@INBOOK{0888Jayant1984, AUTHOR = {N. S. Jayant and Peter Noll}, TITLE = {Digital Coding of Waveforms, Principles and Applications to Speech and Video}, YEAR = {1984}, PUBLISHER = {Prentice-Hall}, PAGES = {688}, ADDRESS = {Englewood Cliffs NJ, USA}, NOTE = {N. S. Jayant: Bell Laboratories; ISBN 0-13-211913-7} } @ARTICLE{0935Bochow1988, AUTHOR = {Bernd Bochow and Thomas Gries and Peter Noll}, TITLE = {Bitfehlerresistente Teilbandcodierung von Sprache mit Vektorquantisierung der Nebeninformation}, JOURNAL = {ITG - Fachbericht 105 "Digitale Sprachverarbeitung - Prinzipien und Anwendungen"}, YEAR = {1988}, PAGES = {31--35} } @INPROCEEDINGS{0889Ohm1989, AUTHOR = {Jens-Rainer Ohm and Peter Noll}, TITLE = {Predictive Tree Encoding of Still Images with Vector Quantization}, BOOKTITLE = {URSI International Symposium on Signals, Systems, and Electronics (ISSSE'89)}, YEAR = {1989}, PAGES = {325--328}, ADDRESS = {Erlangen} } @INPROCEEDINGS{0890Bochow1989, AUTHOR = {Bernd Bochow and Bernd Czyrnik}, TITLE = {Multiprocessor Implementation of an ATC Audio Codec}, BOOKTITLE = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP '89)}, YEAR = {1989} } @INPROCEEDINGS{0891Fellbaum1989, AUTHOR = {Klaus Fellbaum}, TITLE = {Methods of Interfacing to Public Terminals Using Speech}, BOOKTITLE = {John Gill and Cathy Rundle ,Technical Developement Department RNIB}, YEAR = {1989}, ADDRESS = {London, UK} } @INPROCEEDINGS{0892Heinstein1989, AUTHOR = {R. Heinstein and Klaus Fellbaum and Helmut Loebner}, TITLE = {Speech Dialogue Systems - State of the Art and Selected Applications.}, BOOKTITLE = {European Conference on Speech Communication and Technology}, YEAR = {1989}, ADDRESS = {Paris} } @INPROCEEDINGS{0934Fellbaum1989, AUTHOR = {Klaus Fellbaum}, TITLE = {Access to Visual Computer Information for Blind People}, BOOKTITLE = {Proceedings of the Concerted Action on Technology and Blindness}, YEAR = {1989}, PAGES = {145}, ORGANIZATION = {TUB, Institut für Fernmeldetechnik}, ADDRESS = {Berlin} } @ARTICLE{0893Bochow1990, AUTHOR = {Bernd Bochow and Bernd Czyrnik and Peter Noll}, TITLE = {Realzeitimplementierung von Transformationscodierverfahren für hochqualitative Audiosignale}, JOURNAL = {Kleinheubacher Berichte}, YEAR = {1990}, PAGES = {605--612}, NUMBER = {33} } @INPROCEEDINGS{0894Podilchuk1990, AUTHOR = {C. Podilchuk and N. S. Jayant and Peter Noll}, TITLE = {Sparse Codebooks for the Quantization of Non-Dominant Subbands in Image Coding}, BOOKTITLE = {IEEE Intern. Conference on Acoustics, Speech, and Signal Processing(ICASSP'90)}, YEAR = {1990}, PAGES = {2101--2104}, NOTE = {C. Podilchuk, N. S. Jayant: AT&T Bell Laboratories} } @INPROCEEDINGS{0895Noll1990, AUTHOR = {Peter Noll}, TITLE = {Data Compression Techniques}, BOOKTITLE = {1st Working Conference on Common Standards for Quantitative Electrocardiography, "Digital ECG Data: Communication, Encoding and Storage"}, YEAR = {1990}, PAGES = {39--57}, ADDRESS = {Leuven (Belgien)}, NOTE = {invited paper} } @INPROCEEDINGS{0896Noll1990, AUTHOR = {Peter Noll}, TITLE = {Data Compression Techniques for New Standards in Speech and Image Coding}, BOOKTITLE = {VI. Internationales Weiterbildungsprogramm Berlin '90, TU Berlin, Zentrum für Technologische Zusammenarbeit}, YEAR = {1990}, PAGES = {245--264} } @ARTICLE{0897Ohm1990, AUTHOR = {Jens-Rainer Ohm and Peter Noll}, TITLE = {Predictive Tree Encoding of Still Images with Vector Quantization}, JOURNAL = {Annales des Télécommunications}, YEAR = {1990}, PAGES = {465--470}, VOLUME = {45}, NUMBER = {9-10} } @ARTICLE{0936Bochow1990, AUTHOR = {Bernd Bochow and Bernd Czyrnik and Peter Noll}, TITLE = {Realzeitimplementierung von Transformationscodierverfahren für hochqualitative Audiosignale}, JOURNAL = {Kleinheubacher Berichte}, YEAR = {1990}, PAGES = {605--612}, NOTE = {Band 33} } @INBOOK{0977Jayant1990, AUTHOR = {N. S. Jayant and Peter Noll}, TITLE = {Digital Coding of Waveforms - Principles and Applications to Speech and Video}, YEAR = {1990}, PUBLISHER = {Kai Fa Book Company}, PAGES = {688}, ADDRESS = {Taipei, Taiwan}, NOTE = {N. S. Jayant: AT&T Bell Laboratories} } @INPROCEEDINGS{0980Ohm1990, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Still Image Coding Using Predictive Tree-VQ with Sub-Band Decomposition}, BOOKTITLE = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 90)}, YEAR = {1990}, ADDRESS = {Albuquerque} } @INPROCEEDINGS{0981Ohm1990, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Classified Predictive Tree-VQ for Still Image Coding}, BOOKTITLE = {Picture Coding Symposium, MIT Media Lab}, YEAR = {1990}, ADDRESS = {USA} } @ARTICLE{0982Fellbaum1990, AUTHOR = {Klaus Fellbaum}, TITLE = {Sprache - elektronisch im Griff}, JOURNAL = {Funkschau}, YEAR = {1990}, NUMBER = {4/5} } @INPROCEEDINGS{0983Weber1990, AUTHOR = {R. Weber and Ernst Kabot}, TITLE = {Kreidequietschen - Ein Beispiel für ein unangenehmes Geräusch}, BOOKTITLE = {Fortschritte der Akustik - DAGA 90}, YEAR = {1990}, PAGES = {703--706} } @INPROCEEDINGS{0979Clüver1990, AUTHOR = {Kai Clüver and Thomas Gries and Hui Li}, TITLE = {Echtzeitimplementierung eines CELP-Codec mit ungleichgewichtigem Fehlerschutz}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung}, YEAR = {1990}, MONTH = sep, EDITOR = {Klaus Fellbaum}, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0920Fellbaum1991, AUTHOR = {Klaus Fellbaum and Peter Noll}, TITLE = {Arbeiten zur Sprachcodierung und Sprachverarbeitung am Berliner Institut für Fernmeldedetechnik}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung, TU Dresden}, YEAR = {1991}, EDITOR = {R. Hoffmann}, PAGES = {8--30}, ADDRESS = {Dresden} } @INPROCEEDINGS{0921Li1991, AUTHOR = {Hui Li and Peter Noll}, TITLE = {The Performance of Rate Selective Punctured Convolutional Codes of Equal Code Length}, BOOKTITLE = {Sixth International Conference on Digital Signal Processing in Communications, Conf. Rec., IEEE Conf. Publication 340}, YEAR = {1991}, PAGES = {198--201}, ADDRESS = {Loughborough, Great Britain}, NOTE = {H. Li: Jiao-Tong-Universität Shanghai} } @INPROCEEDINGS{0922Becker1991, AUTHOR = {Dieter Becker and Klaus Fellbaum}, TITLE = {Isolated Word Recognition with Integrated Noise Reduction}, BOOKTITLE = {2nd European Conference on Speech Communication and Technology}, YEAR = {1991}, ADDRESS = {Genova, Italy} } @ARTICLE{0923Fellbaum1991, AUTHOR = {Klaus Fellbaum}, TITLE = {Digitale Sprachübertragung und Sprachverarbeitung}, JOURNAL = {ntz, Nachrichtentechnische Zeitschrift}, YEAR = {1991}, NUMBER = {4 und 5} } @ARTICLE{0924Fellbaum1991, AUTHOR = {Klaus Fellbaum}, TITLE = {Einführung in die elektronische Sprachverarbeitung}, JOURNAL = {Elektronische Sprachverarbeitung}, YEAR = {1991} } @INPROCEEDINGS{0925Fellbaum1991, AUTHOR = {Klaus Fellbaum}, TITLE = {Speech, Hearing, and Telecommunication}, BOOKTITLE = {Issues in Telecommunication and Disability}, YEAR = {1991}, ORGANIZATION = {Commission of the European Community}, ADDRESS = {Brussels, Belgium} } @ARTICLE{0926Fellbaum1991, AUTHOR = {Klaus Fellbaum}, TITLE = {Sprachausgabesysteme und ihre Anwendungen in der Praxis}, JOURNAL = {Elektronische Sprachverarbeitung}, YEAR = {1991} } @INPROCEEDINGS{0927Hettwer1991, AUTHOR = {G. Hettwer and Klaus Fellbaum}, TITLE = {Speech Processing and Noise Cancellation for Cars - State of the Art, Utility, Problems}, BOOKTITLE = {ISATA Proceedings}, YEAR = {1991}, ADDRESS = {Croydon, England} } @INPROCEEDINGS{0928Hoffmann1991, AUTHOR = {R. Hoffmann and Klaus Fellbaum}, TITLE = {Elektronische Sprachsignalverarbeitung}, BOOKTITLE = {Konferenzband, TU Dresden}, YEAR = {1991} } @BOOK{0929Kanbach1991, AUTHOR = {Andreas Kanbach and Andreas Körber}, TITLE = {ISDN - Die Technik}, YEAR = {1991}, PUBLISHER = {Hüthig, Heidelberg}, EDITION = {2.}, NOTE = {1. Aufl. 1990} } @INPROCEEDINGS{0930Klaus1991, AUTHOR = {Harald Klaus and Klaus Fellbaum}, TITLE = {Anwendungen der Spracherkennung für motorisch behinderte Benutzer}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung - 2. gemeinsame Konferenz der TU Berlin und Dresden und der HU Berlin}, YEAR = {1991} } @INPROCEEDINGS{0931Klaus1991, AUTHOR = {Harald Klaus and Klaus Fellbaum}, TITLE = {Einsatz der Spracherkennung und Sprachausgabe bei Personal Computern für motorisch behinderte Benutzer}, BOOKTITLE = {DAGA-Konferenzband Teil B}, YEAR = {1991}, ADDRESS = {Bochum, Germany} } @INPROCEEDINGS{0932Ohm1991, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Region-Oriented Predictive Tree-VQ- A New Approach for Image Coding Schemes Based on Segmentation Techniques}, BOOKTITLE = {Proceedings Picture Coding Symposium}, YEAR = {1991}, ADDRESS = {Kyoto, Japan} } @INPROCEEDINGS{0933Weber1991, AUTHOR = {R. Weber and Ernst Kabot}, TITLE = {Psychoakustische Parameter von Alarmsignalen}, BOOKTITLE = {Fortschritte der Akustik - DAGA 91}, YEAR = {1991}, PAGES = {509--512} } @ARTICLE{0914Kuang1992, AUTHOR = {J. Kuang and Peter Noll and F. Fu and J. Liu}, TITLE = {A Typical Channel Model of Digital Mobile Communications Applied in Speech Coding (auf chinesisch)}, JOURNAL = {Journal of Beijing Institute of Technology (Beijing, China)}, YEAR = {1992}, PAGES = {43--48}, VOLUME = {12}, NUMBER = {3}, NOTE = {J. Kuang, F. Fu, J. Liu: Beijing Institute of Technology, Peking} } @INPROCEEDINGS{0915Barthel1992, AUTHOR = {Kai Barthel and Stefan Bruhn und Peter Noll}, TITLE = {Sigvid - ein Programmsystem zur Unterstützung der Lehre auf dem Gebiet der digitalen Nachrichtensignalverarbeitung}, BOOKTITLE = {6. CIP-Kongreß- HU Berlin, Konferenzband Multimedia und Computeranwendungen in der Lehre}, YEAR = {1992}, PUBLISHER = {(Springer-Verlag)}, PAGES = {375--382}, ORGANIZATION = {Mikrocomputer-Forum für Bildung und Wissenschaft 5} } @INPROCEEDINGS{0917Jürgens1992, AUTHOR = {Carsten Jürgens}, TITLE = {Zur Klassifikation und Beurteilung von Sprachsyntheseverfahren}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung}, YEAR = {1992}, ADDRESS = {Dresden} } @INPROCEEDINGS{0918Jürgens1992, AUTHOR = {Carsten Jürgens}, TITLE = {Arbeiten zur Sprachsynthese an der TU Berlin.}, BOOKTITLE = {DEGA-ITG-Diskussionssitzung "Vergleich realisierter Sprachsynthese-Systeme"}, YEAR = {1992}, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0919Li1992, AUTHOR = {Hui Li and Peter Noll}, TITLE = {Hybrid Phase Trellis-Coded Modulation for Unequal Error Protection Coding}, BOOKTITLE = {Proceedings 1992 URSI International Symposium on Signals, Systems, and Electronics (ISSSE'92)}, YEAR = {1992}, PAGES = {113--116}, NOTE = {H. Li: Jiao-Tong-Universität Shanghai} } @INPROCEEDINGS{0916Ohm1992, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Temporal Domain Sub-Band Video Coding with Motion Compensation}, BOOKTITLE = {International Conference on Acoustics, Speech and Signal Processing (ICASSP-92)}, YEAR = {1992}, MONTH = mar, PAGES = {III/229 - III/232}, ADDRESS = {San Francisco, CA, USA} } @INPROCEEDINGS{0913Clüver1992, AUTHOR = {Kai Clüver and Thomas Gries and Hui Li and Peter Noll}, TITLE = {Real-Time Implementation of a CELP Codec with Unequal Error Protection}, BOOKTITLE = {Signal Processing VI - Proceedings of the Sixth European Signal Processing Conference, EUSIPCO-92}, YEAR = {1992}, MONTH = aug, PUBLISHER = {Elsevier}, PAGES = {1541--1544}, ADDRESS = {Brussels, Belgium}, NOTE = {H. Li: Jiao-Tong-Universität Shanghai} } @INPROCEEDINGS{0899Barthel1993, AUTHOR = {Kai Barthel and Thomas Voyé and Peter Noll}, TITLE = {Improved Fractal Image Coding}, BOOKTITLE = {Picture Coding Symposium PCS '93}, YEAR = {1993}, ADDRESS = {Lausanne, Switzerland}, NOTE = {Proceedings Section 1.5} } @INPROCEEDINGS{0900Noll1993, AUTHOR = {Peter Noll}, TITLE = {High Quality Audio Coding: The ISO/MPEG Standard(s)}, BOOKTITLE = {Cost 229 Workshop on Intelligent Terminals and Source and Channel Coding}, YEAR = {1993}, EDITOR = {K. Fazekas}, PAGES = {5--14}, ADDRESS = {Budapest, Hungary}, NOTE = {invited paper} } @INPROCEEDINGS{0901Noll1993, AUTHOR = {Peter Noll}, TITLE = {Speech Coding for Communications}, BOOKTITLE = {European Speech Processing Conference (EUSIPCO'93)}, YEAR = {1993}, PAGES = {479--488}, ADDRESS = {Berlin, Germany}, NOTE = {invited paper} } @INPROCEEDINGS{0902Noll1993, AUTHOR = {Peter Noll and G. Stoll}, TITLE = {ISO/MPEG High Quality Audio Coding}, BOOKTITLE = {High Definition Television Conference (HDTV '93)}, YEAR = {1993}, ADDRESS = {Ottawa, Canada}, NOTE = {invited paper, G. Stoll: Institut für Rundfunktechnik, München} } @INPROCEEDINGS{0904Noll1993, AUTHOR = {Peter Noll}, TITLE = {ISO/MPEG Audio Coding: Status and Trends}, BOOKTITLE = {Workshop on Mobile Multimedia Communications, MoMuc-1}, YEAR = {1993}, ADDRESS = {Tokyo}, NOTE = {invited paper} } @INPROCEEDINGS{0905Schamel1993, AUTHOR = {G. Schamel and H. Li}, TITLE = {Frequence Scanning in Digital Coding for HDTV Broadcasting}, BOOKTITLE = {SPIE EUROPTO-93}, YEAR = {1993}, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0906Kabot1993, AUTHOR = {Ernst Kabot and R. Weber}, TITLE = {Kategorial beurteilte Schärfe von künstlichen und natürlichen Schallen}, BOOKTITLE = {Fortschritte der Akustik - DAGA 93}, YEAR = {1993}, PAGES = {840--843} } @INPROCEEDINGS{0912Jürgens1993, AUTHOR = {Carsten Jürgens}, TITLE = {Sprachsynthese auf Clusterbasis nach dem PSOLA-Verfahren}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung}, YEAR = {1993}, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0898Bitó1993, AUTHOR = {János Bitó and Jens-Rainer Ohm and Peter Noll}, TITLE = {A Simple Model for the Loss Process in the Cell Stream of Variable Bit Rate Video Sources}, BOOKTITLE = {VISICOM '93, 5th International Workshop on Packet Video}, YEAR = {1993}, MONTH = mar, EDITOR = {R. Schäfer}, PAGES = {C4.1 - C4.6}, ADDRESS = {Berlin, Germany}, NOTE = {J. Bitó: Technical University Budapest} } @INPROCEEDINGS{0907Ohm1993, AUTHOR = {Jens-Rainer Ohm}, TITLE = {3-D SBC-VQ with Motion Compensation}, BOOKTITLE = {Picture Coding Symposium (PCS'93)}, YEAR = {1993}, MONTH = mar, PAGES = {11.5--1 - 11.5--2}, ADDRESS = {Lausanne, Switzerland} } @INPROCEEDINGS{0909Ohm1993, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Layered VQ and SBC Techniques for Packet Video Applications}, BOOKTITLE = {5th International Workshop on Packet Video}, YEAR = {1993}, MONTH = mar, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0910Ohm1993, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Three-Dimensional Motion-Compensated Subband Coding}, BOOKTITLE = {International Symposium on Video Communications and Fiber Optic Services, SPIE}, YEAR = {1993}, MONTH = apr, PAGES = {188--197}, ADDRESS = {Berlin, Germany} } @ARTICLE{0911Ohm1993, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Advanced Packet Video Coding Based on Layered VQ and SBC Techniques}, JOURNAL = {IEEE Transactions on Circuits and Systems for Video Technology}, YEAR = {1993}, MONTH = jun, PAGES = {208--221}, VOLUME = {CSVT-3}, NUMBER = {3} } @ARTICLE{0903Noll1993, AUTHOR = {Peter Noll}, TITLE = {Wideband Speech and Audio Coding}, JOURNAL = {IEEE Communications Magazine}, YEAR = {1993}, MONTH = nov, PAGES = {34--44}, VOLUME = {31}, NUMBER = {11}, DOI = {10.1109/35.256878}, ABSTRACT = {Typical parameters of wideband speech and audio signals, including digitized versions of each, potential applications, and available transmission media, are described. Facts about human auditory perception that are exploited in audio coding and quality measures that play an important role in coder evaluations and designs are reviewed. Techniques for efficient coding of wideband speech and audio signals, with an emphasis on existing standards, are discussed. The audio coding standard developed by the Moving Pictures Expert Group within the International Organization for standardization (ISO/MPEG) is covered in some detail, since it will be used in many application areas, including digital storage, transmission, and broadcasting of audio-only signals and audiovisual applications such as videotelephony, videoconferencing, and TV broadcasting. Ongoing research and standardization work is outlined} } @INPROCEEDINGS{0880Bitó1994, AUTHOR = {János Bitó and Peter Noll}, TITLE = {Ein adaptives digitales Modell für satellitengestützte Landmobilfunkkanäle}, BOOKTITLE = {8. Aachener Kolloquium "Signaltheorie"}, YEAR = {1994}, ADDRESS = {Aachen}, NOTE = {J. Bitó: Technische Universität Budapest} } @INPROCEEDINGS{0964Kabot1994, AUTHOR = {Ernst Kabot and R. Weber}, TITLE = {Der Einfluß der Bandbreite von Rauschsignalen auf die Schärfeempfindung}, BOOKTITLE = {Fortschritte der Akustik - DAGA 94}, YEAR = {1994}, PAGES = {1029--1032} } @INPROCEEDINGS{0969Drews1994, AUTHOR = {Martin Drews}, TITLE = {Mehrkanal-Sprachpausendetektoren für Laufzeitsteuerungen}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung}, YEAR = {1994}, PAGES = {341--348} } @ARTICLE{0970Drews1994, AUTHOR = {Martin Drews}, TITLE = {Echtzeitimplementierung eines Verfahrens zur Geräuschreduktion mit Mikrofonarrays}, JOURNAL = {Kleinheubacher Berichte}, YEAR = {1994}, PAGES = {337--346}, NUMBER = {38} } @INPROCEEDINGS{0971Thiede1994, AUTHOR = {Thilo Thiede}, TITLE = {Gehörrichtige Qualitätsbewertung von Audiosignalen - Übersicht und Einschätzung der gegenwärtigen Verfahren}, BOOKTITLE = {Bericht zur 18. Tonmeistertagung}, YEAR = {1994}, PAGES = {623--642}, ADDRESS = {Karlsruhe}, PDF = {http://elvera.nue.tu-berlin.de/files/0971Thiede1994.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0971Thiede1994.pdf}, ABSTRACT = {Bei der digitalen Übertragung und Speicherung von Audiosignalen werden in zunehmendem Maße Datenreduktionsverfahren eingesetzt, die an die Eigenschaften des menschlichen Gehörs angepaßt sind und insbesondere Verdeckungseffekte ausnutzen. Dabei wird in erster Linie nicht versucht, die absolute Größe der bei einer vorgegebenen Datenrate auftretenden Fehler zu minimieren, sondern diese Fehler so zu formen, daß sie möglichst wenig hörbar sind. Daher würde ein konventionelles Meßverfahren einem derart verarbeiteten Audiosignal stets eine bedeutend geringere Qualität zuordnen als tatsächlich wahrgenommen wird.Aus diesem Grunde sind zur Beurteilung derartiger Datenreduktionsverfahren, die oft als "perceptual coder" (gehörrichtige Codierverfahren) bezeichnet werden,heutzutage noch subjektive Hörtests üblich. Da diese Versuche sehr umfangreich sein müssen, um eine hinreichende Genauigkeit und Reproduzierbarkeit der Ergebnisse zu erreichen, ist der damit verbundene Zeit- und Arbeitsaufwand sehr hoch.} } @INPROCEEDINGS{0973Jürgens1994, AUTHOR = {Carsten Jürgens}, TITLE = {TUBSY - Sprachsynthese auf Clusterbasis nach dem PSOLA- Verfahren.}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung}, YEAR = {1994}, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0974Jürgens1994, AUTHOR = {Carsten Jürgens and Harald Klaus}, TITLE = {Speech Quality Assessment of Synthesized Speech Using Different Reference Systems.}, BOOKTITLE = {Workshop on Speech Quality Assessment}, YEAR = {1994}, ADDRESS = {Bochum, Germany} } @ARTICLE{0975Thiede1994, AUTHOR = {Thilo Thiede and G. Steinke}, TITLE = {Arbeitsweise und Eigenschaften von Verfahren zur gehörrichtigen Qualitätsbewertung von bitratenreduzierten Audiosignalen}, JOURNAL = {Rundfunktechnische Mitteilungen,}, YEAR = {1994}, PAGES = {102--114}, VOLUME = {38}, NUMBER = {3} } @INPROCEEDINGS{0963Barthel1994, AUTHOR = {Kai Uwe Barthel and Thomas Voyé}, TITLE = {Adaptive fractal image coding in the frequency domain}, BOOKTITLE = {Proceedings of International Workshop on Image Processing}, YEAR = {1994}, MONTH = jun, ADDRESS = {Budapest} } @ARTICLE{0965Ohm1994, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Three-Dimensional Subband Coding with Motion Compensation}, JOURNAL = {IEEE Transactions on Image Processing}, YEAR = {1994}, MONTH = sep, PAGES = {559--571}, VOLUME = {IP-3}, NUMBER = {5} } @INPROCEEDINGS{0882Clüver1994, AUTHOR = {Kai Clüver and Peter Noll}, TITLE = {PCM-Sprachübertragung in ATM-Netzen}, BOOKTITLE = {ITG-Fachtagung "Codierung für Quelle, Kanal und Übertragung", München}, YEAR = {1994}, MONTH = oct } @INPROCEEDINGS{0966Ohm1994, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Hierarchische Codierung von Videosignalen durch bewegungskompensierte Teilbandzerlegung}, BOOKTITLE = {ITG-Fachtagung "Codierung für Quelle, Kanal und Übertragung"}, YEAR = {1994}, MONTH = oct, ADDRESS = {München} } @INPROCEEDINGS{0968Li1994, AUTHOR = {Hui Li and Peter Noll}, TITLE = {Comparative Study of two Rate-Selectable Channel Coding Techniques}, BOOKTITLE = {ITG-Fachtagung "Codierung für Quelle, Kanal und Übertragung"}, YEAR = {1994}, MONTH = oct, ADDRESS = {München} } @INPROCEEDINGS{0972Clüver1994, AUTHOR = {Kai Clüver and Lutz Schröder}, TITLE = {Ein Sprachcodec für den asynchronen Transfermodus}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung}, YEAR = {1994}, MONTH = oct, PAGES = {319--325}, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0881Barthel1994, AUTHOR = {Kai Barthel and Jörg Schüttemeyer and Thomas Voyé and Peter Noll}, TITLE = {A New Image Coding Technique Unifying Fractal and Transform Coding}, BOOKTITLE = {ICIP}, YEAR = {1994}, MONTH = nov, PDF = {http://elvera.nue.tu-berlin.de/files/0881Barthel1994.ps}, URL = {http://elvera.nue.tu-berlin.de/files/0881Barthel1994.ps}, ABSTRACT = {We present a new image coding scheme based on an unification of fractal and transform coding. We introduce a generalization of the luminance transformation used by fractal coding schemes. By extending the luminance transformation to the frequency domain, fractal and transform coding become subsets of the proposed transformation. Our new coding scheme FTC (fractal based transform coding) combines the advantages of both techniques. Compared to JPEG a coding gain of 1.5 - 2.5 dB [PSNR] is obtained. The encoding time is reduced compared to conventional fractal coding schemes and a better convergence at the decoder is attained. At equal error rates the subjective quality of images coded with the new scheme is superior compared to transform coded images.} } @INPROCEEDINGS{0967Ohm1994, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Motion-compensated 3-D subband coding with multiresolution representation of motion parameters}, BOOKTITLE = {Proceedings IEEE 1st International Conference on Image Processing (ICIP-94)}, YEAR = {1994}, MONTH = nov, ADDRESS = {Austin, TX} } @INPROCEEDINGS{0878Barthel1995, AUTHOR = {Kai Barthel and Thomas Voyé}, TITLE = {Three-Dimensional Fractal Video Coding}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP 95)}, YEAR = {1995}, ADDRESS = {Washington, D.C., USA} } @INPROCEEDINGS{0879Barthel1995, AUTHOR = {Kai Barthel}, TITLE = {Entropy constrained fractal image coding}, BOOKTITLE = {NATO ASI on Fractal Image Coding}, YEAR = {1995}, ADDRESS = {Trondheim, Norwegen} } @ARTICLE{0954Drews1995, AUTHOR = {Martin Drews}, TITLE = {Echtzeitimplementierung eines Verfahrens zur Geräuschreduktion mit Mikrofonarrays}, JOURNAL = {Kleinheubacher Berichte}, YEAR = {1995}, PAGES = {337--346}, NUMBER = {38} } @INPROCEEDINGS{0955Drews1995, AUTHOR = {Martin Drews}, TITLE = {Time Delay Estimation for Microphone Array Speech Enhancement Systems}, BOOKTITLE = {Tagungsband EUROSPEECH '95}, YEAR = {1995}, PAGES = {2013--2016}, ADDRESS = {Madrid, Spain} } @INPROCEEDINGS{0956Hardt1995, AUTHOR = {Detlef Hardt and Lutz Fliegner}, TITLE = {Einsatz der Sprecherverifizierung in Open- und Closed-line Systemen}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung, 6. gemeinsame Konferenz der TU Berlin, der TU Dresden und der HU Berlin}, YEAR = {1995}, ADDRESS = {Wolfenbüttel} } @INPROCEEDINGS{0957Jürgens1995, AUTHOR = {Carsten Jürgens and B. Wehen and Wiebke Johannsen}, TITLE = {Untersuchungen zur Auswahl von Sprechern für die Sprachsynthese im Zeitbereich}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung}, YEAR = {1995}, ADDRESS = {Wolfenbüttel} } @INPROCEEDINGS{0958Jürgens1995, AUTHOR = {Carsten Jürgens and M. Wunderlich}, TITLE = {A Comparison of Different Speech Units for the German TTS-System TUBSY}, BOOKTITLE = {EUROSPEECH}, YEAR = {1995}, ADDRESS = {Madrid, Spain} } @INBOOK{0959Ohm1995, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Digitale Bildcodierung - Repräsentation, Kompression und Übertragung von Bildsignalen}, YEAR = {1995}, PUBLISHER = {Springer-Verlag}, PAGES = {487}, ADDRESS = {Berlin Heidelberg New York}, NOTE = {ISBN 3-540-58579-6} } @INPROCEEDINGS{0960Purat1995, AUTHOR = {Marcus Purat and Peter Noll}, TITLE = {A New Orthonormal Wavelet Packet Decomposition For Audio Coding Using Frequency-Varying Modulated Lapped Transforms}, BOOKTITLE = {IEEE 1995 Workshop on Applications of Signal Processing to Audio and Acoustics}, YEAR = {1995}, ADDRESS = {New Paltz, N.Y. (USA)} } @INPROCEEDINGS{0961Suhardi1995, AUTHOR = {Suhardi and Klaus Fellbaum}, TITLE = {Schlüsselworterkennung in fließender Sprache unter Verwendung neuronaler Netze}, BOOKTITLE = {Elekronische Sprachverarbeitung- 6. gemeinsame Konferenz der TU Berlin,TU Dresden, HU Berlin}, YEAR = {1995}, ADDRESS = {Wolfenbttel} } @INPROCEEDINGS{0962Weber1995, AUTHOR = {R. Weber and Ernst Kabot}, TITLE = {Objektive Beschreibungsgrößen für die subjektive Wahrnehmung von Rasselgeräuschen}, BOOKTITLE = {Fortschritte der Akustik - DAGA 95}, YEAR = {1995}, PAGES = {891--894} } @ARTICLE{0877Noll1995, AUTHOR = {Peter Noll}, TITLE = {Digital Audio Coding for Visual Communications}, JOURNAL = {Proceedings of the IEEE}, YEAR = {1995}, MONTH = jun, PAGES = {925--943}, VOLUME = {83}, NUMBER = {6}, DOI = {10.1109/5.387093}, ABSTRACT = {Current and future visual communications for applications such as broadcasting videotelephony, video- and audiographic-conferencing, and interactive multimedia services assume a substantial audio component. Even text, graphics, fax, still images, email documents, etc. will gain from voice annotation and audio clips. A wide range of speech, wideband speech, and wideband audio coders is available for such applications. In the context of audiovisual communications, the quality of telephone-bandwidth speech is acceptable for some videotelephony and videoconferencing services. Higher bandwidths (wideband speech) may be necessary to improve the intelligibility and naturalness of speech. High quality audio coding including multichannel audio will be necessary in advanced digital TV and multimedia services. This paper explains basic approaches to speech, wideband speech, and audio bit rate compressions in audiovisual communications. These signal classes differ in bandwidth, dynamic range, and in listener expectation of offered quality. It will become obvious that the use of our knowledge of auditory perception helps minimizing perception of coding artifacts and leads to efficient low bit rate coding algorithms which can achieve substantially more compression than was thought possible only a few years ago. The paper concentrates on worldwide source coding standards beneficial for consumers, service providers, and manufacturers} } @INPROCEEDINGS{0811Thiede1996, AUTHOR = {Thilo Thiede and Ernst Kabot}, TITLE = {Ein neues psychoakustisches Meßverfahren zur Bestimmung der Wahrnehmbarkeit von Codierfehlern}, BOOKTITLE = {DAGA 96}, YEAR = {1996}, PAGES = {362--363}, ADDRESS = {Bonn}, PDF = {http://elvera.nue.tu-berlin.de/files/0811Thiede1996.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0811Thiede1996.pdf} } @INBOOK{0847Noll1996, AUTHOR = {Peter Noll}, TITLE = {Source Compression: Audio Coding}, YEAR = {1996}, BOOKTITLE = {The Communications Handbook}, EDITOR = {J. Gibson, Texas A&M University}, PUBLISHER = {CRC, Inc.}, PAGES = {1475--1487}, CHAPTER = {7.8} } @INPROCEEDINGS{0854Noll1996, AUTHOR = {Peter Noll}, TITLE = {Zur geschichtlichen Entwicklung der Nachrichtentechnik an der Technischen Hochschule Berlin-Charlottenburg}, BOOKTITLE = {Tagungsband 2. ITG-Diskussionssitzung}, YEAR = {1996}, PUBLISHER = {VDE-Verlag Berlin u. Offenbach}, PAGES = {233--246} } @PROCEEDINGS{0855Mathis1996, TITLE = {Neue Anwendungen theoretischer Konzepte in der Elektrotechnik mit Gedenksitzung zum 50. Todestag von Wilhelm Cauer, 2. ITG-Diskussionssitzung}, EDITOR = {W. Mathis, Peter Noll}, YEAR = {1996}, PUBLISHER = {VDE-Verlag}, ADDRESS = {Berlin u. Offenbach}, NOTE = {ISBN 3-8007-2190-2} } @ARTICLE{0867Drews1996, AUTHOR = {Martin Drews}, TITLE = {Mehrkanal-Geräuschreduktion - Ein Verfahren zur Verbesserung der Sprachqualität beim Freisprechen}, JOURNAL = {Telekom Praxis}, YEAR = {1996}, PAGES = {32--38}, NUMBER = {8} } @INPROCEEDINGS{0869Hardt1996, AUTHOR = {Detlef Hardt}, TITLE = {Untersuchungen zum Einsatz der Störreduktion in der Sprecherverifizierung}, BOOKTITLE = {7. Konferenz "Elektronische Sprachsignalverarbeitung"}, YEAR = {1996}, PAGES = {122--130} } @INPROCEEDINGS{0874Purat1996, AUTHOR = {Marcus Purat and Peter Noll}, TITLE = {Audio Coding with a Dynamic Wavelet Packet Decomposition Based on Frequency-Varying Modulated Lapped Transforms}, BOOKTITLE = {IEEE Acoustics, Speech and Signal Processing Conference (ICASSP)}, YEAR = {1996}, ADDRESS = {Atlanta (USA)} } @INPROCEEDINGS{0875Suhardi1996, AUTHOR = {Suhardi and Klaus Fellbaum}, TITLE = {Zur Schlüsselworterkennung unter Verwendung prädiktiver neuronaler Modelle}, BOOKTITLE = {7. Konferenz "Elektronische Sprachsignalverarbeitung"}, YEAR = {1996}, PAGES = {108--114}, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0876Thiede1996, AUTHOR = {Thilo Thiede and Ernst Kabot}, TITLE = {A New Perceptual Quality Measure for Bit Rate Reduced Audio}, BOOKTITLE = {Contribution to the 100th AES Convention}, YEAR = {1996}, ADDRESS = {Copenhagen}, NOTE = {Preprint 4280.}, PDF = {http://elvera.nue.tu-berlin.de/files/0876Thiede1996.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0876Thiede1996.pdf}, ABSTRACT = {For the quality evaluation of perceptual audio codecs, appropriate measurement algorithms are needed, which detect and assess audible artefacts by comparing the output of the codec with the uncoded reference. A filter bank based perceptual model is presented, which yields better temporal resolution than FFT-based approaches and thus allows a more precise modelling of pre- and post-masking and a refined analysis of the envelopes within each filter channel.} } @INPROCEEDINGS{0883Barthel1996, AUTHOR = {Kai Uwe Barthel}, TITLE = {Entropy constrained fractal image coding}, BOOKTITLE = {2. ITG-Diskussionssitzung}, YEAR = {1996}, PUBLISHER = {VDE-Verlag Berlin und Offenbach}, PAGES = {3--10}, NOTE = {ISBN 3-8007-2190-2} } @INPROCEEDINGS{0884Drews1996, AUTHOR = {Martin Drews}, TITLE = {Mehrkanal-Geräuschreduktion mit adaptivem Wiener-Kolmogoroff- Filter}, BOOKTITLE = {2. ITG-Diskussionssitzung}, YEAR = {1996}, PUBLISHER = {VDE-Verlag Berlin und Offenbach}, PAGES = {129--136}, NOTE = {ISBN 3-8007-2190-2} } @INPROCEEDINGS{0885Bos1996, AUTHOR = {Jürgen Bos and Martin Drews}, TITLE = {Simulation von Raumimpulsantworten zur Nachbildung der Ausgangssignale eines Mikrofonarrays in kleinen Räumen}, BOOKTITLE = {Fortschritte der Akustik - DAGA 96}, YEAR = {1996}, PAGES = {458--459} } @INPROCEEDINGS{0886Purat1996, AUTHOR = {Marcus Purat}, TITLE = {Waveletcodierung}, BOOKTITLE = {2. ITG-Diskussionssitzung}, YEAR = {1996}, PUBLISHER = {VDE-Verlag Berlin u. Offenbach}, PAGES = {165--170}, NOTE = {ISBN 3-8007-2190-2} } @INPROCEEDINGS{0864Clüver1996, AUTHOR = {Kai Clüver and Peter Noll}, TITLE = {Reconstruction of Missing Speech Frames using Sub-Band Excitation}, BOOKTITLE = {International Symposium on Time-Frequency and Time-Scale Analysis}, YEAR = {1996}, MONTH = jun, PAGES = {277--280}, ADDRESS = {Paris}, PDF = {http://elvera.nue.tu-berlin.de/files/0864Cluever1996.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0864Cluever1996.pdf}, ABSTRACT = {A new reconstruction method for frame erasures in speech transmission is presented which is based on parameterization of the speech signal by means of linear prediction (LPC) and voicing analysis. The problem of generating partially voiced substitute speech signals is solved by performing separate voicing decisions in sub-bands. The method yields considerable improvements compared with silence substitution for frame erasure ratios of up to 10 % or even 20 %. The combination of the reconstruction method with adaptive speech coders showed virtually the same good results for forward adaptation, whereas a higher degradation is caused by backward-adaptive coders.} } @ARTICLE{0859Barthel1996, AUTHOR = {Kai Uwe Barthel}, TITLE = {Niederratige Festbildcodierung unter Verwendung fraktaler Methoden im Orts- und Frequenzbereich}, JOURNAL = {FREQUENZ}, YEAR = {1996}, MONTH = sep, PAGES = {237--244}, VOLUME = {50}, NUMBER = {9-10} } @INPROCEEDINGS{0861Bitó1996, AUTHOR = {János Bitó and Thomas Höhne and Peter Noll}, TITLE = {Simulation Study of Multipath Fading Effects on Integrated Video, Voice and Data Transmission in Asynchronous BPSK_DS/CDMA Systems}, BOOKTITLE = {IEEE/EURASIP First International Workshop on Wireless Image/Video Communications}, YEAR = {1996}, MONTH = sep, PAGES = {99--104}, ADDRESS = {Loughborough, GB} } @ARTICLE{0862Bruhn1996, AUTHOR = {Stefan Bruhn}, TITLE = {Effiziente sehr niederratige Sprachübertragung mittels Interblock-Codierung}, JOURNAL = {FREQUENZ}, YEAR = {1996}, MONTH = sep, PAGES = {245--252}, VOLUME = {50}, NUMBER = {9-10} } @ARTICLE{0863Clüver1996, AUTHOR = {Kai Clüver}, TITLE = {Ein Verfahren zur Rekonstruktion fehlender Sprachsignalrahmen mit linearer Prädiktion und Teilbandanregung}, JOURNAL = {FREQUENZ}, YEAR = {1996}, MONTH = sep, PAGES = {211--215}, VOLUME = {50}, NUMBER = {9-10} } @INPROCEEDINGS{0865Clüver1996, AUTHOR = {Kai Clüver}, TITLE = {An ATM Speech Codec with Improved Reconstruction of Lost Cells}, BOOKTITLE = {EUSIPCO 96}, YEAR = {1996}, MONTH = sep, PAGES = {1641--1643}, ADDRESS = {Trieste}, PDF = {http://elvera.nue.tu-berlin.de/files/0865Cluever1996.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0865Cluever1996.pdf}, ABSTRACT = {A speech codec for ATM networks is presented which includes ATM adaptation layer functions, a voice activity detection, and a new method for the reconstruction of lost cells. As the cell assembly already requires a relatively high buffering delay, only algorithms are applied which introduce small additional delays. The reconstruction of lost cells is based on an analysis of the LPC and pitch parameters of the speech signal. The new waveform substitution method considerably reduces the speech quality impairment caused by cell loss.} } @ARTICLE{0866Drews1996, AUTHOR = {Martin Drews}, TITLE = {Aufbau von Mikrofon-Arrays zur Optimierung von Verfahren der Mehrkanal-Geräuschreduktion}, JOURNAL = {FREQUENZ}, YEAR = {1996}, MONTH = sep, PAGES = {223--227}, VOLUME = {50}, NUMBER = {9-10} } @INPROCEEDINGS{0868Drews1996, AUTHOR = {Martin Drews}, TITLE = {Speaker Localization and Its Application to Time Delay Estimators for Multi-Microphone Speech Enhancement Systems}, BOOKTITLE = {EUSIPCO 96}, YEAR = {1996}, MONTH = sep, PAGES = {483--486}, ADDRESS = {Trieste} } @ARTICLE{0872Ohm1996, AUTHOR = {Jens-Rainer Ohm}, TITLE = {Skalierbare Videocodierung für asynchrone und heterogene Datennetze}, JOURNAL = {FREQUENZ}, YEAR = {1996}, MONTH = sep, PAGES = {216--222}, VOLUME = {50}, NUMBER = {9-10} } @ARTICLE{0873Purat1996, AUTHOR = {Marcus Purat}, TITLE = {Audiocodierung unter Verwendung dynamischer Zeit-Frequenz-Zerlegungen}, JOURNAL = {FREQUENZ}, YEAR = {1996}, MONTH = sep, PAGES = {205--210}, VOLUME = {50}, NUMBER = {9-10} } @INPROCEEDINGS{0870Heising1996, AUTHOR = {Guido Heising and G. Ruhl}, TITLE = {Video Coding Using Spatial Extrapolation Based Motion Field Segmentation}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'96)}, YEAR = {1996}, MONTH = sep, PAGES = {481--484}, ADDRESS = {Lausanne, Switzerland} } @ARTICLE{0860Bitó1996, AUTHOR = {János Bitó}, TITLE = {Adaptive digitale Kanalmodelle für Mobilfunkkanäle}, JOURNAL = {FREQUENZ}, YEAR = {1996}, MONTH = nov, PAGES = {261--267}, VOLUME = {50}, NUMBER = {11-12} } @ARTICLE{0871Heising1996, AUTHOR = {Guido Heising}, TITLE = {Bewegtbildcodierung unter Verwendung von Blockverzerrungsmodellen zur Bewegungskompensation}, JOURNAL = {FREQUENZ}, YEAR = {1996}, MONTH = nov, VOLUME = {50}, NUMBER = {11-12} } @INPROCEEDINGS{0832Noll1997, AUTHOR = {Peter Noll}, TITLE = {Speech and Audio Coding}, BOOKTITLE = {internetaudio.org = 14th Conference of the Acoustical Engineering Society (AES), WWW-Proceedings}, YEAR = {1997}, ADDRESS = {Seattle}, NOTE = {invited paper} } @BOOK{0833Noll1997, AUTHOR = {Peter Noll}, TITLE = {MPEG-based Audio Coding}, YEAR = {1997}, BOOKTITLE = {Handbook on Digital Consumer Electronics}, PUBLISHER = {McGrawHill} } @ARTICLE{0834Noll1997, AUTHOR = {Peter Noll and Davis Pan}, TITLE = {ISO/MPEG Audio Coding}, JOURNAL = {International Journal of High-Speed Electronics and Systems}, YEAR = {1997}, PAGES = {69--118}, VOLUME = {8}, NUMBER = {1}, NOTE = {D. Pan: Digital Equipment Corp.(USA); auch/also in: Signal Compression - Coding of Speech, Audio, Text, Image and Video (Ed.: N. Jayant), World Scientific Publ. Co., 1997.} } @INPROCEEDINGS{0835Barthel1997, AUTHOR = {Kai Uwe Barthel}, TITLE = {Entropy Constrained Zerotree Wavelet Image Coding Using Fractal Prediction}, BOOKTITLE = {Picture Coding Symposium PCS '97}, YEAR = {1997} } @INPROCEEDINGS{0839Hardt1997, AUTHOR = {Detlef Hardt}, TITLE = {Untersuchungen zur textunabhängigigen Sprecherverifizierung mit begrenztem Wortschatz}, BOOKTITLE = {8. Konferenz "Elektronische Sprachsignalverarbeitung"}, YEAR = {1997}, PAGES = {61--69}, ADDRESS = {Cottbus} } @INPROCEEDINGS{0840Heising1997, AUTHOR = {Guido Heising}, TITLE = {Blocking Artefact Free Video Coding by Combining Warping Based Prediction with Wavelet Error Coding}, BOOKTITLE = {Picture Coding Symposium PCS '97}, YEAR = {1997}, ADDRESS = {Berlin, Germany} } @INPROCEEDINGS{0843Purat1997, AUTHOR = {Marcus Purat and Tilman Liebchen and Peter Noll}, TITLE = {Lossless Transform Coding of Audio Signals}, BOOKTITLE = {102nd AES Convention}, YEAR = {1997}, ADDRESS = {München}, NOTE = {Preprint No. 4414}, PDF = {http://elvera.nue.tu-berlin.de/files/0843Purat1997.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0843Purat1997.pdf}, ABSTRACT = {Recent papers have proposed linear prediction as a useful method for lossless audio coding. Transform coding, however, hasn’t been investigated so far, although it seems to be more adapted to the harmonic structure of most audio signals. In this paper we present first results on lossless transform coding of CD-quality audio data. One main aspect lies on a suitable quantization method to obtain perfect reconstruction. Using a codebook with different entropy codes for the transform coefficients we achieve bitrates, slightly better then those obtained by the lossless linear prediction schemes mentioned above.} } @INPROCEEDINGS{0858Hardt1997, AUTHOR = {Detlef Hardt and Klaus Fellbaum}, TITLE = {Spectral Subtraction and RASTA-Filtering in Text-Dependent HMM-Based Speaker Verification}, BOOKTITLE = {ICASSP}, YEAR = {1997}, PAGES = {867--870}, ADDRESS = {München} } @INPROCEEDINGS{0845Suhardi1997, AUTHOR = {Suhardi and Hyoung-Gook Kim}, TITLE = {Untersuchungen zur hybriden HMM/PLP-basierten Schlüsselworterkennung}, BOOKTITLE = {8. Konferenz "Elektronische Sprachsignalverarbeitung"}, YEAR = {1997}, PAGES = {55--60}, ADDRESS = {Cottbus} } @INPROCEEDINGS{0831Noll1997, AUTHOR = {Peter Noll}, TITLE = {Neue Verfahren der Audiocodierung}, BOOKTITLE = {8. Konferenz Elektronische Sprachsignalverarbeitung}, YEAR = {1997}, MONTH = aug, PAGES = {7--26}, ADDRESS = {Cottbus}, NOTE = {invited paper} } @INPROCEEDINGS{0842Kliche1997, AUTHOR = {Ingmar Kliche and Rolf Kapust and Peter Noll}, TITLE = {Ein offener modularer Simulator für Untersuchungen zur Sprachübertragung über ATM}, BOOKTITLE = {8. Konferenz Elektronische Sprachsignalverarbeitung}, YEAR = {1997}, MONTH = aug, PAGES = {184--191}, ADDRESS = {Cottbus} } @INPROCEEDINGS{0857Suhardi1997, AUTHOR = {Suhardi and Klaus Fellbaum}, TITLE = {Empirical Comparsion of two Multilayer Perceptron-Based Keyword Speech Recognition Algorithms}, BOOKTITLE = {Eurospeech 1997}, YEAR = {1997}, MONTH = sep, PAGES = {2835--2838}, ORGANIZATION = {ESCA}, ADDRESS = {Phodes, Greece} } @ARTICLE{0830Noll1997, AUTHOR = {Peter Noll}, TITLE = {MPEG Digital Audio Coding - Setting the Standard for High-Quality Audio Compression}, JOURNAL = {IEEE Signal Processing Magazine, Special Issue on MPEG Audio and Video Coding}, YEAR = {1997}, MONTH = sep, PAGES = {59--81}, VOLUME = {14}, NUMBER = {5} } @INPROCEEDINGS{0837Drews1997, AUTHOR = {Martin Drews and Martin Streckfuß}, TITLE = {Multi-Channel Speech Enhancement using an Adaptive Post-Filter with Frequency-Dependent Channel Selection and Auditory Constraints}, BOOKTITLE = {International Workshop on Acoustic Echo and Noise Control (IWAENC`97)}, YEAR = {1997}, MONTH = sep, ADDRESS = {London} } @INPROCEEDINGS{0836Barthel1997, AUTHOR = {Kai Uwe Barthel and Sven Brandau and W. Hermesmeier and Guido Heising}, TITLE = {Zerotree Wavelet Coding Using Fractal Prediction}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'97)}, YEAR = {1997}, MONTH = oct, ADDRESS = {Santa Barbara, CA, USA} } @INPROCEEDINGS{0841Heising1997, AUTHOR = {Guido Heising and Kai Uwe Barthel and Wiebke Johannsen and Christoph Steinbach}, TITLE = {Blocking Artefact Free Video Coding Based on a Bilinear Forward Image Warping Model}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'97)}, YEAR = {1997}, MONTH = oct, ADDRESS = {Santa Barbara, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/0841Heising1997.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0841Heising1997.pdf} } @INBOOK{0824Noll1998, AUTHOR = {Peter Noll}, TITLE = {MPEG Digital Audio Coding Standards}, YEAR = {1998}, BOOKTITLE = {The Digital Signal Processing Handbook}, EDITOR = {V.K. Madisetti, D. B. Williams}, PUBLISHER = {IEEE Press/CRC Press}, PAGES = {40/1--40/28} } @INPROCEEDINGS{0825Hardt1998, AUTHOR = {Detlef Hardt and Klaus Fellbaum and Rolf Kapust and K.-D. Michael}, TITLE = {Einfluß der Sprachcodierung in der Telekommunikation auf die Qualität einer textabhängigen Sprecherverifizierung}, BOOKTITLE = {9. Konferenz "Elektronische Sprachsignalverarbeitung" und 5. ITG-Fachtagung "Sprachkommunikation"}, YEAR = {1998}, PAGES = {93--96}, ADDRESS = {Dresden} } @INPROCEEDINGS{0826Eckert1998, AUTHOR = {M. Eckert and Detlef Hardt}, TITLE = {Einsatz der phonetischen Gewichtung und Cohort-Normalisierung in einem phonembasierten Sprecherverifizierungssystem}, BOOKTITLE = {9. Konferenz "Elektronische Sprachsignalverarbeitung" und 5. ITG-Fachtagung "Sprachkommunikation"}, YEAR = {1998}, PAGES = {37--40}, ADDRESS = {Dresden} } @INPROCEEDINGS{0827Kamceva1998, AUTHOR = {T. Kamceva and Detlef Hardt and G. Klassmeyer}, TITLE = {Mögliche Zusammenhänge zwischen den Ergebnissen einer Analyse von Formantverläufen und der Fehlerrate eines textabhängigen Sprecherverifizierungssystem bei unterschiedlichen Sprechweisen}, BOOKTITLE = {9. Konferenz "Elektronische Sprachsignalverarbeitung" und 5. ITG-Fachtagung "Sprachkommunikation"}, YEAR = {1998}, PAGES = {41--44}, ADDRESS = {Dresden} } @INPROCEEDINGS{0829Thiede1998, AUTHOR = {Thilo Thiede and William C. Treurniet and Roland Bitto and Thomas Sporer and Karlheinz Brandenburg and Christian Schmidmer and Michael Keyhl and John G. Beerends and Catherine Colomes and Gerhard Stoll and Bernhard Feiten}, TITLE = {PEAQ - der künftige ITU-Standard zur objektiven Messung der wahrgenommenen Audioqualität}, BOOKTITLE = {20. Tonmeistertagung}, YEAR = {1998}, ADDRESS = {Karlsruhe}, PDF = {http://elvera.nue.tu-berlin.de/files/0829Thiede1998.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0829Thiede1998.pdf}, ABSTRACT = {Bei der digitalen Übertragung und Speicherung von Audiosignalen werden in zunehmendem Maße Datenreduktionsverfahren verwendet, die Eigenschaften des menschlichen Gehörs ausnutzen. Dabei wird versucht, die spektrale Verteilung der entstehenden Quantisierungsfehler so zu beeinflussen, daß sie unterhalb der Hörschwelle liegen. Die auf diese Weise unhörbar gemachten Störungen sind jedoch immer noch physikalisch vorhanden. Die wahrgenommene Qualität solcher gehörangepaßten Codierverfahren kann somit mit konventionellen Meßverfahren, die lediglich die insgesamt vorhanden Störungen erfassen, nicht bestimmt werden. Daher wird die Qualität von gehörangepaßten Codierverfahren üblicherweise mittels subjektiver Hörtests bestimmt. Solche Hörtests müssen unter optimalen Abhörbedingungen und mit einer großen Anzahl von Testhörern durchgeführt werden, so daß dieser Weg der Qualitätsbestimmung in vielen Fällen zu aufwendig ist. Ein objektives Meßverfahren, das die zum subjektiven Qualitätseindruck führenden physiologischen und kognitiven Vorgänge} } @INPROCEEDINGS{0856Noll1998, AUTHOR = {Peter Noll}, TITLE = {Audio Coding: From Broadcast Standard(s) to Advanced Audio Coding}, BOOKTITLE = {ITG-Fachbericht Nr. 146, "Codierung für Quelle, Kanal und Übertragung"}, YEAR = {1998}, PUBLISHER = {VDE-Verlag}, PAGES = {13--22}, NOTE = {invited paper} } @ARTICLE{0823Bitó1998, AUTHOR = {János Bitó and Thomas Höhne and Holger Schulz and Peter Noll}, TITLE = {Multipath Fading Effects on Integrated Video, Voice and Data Transmissionin in Hybrid-Code BPSK-DS/CDMA Systems}, JOURNAL = {Image Communication Journal, Special Issue on Mobile Image/Video Transmission}, YEAR = {1998}, MONTH = apr, PAGES = {83--192}, VOLUME = {12}, NUMBER = {2} } @INPROCEEDINGS{0828Heising1998, AUTHOR = {Guido Heising and D. Marpe and H. L. Cycon}, TITLE = {A Wavelet-Based Video Coding Scheme Using Image Warping Prediction}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'98)}, YEAR = {1998}, MONTH = oct, ADDRESS = {Chicago, IL, USA}, NOTE = {D. Marpe, H. L. Cycon: Fachhochschule für Technik und Wirtschaft Berlin} } @INPROCEEDINGS{0820Noll1999, AUTHOR = {Peter Noll}, TITLE = {High Quality Audio for Multimedia: Key Technologies and MPEG Standards}, BOOKTITLE = {Proceedings IEEE Global Telecommunications Conference (GLOBECOM)}, YEAR = {1999}, ADDRESS = {Rio de Janeiro, Brasilien}, NOTE = {invited paper}, ABSTRACT = { } } @INPROCEEDINGS{0821Noll1999, AUTHOR = {Peter Noll and Tilman Liebchen}, TITLE = {Digital Audio: From Lossless to Transparent Coding}, BOOKTITLE = {Proceedings IEEE Signal Processing Workshop}, YEAR = {1999}, PAGES = {53--60}, ADDRESS = {Poznan}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/0821Noll1999.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0821Noll1999.pdf}, ABSTRACT = {We have seen rapid progress in high quality compression of wideband audio signals. Todays coding algorithms can achieve substabtially more compression than was thought possible only a few years ago. In the case of audio coding with its Bandwidth of 20 kHz and more, the concept of preceptual coding has paved the way for significant bit rate reductions. However, multiple codings can reveal originally masked distortions. In addition, reproduction of critical music} } @ARTICLE{0848Noll1999, AUTHOR = {Peter Noll}, TITLE = {Audiocodierung: Vom Hörfunkstandard zum Advanced Audio Coding}, JOURNAL = {it+ti - Informationstechnik und Technische Informatik}, YEAR = {1999}, PAGES = {12--18}, VOLUME = {41}, PDF = {http://elvera.nue.tu-berlin.de/files/0848Noll1999.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0848Noll1999.pdf}, ABSTRACT = {P} } @INBOOK{0850Kanbach1999, AUTHOR = {Andreas Kanbach and Andreas Körber}, TITLE = {ISDN - Die Technik; Schnittstellen - Protokolle - Dienste - Endsysteme}, YEAR = {1999}, PUBLISHER = {Hüthig}, PAGES = {549}, EDITION = {3}, ADDRESS = {Heidelberg}, NOTE = {ISBN 3-7785-2288-4} } @ARTICLE{0851Liebchen1999, AUTHOR = {Tilman Liebchen and Marcus Purat and Peter Noll}, TITLE = {Improved Lossless Transform Coding of Audio Signals}, JOURNAL = {Impulse und Antworten - Festschrift für Manfred Krause}, YEAR = {1999}, PAGES = {159--170}, PDF = {http://elvera.nue.tu-berlin.de/files/0851Liebchen1999.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0851Liebchen1999.pdf}, ABSTRACT = {Recent papers have proposed linear prediction as a useful method for lossless audio coding. Transform coding, however, has hardly been investigated, although it seems to be more suited for the harmonic structure of most audio signals. In this paper we present some results on lossless transform coding of CD-quality audio data. One main aspect lies on a convenient quantization method to guarantee perfect reconstruction. We achieve bit rates which are lower than those obtained by lossless linear prediction schemes.} } @ARTICLE{0852Thiede1999, AUTHOR = {Thilo Thiede}, TITLE = {Eine rechenzeiteffektive gehörrichtige Filterbank mit signalabhängiger Filtercharakteristik}, JOURNAL = {Impulse und Antworten - Festschrift für Manfred Krause}, YEAR = {1999}, PAGES = {263--272}, PDF = {http://elvera.nue.tu-berlin.de/files/0852Thiede1999.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0852Thiede1999.pdf}, ABSTRACT = {Ende 1998 wurde von der internationalen Telekommunikationsunion (ITU) eine Empfehlung für ein Verfahren zur objektiven Messung der wahrnehmbaren Qualität digital übertragener Audiosignale herausgegeben [2]. Die in diesem Standard enthaltene Modellvariante für die höchsten Genauigkeitsanforderungen ("advanced version") verwendet neben einem transformationsbasierten Gehörmodell auch eine gehörangepaßte Filterbank mit signalabhängigen Filtercharakteristiken. Der vorliegende Artikel beschreibt diese Filterbank sowie eine rechenzeiteffektive Filterstruktur zu ihrer Realisierung.} } @ARTICLE{0853Thiede1999, AUTHOR = {Thilo Thiede and W. Schaller and J. Hensel}, TITLE = {Eine hochsymmetrische Anordnung zur virtuellen räumlichen Ausrichtung von Richtcharakteristiken zweiter Ordnung}, JOURNAL = {Impulse und Antworten - Festschrift für Manfred Krause}, YEAR = {1999}, PAGES = {273--284}, PDF = {http://elvera.nue.tu-berlin.de/files/0853Thiede1999.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0853Thiede1999.pdf}, ABSTRACT = {Die Abtastung eines Schallfeldes durch Mikrofongruppen mit zueinander orthogonalen Richtcharakteristiken bietet eine mathematisch „saubere” Möglichkeit zur Aufnahme und Wiedergabe eines räumlichen Schallfeldes mit einer minimalen Anzahl von Übertragungskanälen. Zum Erreichen einer hohen räumlichen Auflösung ist dabei die Verwendung von Richtcharakteristiken zweiter Ordnung wünschenswert. Die am einfachsten zu erzeugenden Richtcharakteristiken zweiter Ordnung enthalten jedoch auch einen Anteil nullter Ordnung und lassen sich daher nicht mehr nach dem Additionstheorem der Kugelfunktionen ausrichten. Es lassen sich jedoch auch für solche Richtcharakteristiken Koeffizientenmatrizen zur räumlichen Ausrichtung und zur Orthogonalzerlegung vorgegebener Richtcharakteristiken angeben.} } @INPROCEEDINGS{0849Ekmekci1999, AUTHOR = {Sila Ekmekci and Jens-Rainer Ohm}, TITLE = {"Incomplete 3D Representation" and View Synthesis for Video Objects Captured by Multiple Cameras}, BOOKTITLE = {Proceedings Picture Coding Symposium PCS 99}, YEAR = {1999}, MONTH = apr, PAGES = {81--86}, ADDRESS = {Portland, Oregon, USA} } @INPROCEEDINGS{0818Noll2000, AUTHOR = {Peter Noll}, TITLE = {Speech and Audio Coding for Multimedia Communications}, BOOKTITLE = {Proceedings International Cost 254 Workshop on Intelligent Communication Technologies and Applications}, YEAR = {2000}, PAGES = {253--263}, ADDRESS = {Neuchâtel, Schweiz}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/0818Noll2000.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0818Noll2000.pdf}, ABSTRACT = {We have seen rapid progress in high-quality compression of telephon speech and wideband speech signals. Linear prediction, subband coding, transform coding, as well as various forms of vector quantization and entropy coding techniques have been used to design efficient coding algorithms which cann achieve substantially more compression than was thought possible only a few years ago. In the case of audio coding with its of 20 kHz and more, the concept of preceptual coding has paved the way for significant bit rate reductions. The paper will explain basic approaches to such compressions, with concentration on existing and upcoming international standards. As typical signal classes we shall consider telephone speech, wideband speech, and wideband audio signals all of which differin listener expcctation of offered quality. The main motivations for low bit rate coding are outlined as well as basic and network related requirements. It will become obvious that speech and Audio coders must be both source-specific and hearing-specific to perform adequately at low bit rates} } @ARTICLE{0819Ekmekci2000, AUTHOR = {Sila Ekmekci}, TITLE = {Encoding and Reconstruction of Incomplete 3D Video Objects}, JOURNAL = {Special Issue of IEEE Transactions on Circuits and Systems for Video Technology}, YEAR = {2000}, MONTH = oct, PAGES = {1198--1207}, VOLUME = {10}, NUMBER = {7}, ABSTRACT = { } } @INPROCEEDINGS{0816Noll2001, AUTHOR = {Peter Noll}, TITLE = {Speech and Audio Coding - Status and Directions}, BOOKTITLE = {10. Symposium Maritime Elektronik}, YEAR = {2001}, PUBLISHER = {Univ.-Druckerei Rostock 442-01}, PAGES = {9--22}, ORGANIZATION = {Universität Rostock, Arbeitskreis Mess- und Informationselektronik} } @INPROCEEDINGS{0817Noll2001, AUTHOR = {Peter Noll}, TITLE = {Digital Audio for Multimedia}, BOOKTITLE = {Proceedings Signal Processing for Multimedia}, YEAR = {2001}, ORGANIZATION = {NATO Advanced Study Institute}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/0817Noll2001.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0817Noll2001.pdf}, ABSTRACT = {The paper covers any key technologies in wideband audio coding including auditory masking, preceptual coding, frecuency domain coding, and dynamic bit allocation. The MPEG standization work is then described. MPEG algorithmus have found a wide range of communications-based and storage-based applications. For example, the european digital audio broadcast (DAB) makes use of MPEG-1. It will then be shown that the MPEG-2 advanced Audio Coding (AAC) standard offers a powerful collection of very flexible tools for stereo and multichannel coding, and that AAC outperforms many other coding algorithms(including MPEG-1 coders). Finally, we will address the current MPEG-4 speech and audio coding standarization work which merges the whole range of audio from high fidelity audio coding and speech coding down to synthetic audio, synthetic speech and text to speech conversion} } @ARTICLE{1218Chang2001, AUTHOR = {Shih-Fu Chang and Thomas Sikora and Atul Puri}, TITLE = {Overview of the MPEG-7 standard}, JOURNAL = {IEEE Transactions on Circuits and Systems for Video Technology}, YEAR = {2001}, MONTH = jun, PAGES = {688--695}, VOLUME = {11}, NUMBER = {6}, DOI = {10.1109/76.927421}, ABSTRACT = {MPEG-7, formally known as the Multimedia Content Description Interface, includes standardized tools (descriptors, description schemes, and language) enabling structural, detailed descriptions of audio-visual information at different granularity levels (region, image, video segment, collection) and in different areas (content description, management, organization, navigation, and user interaction). It aims to support and facilitate a wide range of applications, such as media portals, content broadcasting, and ubiquitous multimedia. We present a high-level overview of the MPEG-7 standard. We first discuss the scope, basic terminology, and potential applications. Next, we discuss the constituent components. Then, we compare the relationship with other standards to highlight its capabilities} } @INPROCEEDINGS{1217Sikora2001, AUTHOR = {Thomas Sikora}, TITLE = {Visualization and navigation in image database applications based on MPEG-7 descriptors}, BOOKTITLE = {Proceedings of the IEEE International Conference on Image Processing (ICIP 2001)}, YEAR = {2001}, MONTH = oct, PAGES = {583--586}, NOTE = {ISBN 0-7803-6725-1}, DOI = {10.1109/ICIP.2001.958185}, ABSTRACT = {In this paper we address the user-navigation through large volumes of image data. A similarity-measure based on MPEG-7 color histograms is introduced and multidimensional scaling concepts are employed to display images in two dimensions according to their mutual similarities. With such a view the user can easily see relations and color similarity between images and understand the structure of the data base. In order to cope with large volumes of images a modified version of the k-means clustering technique is introduced which identifies representative image samples for each cluster. Representative images (up to 100) are then displayed in two dimensions using MDS structuring. The modified clustering technique proposed produces a hierarchical structure of clusters similar to street maps with various resolutions of detail. The user can zoom into various cluster levels to obtain more or less detail if required. The results obtained verify the attractiveness of the approach fornavigation and retrieval applications.} } @INPROCEEDINGS{0809Meiers2002, AUTHOR = {T. Meiers and Thomas Sikora and I. Keller}, TITLE = {Hierarchical Image Database Browsing Environment with Embedded Relevance Feedback}, BOOKTITLE = {IEEE 2002 Int. Conf. on Image Processing}, YEAR = {2002}, ADDRESS = {Rochester, NJ}, DOI = {10.1109/ICIP.2002.1040020}, ABSTRACT = {We address the user-navigation through large volumes of image data. A tree structured K-means clustering is introduced which will hierarchically group images into similar groups. Providing the nodes of the different levels with representative image samples leads to different "image maps" similar to street maps with various resolutions of details. The user can zoom into various cluster levels to obtain more or less detail if required. Further a new query refinement method is introduced. The retrieval process is controlled by learning from positive examples from the user, often called the relevance feedback of the user. The combination of the relevance feedback and the hierarchical structure together with a three-dimensional visualization of the "image maps" leads to an intuitive browsing environment. The results obtained verify the attractiveness of the approach for navigation and retrieval applications.} } @INPROCEEDINGS{0810Liebchen2002, AUTHOR = {Tilman Liebchen}, TITLE = {Lossless Audio Coding Using Adaptive Multichannel Prediction}, BOOKTITLE = {113th AES Convention}, YEAR = {2002}, ADDRESS = {Los Angeles}, PDF = {http://elvera.nue.tu-berlin.de/files/0810Liebchen2002.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0810Liebchen2002.pdf}, ABSTRACT = {Losless audio coding enables the compression of digital audio data without any loss in quality due to a perfect reconstruction of the digital signal. The compression is achieved by means of decorrelation methods such as linear prediction. However since audio signals usually consists of at least two channels, which are often highly correlated with each other, it is worthwhile to make use of inter-channel correlations as well} } @INBOOK{0814Noll2002, AUTHOR = {Peter Noll}, TITLE = {High Quality Audio Coding}, YEAR = {2002}, BOOKTITLE = {The Communications Handbook}, EDITOR = {J. Gibson, Texas A&M University}, PUBLISHER = {CRC, Inc}, PAGES = {97/1--97/16}, CHAPTER = {97}, EDITION = {Second} } @INPROCEEDINGS{0815Meiers2002, AUTHOR = {T. Meiers and I. Keller and Thomas Sikora}, TITLE = {Three-dimensional browsing environment for MPEG-7 image databases}, BOOKTITLE = {IS&T/SPIE Workshop: \"Storage and Retrieval for Media Databases 2002\"}, YEAR = {2002}, PAGES = {324--335}, ADDRESS = {San José CA}, DOI = {10.1117/12.451103} } @INBOOK{1045Manjunath2002, TITLE = {Introduction to MPEG-7: Multimedia Content Description Interface}, YEAR = {2002}, EDITOR = {B. S. Manjunath, Philippe Salembier, Thomas Sikora}, PUBLISHER = {John Wiley & Sons}, PAGES = {396}, NOTE = {ISBN: 978-0-471-48678-7}, ABSTRACT = {The MPEG standards are an evolving set of standards for video and audio compression. MPEG 7 technology covers the most recent developments in multimedia search and retreival, designed to standardise the description of multimedia content supporting a wide range of applications including DVD, CD and HDTV. Multimedia content description, search and retrieval is a rapidly expanding research area due to the increasing amount of audiovisual (AV) data available. The wealth of practical applications available and currently under development (for example, large scale multimedia search engines and AV broadcast servers) has lead to the development of processing tools to create the description of AV material or to support the identification or retrieval of AV documents. Written by experts in the field, this book has been designed as a unique tutorial in the new MPEG 7 standard covering content creation, content distribution and content consumption. At present there are no books documenting the available technologies in such a comprehensive way. * Presents a comprehensive overview of the principles and concepts involved in the complete range of Audio Visual material indexing, metadata description, information retrieval and browsing * Details the major processing tools used for indexing and retrieval of images and video sequences * Individual chapters, written by experts who have contributed to the development of MPEG 7, provide clear explanations of the underlying tools and technologies contributing to the standard * Demostration software offering step-by-step guidance to the multi-media system components and eXperimentation model (XM) MPEG reference software * Coincides with the release of the ISO standard in late 2001. A valuable reference resource for practising electronic and communications engineers designing and implementing MPEG 7 compliant systems, as well as for researchers and students working with multimedia database technology.} } @ARTICLE{0799Sikora2003, AUTHOR = {Thomas Sikora}, TITLE = {MPEG-4 objektbasierte Videocodierung}, JOURNAL = {it Information Technology}, YEAR = {2003}, PAGES = {273}, VOLUME = {45}, NUMBER = {5}, PDF = {http://elvera.nue.tu-berlin.de/files/0799Sikora2003.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0799Sikora2003.pdf}, ABSTRACT = {Neben einer hohen Kompressionseffizienz und einer guten Fehlerrobustheit stellt MPEG-4 Video als einziger internationaler Standard objektbasierte Funktionalitäten für Anwendungen zur Verfügung. In diesem Beitrag werden die Ziele und Techniken der MPEG-4 objektbasierten Kodierung beschrieben.Summary MPEG-4 is the first international standard that supports applications with object-based functionalities. This article outlines the goals and technical details of the MPEG-4 object-based coding algorithm.} } @INPROCEEDINGS{0807Belkoura2003, AUTHOR = {Zouhair Belkoura and L. Naviner}, TITLE = {Hardware Implementation Issues of a BMS Decoding Approach for AG Based Codes}, BOOKTITLE = {Communications and Networking Conference (WCNC)}, YEAR = {2003}, ADDRESS = {New Orleans}, NOTE = {L. Naviner: École Nationale Supérieure des Télécommunications (ENST), Paris} } @INPROCEEDINGS{0808Ekmekci2003, AUTHOR = {Sila Ekmekci and Thomas Sikora}, TITLE = {Unbalanced Quantized Multiple Description Video Transmission using Path Diversity}, BOOKTITLE = {IS&T/SPIE's Electronic Imaging 2003}, YEAR = {2003}, MONTH = jan, ADDRESS = {Santa Clara, CA}, ABSTRACT = {Multiple Description Coding is a forward error correction scheme where two or more descriptions of the source are sent to the receiver over different channels. If only one channel is received the signal can be reconstructed with distortion D1 or D2. On the other hand, if both channels are received the combined information is used to achieve a lower distortion D0. Our approach is based on the Multiple State Video Coding with the novelty that we achieve a flexible unbalance rate of the two streams by varying the quantization step size while keeping the original frame rate constant. The total bitrate RT is fixed which is to be allocated between the two streams. If the assigned bitrates are not balanced there will be PSNR (peak signal to noise ratio) variations between neighbouring frames after reconstruction. Our goal is to find the optimal rate allocation while maximizing the average reconstructed frame PSNR and minimizing the PSNR variations given the total bitrate RT and the packet loss probabilities p1 and p2 over the two paths. The reconstruction algorithm is also taken into account in the optimization process. The paper will report results presenting optimal system designs for balanced (equal packet loss probabilities) but also for unbalanced path conditions (different packet loss probabilities). } } @INPROCEEDINGS{1065Ekmekci2003, AUTHOR = {Sila Ekmekci and Thomas Sikora}, TITLE = {Multistate vs. single-state video coding over error-prone channels}, BOOKTITLE = {Proc. of the Thirty-Seventh Asilomar Conference on Signals, Systems and Computers}, YEAR = {2003}, MONTH = mar, PAGES = {1544--1547}, ADDRESS = {Pacific Grove, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1065Ekmekci2003.pdf}, DOI = {10.1109/ACSSC.2003.1292244}, URL = {http://elvera.nue.tu-berlin.de/files/1065Ekmekci2003.pdf}, ABSTRACT = {Multiple description coding (MDC) is a forward error correction scheme where two or more descriptions of the source are sent to the receiver over different channels. If only one description is received the signal can be reconstructed with distortion D/sub 1/ or D/sub 2/. If both channels are received on the other hand, the combined information is used to achieve a lower distortion D/sub 0/. Multistate video coding (MSVC) is a specific MDC scheme where the video is coded into multiple independently decodable streams each with its own prediction process and state. In this paper we compare MSVC to single description coding (SDC) at different loss rates and coding options under the assumption that the motion vectors are always available. Results show that when motion vectors are received, SDC performs better than MSVC at every coding option tested. The performance difference is bigger for low motion sequences.} } @INPROCEEDINGS{0806Ekmekci2003, AUTHOR = {Sila Ekmekci and Thomas Sikora}, TITLE = {Model for Unbalanced Multiple Description Video Transmission using Path Diversity}, BOOKTITLE = {VCIP 2003}, YEAR = {2003}, MONTH = jul, ADDRESS = {Lugano, Switzerland}, PDF = {http://elvera.nue.tu-berlin.de/files/0806Ekmekci2003.ps}, URL = {http://elvera.nue.tu-berlin.de/files/0806Ekmekci2003.ps}, ABSTRACT = {Multiple State Video Coding (MSVC) is a Multiple Description Coding Schemawhere the video is coded into multiple independently decodable streams, each with its own prediction process and state. The system subject to this work is composed of two subsystems: 1-multiple state encoding/coding, 2- path diversity transmissionsysten.} } @INPROCEEDINGS{1073Burred2003, AUTHOR = {Juan José Burred and Alexander Lerch}, TITLE = {A Hierarchical Approach to Automatic Musical Genre Classification}, BOOKTITLE = {6th International Conference on Digital Audio Effects (DAFX)}, YEAR = {2003}, MONTH = sep, ADDRESS = {London, UK}, PDF = {http://elvera.nue.tu-berlin.de/files/1073Burred2003.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1073Burred2003.pdf} } @INPROCEEDINGS{0804Kim2003, AUTHOR = {Hyoung-Gook Kim and Edgar Berdahl and Nicolas Moreau and Thomas Sikora}, TITLE = {Speaker Recognition Using MPEG-7 Descriptors}, BOOKTITLE = {EUROSPEECH 2003}, YEAR = {2003}, MONTH = sep, ADDRESS = {Geneva, Switzerland}, PDF = {http://elvera.nue.tu-berlin.de/files/0804Kim2003.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0804Kim2003.pdf}, ABSTRACT = {Our purpose is to evaluate the efficiency of MPEG-7 audio descriptors for speaker recognition. The upcoming MPEG-7 standard provides audio feature descriptors, which are useful for many applications. One example application is a speaker recognition system, in which reduced-dimension log-spectral features based on MPEG-7 descriptors are used to train hidden Markov models for individual speakers. The feature extraction based on MPEG-7 descriptors consists of three main stages:Normalized Audio Spectrum Envelope (NASE), Principal Component Analysis (PCA) and Independent Component Analysis (ICA). An experimental study is presented where the speaker recognition rates are compared for different feature extraction methods. Using ICA, we achieved better results than NASE and PCA in a speaker recognition system.} } @INPROCEEDINGS{0805Kim2003, AUTHOR = {Hyoung-Gook Kim and Markus Schwab and Nicolas Moreau and Thomas Sikora}, TITLE = {Enhancement of Noisy Speech for Noise Robust Front-End and Speech Reconstruction at Back-End of DSR System}, BOOKTITLE = {EUROSPEECH 2003}, YEAR = {2003}, MONTH = sep, ADDRESS = {Geneva, Switzerland}, PDF = {http://elvera.nue.tu-berlin.de/files/0805Kim2003.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0805Kim2003.pdf}, ABSTRACT = {This paper presents a speech enhancement method for noise robust front-end and speech reconstruction at the back-end of Distributed Speech Recognition (DSR). The speech noise removal algorithm is based on a two stage noise filtering LSAHT by log spectral amplitude speech estimator (LSA) and harmonic tunneling (HT) prior to feature extraction. The noise reduced features are transmitted with some parameters, viz., pitch period, the number of harmonic peaks from the mobile terminal to the server along noise-robust mel-frequency cepstral coefficients. Speech reconstruction at the back end is achieved by sinusoidal speech representation. Finally, the performance of the system is measured by the segmental signal-noise ratio, MOS tests, and the recognition accuracy of an Automatic Speech Recognition (ASR) in comparison to other noise reduction methods.} } @INPROCEEDINGS{0802Liem2003, AUTHOR = {Marco Liem and Hyoung-Gook Kim and Otto Manck}, TITLE = {Algorithm of a Single Chip Acoustic Echo Canceller Using Cascaded Cross Spectral Estimation}, BOOKTITLE = {IWAENC 2003}, YEAR = {2003}, MONTH = sep, ADDRESS = {Kyoto, Japan}, PDF = {http://elvera.nue.tu-berlin.de/files/0802Liem2003.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0802Liem2003.pdf}, ABSTRACT = {This paper details the algorithm used by a low cost, single chip acoustic echo canceller. The algorithm is based on classical cross spectral estimation. It is employed in a cascaded filter structure where a short, fast filter operates on the output of a longer but slower filter to optimize the tracking performance of changes in the echo path without affecting the steady state performance. This combination allows the use of a fixed configuration for a wide range of acoustic environments. This contrasts to the predominately employed LMS type of algorithms which are much more sensitive to noise and often require an extensive parameterization specific to the operating environment.} } @INPROCEEDINGS{0803Kim2003, AUTHOR = {Hyoung-Gook Kim and Markus Schwab and Nicolas Moreau and Thomas Sikora}, TITLE = {Speech Enhancement of Noisy Speech Using Log-Spectral Amplitude Estimator and Harmonic Tunnelling}, BOOKTITLE = {IWAENC 2003}, YEAR = {2003}, MONTH = sep, ADDRESS = {Kyoto, Japan}, PDF = {http://elvera.nue.tu-berlin.de/files/0803Kim2003.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0803Kim2003.pdf}, ABSTRACT = {In this paper we present a two stage noise reduction algorithm for speech enhancement. The speech noise removal algorithm is based on a two stage noise filtering LSAHT by log spectral amplitude speech estimator (LSA) and harmonic tunneling (HT) with spectral subtraction. The performance of the system is measured by the segmental signal-to-noise ratio, mean opinion score (MOS) tests, and the recognition accuracy of an Automatic Speech Rec } } @INPROCEEDINGS{0801Liebchen2003, AUTHOR = {Tilman Liebchen}, TITLE = {MPEG-4 Lossless Coding for High-Definition Audio}, BOOKTITLE = {115th AES Convention}, YEAR = {2003}, MONTH = oct, ADDRESS = {New York}, PDF = {http://elvera.nue.tu-berlin.de/files/0801Liebchen2003.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0801Liebchen2003.pdf}, ABSTRACT = {proposals, many companies have submitted lossless audio codecs for evaluation. The codec of the Technical University of Berlin was chosen as reference model for MPEG-4 Audio Lossless Coding, attaining working draft status in July 2003. The encoder is based on linear prediction, which enables high compression even with moderate complexity, while the corresponding decoder is straightforward. The paper describes the basic elements of the codec, points out envisaged applications, and gives an outline of the standardization process. } } @INPROCEEDINGS{0798Ekmekci2003, AUTHOR = {Sila Ekmekci and Thomas Sikora}, TITLE = {Multi-State vs. Single-State Video Coding over Error Prone Channels}, BOOKTITLE = {2003 Asilomar Conference}, YEAR = {2003}, MONTH = nov, ADDRESS = {Monterey, CA}, PDF = {http://elvera.nue.tu-berlin.de/files/0798Ekmekci2003.pdf}, DOI = {10.1109/ACSSC.2003.1292244}, URL = {http://elvera.nue.tu-berlin.de/files/0798Ekmekci2003.pdf}, ABSTRACT = {Multiple description coding (MDC) is a forward error correction scheme where two or more descriptions of the source are sent to the receiver over different channels. If only one description is received the signal can be reconstructed with distortion D/sub 1/ or D/sub 2/. If both channels are received on the other hand, the combined information is used to achieve a lower distortion D/sub 0/. Multistate video coding (MSVC) is a specific MDC scheme where the video is coded into multiple independently decodable streams each with its own prediction process and state. In this paper we compare MSVC to single description coding (SDC) at different loss rates and coding options under the assumption that the motion vectors are always available. Results show that when motion vectors are received, SDC performs better than MSVC at every coding option tested. The performance difference is bigger for low motion sequences.} } @INPROCEEDINGS{0800Schwab2003, AUTHOR = {Markus Schwab and Hyoung-Gook Kim and Wiryadi and Peter Noll}, TITLE = {Robust Noise Estimation applied to different speech estimators}, BOOKTITLE = {Asilomar Conference on Signals, Systems, and Computers}, YEAR = {2003}, MONTH = nov, PDF = {http://elvera.nue.tu-berlin.de/files/0800Schwab2003.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0800Schwab2003.pdf}, ABSTRACT = {In this paper we present a robust noise estimation for peech enhancement algorithms. The robust noise estimation based on a modified minima controlled recursive averaging noise estimator was applied to different speech estimators. The investigated speech estimators were spectral substraction (SS), log spectral amplitude speech estimator (LSA) and optimally modified log spectral amplitude estimator (OM-LSA). The performance of the different algorithms were measured both by the signal-tonoise ratio (SNR) and recognition accuracy of an Automatic Speech Recognition (ASR).} } @INPROCEEDINGS{0797Belkoura2003, AUTHOR = {Zouhair Belkoura and L. Naviner}, TITLE = {Hardware Analysis of Decoding Algorithms for AGC}, BOOKTITLE = {46th IEEE Midwest Symposium on Circuits and Systems (MWSCAS)}, YEAR = {2003}, MONTH = dec, ADDRESS = {Cairo, Egypt}, NOTE = {L. Naviner: École Nationale Supérieure des Télécommunications (ENST), Paris}, PDF = {http://elvera.nue.tu-berlin.de/files/0797Belkoura2003.pdf}, DOI = {10.1109/MWSCAS.2003.1562357}, URL = {http://elvera.nue.tu-berlin.de/files/0797Belkoura2003.pdf}, ABSTRACT = {The algebraic-geometry (AG) family of codes contains sequences with excellent asymptotic behavior, but only few pieces work treating their hardware implementation can be found. In this work, the authors investigated some algorithms for decoding AG codes attempting to identify the differences between these algorithms from a hardware implementation point of view.} } @INPROCEEDINGS{0792Ekmekci2004, AUTHOR = {Sila Ekmekci and Thomas Sikora}, TITLE = {Temporal Layered vs. Multistate Video Coding}, BOOKTITLE = {IS&T/SPIE's Electronic Imaging 2004}, YEAR = {2004}, MONTH = jan, ADDRESS = {San Jose, CA}, PDF = {http://elvera.nue.tu-berlin.de/files/0792Ekmekci.ps}, URL = {http://elvera.nue.tu-berlin.de/files/0792Ekmekci.ps}, ABSTRACT = {This document shows the desired format and appearance of a manuscript prepared for the Preceedings of the SPIE. It contains general formatting instructions and hints about how to use LaTeX.} } @INPROCEEDINGS{0793Kim2004, AUTHOR = {Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Performance of MPEG-7 spectral basis representations for retrieval of home video abstract}, BOOKTITLE = {IS&T/SPIE's Electronic Imaging 2004}, YEAR = {2004}, MONTH = jan, ADDRESS = {San Jose, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/0793Kim2004.ps}, URL = {http://elvera.nue.tu-berlin.de/files/0793Kim2004.ps}, ABSTRACT = {In this paper, we present a classification and retrieval technique targeted for retrieval of home video abstract using dimension-reduced, decorrelated spectral features of audio content. The feature extraction based on MPEG-7 descriptors consists of three main stages: Normalized Audio Spectrum Envelope (NASE), basis decomposition algorithm and basis projection, obtained by multiplying the NASE with a set of extracted basis functions. A classifier based on continuous hidden Markov models is applied. For retrieval with accurate performance the system consists of a two-level hierarchy method using speech recognition and sound classification. For the measure of the performance we compare the classification results of MPEG-7 standardized features vs. Mel-scale Frequency Cepstrum Coefficients (MFCC). Results show that the MFCC features yield better performance compared to MPEG-7 features.} } @INPROCEEDINGS{0794Kim2004, AUTHOR = {Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Automatic segmentation of speakers in broadcast audio material}, BOOKTITLE = {IS&T/SPIE's Electronic Imaging 2004}, YEAR = {2004}, MONTH = jan, ADDRESS = {San Jose, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/0794Kim2004.ps}, URL = {http://elvera.nue.tu-berlin.de/files/0794Kim2004.ps}, ABSTRACT = {In this paper, dimension-reduced, decorrelated spectral features for general sound recognition are applied to segment conversational speech of both broadcast news audio and panel discussion television programs. Without a priori information about number of speakers, the audio stream is segmented by a hybrid metric-based and model-based segmentation algorithm. For the measure of the performance we compare the segmentation results of the hybrid method versus metric-based segmentation with both the MPEG-7 standardized features and Mel-scale Frequency Cepstrum Coefficients (MFCC). Results show that the MFCC features yield better performance compared to MPEG-7 features. The hybrid approach significantly outperforms direct metric based segmentation.} } @INPROCEEDINGS{0796Goldmann2004, AUTHOR = {Lutz Goldmann and Mustafa Karaman and Thomas Sikora}, TITLE = {Human Body Posture Recognition Using MPEG-7 Descriptors}, BOOKTITLE = {IS&T/SPIE's Electronic Imaging 2004}, YEAR = {2004}, MONTH = jan, ADDRESS = {San Jose, CA, USA.}, PDF = {http://elvera.nue.tu-berlin.de/files/0796Goldmann2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0796Goldmann2004.pdf}, ABSTRACT = {This paper presents a novel approach to human body posture recognition based on the MPEG-7 contour-based shape descriptor and the widely used projection histogram. A combination of them was used to recognize the main posture and the view of a human based on the binary object mask obtained by the segmentation process. The recognition is treated as a typical pattern recognition task and is carried out through a hierarchy of classifiers. Therefore various structures both hierachical and non-hierarchical, in combination with different classifiers, are compared to each other with respect to recognition performance and computational complexity. Based on this an optimal system design with recognition rates of 95.59% for the main posture, 77.84% for the view and 79.77% in combination is achieved.} } @INPROCEEDINGS{0791Liebchen2004, AUTHOR = {Tilman Liebchen and Y. Reznik}, TITLE = {MPEG-4 ALS: An Emerging Standard for Lossless Audio Coding}, BOOKTITLE = {IEEE Data Compression Conference}, YEAR = {2004}, MONTH = mar, ADDRESS = {Snowbird, USA}, NOTE = {Y. Reznik: RealNetworks Inc.}, PDF = {http://elvera.nue.tu-berlin.de/files/0791Liebchen2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0791Liebchen2004.pdf} } @INPROCEEDINGS{0790Moriya2004, AUTHOR = {T. Moriya and D. Yang and Tilman Liebchen}, TITLE = {A Design of Lossless Compression for High Quality Audio Signals}, BOOKTITLE = {18th International Congress on Acoustics}, YEAR = {2004}, MONTH = apr, ADDRESS = {Kyoto}, NOTE = {T. Moriya: NTT Cyber Space Labs; D. Yang: University of Southern California}, PDF = {http://elvera.nue.tu-berlin.de/files/0790Moriya2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0790Moriya2004.pdf}, ABSTRACT = {Three extension tools for extending and enhancing the compression performance of prediction-based lossless audio coding are proposed. The first extension aims at supporting floatingpoint data input in addition to integer PCM data. The second is progressive-order prediction of the starting samples at each random-access frame, where the information on previous frame is not available. The third is inter-channel joint coding. Both predictive coefficients and prediction-error signals are efficiently coded making use of the inter-channel correlation. These new prediction tools will contribute to enhance the for the coming MPEG-4 Audio Lossless Coding (ALS) scheme, currently being under development as an extension of the ISO/IEC MPEG-4 audio standard} } @INPROCEEDINGS{0789Clemens2004, AUTHOR = {Carsten Clemens and Matthias Kunter and Sebastian Knorr and Thomas Sikora}, TITLE = {A Hybrid Approach for Error Concealment in Stereoscopic Images}, BOOKTITLE = {5th International Workshop on Image Analysis for Multimedia Interactive Services}, YEAR = {2004}, MONTH = apr, ADDRESS = {Lisbon, Portugal}, PDF = {http://elvera.nue.tu-berlin.de/files/0789Clemens2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0789Clemens2004.pdf}, ABSTRACT = {Error concealment for stereoscopic images receives little attention in research of image processing. While many methods have been proposed for monocular images, this paper considers a concealment strategy for block loss in stereoscopic image pairs, utilizing the information of the associated image to fulfill the higher quality demand. We present a hybrid approach, combining a 2D projective transformation and a monoscopic error concealment technique. Pixel values from the associated stereo image are warped to their corresponding positions in the lost block. To reduce discontinuities at the block borders, a monoscopic error concealment algorithm with low-pass properties is integrated. The stereoscopic depth perception is much less affected in our approach than using only monoscopic error concealment techniques. } } @INPROCEEDINGS{0782Liebchen2004, AUTHOR = {Tilman Liebchen}, TITLE = {An Introduction to MPEG-4 Audio Lossless Coding}, BOOKTITLE = {IEEE ICASSP 2004}, YEAR = {2004}, MONTH = may, ADDRESS = {Montreal, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/0782Liebchen2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0782Liebchen2004.pdf}, ABSTRACT = {Lossless coding will become the latest extension of the MPEG-4 audio standard. In response to a call for proposals, many companies have submitted lossless audio codecs for evaluation. The codec of the Technical University of Berlin was chosen as reference model for MPEG-4 Audio Lossless Coding (ALS), attaining working draft status in July 2003. The encoder is based on linear prediction, which enables high compression even with moderate complexity, while the corresponding decoder is straightforward. The paper describes the basic elements of the codec, points out envisaged applications, and gives an outline of the standardization process.} } @INPROCEEDINGS{0783Moriya2004, AUTHOR = {T. Moriya and D. Yang and Tilman Liebchen}, TITLE = {Extended Linear Prediction Tools for Lossless Audio Coding}, BOOKTITLE = {IEEE ICASSP 2004}, YEAR = {2004}, MONTH = may, ADDRESS = {Montreal, Canada}, NOTE = {T. Moriya: NTT Cyber Space Labs; D. Yang: University of Southern California}, PDF = {http://elvera.nue.tu-berlin.de/files/0783Moriya2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0783Moriya2004.pdf}, ABSTRACT = {Two extension tools for enhancing the compression performance of prediction-based lossless audio coding are proposed. One is progressive-order prediction of the starting samples at the random access points, where the information of previous samples is not available. The first sample is coded as is, the second is predicted by first-order prediction, the third is predicted by second-order prediction, and so on. This can be efficiently carried out with PARCOR (PARtial autoCORrelation) coefficients. The second tool is inter-channel joint coding. Both predictive coefficients and prediction error signals are efficiently coded by inter-channel differential or three-tap adaptive prediction. These new prediction tools lead to a steady reduction in bit rate when random access is activated and the inter-channel correlation is strong.} } @INPROCEEDINGS{0784Yang2004, AUTHOR = {D. Yang and T. Moriya and Tilman Liebchen}, TITLE = {A Lossless Audio Compression Scheme with Random Access Property}, BOOKTITLE = {IEEE ICASSP 2004}, YEAR = {2004}, MONTH = may, ADDRESS = {Montreal, Canada}, NOTE = {D. Yang: University of Southern California; T. Moriya: NTT Cyber Space Labs}, PDF = {http://elvera.nue.tu-berlin.de/files/0784Yang2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0784Yang2004.pdf}, ABSTRACT = {In this paper, we propose an efficient lossless coding algorithm that not only handles both PCM format data and IEEE floatingpoint format data, but also provides end users with random access property. In the worst-case scenario, where the proposed algorithm was applied to artificially generated full 32-bit floating-point sound files with 48- or 96-kHz sampling frequencies, an average compression rate of more than 1.5 and 1.7, respectively, was still achieved, which is much better than the average compression rate of less than 1.1 achieved by general purpose lossless coding algorithm gzip. Moreover, input sound files with samples’ magnitude out-of-range can also be perfectly reconstructed by our algorithm.} } @INPROCEEDINGS{0785Rein2004, AUTHOR = {Stephan Rein and Martin Reisslein and Thomas Sikora}, TITLE = {Audio Content Description with Wavelets and Neural Nets}, BOOKTITLE = {IEEE ICASSP 2004}, YEAR = {2004}, MONTH = may, ADDRESS = {Montreal, Canada}, NOTE = {Martin Reisslein: Arizona State University}, PDF = {http://elvera.nue.tu-berlin.de/files/0785Rein2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0785Rein2004.pdf}, ABSTRACT = {Precision audio content description is one of the key components of next generation internet multimedia search machines.We examine the usability of a combination of 39 different wavelets and three different types of neural nets for precision audio content description. More specifically, we develop a novel wavelet dispersion measure that measures obtained ranks of wavelet coefficients. Our dispersion measure in conjunction with a probabilistic radial basis neural network trained by only three independent example sets obtains a success rate of approximately 78% in identifying unknown complex classical music movements.} } @INPROCEEDINGS{0788Eisenberg2004, AUTHOR = {Gunnar Eisenberg and Jan-Mark Batke and Thomas Sikora}, TITLE = {BeatBank - An MPEG-7 compliant Query by Tapping System}, BOOKTITLE = {116th AES Convention}, YEAR = {2004}, MONTH = may, ADDRESS = {Berlin, Germany}, PDF = {http://elvera.nue.tu-berlin.de/files/0788Eisenberg2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0788Eisenberg2004.pdf}, ABSTRACT = {A Query by Tapping System is a multi-media database containing rhythmic metadata descriptions of songs. This paper presents a Query by Tapping system called BeatBank. The system allows to formulate queries by tapping the melody line’s rhythm of a song requested on a MIDI keyboard or an e-drum. The query entered is converted into an MPEG-7 compliant representation. The actual search process takes only rhythmic aspects of the melodies into account by comparing the values of the MPEG-7 Beat Description Scheme. An efficiently computable similarity measure is presented which enables the comparison of two database entries. This system works in real-time and computes the search process online. It computes and presents a new search result list after every tap made by the user.} } @ARTICLE{0795Kim2004, AUTHOR = {Hyoung-Gook Kim and Nicolas Moreau and Thomas Sikora}, TITLE = {Audio Classification Based on MPEG-7 Spectral Basis Representations}, JOURNAL = {IEEE Transactions on Circuits and Systems for Video Technology 7, Special Issue on Audio and Video Analysis for Multimedia Interactive Services}, YEAR = {2004}, MONTH = may, PAGES = {16--725}, VOLUME = {14}, NUMBER = {5}, PDF = {http://elvera.nue.tu-berlin.de/files/0795Kim2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0795Kim2004.pdf}, ABSTRACT = {classification and retrieval technique targeted for analysis of film material. The technique consists of low-level descriptors and high-level description schemes. For low-level descriptors, low-dimensional features such as audio spectrum projection based on audio spectrum basis descriptors is produced in order to find a balanced tradeoff between reducing dimensionality and retaining maximum information content. High-level description schemes are used to describe the modeling of reduced-dimension features, the procedure of audio classification, and retrieval. A classifier based on continuous hidden Markov models is applied. The sound model state path, which is selected according to the maximum-likelihood model, is stored in an MPEG-7 sound database and used as an index for query applications. Various experiments are presented where the speaker- and sound-recognition rates are compared for different feature extraction methods. Using independent ccomponent analysis, we achieved better results than normalized audio spectrum envelope and principal component analysis in a speaker recognition system. In audio classification experiments, audio sounds are classified into selected sound classes in real time with an accuracy of 96%.} } @INPROCEEDINGS{0786Liebchen2004, AUTHOR = {Tilman Liebchen and Y. Reznik and T. Moriya and D. Yang}, TITLE = {MPEG-4 Audio Lossless Coding}, BOOKTITLE = {116th AES Convention}, YEAR = {2004}, MONTH = may, ADDRESS = {Berlin, Germany}, NOTE = {Y. Reznik: RealNetworks Inc.; T. Moriya: NTT Cyber Space Labs; D. Yang: University of Southern California}, PDF = {http://elvera.nue.tu-berlin.de/files/0786Liebchen2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0786Liebchen2004.pdf}, ABSTRACT = {Lossless coding will become the latest extension of the MPEG-4 audio standard. The lossless audio codec of the Technical University of Berlin was chosen as reference model for MPEG-4 Audio Lossless Coding (ALS). The MPEG-4 ALS encoder is based on linear prediction, which enables high compression even with moderate complexity, while the corresponding decoder is straightforward. The paper describes the basic elements of the codec as well as some additional features, gives compression results, and points out envisaged applications.} } @INPROCEEDINGS{0787Batke2004, AUTHOR = {Jan-Mark Batke and Gunnar Eisenberg and Philipp Weishaupt and Thomas Sikora}, TITLE = {A Query by Humming system using MPEG-7 Descriptors}, BOOKTITLE = {116th AES Convention}, YEAR = {2004}, MONTH = may, ADDRESS = {Berlin, Germany}, PDF = {http://elvera.nue.tu-berlin.de/files/0787Batke2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0787Batke2004.pdf}, ABSTRACT = {Query by Humming (QBH) is a method for searching in a multimedia database system containing meta data descriptions of songs. The database can be searched by hummed queries, this means that a user can hum a melody into a microphone which is connected to the computer hosting the system. The QBH system searches the database for songs which are similar to the input query and presents the result to the user as a list of matching songs. This paper presents a modular QBH system using MPEG-7 descriptors in all processing stages. Due to the modular design all components can easily be substituted. The system is evaluated by changing parameters defined by the MPEG-7 descriptors.} } @INPROCEEDINGS{0781Kim2004, AUTHOR = {Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Comparison of MPEG-7 Audio Spectrum Projection Features and MFCC applied to Speaker Recognition, Sound Classification and Audio Segmentation}, BOOKTITLE = {ICASSP 2004}, YEAR = {2004}, MONTH = may, ORGANIZATION = {IEEE}, ADDRESS = {Montreal, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/0781Kim2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0781Kim2004.pdf}, ABSTRACT = {Our purpose is to evaluate the MPEG-7 Audio Spectrum Projection (ASP) features for general sound recognition performance vs. well established MFCC. The recognition tasks of interest are speaker recognition, sound classification, and segmentation of audio using sound/speaker identification. For the sound classification we use three approaches: the direct approach, the hierarchical approach without hints, and the hierarchical approach with hints. For audio segmentation the MPEG-7 ASP features and MFCCs are used to train hidden Markov models (HMM) for individual speakers and sounds. The trained sound/speaker models are then used to segment conversational speech involving a given subset of people in panel discussion television programs. Results show that MFCC approach yields sound/speaker recognition rate superior to MPEG-7 implementations.} } @INPROCEEDINGS{0778Kim2004, AUTHOR = {Hyoung-Gook Kim and Martin Haller and Thomas Sikora}, TITLE = {Comparison of MPEG-7 Basis Projection Features and MFCC applied to Robust Speaker Recognition}, BOOKTITLE = {ISCA - A Speaker Odyssey 2004}, YEAR = {2004}, MONTH = may, ADDRESS = {Toledo, Spain}, PDF = {http://elvera.nue.tu-berlin.de/files/0778Kim2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0778Kim2004.pdf}, ABSTRACT = {Our purpose is to evaluate the efficiency of MPEG-7 basis projection (BP) features vs. Mel-scale Frequency Cepstrum Coefficients (MFCC) for speaker recognition in noisy environments.The MPEG-7 feature extraction mainly consists of a Normalized Audio Spectrum Envelope (NASE), a basis decomposition algorithm and a spectrum basis projection. Prior to the feature extraction the noise reduction algorithm is performed by using a modified log spectral amplitude speech estimator (LSA) and a minima controlled noise estimation (MC). The noise-reduced features can be effectively used in a HMM-based recognition system. The performance is measured by the segmental signal-to-noise ratio, and the recognition results of the MPEG-7 standardized features vs. Mel-scale Frequency Cepstrum Coefficients (MFCC) in comparison to other noise reduction methods. Results show that the MFCC features yield better performance compared to MPEG-7 features.} } {1330Knorr2004, } @INPROCEEDINGS{0779Moreau2004, AUTHOR = {Nicolas Moreau and Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Phone-based Spoken Document Retrieval in Conformance with the MPEG-7 Standard}, BOOKTITLE = {25th International AES Conference Metadata for Audio}, YEAR = {2004}, MONTH = jun, ADDRESS = {London, UK}, PDF = {http://elvera.nue.tu-berlin.de/files/0779Moreau2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0779Moreau2004.pdf}, ABSTRACT = {This paper presents a phone-based approach of spoken Document retrieval, developed in the framework of the emerging MPEG-7 standard. The audio part of MPEG-7 encloses a SpokenContent tool that provides a standardized description of the content of spoken documents. In the context of MPEG-7, we propose an indexing and retrieval method that uses phonetic information only and a vector space IR model. Experiments are conducted on a database of German spoken documents, with 10 city name queries. Two phone-based retrieval approaches are presented and combined. The first one is based on the combination of phone N-grams of different lengths used as indexing terms. The other consists of expanding the document representation by means of phone confusion probabilities.} } @INPROCEEDINGS{0780Kim2004, AUTHOR = {Hyoung-Gook Kim and Juan José Burred and Thomas Sikora}, TITLE = {How efficient is MPEG-7 for General Sound Recognition?}, BOOKTITLE = {25th International AES Conference Metadata for Audio}, YEAR = {2004}, MONTH = jun, ADDRESS = {London, UK}, PDF = {http://elvera.nue.tu-berlin.de/files/0780Kim2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0780Kim2004.pdf}, ABSTRACT = {Our challenge is to analyze/classify video sound track content for indexing purposes. To this end we compare the performance of MPEG-7 Audio Spectrum Projection (ASP) features based on several basis decomposition algorithms vs. Mel-scale Frequency Cepstrum Coefficients (MFCC). For basis decomposition in the feature extraction we evaluate three approaches: Principal Component Analysis (PCA), Independent Component Analysis (ICA), and Non-negative Matrix Factorization (NMF). Audio features are computed from these reduced vectors and are fed into a continuous hidden Markov model (CHMM) classifier. Our conclusion is that established MFCC features yield better performance compared to MPEG-7 ASP in the general sound recognition under practical constraints.} } @ARTICLE{0777Burred2004, AUTHOR = {Juan José Burred and Alexander Lerch}, TITLE = {Hierarchical Automatic Audio Signal Classification}, JOURNAL = {Journal of the Audio Engineering Society}, YEAR = {2004}, MONTH = jul, PAGES = {724--739}, VOLUME = {52}, NUMBER = {7/8}, NOTE = {A. Lerch: zplane.development}, PDF = {http://elvera.nue.tu-berlin.de/files/0777Burred2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0777Burred2004.pdf}, ABSTRACT = {The design, implementation, and evaluation of a system for automatic audio signal classification is presented. The signals are classified according to audio type, differentiating between three speech classes, 13 musical genres, and background noise. A large number of audio features are evaluated for their suitability in such a classification task, including MPEG-7 descriptors and several new features. The selection of the features is carried out systematically with regard to their robustness to noise and bandwidth changes, as well as to their ability to distinguish a given set of audio types. Direct and hierarchical approaches for the feature selection and for the classification are evaluated and compared.} } @INPROCEEDINGS{0772Knorr2004, AUTHOR = {Sebastian Knorr and Carsten Clemens and Matthias Kunter and Thomas Sikora}, TITLE = {Robust Concealment for Erroneous Block Bursts in Stereoscopic Images}, BOOKTITLE = {2nd International Symposium on 3D Data Processing, Visualization, and Transmission (3DPVT'04)}, YEAR = {2004}, MONTH = sep, ADDRESS = {Thessaloniki, Greece}, PDF = {http://elvera.nue.tu-berlin.de/files/0772Knorr2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0772Knorr2004.pdf}, ABSTRACT = {With the increasing number of image communication applications especially in the low complexity domain, error concealment has become a very important field of research.Since many compression standards for images and videos are block-based a lot of methods were applied to conceal block losses in monocular images. The fast progress of capture, representation and display technologies for 3D image data advances the efforts on 3D concealment strategies. Because of their psycho-visual characteristics, stereoscopic images have to fulfill a very high quality demand. We propose an algorithm that makes use of the redundancies between two views of a stereo image pair. In many cases erroneous block bursts occur and can be highly disturbing, thus we will mainly concentrate on these errors. In addition, we focused on the quality assessment of several error concealment strategies. Beside the objective evaluation measures, we carried out a subjective quality test following the DSCQS methodology as proposed by MPEG. The results of this test demonstrate the efficiency of our approach.} } @INPROCEEDINGS{0775Moreau2004, AUTHOR = {Nicolas Moreau and Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Combination of Phone N-Grams for a MPEG-7-based Spoken Document Retrieval System}, BOOKTITLE = {EURASIP-EUSIPCO 2004}, YEAR = {2004}, MONTH = sep, ADDRESS = {Vienna, Austria}, PDF = {http://elvera.nue.tu-berlin.de/files/0775Moreau2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0775Moreau2004.pdf}, ABSTRACT = {In this paper, we present a phone-based approach of spoken document retrieval (SDR), developed in the framework of the emerging MPEG-7 standard. The audio part of MPEG-7 aims at standardizing the indexing of audio documents. It encloses a SpokenContent tool that provides a description framework of the semantic content of speech signals. In the context of MPEG-7, we propose an indexing and retrieval method that uses phonetic information only and a vector space IR model. Different strategies based on the use of phone N-gram indexing terms are experimented.} } @INPROCEEDINGS{0776Kim2004, AUTHOR = {Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Audio Spectrum Projection Based on Several Basis Decomposition Algorithms applied to General Sound Recognition and Audio Segmentation}, BOOKTITLE = {EURASIP-EUSIPCO 2004}, YEAR = {2004}, MONTH = sep, ADDRESS = {Vienna, Austria}, PDF = {http://elvera.nue.tu-berlin.de/files/0776Kim2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0776Kim2004.pdf}, ABSTRACT = {Our challenge is to analyze/classify video sound track content for indexing purposes. To this end we compare the performance of MPEG-7 Audio Spectrum Projection (ASP) features based on basis decomposition vs. Mel-scale requency Cepstrum Coefficients (MFCC). For basis decomposition in the feature extraction we have three choices: Principal Component Analysis (PCA), Independent Component Analysis (ICA), and Non-negative Matrix Factorization (NMF). Audio features are computed from these reduced vectors and are fed into hidden Markov model classifier. Experimental results show that the MFCC features yield better performance compared to MPEG-7 ASP in the sound recognition, and audio segmentation.} } @INPROCEEDINGS{0774Noll2004, AUTHOR = {Peter Noll and Markus Schwab and Wiryadi}, TITLE = {Sensing People - Localization with Microphone Arrays}, BOOKTITLE = {Elektronische Sprachsignalverarbeitung ESSV 2004}, YEAR = {2004}, MONTH = sep, ADDRESS = {Cottbus}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/0774Noll2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0774Noll2004.pdf}, ABSTRACT = {In this paper we present a real-time microphone array system which performs 3D source localization, multi channel speech enhancement and robust speech recognition. The acoustic source localization uses the SRP-PHAT method [1] to produce potential source locations. A clustering algorithm excludes outliers and enables a multi source tracking. The localizations are finally optimally filtered with an appropriate Kalman filter. The proposed speech enhancement, a weighted subarray Delay-and-Sum beamformer, is designed to cope with the problem of diffuse noise and changing speaker positions subject to minimization of the word error rate (WER) of an automatic speech recognition system (ASR). The proposed algorithm reduces the WER by more than 50 % compared to the WER of a single microphone signal.} } @INPROCEEDINGS{0768Batke2004, AUTHOR = {Jan-Mark Batke and Gunnar Eisenberg and Philipp Weishaupt and Thomas Sikora}, TITLE = {Evaluation of Distance Measures for MPEG-7 Melody Contours}, BOOKTITLE = {International Workshop on Multimedia Signal Processing}, YEAR = {2004}, MONTH = oct, ADDRESS = {Siena, Italy}, PDF = {http://elvera.nue.tu-berlin.de/files/0768Batke2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0768Batke2004.pdf}, ABSTRACT = {In Query by Humming (QBH) systems the melody contour is often used as a symbolic description of music. The MelodyContour Description Scheme (DS) defined by MPEG-7 is a standardized representation of melody contours. For melody comparison in a QBH system a distance measure is required.This paper evaluates different distance measures for the MPEG-7 MelodyContour DS. The usability of each measure is discussed.} } @INPROCEEDINGS{0766Quack2004, AUTHOR = {Till Quack and Ullrich Mönich and Lars Thiele and B. S. Manjunath}, TITLE = {Cortina: A System For Large-Scale, Content-Based Web Image Retrieval}, BOOKTITLE = {ACM Multimedia}, YEAR = {2004}, MONTH = oct, ADDRESS = {New York}, PDF = {http://elvera.nue.tu-berlin.de/files/0766Quack2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0766Quack2004.pdf}, ABSTRACT = {Recent advances in processing and networking capabilities of computers have led to an accumulation of immense amounts of multimedia data such as images. One of the largest repositories for such data is the World Wide Web (WWW). We present Cortina, a large-scale image retrieval system for the WWW. It handles over 3 million images to date. The system retrieves images based on visual features and collateral text. We show that a search process which consists of an initial query-by-keyword or query-by-image and followed by relevance feedback on the visual appearance of the results is possible for large-scale data sets. We also show that it is superior to the pure text retrieval commonly used in largescale systems. Semantic relationships in the data are explored and exploited by data mining, and multiple feature spaces are included in the search process.} } @INPROCEEDINGS{0770Kim2004, AUTHOR = {Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Speech Enhancement based on Smoothing of Spectral Noise Floor}, BOOKTITLE = {INTERSPEECH 2004 - ICSLP}, YEAR = {2004}, MONTH = oct, ADDRESS = {Jeju Island, Korea}, PDF = {http://elvera.nue.tu-berlin.de/files/0770Kim2004.ps}, URL = {http://elvera.nue.tu-berlin.de/files/0770Kim2004.ps}, ABSTRACT = {This paper presents robust speech enhancement using noise estimation based on smoothing of spectral noise floor (SNF) for nonstationary noise environments. The spectral gain function is obtained by well-known log-spectral amplitude (LSA) estimation criterion associated with the speech presence uncertainty. The noise estimate is given by averaging actual spectral power values, using a smoothing parameter that depends on smoothing of spectral noise floor. The noise estimator is very simple but achieves a good tracking capability for a nonstationary noise. Its enhanced speech is free of musical tones and reverberation artifacts and sounds very natural compared to methods using other short-time spectrum attenuation techniques. The performance is measured by the segmental signal-to-noise ratio (SNR), the speech/speaker recognition accuracy and the speaker change detection rate for the audio segmentation using MFCC-features (Melscale Frequency Cepstral Coefficients) in comparison to other single microphone noise reduction methods.} } @INPROCEEDINGS{0769Moreau2004, AUTHOR = {Nicolas Moreau and Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Phonetic Confusion Based Document Expansion for Spoken Document Retrieval}, BOOKTITLE = {INTERSPEECH 2004 - ICSLP}, YEAR = {2004}, MONTH = oct, ADDRESS = {Jeju Island, Korea}, PDF = {http://elvera.nue.tu-berlin.de/files/0769Moreau2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0769Moreau2004.pdf}, ABSTRACT = {This paper presents a phone-based approach of spoken document retrieval (SDR), developed in the framework of the emerging MPEG-7 standard. We describe an indexing and retrieval system that uses phonetic information only. The retrieval method is based on the vector space IR model, using phone N-grams as indexing terms. We propose a technique to expand the representation of documents by means of phone confusion probabilities in order to improve the retrieval performance. This method is tested on a collection of short German spoken documents, using 10 city names as queries.} } @INPROCEEDINGS{0767Eisenberg2004, AUTHOR = {Gunnar Eisenberg and Jan-Mark Batke and Thomas Sikora}, TITLE = {Efficiently Computable Similarity Measures for Query by Tapping Systems}, BOOKTITLE = {7th International Conference on Digital Audio Effects (DAFx)}, YEAR = {2004}, MONTH = oct, ADDRESS = {Naples, Italy}, PDF = {http://elvera.nue.tu-berlin.de/files/0767Eisenberg2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0767Eisenberg2004.pdf}, ABSTRACT = {A Query by Tapping system is a database which contains metadata descriptions of songs. The database can be scanned by tapping the melody line’s rhythm of a song requested on a MIDI keyboard or an e-drum. For the processing of queries the system computes the similarity of the query and the content inside the database by applying a similarity measure. Due to the high number of comparison processes in large databases efficiently computable similarity measures are needed. This paper presents two efficiently computable similarity measures which evaluate rhythmic properties of monophonic melodies represented in an MPEG-7 compliant manner. The usage and effectiveness is presented and evaluated with the real time capable Query by Tapping system BeatBank.} } @INPROCEEDINGS{0771Ekmekci2004, AUTHOR = {Sila Ekmekci and Thomas Sikora}, TITLE = {Recursive Decoder Distortion Estimation Based on AR(1) Modeling for Video}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'04)}, YEAR = {2004}, MONTH = oct, ADDRESS = {Singapore}, PDF = {http://elvera.nue.tu-berlin.de/files/0771Ekmekci2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0771Ekmekci2004.pdf} } @INPROCEEDINGS{0773Kunter2004, AUTHOR = {Matthias Kunter and Sebastian Knorr and Carsten Clemens and Thomas Sikora}, TITLE = {A Gradient Based Approach for Stereoscopic Error Concealment}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'04)}, YEAR = {2004}, MONTH = oct, ADDRESS = { Singapore}, PDF = {http://elvera.nue.tu-berlin.de/files/0773Kunter2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0773Kunter2004.pdf}, ABSTRACT = {Error concealment is an important #eld of research in image processing. Many methods were applied to conceal block losses in monocular images. In this paper we present a concealment strategy for block loss in stereoscopic image pairs. Unlike the error concealment techniques used for monocular images, the information of the associated image is utilized , i.e., by means of a projective transformation model, pixel values from the associated stereo image are warped to their corresponding positions in the lost block.The stereoscopic depth perception is much less affected in our approach than using monoscopic error concealment techniques. } } @INPROCEEDINGS{0765Onural2004, AUTHOR = {L. Onural and Thomas Sikora and Aljoscha Smolic}, TITLE = {"An Overview of a New European Consortium: Integrated Three-Dimensional Television - Capture, Transmission and Display (3DTV)"}, BOOKTITLE = {European Workshop on the Integration of Knowledge, Semantics and Digital Media Technology (EWIMT´04), Proceedings, London}, YEAR = {2004}, MONTH = nov, PDF = {http://elvera.nue.tu-berlin.de/files/0765Onural2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0765Onural2004.pdf}, ABSTRACT = {A new European consortium is formed as a Network of excellence to integrate the research works of 19 institutions in the field of 3DTV. The consortium is funded by EC under the FP6 thematic area Information Society Technologies within the strategic objective Cross-media Content for Leisure and Entertainment. The project will last 48 months, but the collaboration among the partners is expected to be longer. The technical focus of the consortium is 3DTV with all its aspects except audio. Various techniques of 3D scene capture will be investigated and compared. Representation of captured 3D content in abstract form using mainly computer graphics approaches is the key feature which decouples user from the input.. Compression of 3D scene information, and forming the bitstream structure for effective streaming are parts of the project. The user may interact with the captured scene and get a visual display based on the choice of display technology. A rich variety of different display techniques, including stereoscopy and holographic displays are among the main focus of the consortium. The plan covers various integration and dissemination activities.} } @INPROCEEDINGS{0763Günther2004, AUTHOR = {Karsten A. M. Günther and Carsten Clemens and Thomas Sikora}, TITLE = {A Fast Displacement-Estimation Based Approach For Stereoscopic Error Concealment}, BOOKTITLE = {Picture Coding Symposium}, YEAR = {2004}, MONTH = dec, ADDRESS = {San Francisco, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/0763Guenther2004.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0763Guenther2004.pdf}, ABSTRACT = {In image and video processing error concealment is an important field of research. Video applications like teleconferencing or digital video broadcasting (DVB) require fast and robust signal processing algorithms to fulfil real-time conditions. Therefore many fast concealment methods were developed to handle block losses in monocular sequences. In this paper we present a fast concealment strategy for block losses in stereoscopic sequences. Pixel values from the associate stereo image are used to conceal the lost block at the corresponding position. In this approach, we focus on robustness and low complexity.} } @BOOK{0747Noll2005, AUTHOR = {Peter Noll and Tilman Liebchen}, TITLE = {Lossless and Perceptual Coding of Digital Audio}, YEAR = {2005}, BOOKTITLE = {Beiträge zur Geschichte und neueren Entwicklung der Sprachakustik und Informationsverarbeitung - Werner Endres zum 90. Geburtstag}, PUBLISHER = {w.e.b. Universitätsverlag}, ADDRESS = {Dresden}, PDF = {http://elvera.nue.tu-berlin.de/files/0747Noll2005.pdf}, ABSTRACT = {We have seen rapid progress in high-quality compression of wideband audio signals.Today’s coding algorithms can achieve substantially better compression than was thought possible only a few years ago. In the case of audio coding with its bandwidth of 20 kHz and more, the concept of perceptual coding has paved the way for significant bit rate reductions.However, multiple coding can reveal originally masked distortions. In addition, reproduction of critical music items shows that even the best systems can not be considered as truly transparent. Therefore lossless audio coding has become a topic of high interest both for professional and customer applications.This paper will explain approaches to lossless and lossy compression, both with emphasis on MPEG standards which have found a wide range of communications-based and storagebased applications. As an example for state-of the-art lossless coding, an overview of the forthcoming MPEG-4 Audio Lossless Coding (ALS) standard will be presented. On the other hand, it will be shown that the recent MPEG-4 Advanced Audio Coding (AAC) standard outperforms many other perceptual coding algorithms (including MP3 coders). Finally, we will address the current MPEG-4 speech and audio coding standardization work which merges the whole range of audio from high fidelity audio coding and speech coding down to synthetic audio, synthetic speech and text-to-speech conversion.1 Introduction Wideband (high fidelity) audio representations including multichannel audio need bandwidths of at least 20 kHz. The conventional digital format of digital audio is PCM, with sampling rates of 32, 44.1, or 48 kHz and an amplitude resolution (PCM bits per sample) of 16 bit. Typical application areas for digital audio are in the fields of audio production,program distribution and exchange, digital sound broadcasting, digital storage, and various multimedia applications. For archiving and processing of audio signals, highest quality formats with up to 192 kHz sampling and 24 to 32-bit Amplitude resolution are already used.Audio coding is employed in order to reduce bit rate compared to the PCM epresentation.In some applications coding will have to be lossless, with compression factors around two as will be shown shortly. For other applications, perceptually transparent coding will be sufficient, which allows to compress the audio data to less than a tenth of its original size.The Compact Disc (CD) is today's de facto standard for disc-base delivery of digital audio.The CD uses the PCM format with 16-bit amplitude resolution and 44.1 kHz sampling rate,} } @ARTICLE{0752Sikora2005, AUTHOR = {Thomas Sikora}, TITLE = {Trends and Perspectives in Image and Video Coding}, JOURNAL = {Proceedings of the IEEE}, YEAR = {2005}, MONTH = jan, PAGES = {6--17}, VOLUME = {93}, NUMBER = {1}, PDF = {http://elvera.nue.tu-berlin.de/files/0752Sikora2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0752Sikora2005.pdf}, ABSTRACT = {trends and future perspectives in image and video coding. Here, I review the rapid development in the field during the past 40 years and outline current state-of-the art strategies for coding images and videos. These and other coding algorithms are discussed in the context of international JPEG, JPEG 2000, MPEG-1/2/4, and H.261/3/4 standards. Novel techniques targeted at achieving higher compression gains, error robustness, and network/device adaptability are described and discussed.Keywords—Discrete cosine transform (DCT), distributed source coding, embedded coding, error concealment, image coding, International Telecommunication Union-Telecommunications (ITU-T),Joint Photographic Experts Group (JPEG), motion compensation, JPEG 2000, Motion Picture Experts Group (MPEG), standardization,video coding, wavelets.} } @INPROCEEDINGS{0750Kim2005, AUTHOR = {Hyoung-Gook Kim and Steffen Roeber and Amjad Samour and Thomas Sikora}, TITLE = {Detection Of Goal Events In Soccer Videos}, BOOKTITLE = {IS&T/SPIE's Electronic Imaging 2005}, YEAR = {2005}, MONTH = jan, ADDRESS = {San Jose, CA USA}, PDF = {http://elvera.nue.tu-berlin.de/files/0750Kim2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0750Kim2005.pdf}, ABSTRACT = {In this paper, we present an automatic extraction of goal events in soccer videos by using audio track features alone without relying on expensive-to-compute video track features. The extracted goal events can be used for high-level indexing and selective browsing of soccer videos. The detection of soccer video highlights using audio contents comprises three steps: 1) extraction of audio features from a video sequence, 2) event candidate detection of highlight events based on the information provided by the feature extraction Methods and the Hidden Markov Model (HMM), 3) goal event selection to finally determine the video intervals to be included in the summary. For this purpose we compared the performance of the well known Mel-scale Frequency Cepstral Coefficients (MFCC) feature extraction method vs. MPEG-7 Audio Spectrum Projection feature (ASP) extraction method based on three different decomposition methods namely Principal Component Analysis( PCA), Independent Component Analysis (ICA) and Non-Negative Matrix Factorization (NMF). To evaluate our system we collected five soccer game videos from various sources. In total we have seven hours of soccer games consisting of eight gigabytes of data. One of five soccer games is used as the training data (e.g., announcers’ excited speech, Audience ambient speech noise, audience clapping,environmental sounds). Our goal event detection results are encouraging.Keywords: Goal score Detection, Highlight events Detection, MPEG-7, Mel-scale} } @INPROCEEDINGS{1063Glasberg2005, AUTHOR = {Ronald Glasberg and Khalid Elazouzi and Thomas Sikora}, TITLE = {Video-genre-classification: recognizing cartoons in real-time using visual-descriptors and a multilayer-percetpron}, BOOKTITLE = {Proc. of the 7th International Conference on Advanced Communication Technology (ICACT)}, YEAR = {2005}, MONTH = feb, PAGES = {1121--1124}, PDF = {http://elvera.nue.tu-berlin.de/files/1063Glasberg2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1063Glasberg2005.pdf}, ABSTRACT = {We present a new approach for classifying MPEG-2 video sequences as ‘cartoon’ or ‘non-cartoon’ by analyzing pecific color, texture and motion features of consecutive frames in real-time. This is part of the well-known ideo-genre-classification problem, where popular TV broadcast genres like cartoon, commercial, music, news and sports are studied. Such applications have also been discussed in the context of MPEG-7. In our method the extracted features from the visual descriptors are nonlinear weighted with a sigmoid-function and afterwards combined using a multilayered perceptron to produce a reliable recognition. The results demonstrate a high identification rate based on a large collection of 100 representative video sequences (20 cartoons and 4*20 non-cartoons) gathered from free digital TV-broadcasting.} } @INPROCEEDINGS{1064Ekmekci2005, AUTHOR = {Sila Ekmekci and Pascal Frossard and Thomas Sikora}, TITLE = {Distortion Estimation for Temporal Layered Video Coding}, BOOKTITLE = {Proc. IEEE Int. Conf. on Acoustics, Speech, and Signal Processing (ICASSP)}, YEAR = {2005}, MONTH = mar, PAGES = {189--192}, PDF = {http://elvera.nue.tu-berlin.de/files/1064Ekmekci2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1064Ekmekci2005.pdf}, ABSTRACT = {We present a recursive block based decoder distortion estimation model for temporal layered video transmission, based on a DPCM structure. Each block in a video frame is modeled as a sample from an AR(1) source. The correlation coef£cient of this source depends on the loop £ltering effects, whereas the additional noise term on the motion compensated block difference and the quantization distortion of the block. Distortion estimations are compared to simulation results, and the model is shown to accurately capture the video distortion in various lossy streaming scenarios. The low implementation complexity, and high estimation accuracy of the proposed technique makes it particularly attractive for adaptive video communication applications, that try to optimize the streaming policy.} } @INPROCEEDINGS{0749Kim2005, AUTHOR = {Hyoung-Gook Kim and Daniel Ertelt and Thomas Sikora}, TITLE = {Hybrid Speaker-Based Segmentation System Using Model-Level Clustering}, BOOKTITLE = {ICASSP 2005}, YEAR = {2005}, MONTH = mar, ADDRESS = {Philadelphia, PA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/0749Kim2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0749Kim2005.pdf}, ABSTRACT = {In this paper, we present a hybrid speaker-based segmentation, which combines metric-based and modelbased techniques. Without a priori information about number of speakers and speaker identities, the speech stream is segmented by three stages: (1) The most likely speaker changes are detected. (2) To group segments of identical speakers, a two-level clustering algorithm using a Bayesian Information Criterion (BIC) and HMM model scores is performed. Every cluster is assumed to contain only one speaker. (3) The speaker models are reestimated from each cluster by HMM. Finally a resegmentation step performs a more refined segmentation using these speaker models. For measuring the performance we compare the segmentation results of the proposed hybrid method versus metric-based segmentation. Results show that the hybrid approach using two-level clustering significantly outperforms direct metric based segmentation.} } @INPROCEEDINGS{0748Glasberg2005, AUTHOR = {Ronald Glasberg and Khalid Elazouzi and Thomas Sikora}, TITLE = {Cartoon-Recognition using Visual-Descriptors and a Multilayer-Perceptron}, BOOKTITLE = {WIAMIS}, YEAR = {2005}, MONTH = apr, ADDRESS = {Montreux}, PDF = {http://elvera.nue.tu-berlin.de/files/0748Glasberg2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0748Glasberg2005.pdf}, ABSTRACT = {sequences as ‘cartoon’ or ‘non-cartoon’ by analyzing specific color, texture and motion features of consecutive frames in real-time. This is part of the well-known videogenre-classification problem, where popular TVbroadcast genres like cartoon, commercial, music, news and sports are studied. Such applications have also been discussed in the context of MPEG-7 [12]. In our method the extracted features from the visual descriptors are nonlinear weighted with a sigmoid-function and afterwards combined using a multilayered perceptron to produce a reliable recognition. The results demonstrate a high identification rate based on a large collection of 200 representative video sequences (40 cartoons and 4*40 non-cartoons) gathered from free digital TV-broadcasting in Germany.} } @INPROCEEDINGS{0746Liebchen2005, AUTHOR = {Tilman Liebchen and Y. Reznik}, TITLE = {Improved Forward-Adaptive Prediction for MPEG-4 Audio Lossless Coding}, BOOKTITLE = {118th AES Convention}, YEAR = {2005}, MONTH = may, ADDRESS = {Barcelona}, NOTE = {Y. Reznik: RealNetworks Inc.}, PDF = {http://elvera.nue.tu-berlin.de/files/0746Liebchen2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0746Liebchen2005.pdf}, ABSTRACT = {MPEG-4 Audio Lossless Coding (ALS) is a new addition to the suite of MPEG-4 audio coding standards. The ALS codec is based on forward-adaptive linear prediction, which offers remarkable compression even with low predictor orders. Nevertheless, performance can be signicantly improved by using higher predictor orders, more efficient quantization and encoding of the predictor coecients, and adaptive block length switching. The paper describes the basic elements of the ALS codec with a focus on these recent improvements. It also presents the latest developments in the standardization process and describes several important applications of this new lossless audio format in practice.} } @INPROCEEDINGS{0745Kim2005, AUTHOR = {Jang-Heon Kim and Thomas Sikora}, TITLE = {Gaussian Scale-space Dense Disparity Estimation with Anisotropic Disparity-field Diffusion}, BOOKTITLE = {IEEE Int. Conf. on 3-D Digital Imaging and Modeling (3DIM '05)}, YEAR = {2005}, MONTH = jun, ADDRESS = {Ottawa, Ontario, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/0745Kim2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0745Kim2005.pdf}, ABSTRACT = {We present a new reliable dense disparity estimation algorithm which employs Gaussian scale-space with anisotropic disparity-field diffusion. This algorithm estimates edge-preserving dense disparity vectors using a diffusive method on iteratively Gaussianfiltered images with a scale, i.e. the Gaussian scalespace.While a Gaussian filter kernel generates a coarser resolution from stereo image pairs, only strong and meaningful boundaries are adaptively selected on the resolution of the filtered images. Then, coarse global disparity vectors are initialized using the boundary constraint. The per-pixel disparity vectors are iteratively obtained by the local adjustment of the global disparity vectors using an energy-minimization framework. The proposed algorithm preserves the boundaries while inner regions are smoothed using anisotropic disparity-field diffusion.In this work, the Gaussian scale-space efficiently avoids illegal matching on a large baseline by the restriction of the range. Moreover, it prevents the computation from iterating into local minima of illposed diffusion on large gradient areas e.g. shadow and texture region, etc. The experimental results prove the excellent localization performance preserving the disparity discontinuity of each object. } } @INBOOK{1048Schreer2005, TITLE = {3D Videocommunication: Algorithms, concepts and real-time systems in human centred communication}, YEAR = {2005}, EDITOR = {Oliver Schreer, Peter Kauff, Thomas Sikora}, PUBLISHER = {John Wiley & Sons}, PAGES = {364}, NOTE = {ISBN: 978-0-470-02271-9}, DOI = {10.1002/0470022736}, ABSTRACT = {The migration of immersive media towards telecommunication applications is advancing rapidly. Impressive progress in the field of media compression, media representation, and the larger and ever increasing bandwidth available to the customer, will foster the introduction of these services in the future. One of the key components for the envisioned applications is the development from two-dimensional towards three-dimensional audio-visual communications. With contributions from key experts in the field, 3D Videocommunication: * provides a complete overview of existing systems and technologies in 3D video communications and provides guidance on future trends and research; * considers all aspects of the 3D videocommunication processing chain including video coding, signal processing and computer graphics; * focuses on the current state-of-the-art and highlights the directions in which the technology is likely to move; * discusses in detail the relevance of 3D videocommunication for telepresence systems and immersive media; and * provides an exhaustive bibliography for further reading. Researchers and students interested in the field of 3D audio-visual communications will find 3D Videocommunication a valuable resource, covering a broad overview of the current state-of-the-art. Practical engineers from industry will also find it a useful tool in envisioning and building innovative applications.} } @INPROCEEDINGS{0744Karaman2005, AUTHOR = {Mustafa Karaman and Lutz Goldmann and Da Yu and Thomas Sikora}, TITLE = {Comparison of Static Background Segmentation Methods}, BOOKTITLE = {Visual Communications and Image Processing (VCIP '05)}, YEAR = {2005}, MONTH = jul, ADDRESS = { Beijing, China}, PDF = {http://elvera.nue.tu-berlin.de/files/0744Karaman2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0744Karaman2005.pdf}, ABSTRACT = {In the case of a static or motion compensated camera, static background segmentation methods can be applied to segment the interesting foreground objects from the Background.Although a lot of methods have been proposed,a general assessment of the state of the art is not available. An important issue is to compare various state of the art methods in terms of quality (accuracy) and computational complexity (time and memory consumption).A representative set of recent techniques is chosen, implemented and compared to each other. An extensive set of videos is used to achieve comprehensive results. Both indoor and outdoor videos with different environmental conditions are used. While visual analysis is used for subjective assessment of the quality, pixel based measures based on available ground truth data are used for the objective assessment. Furthermore the computational complexity is estimated by measuring the elapsed time and memory requirements of each algorithm. The paper summarizes the experiments and considers the assets and drawbacks of the various techniques. Moreover, it will give hints for selecting the optimal approach for a specific environment and directions for further research in this field.} } @INPROCEEDINGS{0741Moreau2005, AUTHOR = {Nicolas Moreau and Shan Jin and Thomas Sikora}, TITLE = {Comparison of Different Phone-based Spoken Document Retrieval Methods with Text and Spoken Queries}, BOOKTITLE = {ISCA 9th European Conference on Speech Communication and Technology, Interspeech 2005-Eurospeech}, YEAR = {2005}, MONTH = sep, ADDRESS = {Lisbon, Portugal}, PDF = {http://elvera.nue.tu-berlin.de/files/0741Moreau2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0741Moreau2005.pdf}, ABSTRACT = {This study compares four phone-based spoken document retrieval (SDR) approaches. In all cases, the indexing and retrieval system uses phonetic information only. The first retrieval method is based on the vector space model, using phone 3-grams as indexing terms. This approach is compared with 2 string-matching methods. A fourth method, combining the VSM approach with the slot detection step of tringmatching techniques is proposed. This method is tested on a collection of short German spoken documents, using three different sets of queries: text queries, clean spoken queries and noisy spoken queries.} } @INPROCEEDINGS{0742Clüver2005, AUTHOR = {Kai Clüver and Thomas Sikora}, TITLE = {Multiple-Description Coding of Logarithmic PCM}, BOOKTITLE = {EUSIPCO}, YEAR = {2005}, MONTH = sep, ADDRESS = {Antalya, Turkey}, PDF = {http://elvera.nue.tu-berlin.de/files/0742Cluever2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0742Cluever2005.pdf}, ABSTRACT = {A practical approach for the design of multiple-description scalar quantization of speech is presented that conforms to standard G.711 PCM. The method chiefly consists of an index assignment algorithm that enables the side decoders to exhibit SNR characteristics comparable to those of the standard logarithmic quantizer. With two-channel transmission of multiple descriptions, an increase in robustness to lossy channels is obtained without violation of the standard coding method. The method found is suitable for the design of multiple descriptions of any given scalar quantizer, e. g. one within a complex speech coder.} } @INPROCEEDINGS{0743Glasberg2005, AUTHOR = {Ronald Glasberg and Amjad Samour and Khalid Elazouzi and Thomas Sikora}, TITLE = {Cartoon-Recognition using Video & Audio-Descriptors}, BOOKTITLE = {EUSIPCO}, YEAR = {2005}, MONTH = sep, ADDRESS = {Antalya, Turkey}, PDF = {http://elvera.nue.tu-berlin.de/files/0743Glasberg2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0743Glasberg2005.pdf}, ABSTRACT = {We present a new approach for classifying mpeg-2 video sequences as ‘cartoon’ or ‘non-cartoon’ by analyzing specific video and audio features of consecutive frames in real-time. This is part of the well-known video-genre-classification problem, where popular TV-broadcast genres like cartoon, commercial, music, news and sports are studied. Such applications have also been discussed in the context of MPEG-7 [12]. In our method the extracted features from the visual descriptors are non-linearly combined using a multilayered perceptron and then considered together with the output of the audio-descriptor to produce a reliable recognition. The results demonstrate a high identification rate based on a large collection of 100 representative video sequences (20 cartoons and 4*20 non-cartoons) gathered from free digital TV-broadcasting.} } @INPROCEEDINGS{0740Kim2005, AUTHOR = {Jang-Heon Kim and Thomas Sikora}, TITLE = {Hybrid Recursive Energy-based Method for Robust Optical Flow on Large Motion Fields}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP '05)}, YEAR = {2005}, MONTH = sep, ADDRESS = {Genova, Italy}, PDF = {http://elvera.nue.tu-berlin.de/files/0740Kim2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0740Kim2005.pdf}, ABSTRACT = {Abstract—We present a new reliable hybrid recursive method for optical flow estimation. The method efficiently combines the advantage of discrete motion estimation and optical flow estimation in a recursive block-to-pixel estimation scheme. Integrated local and global approaches using the robust statistic of anisotropic diffusion remove outliers from the estimated motion field. We separately describe the process with two frameworks i.e. an incremental updating framework and a robust energy minimization framework. With robust error norms of Perona and Marik anisotropic diffusion, the formulation usually leads to non-convex optimization problems. Thus, the solution has many local minima, and convergence to the global minima is not guaranteed. Our hybrid recursive energy-based method employs a hierarchical block-to-pixel estimation concept to prevent this problem. The experimental results prove the excellent performance on several large motion fields.} } @INPROCEEDINGS{0739Ekmekci2005, AUTHOR = {Sila Ekmekci and Thomas Sikora and P. Frossard}, TITLE = {Coding with Temporal Layers or Multiple Descriptions for Lossy Video Transmission}, BOOKTITLE = {International Workshop VLBV05}, YEAR = {2005}, MONTH = sep, ADDRESS = {Costa Rei, Sardinia, Italy}, NOTE = {Sila Ekmekci, P. Frossard: EPFL Lausanne}, PDF = {http://elvera.nue.tu-berlin.de/files/0739Ekmekci2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0739Ekmekci2005.pdf}, ABSTRACT = {In this paper, we compare temporal layered coding (TLC), as well as single-state coding (SSC), to multi-state video coding (MSVC) in the context of lossy video communica- tions. MSVC is a MDC scheme where the video is coded into multiple independently decodable streams each with its own prediction process and state. The performance of these three coding schemes are analyzed at di®erent loss rates and coding options, under the assumption that each packet contains the complete coded data for a frame, and the to- tal bit rate is kept constant. To substitute the lost frames, MSVC employs state recovery based on motion compen- sated frame interpolation, whereas SSC and TLC repeat the last received frame. Results show that MSVC outper- forms SSC and TLC for high motion sequences, and also for low motion sequences at high loss probabilities, due to increased state recovery ability of the system. Addi- tionally, if one of the parallel channels of MSVC is in bad condition, unbalanced MSVC that allocates less bit rate to this channel, becomes favorable. Finally, increased error resilience with intra-GOB or frame update improves the system performance for high motion sequences at high loss rates, whereas for low motion sequences, intra updates are disadvantageous due to the penalty on the source coding quality} } @INPROCEEDINGS{0738Lerch2005, AUTHOR = {Alexander Lerch and Gunnar Eisenberg and Koen Tanghe}, TITLE = {FEAPI: A Low Level Feature Extraction Plugin API}, BOOKTITLE = {8th International Conference on Digital Audio Effects (DAFx)}, YEAR = {2005}, MONTH = sep, ADDRESS = {Madrid, Spain}, PDF = {http://elvera.nue.tu-berlin.de/files/0738Lerch2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0738Lerch2005.pdf}, ABSTRACT = {This paper presents FEAPI, an easy-to-use platform-independent plugin application programming interface (API) for the extraction of low level features from audio in PCM format in the context of music information retrieval software. The need for and advantages of using an open and well-defined plugin interface are outlined in this paper and an overview of the API itself and its usage is given.} } @INBOOK{1047Kim2005, AUTHOR = {Hyoung-Gook Kim and Nicolas Moreau and Thomas Sikora}, TITLE = {MPEG-7 Audio and Beyond: Audio Content Indexing and Retrieval}, YEAR = {2005}, PUBLISHER = {John Wiley & Sons}, PAGES = {304}, NOTE = {ISBN: 978-0-470-09334-4}, DOI = {10.1002/0470093366}, ABSTRACT = {Advances in technology, such as MP3 players, the Internet and DVDs, have led to the production, storage and distribution of a wealth of audio signals, including speech, music and more general sound signals and their combinations. MPEG-7 audio tools were created to enable the navigation of this data, by providing an established framework for effective multimedia management. MPEG-7 Audio and Beyond: Audio Content Indexing and Retrieval is a unique insight into the technology, covering the following topics: * the fundamentals of MPEG-7 audio, principally low-level descriptors and sound classification and similarity; * spoken content description, and timbre, melody and tempo music description tools; * existing MPEG-7 applications and those currently being developed; * examples of audio technology beyond the scope of MPEG-7. Essential reading for practising electronic and communications engineers designing and implementing MPEG-7 compliant systems, this book will also be a useful reference for researchers and graduate students working with multimedia database technology.} } @INPROCEEDINGS{0737Liebchen2005, AUTHOR = {Tilman Liebchen and T. Moriya and N. Harada and Y. Kamamoto and Y. Reznik}, TITLE = {The MPEG-4 Audio Lossless Coding (ALS) Standard - Technology and Applications}, BOOKTITLE = {119th AES Convention}, YEAR = {2005}, MONTH = oct, ADDRESS = {New York}, NOTE = {T. Moriya, N. Harada, Y. Kamamoto: NTT Communication Science Labs; Y. Reznik: RealNetworks Inc.}, PDF = {http://elvera.nue.tu-berlin.de/files/0737Liebchen2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0737Liebchen2005.pdf}, ABSTRACT = {MPEG-4 Audio Lossless Coding (ALS) is a new extension of the MPEG-4 audio coding family. The ALS core codec is based on forward-adaptive linear prediction, which offers remarkable compression together with low complexity. Additional features include long-term prediction, multichannel coding, and compression of floating-point audio material. In this paper authors who have actively contributed to the standard describe the basic elements of the ALS codec with a focus on prediction, entropy coding, and related tools. We also present latest developments in the standardization process and point out the most important applications of this new lossless audio format.} } @INPROCEEDINGS{0733Benetos2005, AUTHOR = {E. Benetos and M. Kotti and C. Kotropoulos and Juan José Burred and Gunnar Eisenberg and Martin Haller and Thomas Sikora}, TITLE = {Comparison of Subspace Analysis-Based and Statistical Model-Based Algorithms for Musical Instrument Classification}, BOOKTITLE = {2nd Workshop on Immersive Communication and Broadcast Systems (ICOB '05)}, YEAR = {2005}, MONTH = oct, ADDRESS = {Berlin, Germany}, NOTE = {E. Benetos, M. Kotti, C. Kotropoulos: Aristotle University of Thessaloniki}, PDF = {http://elvera.nue.tu-berlin.de/files/0733Benetos2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0733Benetos2005.pdf}, ABSTRACT = {In this paper, three classes of algorithms for automatic classification of individual musical instrument sounds are compared. The first class of classifiers is based on Non-negative Matrix Factorization, the second class of classifiers employs automatic feature selection and Gaussian Mixture Models and the third is based on continuous Hidden Markov Models. Several perceptual features used in general sound classification as well as MPEG-7 basic spectral and spectral basis descriptors were measured for 300 sound recordings consisting of 6 different musical instrument classes (piano,violin, cello, flute, bassoon, and soprano saxophone) from the University of Iowa database. The audio files were split using 70% of the available data for training and the remaining 30% for testing. Experimental results are presented to compare the classifier performance. The results indicate that all algorithm classes offer an accuracy of over 95% that outperforms the state-of-the-art performance reported for the aforementioned experiment.} } @INPROCEEDINGS{0735Tola2005, AUTHOR = {Engin Tola and Sebastian Knorr and Evren Imre and Aydin A. Alatan and Thomas Sikora}, TITLE = {Structure from Motion in Dynamic Scenes with Multiple Motions}, BOOKTITLE = {2nd Workshop On Immersive Communication and Broadcast Systems (ICOB '05)}, YEAR = {2005}, MONTH = oct, ADDRESS = {Berlin, Germany}, NOTE = {Engin Tola, Evren Imre, Aydin A. Alatan: Middle East Technical University, Turkey}, PDF = {http://elvera.nue.tu-berlin.de/files/0735Tola2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0735Tola2005.pdf}, ABSTRACT = {In this study, an algorithm is proposed to solve the multibody structure from motion (SfM) problem for the single camera case. The algorithm uses the epipolar criterion to segment the features belonging to independently moving objects. Once the features are segmented, corresponding objects are reconstructed individually by applying a sequential algorithm, which uses the previous structure to estimate the pose of the current frame. A tracker is utilized to increase the baseline and improve the F-matrix estimation, which is beneficial for both segmentation and 3D structure estimation. The experimental results on synthetic and real data demonstrate that our approach efficiently deals with the multi-body SfM problem.} } @INPROCEEDINGS{0736Kim2005, AUTHOR = {Jang-Heon Kim and Matthias Kunter and Thomas Sikora}, TITLE = {Depth Diffusion Objects (DeDiO) - A Seamless Object-based Approach for TV Applications}, BOOKTITLE = {2nd Workshop on Immersive Communication and Broadcast Systems (ICOB '05)}, YEAR = {2005}, MONTH = oct, ADDRESS = {Berlin, Germany}, PDF = {http://elvera.nue.tu-berlin.de/files/0736Kim2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0736Kim2005.pdf}, ABSTRACT = {This paper proposes a novel and seamless object-based method for visual contents fusion called “Depth Diffusion Object (DEDIO)”. It employs depth-based object segmentation in multiview scenes and allows the composition of new scenes conveying a natural image impression. Using spatial anisotropic diffusion, the homogeneous regions of a scene which have similar depth are smoothly regularized following the spatial variation while discontinuities are preserved. The object’s shape is automatically extracted by the depth range. In order to fuse the object with new background images we use membrane harmonizing applying directional constrained diffusion to remove visual seams and blend object, i.e. the remaining background area in the segmented object near the shape boundary. Our system automatically maintains the seamless object composition quality, even if the object is merged with new scenes, having different lightening conditions.} } @INPROCEEDINGS{0986Goldmann2005, AUTHOR = {Lutz Goldmann and M. Krinidis and N. Nikolaidis and S. Asteriadis and Thomas Sikora}, TITLE = {An Integrated System for Face Detection and Tracking}, BOOKTITLE = {2nd Workshop on Immersive Communication and Broadcast Systems (ICOB '05)}, YEAR = {2005}, MONTH = oct, PDF = {http://elvera.nue.tu-berlin.de/files/0986Goldmann2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0986Goldmann2005.pdf} } @INPROCEEDINGS{1059Flierl2005, AUTHOR = {Sila Ekmekci Flierl and Thomas Sikora}, TITLE = {Multi-State Video Coding with Side Information}, BOOKTITLE = {Proc. of the Thirty-Ninth Asilomar Conference on Signals, Systems and Computers}, YEAR = {2005}, MONTH = oct, PAGES = {874--878}, ADDRESS = {Pacific Grove, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1059Flierl2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1059Flierl2005.pdf}, ABSTRACT = {Multi-State Video Coding (MSVC) is a multiple description scheme where the video is splitted into two or more subsequences. Each subsequence is encoded and transmitted separately and can be decoded independently. The prediction gain decreases due to sequence splitting but error resilience of the system increases since reconstruction capabilities improve. The lost frames in one subsequence are reconstructed by using state recovery, i.e., interpolation of the past and/ future frames from the other subsequence. Unbalanced Quantized MSVC is realized by using the same scheme but coding the subsequences with different quantization stepsizes yielding different bitrates. The advantage of unbalanced operation is the increased system performance in case of unbalanced transmission channel characteristics. In our previous work, we proposed an advanced reconstruction algorithm to support the unbalanced coding of the subsequences: State recovery is not only used for the lost frames but also for received frames when state recovery yields a higher frame PSNR than using the received packet and applying motion compensation. But to figure out which reconstruction method gives a higher frame PSNR a comparison with the original sequence is necessary. Therefore the method is applicable at the decoder only if a feedback mechanism between the encoder and decoder is present. In this work, we present an alternative way, MSVC with side information (MSVCSI), for guiding the optimized reconstruction stategy by estimating the reliabilities of several possible reconstruction alternatives. The reliabilty values are calculated recursively for each frame using the loss history of the frames and the side information representing the specific sequence characteristics. We show that under unbalanced transmission conditions, MSVCSI outperforms the original MSVC method (Approach 1) and the advanced MSVC (Approach 2) upto 1 dB depending on the loss rates of the transmission channels. The gain increases as the loss rates and the unbalance rate increase.} } @INPROCEEDINGS{0731Burred2005, AUTHOR = {Juan José Burred and Thomas Sikora}, TITLE = {On the Use of Auditory Representations for Sparsity-Based Sound Source Separation}, BOOKTITLE = {IEEE Fifth Int. Conf. on Information, Communications and Signal Processing (ICICS '05)}, YEAR = {2005}, MONTH = dec, ADDRESS = {Bangkok, Thailand}, PDF = {http://elvera.nue.tu-berlin.de/files/0731Burred2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0731Burred2005.pdf}, ABSTRACT = {Sparsity-based source separation algorithms often rely on a transformation into a sparse domain to improve mixture disjointness and therefore facilitate separation. To this end, the most commonly used time-frequency representation has been the Short Time Fourier Transform (STFT). The purpose of this paper is to study the use of auditory-based representations instead of the STFT. We first evaluate the STFT disjointness properties for the case of speech and music signals, and show that auditory representations based on the Equal Rectangular Bandwidth (ERB) and Bark frequency scales can improve the disjointness of the transformed mixtures.} } @INPROCEEDINGS{0732Kunter2005, AUTHOR = {Matthias Kunter and Jang-Heon Kim and Thomas Sikora}, TITLE = {Super-resolution Mosaicing using Embedded Hybrid Recursive Flow-based Segmentation}, BOOKTITLE = {IEEE Fifth Int. Conf. on Information, Communications and Signal Processing (ICICS '05)}, YEAR = {2005}, MONTH = dec, ADDRESS = { Bangkok, Thailand}, PDF = {http://elvera.nue.tu-berlin.de/files/0732Kunter2005.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0732Kunter2005.pdf}, ABSTRACT = {We present a new strategy for the generation of background super-resolution mosaics from videos with arbitrary camera pan, tilt, and zoom including freely moving foreground. Our main focus is directed to the automatic, embedded Presegmentation of foreground objects. The segmentation technique is based on efficient and robust computation of the optical flow between neighboring frames in a video scene using a hybrid recursive approach, i.e. a combination of block-flow methods and spatial-temporal anisotropic diffusion-based flow field regularization. Unlike in other related publications we are able to segment moving foreground objects before the actual image-tomosaic- registration is proceeded even if the foreground objects do not move relatively to the camera motion. Thus, every segmented background frame can be used to enhance the resolution of the composed mosaic due to an effective blending process. Additionally, the appearance of disturbing ghost objects is prevented.} } @ARTICLE{0996Flierl2006, AUTHOR = {Sila Ekmekci Flierl and Thomas Sikora and Pascal Frossard}, TITLE = {Unbalanced Quantized Multiple State Video Coding}, JOURNAL = {EURASIP Journal on Applied Signal Processing}, YEAR = {2006}, PAGES = {1--10}, VOLUME = {2006}, NOTE = {Received 18 March 2005; Revised 24 August 2005; Accepted 12 September 2005}, PDF = {http://elvera.nue.tu-berlin.de/files/0996Flierl2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0996Flierl2006.pdf}, ABSTRACT = {Multiple state video coding (MSVC) is a multiple description scheme based on frame-wise splitting of the video sequence into two or more subsequences. Each subsequence is encoded separately to generate descriptions which can be decoded independently. Due to subsequence splitting, the prediction gain decreases. But since reconstruction capabilities improve, error resilience of the system increases. Our focus is on multiple state video coding with unbalanced quantized descriptions, which is particularly interesting for video streaming applications over heterogeneous networks where path diversity is used and transmission channels have varying transmission characteristics. The total bitrate is kept constant, while the subsequences are quantized with different stepsizes depending on the sequence as well as on the transmission conditions. Our goal is to figure out under which transmission conditions unbalanced bitstreams lead to good system performance in terms of the average reconstructed PSNR. Besides, we investigate the effects of intra-coding on the error resilience of the system and show that the sequence characteristics, and in particular the degree of motion in the sequence, have an important impact on the decoding performance. Finally, we propose a distortion model that is the core of an optimized rate allocation strategy, which is dependent on the network characteristics and status as well as on the video sequence characteristics.} } @INPROCEEDINGS{0041Goldmann2006, AUTHOR = {Lutz Goldmann and Ullrich Moenich and Thomas Sikora}, TITLE = {Robust Face Detection Based on Components and Their Topology}, BOOKTITLE = {Visual Communications and Image Processing (VCIP), IS&T/SPIE's Electronic Imaging 2006}, YEAR = {2006}, MONTH = jan, ADDRESS = {San Jose, CA, USA}, DOI = {10.1117/12.643078}, ABSTRACT = {This paper presents a novel approach for automatic and robust object detection. It utilizes a component-based approach that combines techniques from both statistical and structural pattern recognition domain. While the component detection relies on Haar-like features and an AdaBoost trained classifier cascade, the topology verification is based on graph matching techniques. The system was applied to face detection and the experiments show its outstanding performance in comparison to other face detection approaches. Especially in the presence of partial occlusions, uneven illumination and out-of-plane rotations it yields higher robustness.} } @INPROCEEDINGS{0040Karaman2006, AUTHOR = {Mustafa Karaman and Lutz Goldmann and Thomas Sikora}, TITLE = {A New Segmentation Approach Using Gaussian Color Model and Temporal Information}, BOOKTITLE = {Visual Communications and Image Processing (VCIP), IS&T/SPIE's Electronic Imaging 2006}, YEAR = {2006}, MONTH = jan, ADDRESS = {San Jose, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/0040Karaman2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0040Karaman2006.pdf}, ABSTRACT = {A new segmentation approach usable for fixed or motion compensated camera is described. Instead of the often used RGB color space we operate with the invariant Gaussian color model proposed by Geusebroek and temporal information which eliminates unsteady regions surrounded by the moving objects. The Gaussian color model has never been used in video segmentation. Comparison with some state of the art methods in which both subjective and objective evaluation are applied proof the good performance of the proposed method.} } @INPROCEEDINGS{0039Kim2006, AUTHOR = {Jang-Heon Kim and Thomas Sikora}, TITLE = {Anisotropic Scene Geometry Resampling with Occlusion Filling for 3DTV Applications}, BOOKTITLE = {Conf. on Stereoscopic Displays and Applications XVII, IS&T/SPIE's Electronic Imaging 2006}, YEAR = {2006}, MONTH = jan, ORGANIZATION = {IS&T/SPIE's Electronic Imaging, IS&T}, ADDRESS = {San Jose, CA, USA}, URL = {http://bookstore.spie.org/index.cfm?fuseaction=detailpaper&cachedsearch=1&productid=642268&producttype=pdf&CFID=3001076&CFTOKEN=75227483}, ABSTRACT = {Image and video-based rendering technologies are receiving growing attention due to their photo-realistic rendering capability in free-viewpoint. However, two major limitations are ghosting and blurring due to their sampling-based mechanism. The scene geometry which supports to select accurate sampling positions is proposed using global method (i.e. approximate depth plane) and local method (i.e. disparity estimation). This paper focuses on the local method since it can yield more accurate rendering quality without large number of cameras. The local scene geometry has two difficulties which are the geometrical density and the uncovered area including hidden information. They are the serious drawback to reconstruct an arbitrary viewpoint without aliasing artifacts. To solve the problems, we propose anisotropic diffusive resampling method based on tensor theory. Isotropic low-pass filtering accomplishes anti-aliasing in scene geometry and anisotropic diffusion prevents filtering from blurring the visual structures. Apertures in coarse samples are estimated following diffusion on the pre-filtered space, the nonlinear weighting of gradient directions suppresses the amount of diffusion. Aliasing artifacts from low density are efficiently removed by isotropic filtering and the edge blurring can be solved by the anisotropic method at one process. Due to difference size of sampling gap, the resampling condition is defined considering causality between filter-scale and edge. Using partial differential equation (PDE) employing Gaussian scale-space, we iteratively achieve the coarse-to-fine resampling. In a large scale, apertures and uncovered holes can be overcoming because only strong and meaningful boundaries are selected on the resolution. The coarse-level resampling with a large scale is iteratively refined to get detail scene structure. Simulation results show the marked improvements of rendering quality.} } {1105Liebchen2006, } {1074Burred2006, } @INPROCEEDINGS{1005Imre2006, AUTHOR = {Evren Imre and Sebastian Knorr and A. Aydın Alatan and Thomas Sikora}, TITLE = {Dinamik sahneler için önceliklendirilmiş sıralı 3B geri çatım}, BOOKTITLE = {SIU 2006}, YEAR = {2006}, MONTH = apr, ADDRESS = {Antalya}, NOTE = {Evren İmre, A. Aydın Alatan: Middle East Technical University, Turkey}, PDF = {http://elvera.nue.tu-berlin.de/files/1005Imre2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1005Imre2006.pdf}, ABSTRACT = {In this study, an algorithm is proposed to solve the multiframe structure from motion (MFSfM) problem for monocular video sequences in dynamic scenes. The algorithm uses the epipolar criterion to segment the features belonging to the independently moving objects. Once the features are segmented, the corresponding objects are reconstructed individually by using a sequential algorithm, which is also capable prioritizing the frame pairs with respect to their reliability and information content, thus achieving a fast and accurate reconstruction through efficient processing of the available data. A tracker is utilized to increase the baseline distance between views and to improve the F-matrix estimation, which is beneficial to both the segmentation and the 3D structure estimation processes. The experimental results demonstrate that our approach has the potential to effectively deal with the multi-body MFSfM problem in a generic video sequence.} } @INPROCEEDINGS{0976Kim2006, AUTHOR = {Jang-Heon Kim and Thomas Sikora}, TITLE = {Color Image Noise Reduction using Perceptual Maximum Variation Modeling for Color Diffusion}, BOOKTITLE = {7th International Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS 2006)}, YEAR = {2006}, MONTH = apr, ADDRESS = {Incheon, South Korea}, PDF = {http://elvera.nue.tu-berlin.de/files/0976Kim2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0976Kim2006.pdf}, ABSTRACT = {Diffusion is an efficient localized image regularization method based on the analysis of image structures such as direction and magnitude. However, the localization at weak features which have small brightness variations is fundamentally difficult. This often results in removal of weak features. We address this problem with perceptual maximum variation modeling. In our method, diffusion flow of color images is performed by evaluating the perceptual maximum variations which combine the small differences in both brightness and chromaticity, using a least squares optimization with principal component analysis (PCA). A consistency constraint is employed to avoid influence from global color distributions and to enhance homogeneous color regions. We apply our approach for de-noising of color images and obtain excellent improvements over existing methods.} } @INPROCEEDINGS{0987Goldmann2006, AUTHOR = {Lutz Goldmann and Mustafa Karaman and J. T. Saez Minquez and Thomas Sikora}, TITLE = {Appearance-Based Person Recognition for Surveillance Applications}, BOOKTITLE = {7th International Workshop on Image Analysis for Multimadia Interactive Services (WIAMIS 2006)}, YEAR = {2006}, MONTH = apr, ADDRESS = {Incheon, Korea}, PDF = {http://elvera.nue.tu-berlin.de/files/0987Goldmann2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0987Goldmann2006.pdf}, ABSTRACT = {This paper presents an original system for recognizing persons based on their appearance. Thus, it is especially suitable to surveillance scenarios, where biometric information might not be available. Different visual low level features in combination with different supervised learning methods are examined in order to built a robust system. Furthermore, complementary features are fused using postmapping fusion concepts to improve the reliability. The experiments show that the system is able to distinguish a large number of people and can be used for different applications.} } @INPROCEEDINGS{0988Goldmann2006, AUTHOR = {Lutz Goldmann and Amjad Samour and Thomas Sikora}, TITLE = {Multimodal Analysis for Universal Smart Room Applications}, BOOKTITLE = {7th International Workshop on Image Analysis for Multimadia Interactive Services (WIAMIS 2006)}, YEAR = {2006}, MONTH = apr, ADDRESS = {Incheon, Korea}, PDF = {http://elvera.nue.tu-berlin.de/files/0988Goldmann2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0988Goldmann2006.pdf}, ABSTRACT = {This paper presents a multimodal system incorporating smart room technologies (SRT) for conference room applications. Although, the audio-visual analysis requires only rather basic equipment, the system works reliably and supports various applications such as recognizing persons using different modalities, localizating visible speakers, controlling the camera view and summarizing the AV data.} } @INPROCEEDINGS{0997Kunter2006, AUTHOR = {Matthias Kunter and Thomas Sikora}, TITLE = {Super-resolution Mosaicing for Object based Video Format Conversion}, BOOKTITLE = {7th International Workshop on Image Analysis for Multimadia Interactive Services (WIAMIS 2006)}, YEAR = {2006}, MONTH = apr, ADDRESS = {Incheon, Korea}, PDF = {http://elvera.nue.tu-berlin.de/files/0997Kunter2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0997Kunter2006.pdf}, ABSTRACT = {This paper presents a new approach to spatial upsampling of digital video based on super-resolution mosaics. First, we robustly generate a background mosaic of higher resolution than the original video. In order to achieve that goal, we apply hierarchical global image registration estimating an optimal parabolic parameter set for each view of a scene shot. The final mosaic is generated using statistical and projection grid distance measures to avoid the impact of foreground objects and to accomplish super-resolution respectively. Second, arbitrarily moving foreground objects are segmented using MRF-based change detection methods based on the calculated mosaic. For the foreground objects an optical flow field between adjacent frames is computed. Third, we create new views with higher spatial resolution fusing re-projected background content from the mosaic together with super-resolution foreground objects obtained using optical flow field calculation. Results show that this method is able to convert videos into higher spatial resolution with very high objective and subjective quality.} } @INPROCEEDINGS{1013Glasberg2006, AUTHOR = {Ronald Glasberg and Cengiz Tas and Thomas Sikora}, TITLE = {Commercial-Recognition for MPEG-2 Streams in Real-Time using Three Visual Descriptors}, BOOKTITLE = {7th International Workshop on Image Analysis for Multimadia Interactive Services (WIAMIS 2006)}, YEAR = {2006}, MONTH = apr, ADDRESS = {Incheon, Korea}, PDF = {http://elvera.nue.tu-berlin.de/files/1013Glasberg2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1013Glasberg2006.pdf} } @INPROCEEDINGS{1104Onural2006, AUTHOR = {L. Onural and T. Sikora and J. Ostermann and A. Smolic and M. R. Civanlar and J. Watson}, TITLE = {"An Assessment of 3DTV Technologies"}, BOOKTITLE = {NAB Broadcast Engineering Conference Proceedings 2006, Las Vegas, USA}, YEAR = {2006}, MONTH = apr, PAGES = {456--467}, URL = {http://www.3dtv-research.org/publicDocs/showCase/onural.html} } @INPROCEEDINGS{0978Belkoura2006, AUTHOR = {Zouhair Belkoura and Thomas Sikora}, TITLE = {Towards Rate-Decoder Complexity Optimisation in Turbo-Coder based Distributed Video Coding}, BOOKTITLE = {Picture Coding Symposium (PCS 2006)}, YEAR = {2006}, MONTH = apr, ADDRESS = {Beijing}, PDF = {http://elvera.nue.tu-berlin.de/files/0978Belkoura2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0978Belkoura2006.pdf}, ABSTRACT = {Conventional hybrid video coding such as H.264 is compared to turbo-coder based Distributed Video Coding (DVC) from a complexity point of view. It is shown here that the overall workload in DVC can exceed that of H.264 by a substantial amount. Hence, DVC has the advantage of low-complexity encoding but at the price of high-complexity decoding, exceeding the encoder complexity of H.264. Given a turbo-coder based DVC setup, this work introduces a method to vary and possibly optimise decoder complexity while keeping encoder burden fixed at a low level. Using operational curves of the channel coding tools used in the DVC, a relation between bitrate and decoder complexity is given. It is demonstrated that in certain regions of these operational curves, large reductions in computations can be traded against relatively small increases in bitrate. Moreover, a variation of channel coder memory has influence on decoder complexity. Together with expected prediction error-rates, this permits to select an optimal operation point for given overall constraints.} } @INPROCEEDINGS{1042Batke2006, AUTHOR = {Jan-Mark Batke and Gunnar Eisenberg}, TITLE = {Evaluation of Query-by-Humming Systems using a Random Melody Database}, BOOKTITLE = {120th AES Convention}, YEAR = {2006}, MONTH = may, ADDRESS = {Paris, France}, PDF = {http://elvera.nue.tu-berlin.de/files/1042Batke2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1042Batke2006.pdf}, ABSTRACT = {The performance of melody retrieval using a query-by-humming (QBH) system depends on different parameters. For the query, parameters like length of the query and possibly contained errors influence the success of the retrieval. But also the size of the melody database inside the QBH-system has a certain impact on the query. This paper describes how the statistical parameters of a random melody database are modelled to get the same behaviour like a database containing authentic melodies. Databases containing random melodies are a testing facility to QBH-systems.} } {1044Sikora2006, } @INPROCEEDINGS{0989Knorr2006, AUTHOR = {Sebastian Knorr and Evren İmre and Burak Özkalaycı and A. Aydın Alatan and Thomas Sikora}, TITLE = {A Modular Scheme for 2D/3D Conversion of TV Broadcast}, BOOKTITLE = {3rd International Symposium on 3D Data Processing, Visualization, and Transmission (3DPVT'06)}, YEAR = {2006}, MONTH = jun, ADDRESS = {Chapel Hill, USA}, NOTE = {E. İmre, B. Özkalaycı, A. A. Alatan: Middle East Technical University, Turkey}, PDF = {http://elvera.nue.tu-berlin.de/files/0989Knorr2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0989Knorr2006.pdf}, ABSTRACT = {The 3D reconstruction from 2D broadcast video is a challenging problem with many potential applications, such as 3DTV, free-viewpoint video or augmented reality. In this paper, a modular system capable of efficiently reconstructing 3D scenes from broadcast video is proposed. The system consists of four constitutive modules: tracking and segmentation, self-calibration, sparse reconstruction and, finally, dense reconstruction. This paper also introduces some novel approaches for moving object segmentation and sparse and dense reconstruction problems. According to the simulations for both synthetic and real data, the system achieves a promising performance for typical TV content, indicating that it is a significant step towards the 3D reconstruction of scenes from broadcast video.} } @INPROCEEDINGS{1001Drelie2006, AUTHOR = {Elisa Gelasca Drelie and Mustafa Karaman and Tourajd Ebrahimi and Thomas Sikora}, TITLE = {A Framework for Evaluating Video Object Segmentation Algorithms}, BOOKTITLE = {CVPR 2006 Workshop (Perceptual Organization in Computer Vision POCV)}, YEAR = {2006}, MONTH = jun, EDITOR = {IEEE}, PAGES = {198--198}, ORGANIZATION = {IEEE}, ADDRESS = {New York}, PDF = {http://elvera.nue.tu-berlin.de/files/1001Drelie2006.pdf}, DOI = {10.1109/CVPRW.2006.15}, URL = {http://elvera.nue.tu-berlin.de/files/1001Drelie2006.pdf}, ABSTRACT = {Segmentation of moving objects in image sequences plays an important role in video processing and analysis. Evaluating the quality of segmentation results is necessary to allow the appropriate selection of segmentation algorithms and to tune their parameters for optimal performance. Many segmentation algorithms have been proposed along with a number of evaluation criteria. Nevertheless, no psychophysical experiments evaluating the quality of different video object segmentation results have been conducted. In this paper, a generic framework for segmentation quality evaluation is presented. A perceptually driven automatic method for segmentation evaluation is proposed and compared against an existing approach. Moreover, on the basis of subjective results, perceptual factors are introduced into the novel objective metric to meet the specificity of different segmentation applications such as video compression. Experimental results confirm the efficiency of the proposed evaluation criteria.} } @ARTICLE{1079Cooke2006, AUTHOR = {E. Cooke and P. Kauff and T. Sikora}, TITLE = {Multi-view synthesis: A novel viev creation approach for free viewpoint video}, JOURNAL = {Signal Processing/ Imgae Communication}, YEAR = {2006}, MONTH = jul, PAGES = {476--492}, VOLUME = {21}, NUMBER = {6} } @INPROCEEDINGS{0985Haller2006, AUTHOR = {Martin Haller and Hyoung-Gook Kim and Thomas Sikora}, TITLE = {Audiovisual Anchorperson Detection for Topic-oriented Navigation in Broadcast News}, BOOKTITLE = {IEEE 7th International Conference on Multimedia & Expo (ICME 2006)}, YEAR = {2006}, MONTH = jul, PAGES = {1817--1820}, ADDRESS = {Toronto, ON, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/0985Haller2006.pdf}, DOI = {10.1109/ICME.2006.262906}, URL = {http://elvera.nue.tu-berlin.de/files/0985Haller2006.pdf}, ABSTRACT = {This paper presents a content-based audiovisual video analysis technique for anchorperson detection in broadcast news. For topic-oriented navigation in newscasts, a segmentation of the topic boundaries is needed. As the anchorperson gives a strong indication for such boundaries, the presented technique automatically determines that high-level information for video indexing from MPEG-2 videos and stores the results in an MPEG-7 conform format. The multimodal analysis process is carried out separately in the auditory and visual modality, and the decision fusion forms the final anchorperson segments.} } @INPROCEEDINGS{1012Glasberg2006, AUTHOR = {Ronald Glasberg and Cengiz Tas and Thomas Sikora}, TITLE = {Recognizing Commercials in Real-Time using three Visual Descriptors and a Decision-Tree}, BOOKTITLE = {IEEE 7th International Conference on Multimedia & Expo (ICME 2006)}, YEAR = {2006}, MONTH = jul, ADDRESS = {Toronto, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/1012Glasberg2006.pdf}, DOI = {10.1109/ICME.2006.262822}, URL = {http://elvera.nue.tu-berlin.de/files/1012Glasberg2006.pdf}, ABSTRACT = {We present a new approach for classifying mpeg-2 video sequences as "commercial" or "non-commercial" by analyzing specific color, texture and motion features of consecutive frames in real-time. This is part of the well-known video-genre-classification problem, where popular TV-broadcast genres like cartoon, commercial, music, news and sports are studied. Such applications have also been discussed in the context of MPEG-7. In our method the extracted features from three visual descriptors are logically combined using a decision tree to produce a reliable recognition. The results demonstrate a high identification rate based on a large collection of 200 representative video sequences (40 "commercials" and 4*40 "non-commercials") gathered from free digital TV-broadcasting in Germany.} } @INPROCEEDINGS{1016Yuan2006, AUTHOR = {Y. Yuan and B. Cockburn and Thomas Sikora and Mrinal Mandal}, TITLE = {A GoP Based FEC Technique for Packet Based Video Streaming}, BOOKTITLE = {10th WSEAS International Conference on Communications}, YEAR = {2006}, MONTH = jul, PAGES = {187--192}, PDF = {http://elvera.nue.tu-berlin.de/files/1016Yuan2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1016Yuan2006.pdf}, ABSTRACT = {In this paper, we propose an efficient forward error correction (FEC) technique for video transmission in a lossy network. Here, the FEC is applied on source packets at group of pictures level assuming an MPEG-like compression scheme. We also derive analytically an estimate of the playable frame rate for the proposed technique. It is shown, by both analysis and simulation, that the proposed FEC technique provides a better playable frame rate than the classical frame-level FEC techniques.} } @INPROCEEDINGS{1017Jia2006, AUTHOR = {Leslie Jia and Mrinal Mandal and Thomas Sikora}, TITLE = {Efficient Disparity Estimation Using Region based Segmentation and Multistage Feedback}, BOOKTITLE = {10th WSEAS International Conference on Communications}, YEAR = {2006}, MONTH = jul, PAGES = {582--589}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/1017Jia2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1017Jia2006.pdf}, ABSTRACT = {Stereoscopic analysis is widely used in machine vision applications. Local and global methods are two main branches of stereoscopic analysis. The global methods typically minimize a cost function over the entire scene. Although these methods provide high estimation accuracy, because of its high complexity, they are not suitable for real-time implementation. The local methods typically use window-correlation approaches, and the associated complexity is generally low. However, the estimation accuracy is sensitive to the selected window size. In this paper, we propose a multistage local method that operates on image segments instead of traditional rectangular windows. This new approach exploits the unique characteristics of image segments, and reduces occlusion through a feedback system. Experimental results show that it is very effective for natural images. In addition, it has a low computational complexity which may be suitable for real-time implementation.} } @INPROCEEDINGS{1020Goldmann2006, AUTHOR = {Lutz Goldmann and Lars Thiele and Thomas Sikora}, TITLE = {Online Image Retrieval System Using Long Term Relevance Feedback}, BOOKTITLE = {International Conference on Image and Video Retrieval (CIVR)}, YEAR = {2006}, MONTH = jul, ADDRESS = {Tempe, AZ, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1020Goldmann2006.pdf}, DOI = {10.1007/11788034_43}, URL = {http://elvera.nue.tu-berlin.de/files/1020Goldmann2006.pdf}, ABSTRACT = {This paper describes an original system for content based image retrieval. It is based on MPEG-7 descriptors and a novel approach for long term relevance feedback using a Bayesian classifier. Each image is represented by a special model that is adapted over multiple feedback rounds and even multiple sessions or users. The experiments show its outstanding performance in comparison to often used short term relevance feedback and the recently proposed FIRE system.} } @ARTICLE{1018Jia2006, AUTHOR = {L. Jia and Mrinal Mandal and Thomas Sikora}, TITLE = {Efficient Disparity Estimation using Region based Segmentation and Multistage Feedback}, JOURNAL = {WSEAS Transactions on Communications}, YEAR = {2006}, MONTH = sep, PAGES = {1577--1584}, VOLUME = {5}, NUMBER = {9}, PDF = {http://elvera.nue.tu-berlin.de/files/1018Jia2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1018Jia2006.pdf}, ABSTRACT = {Stereoscopic analysis is widely used in machine vision applications. Local and global methods are two main branches of stereoscopic analysis. The global methods typically minimize a cost function over the entire scene. Although these methods provide high estimation accuracy, because of its high complexity, they are not suitable for real-time implementation. The local methods typically use window-correlation approaches, and the associated complexity is generally low. However, the estimation accuracy is sensitive to the selected window size. In this paper, we propose a multistage local method that operates on image segments instead of traditional rectangular windows. This new approach exploits the unique characteristics of image segments, and reduces occlusion through a feedback system. Experimental results show that it is very effective for natural images. In addition, it has a low computational complexity which may be suitable for real-time implementation.} } @INPROCEEDINGS{1019Schwab2006, AUTHOR = {Markus Schwab and Peter Noll and Thomas Sikora}, TITLE = {Noise Robust Relative Transfer Function Estimation}, BOOKTITLE = {EUSIPCO 2006}, YEAR = {2006}, MONTH = sep, PDF = {http://elvera.nue.tu-berlin.de/files/1019Schwab2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1019Schwab2006.pdf}, ABSTRACT = {Microphone arrays are suitable for a large range of applications. Two important applications are speaker localization and speech enhancement. For both of these the transfer functions from one microphone to the other microphones are needed to form potential algorithms for these applications. In this paper we present a new transfer function estimator optimized for speech sources in a noisy environment. To achieve this, we integrate a new covariance matrix estimation algorithm for the noisy speech as well as for the adaptive and correlated noise signals as received by the microphones. Results indicate that our algorithm outperforms other state-of-the-art algorithms.} } @INPROCEEDINGS{0990Knorr2006, AUTHOR = {Sebastian Knorr and Evren İmre and A. Aydın Alatan and Thomas Sikora}, TITLE = {A Geometric Segmentation Approach for the 3D Reconstruction of Dynamic Scenes in 2D Video Sequences}, BOOKTITLE = {EUSIPCO}, YEAR = {2006}, MONTH = sep, ADDRESS = {Florence, Italy}, NOTE = {E. İmre, A. A. Alatan: Middle East Technical University, Turkey}, PDF = {http://elvera.nue.tu-berlin.de/files/0990Knorr2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0990Knorr2006.pdf}, ABSTRACT = {In this paper, an algorithm is proposed to solve the multi-frame structure from motion (MFSfM) problem for monocular video sequences with multiple rigid moving objects. The algorithm uses the epipolar criterion to segment feature trajectories belonging to the background scene and each of the independently moving objects. As a large baseline length is essential for the reliability of the epipolar geometry, the geometric robust information criterion is employed for a key-frame selection within the sequences. Once the features are segmented, corresponding objects are reconstructed individually using a sequential algorithm that is capable of prioritizing the frame pairs with respect to their reliability and information content. The experimental results on synthetic and real data demonstrate that our approach has the potential to effectively deal with the multi-body MFSfM problem.} } @INPROCEEDINGS{1252Ide2006, AUTHOR = {Kai Ide and Seung Eun Lee and Young Chun Kim and Dong Kwon Kim and O'Dae Kwon}, TITLE = {A simple and sturdy butt coupling and PQR hole laser intensity profile analysis}, BOOKTITLE = {APOC 2006}, YEAR = {2006}, MONTH = sep, ADDRESS = {Kimdaejung Convention Center Gwangju, Korea} } @INPROCEEDINGS{1050Eisenberg2006, AUTHOR = {Gunnar Eisenberg and Thomas Sikora}, TITLE = {Granular Resynthesis for Sound Unmixing}, BOOKTITLE = {9th International Conference on Digital Audio Effects (DAFx-06)}, YEAR = {2006}, MONTH = sep, ADDRESS = {Montréal, Québec, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/1050Eisenberg2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1050Eisenberg2006.pdf}, ABSTRACT = {In modern music genres like Pop, Rap, Hip-Hop or Techno many songs are built in a way that a pool of small musical pieces, so called loops, are used as building blocks. These loops are usually one, two or four bars long and build the accompaniment for the lead melody or singing voice. Very often the accompanying loops can be heard solo in a song at least once. This can be used as a-priori knowledge for removing these loops from the mixture. In this paper an algorithm based on granular resynthesis and spectral subtraction is presented which makes use of this a-priori knowledge. The algorithm uses two different synthesis strategies and is capable of removing known loops from mixtures even if the loop signal contained in the mixture signal is slightly different from the solo loop signal.} } @INPROCEEDINGS{1009Krutz2006, AUTHOR = {Andreas Krutz and Michael Frater and Thomas Sikora}, TITLE = {Improved Image Registration using the Up-sampled Domain}, BOOKTITLE = {International Workshop on Multimedia Signal Processing (MMSP) 2006}, YEAR = {2006}, MONTH = oct, ADDRESS = {Victoria, BC, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/1009Krutz2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1009Krutz2006.pdf} } @INPROCEEDINGS{1015Burred2006, AUTHOR = {Juan José Burred and Thomas Sikora}, TITLE = {Comparison of frequency-warped representations for source separation of stereo mixtures}, BOOKTITLE = {121st AES Convention}, YEAR = {2006}, MONTH = oct, ADDRESS = {San Francisco, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1015Burred2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1015Burred2006.pdf}, ABSTRACT = {We evaluate the use of different frequency-warped, nonuniform time-frequency representations for the purpose of blind sound source separation from stereo mixtures. Such transformations enhance resolution in spectral areas relevant for the discrimination of the different sources, improving sparsity and mixture disjointness. In this paper, we study the effect of using such representations on the localization and detection of the sources, as well as on the quality of the separated signals. Specifically, we evaluate a constant-Q and several auditory warpings in combination with a shortest path separation algorithm and show that they improve detection and separation quality in comparison to using the Short Time Fourier Transform.} } @INPROCEEDINGS{0991Imre2006, AUTHOR = {Evren Imre and Sebastian Knorr and A. Aydın Alatan and Thomas Sikora}, TITLE = {Prioritized Sequential 3D Reconstruction in Video Sequences of Dynamic Scenes}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP\'06)}, YEAR = {2006}, MONTH = oct, ADDRESS = {Atlanta, GA, USA}, NOTE = {Evren İmre, A. Aydın Alatan: Middle East Technical University, Turkey}, PDF = {http://elvera.nue.tu-berlin.de/files/0991Imre2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0991Imre2006.pdf}, ABSTRACT = {In this study, an algorithm is proposed to solve the multi-frame structure from motion (MFSfM) problem for monocular video sequences in dynamic scenes. The algorithm uses the epipolar criterion to segment the features belonging to the independently moving objects. Once the features are segmented, corresponding objects are reconstructed individually by using a sequential algorithm, which is also capable prioritizing the frame pairs with respect to their reliability and information content, thus achieving a fast and accurate reconstruction through efficient processing of the available data A tracker is utilized to increase the baseline distance between views and to improve the F-matrix estimation, which is beneficial to both the segmentation and the 3D structure estimation processes. The experimental results demonstrate that our approach has the potential to effectively deal with the multi-body MFSfM problem in a generic video sequence.} } @INPROCEEDINGS{0992Dröse2006, AUTHOR = {Michael Dröse and Carsten Clemens and Thomas Sikora}, TITLE = {Extending Single-view Scalable Video Coding to Multi-view based on H.264/AVC}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'06)}, YEAR = {2006}, MONTH = oct, ADDRESS = {Atlanta, GA, USA}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/0992Droese2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0992Droese2006.pdf}, ABSTRACT = {An extension of single-view scalable video coding to multi-view is presented in this paper. Scalable video coding is recently developed in the Joint Video Team of ISO/IEC MPEG and ITU-T VCEG named Joint Scalable Video Model. The model includes temporal, spatial and quality scalability enhancing a H.264/AVC base layer. To remove redundancy between views a hierarchical decomposition in a similar way to the temporal direction is applied. The codec is based on this technology and supports open-loop as well as closed-loop controlled encoding.} } @INPROCEEDINGS{0995Krutz2006, AUTHOR = {Andreas Krutz and Michael Frater and Matthias Kunter and Thomas Sikora}, TITLE = {Windowed Image Registration for Robust Mosaicing of Scenes with Large Background Occlusions}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP\'06)}, YEAR = {2006}, MONTH = oct, ADDRESS = {Atlanta, GA, USA}, NOTE = {M. Frater: University of New South Wales, Canberra, Australia}, PDF = {http://elvera.nue.tu-berlin.de/files/0995Krutz2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0995Krutz2006.pdf}, ABSTRACT = {We propose an enhanced window-based approach to local image registration for robust video mosaicing in scenes with arbitrarily moving foreground objects. Unlike other approaches, we estimate accurately the image transformation without any pre-segmentation even if large background regions are occluded. We apply a windowed hierarchical frame-to-frame registration based on image pyramid decomposition. In the lowest resolution level phase correlation for initial parameter estimation is used while in the next levels robust Newton-based energy minimization of the compensated image mean-squared error is conducted. To overcome the degradation error caused by spatial image interpolation due to the warping process, i.e. aliasing effects from under-sampling, final pixel values are assigned in an up-sampled image domain using a Daubechies bi-orthogonal synthesis filter. Experimental results show the excellent performance of the method compared to recently published methods. The image registration is sufficiently accurate to allow open-loop parameter accumulation for long-term motion estimation.} } @INPROCEEDINGS{0998Kim2006, AUTHOR = {Jang-Heon Kim and Thomas Sikora}, TITLE = {Robust Anisotropic Disparity Estimation with Perceptual Maximum Variation Modeling}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'06)}, YEAR = {2006}, MONTH = oct, ADDRESS = {Atlanta, GA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/0998Kim2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/0998Kim2006.pdf}, ABSTRACT = {We present a robust anisotropic dense disparity estimation algorithm which employs perceptual maximum variation modeling. Edge-preserving dense disparity vectors are estimated using a coarse-to-fine diffusive method on iteratively filtered images, i.e. the scale-space. While an energy-minimization framework adjusts local disparity, the edges are efficiently preserved by anisotropic disparity-field diffusion. However, the localization at weak image edges which have small brightness variations is fundamentally difficult. In this paper, perceptual maximum variation modeling prevents the delocalization flow over edges computed by evaluating small variations. We perform disparity-field diffusion on a perceptually optimized color space, which combines the small differences in both brightness and chromaticity. Additionally a consistency constraint is employed in the modeling to avoid the influence of global color distributions and to enhance important edges as the human vision system does. The experimental results show the excellent localization performance preserving the disparity discontinuity of each object.} } @INPROCEEDINGS{1039Goldmann2006, AUTHOR = {Lutz Goldmann and Amjad Samour and Mustafa Karaman and Thomas Sikora}, TITLE = {Extracting High Level Semantics by Means of Speech, Audio, and Image Primitives in Surveillance Applications}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP'06)}, YEAR = {2006}, MONTH = oct, PAGES = {2397--2400}, ADDRESS = {Atlanta, GA, USA}, NOTE = {invited paper, ISBN: 1-4244-1437-7 ISSN: 1522-4880}, PDF = {http://elvera.nue.tu-berlin.de/files/1039Goldmann2006.pdf}, DOI = {10.1109/ICIP.2006.312945}, URL = {http://elvera.nue.tu-berlin.de/files/1039Goldmann2006.pdf}, ABSTRACT = {Traditional surveillance systems are usually based on visual information only. With the emerging multimedia analysis techniques, interests are changing towards systems that incorporate multiple sensors and different modalities, which leads to new ways of analyzing this multimedia data and more sophisticated applications. This paper shortly reviews the ideas of traditional surveillance systems and explains actual research interests in this domain. Then, it focuses on the typical structure, goals, and applications of multimedia surveillance systems. These issues are supported by short descriptions of selected analysis steps of such a system currently under development. Some experimental results are given to illustrate the extracted semantics and to assess the performance of the individual steps.} } @INPROCEEDINGS{1041Wilkins2006, AUTHOR = {P. Wilkins and T. Adamek and P. Ferguson and M. Hughes and G. J. F. Jones and G. Keenan and K. McGuinness and J. Malobabic and N. E. O‘Connor and D. Sadlier and A. F. Smeaton and R. Benmokhtar and E. Dumont and B. Huet and B. Merialdo and E. Spyrou and G. Koumoulos and Y. Avrithis and R. Moerzinger and P. Schallauer and W. Bailer and Q. Zhang and T. Piatrik and K. Chandramouli and E. Izquierdo and Lutz Goldmann and Martin Haller and Thomas. Sikora and P. Praks and J. Urban and X. Hilaire and J. M. Jose}, TITLE = {K-Space at TRECVid 2006}, BOOKTITLE = {Proceedings of the TRECVid Workshop}, YEAR = {2006}, MONTH = nov, ADDRESS = {Gaithersburg, Maryland, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1041Wilkins2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1041Wilkins2006.pdf}, ABSTRACT = {In this paper we describe the K-Space participation in TRECVid 2006. K-Space participated in two tasks, high-level feature extraction and search. We present our approaches for each of these activities and provide a brief analysis of our results. Our high-level feature submission made use of support vector machines (SVMs) created with low-level MPEG-7 visual features, fused with specific concept detectors. Search submissions were both manual and automatic and made use of both low- and high-level features. In the high-level feature extraction submission, four of our six runs achieved performance above the TRECVid median, whilst our search submission performed around the median. The K-Space team consisted of eight partner institutions from the EU-funded K-Space Network, and our submissions made use of tools and techniques from each partner. As such this paper will provide overviews of each partner’s contributions and provide appropriate references for specific descriptions of individual components.} } @INPROCEEDINGS{1033Burred2006, AUTHOR = {Juan José Burred and Axel Röbel and Xavier Rodet}, TITLE = {An Accurate Timbre Model for Musical Instruments and its Application to Classification}, BOOKTITLE = {First Workshop on Learning the Semantics of Audio Signals}, YEAR = {2006}, MONTH = dec, ADDRESS = {Athens}, NOTE = {Axel Röbel, Xavier Rodet: IRCAM, Paris}, PDF = {http://elvera.nue.tu-berlin.de/files/1033Burred2006.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1033Burred2006.pdf}, ABSTRACT = {A compact, general and accurate model of the timbral characteristics of musical instruments can be used as a source of a priori knowledge for music content analysis applications such as transcription and instrument classification, as well as for source separation. We develop a timbre model based on the spectral envelope that meets these requirements and relies on additive analysis, Principal Component Analysis and database training. We put special emphasis on the issue of frequency misalignment when training an instrument model with notes of different pitches, and show that a spectral representation involving frequency interpolation results in an improved model. Finally, we show the performance of the developed model when applied to musical instrument classification.} } @ARTICLE{1055Imre2007, AUTHOR = {Evren Imre and Sebastian Knorr and Burak Ozkalayci and Ugur Topay and A. Aydin Alatan and Thomas Sikora}, TITLE = {Towards 3-D Scene Reconstruction from Broadcast Video}, JOURNAL = {Signal Processing: Image Communication}, YEAR = {2007}, MONTH = jan, NOTE = {E. Imre, B. Özkalayci, U. Topay, A. A. Alatan: Middle East Technical University, Turkey}, URL = {http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6V08-4MRNT3C-1&_user=1773525&_coverDate=01%2F05%2F2007&_rdoc=13&_fmt=summary&_orig=browse&_srch=doc-info(%23toc%235640%239999%23999999999%2399999%23FLA%23display%23Articles)&_cdi=5640&_sort=d&_docanchor=&view=c&_ct=24&_acct=C000054491&_version=1&_urlVersion=0&_userid=1773525&md5=75ad72b359f51cb59b0ad44a0251f975}, ABSTRACT = {Three-dimensional (3-D) scene reconstruction from broadcast video is a challenging problem with many potential applications, such as 3-D TV, free-view TV, augmented reality or three-dimensionalization of two-dimensional (2-D) media archives. In this paper, a flexible and effective system capable of efficiently reconstructing 3-D scenes from broadcast video is proposed, with the assumption that there is relative motion between camera and scene/objects. The system requires no a priori information and input, other than the video sequence itself, and capable of estimating the internal and external camera parameters and performing a 3-D motion-based segmentation, as well as computing a dense depth field. The system also serves as a showcase to present some novel approaches for moving object segmentation, sparse and dense reconstruction problems. According to the simulations for both synthetic and real data, the system achieves a promising performance for typical TV content, indicating that it is a significant step towards the 3-D reconstruction of scenes from broadcast video.} } @INPROCEEDINGS{1040Samour2007, AUTHOR = {Amjad Samour and Mustafa Karaman and Lutz Goldmann and Thomas Sikora}, TITLE = {Video to the Rescue of Audio: Shot Boundary Assisted Speaker Change Detection}, BOOKTITLE = {Multimedia Content Access: Algorithms and Systems, IS&T/SPIE's Electronic Imaging 2007}, YEAR = {2007}, MONTH = jan, EDITOR = {Alan Hanjalic; Raimondo Schettini; Nicu Sebe}, PUBLISHER = {SPIE}, ORGANIZATION = {SPIE}, ADDRESS = {San Jose, CA, USA}, NOTE = {ISBN: 9780819466198}, PDF = {http://elvera.nue.tu-berlin.de/files/1040Samour2007.pdf}, DOI = {10.1117/12.703114}, URL = {http://elvera.nue.tu-berlin.de/files/1040Samour2007.pdf}, ABSTRACT = {Speaker change detection (SCD) is a preliminary step for many audio applications such as speaker segmentation and recognition. Thus, its robustness is crucial to achieve a good performance in the later steps. Especially, misses (false negatives) affect the results. For some applications, domain-specific characteristics can be used to improve the reliability of the SCD. In broadcast news and discussions, the cooccurrence of shot boundaries and change points provides a robust clue for speaker changes. In this paper, two multimodal approaches are presented that utilize the results of a shot boundary detection (SBD) step to improve the robustness of the SCD. Both approaches clearly outperform the audio-only approach and are exclusively applicable for TV broadcast news and plenary discussions.} } @INPROCEEDINGS{1049Kunter2007, AUTHOR = {Matthias Kunter and Andreas Krutz and Mrinal Mandal and Thomas Sikora}, TITLE = {Optimal multiple sprite generation based on physical camera parameter estimation}, BOOKTITLE = {Visual Communications and Image Processing, VCIP, IS&T/SPIE's Electronic Imaging 2007}, YEAR = {2007}, MONTH = jan, ORGANIZATION = {IS&T/SPIE}, ADDRESS = {San José, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1049Kunter2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1049Kunter2007.pdf} } @INPROCEEDINGS{1096Knörig2007, AUTHOR = {Rüdiger Knörig and Thomas Sikora}, TITLE = {MDC image coding using Cascaded Correlating Transforms}, BOOKTITLE = {Electronic Imaging}, YEAR = {2007}, MONTH = jan, ORGANIZATION = {IS&T / SPIE}, ADDRESS = {San Jose, California USA}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/1096Knoerig2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1096Knoerig2007.pdf}, ABSTRACT = {This paper describes a joint source-channel coding framework combining cascaded correlating transforms as proposed by Goyal with an optimal estimation algorithm in the MSE sense. The cascaded correlating transform, an extension of the well-known pairwise correlating transform to transforms of higher order, can be seen as a detunable decorrelating transform. By reducing the transforms ability to decorrelate, a higher amount of source correlation ”survives” in the signal. This increased redundancy will be used for concealing channel errors. Since the detuning can be performed stepless an arbitrary amount of redundancy can be selected, allowing fine-tuned trade-offs between coding effciency and robustness to channel errors. This is an advantage over the classic approach by combining source- and channel coders since even shortened convolution coders offer only a discrete and therefore not stepless set of coding rates. Moreover, our approach affects only the transform and the inverse transform stages and will be transparent to other stages of the coding system (e.g. quantization or entropy coding).} } @INPROCEEDINGS{1127Koutsia2007, AUTHOR = {A. Koutsia and N. Grammalidis and K. Dimitropoulos and M. Karaman and L. Goldmann}, TITLE = {Football Player Tracking from Multiple Cameras}, BOOKTITLE = {International Conference on Computer Vision Theory and Applications}, YEAR = {2007}, MONTH = mar, EDITOR = {A. Ranchordas, H. Araujo, J. Vitria}, PUBLISHER = {Insticc Press}, PAGES = {523--526}, ADDRESS = {Barcelona, Spain}, NOTE = {ISBN: 978-972-8865-74-0}, PDF = {http://elvera.nue.tu-berlin.de/files/1127Koutsia2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1127Koutsia2007.pdf}, ABSTRACT = {In this work, our aim is to develop an automated system which provides data useful for football game analysis. Information from multiple cameras is used to perform player detection, classification and tracking. A background segmentation approach, which operates with the invariant Gaussian colour model and uses temporal information, is used to achieve more accurate results. Information derived and matched from all cameras is then used to perform tracking, using an advanced Multiple Hypothesis Tracking algorithm.} } {1090Sikora2007, } @INPROCEEDINGS{1052Weil2007, AUTHOR = {Jan Weil and Kai Clüver and Thomas Sikora}, TITLE = {Real-Time Multiple-Description Coding of Speech Signals}, BOOKTITLE = {5th International Linux Audio Conference, LAC2007}, YEAR = {2007}, MONTH = mar, ADDRESS = {Berlin, Germany}, PDF = {http://elvera.nue.tu-berlin.de/files/1052Weil2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1052Weil2007.pdf}, ABSTRACT = {When sending speech data over lossy networks like the internet, multiple-description (MD) coding is a means to improve the perceived quality by dividing the data into multiple descriptions which are then sent as separate packets. In doing so the speech signal can still be decoded even if only parts of these descriptions are received. The present paper describes the structure of a software which demonstrates the benefits of this coding scheme using a client-server architecture.} } {1089Sikora2007, } @ARTICLE{1072Hoene2007, AUTHOR = {Christian Hoene and Kai Clüver and Jan Weil}, TITLE = {An Architecture for a Next Generation VoIP Transmission System}, JOURNAL = {PIK, Special Issue on Current Trends in Network and Service Management}, YEAR = {2007}, MONTH = apr, PAGES = {76--81}, VOLUME = {30}, NUMBER = {2}, NOTE = {C. Hoene: University of Tübingen}, URL = {http://www.saur.de/index.cfm?lang=DE&id=0000001475}, ABSTRACT = {Packetized speech transmission systems implemented with Voice over IP are gaining momentum against the traditional circuit switched systems despite the fact that packet switched VoIP is two to three times less efficient than its circuit switched counterpart. At the same time, it only supports a rather bad “toll” quality. We believe that it is time for a new architecture developed from scratch - an architecture that includes an Internet enabled speech codec and its transport system. This architecture manages the perceptual service quality while using the available transmission resources to its best. The transmission of speech is managed and controlled with respect to its speech quality, mouth-to-ear delay, bit-rate, frame-rate, and loss robustness. Beside the architecture, we describe the requirements for the Internet speech codec and its transport protocol and present an interface between the speech codec and the transport protocol.} } @INPROCEEDINGS{1053Krutz2007, AUTHOR = {Andreas Krutz and Michael Dröse and Matthias Kunter and Mrinal Mandal and Michael Frater and Thomas Sikora}, TITLE = {Low Bit-Rate Object-Based Multi-View Video Coding using MVC}, BOOKTITLE = {3DTV-Conference}, YEAR = {2007}, MONTH = may, ORGANIZATION = {IEEE, 3DTV}, ADDRESS = {Kos Island, Greece}, NOTE = {M. Mandal: Department of Electr. & Comp. Eng., University of Alberta, Edmonton, Canada; M. Frater: Australian Defence Force Academy, Canberra, Australia}, PDF = {http://elvera.nue.tu-berlin.de/files/1053Krutz2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1053Krutz2007.pdf} } @INPROCEEDINGS{1054Knorr2007, AUTHOR = {Sebastian Knorr and Aljoscha Smolic and Thomas Sikora}, TITLE = {From 2D- to Stereo- to Multi-view Video}, BOOKTITLE = {3DTV-Conference}, YEAR = {2007}, MONTH = may, ADDRESS = {Kos Island, Greece}, PDF = {http://elvera.nue.tu-berlin.de/files/1054Knorr2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1054Knorr2007.pdf} } @INPROCEEDINGS{1056Krutz2007, AUTHOR = {Andreas Krutz and Matthias Kunter and Mrinal Mandal and Michael Frater and Thomas Sikora}, TITLE = {Motion-based Object Segmentation using Sprites and Anisotropic Diffusion}, BOOKTITLE = {8th International Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS)}, YEAR = {2007}, MONTH = jun, ADDRESS = {Santorini, Greece}, PDF = {http://elvera.nue.tu-berlin.de/files/1056Krutz2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1056Krutz2007.pdf} } @INPROCEEDINGS{1121Kim2007, AUTHOR = {Woong Hee Kim and Thomas Sikora}, TITLE = {Image Denoising Method Using Diffusion Equation and Edge Map Estimated with K-Means Clustering Algorithm}, BOOKTITLE = {8th International Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS)}, YEAR = {2007}, MONTH = jun, ADDRESS = {Santorini, Greece}, PDF = {http://elvera.nue.tu-berlin.de/files/1121Kim2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1121Kim2007.pdf} } @INPROCEEDINGS{1125Glasberg2007, AUTHOR = {Ronald Glasberg and Pascal Kelm and Hao Qin and Thomas Sikora,}, TITLE = {Extensible Platform for Multimedia Analysis (XPMA)}, BOOKTITLE = {2007 IEEE International Conference on Multimedia and Expo}, YEAR = {2007}, MONTH = jul, PUBLISHER = {IEEE}, PAGES = {5--5}, ORGANIZATION = {IEEE}, ADDRESS = {BEIJING, CHINA}, NOTE = {Demo}, PDF = {http://elvera.nue.tu-berlin.de/files/1125Glasberg2007.pdf}, DOI = {10.1109/ICME.2007.4284567}, URL = {http://elvera.nue.tu-berlin.de/files/1125Glasberg2007.pdf}, ABSTRACT = {We present a new software-platform with an open and programming-language-independent structure, to improve the reusability of developed multimedia analysis components. A user can design his own, applicationoriented multimedia system by combining these components via XML and evaluate the experimental results.} } @INPROCEEDINGS{1083Knorr2007, AUTHOR = {Sebastian Knorr and Matthias Kunter and Thomas Sikora}, TITLE = {Super-Resolution Stereo- and Multi-View Synthesis from Monocular Video Sequences}, BOOKTITLE = {3-D Digital Imaging and Modeling (3DIM 2007)}, YEAR = {2007}, MONTH = aug, ADDRESS = {Montréal, Québec, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/1083Knorr2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1083Knorr2007.pdf}, ABSTRACT = {This paper presents a new approach for generation of super-resolution stereoscopic and multi-view video from monocular video. Such multi-view video is used for instance with multi-user 3D displays or auto-stereoscopic displays with head-tracking to create a depth impression of the observed scenery. Our approach is an extension of the realistic stereo view synthesis (RSVS) approach which is based on structure from motion techniques and image-based rendering to generate the desired stereoscopic views for each point in time. The extension relies on an additional super-resolution mode which utilizes a number of frames of the original video sequence to generate a virtual stereo frame with higher resolution. The algorithm is tested on several TV broadcast videos, as well as on sequences captured with a single handheld camera and sequences from the well known BBC documentation “Planet Earth”. Finally, some simulation results will show that RSVS is quite suitable for super-resolution 2D-3D conversion.} } {1095Knorr2007, } {1333Knorr2007, } {1126Glasberg2007, } @ARTICLE{1102Goldmann2007, AUTHOR = {Lutz Goldmann and Ullrich J. Mönich and Thomas Sikora}, TITLE = {Components and Their Topology for Robust Face Detection in the Presence of Partial Occlusions}, JOURNAL = {IEEE Transactions on Information Forensics and Security, Special Issue on Human Detection and Recognition}, YEAR = {2007}, MONTH = sep, PAGES = {559--569}, VOLUME = {2}, NUMBER = {3/2}, NOTE = {ISSN: 1556-6013}, PDF = {http://elvera.nue.tu-berlin.de/files/1102Goldmann2007.pdf}, DOI = {10.1109/TIFS.2007.902019}, URL = {http://elvera.nue.tu-berlin.de/files/1102Goldmann2007.pdf}, ABSTRACT = {This paper presents a novel approach for automatic and robust object detection. It utilizes a component-based approach that combines techniques from both statistical and structural pattern recognition domain. While the component detection relies on Haar-like features and an AdaBoost trained classifier cascade, the topology verification is based on graph matching techniques. The system was applied to face detection and the experiments show its outstanding performance in comparison to conventional face detection approaches. Especially in the presence of partial occlusions, uneven illumination, and out-of-plane rotations, it yields higher robustness. Furthermore, this paper provides a comprehensive review of recent approaches for object detection and gives an overview of available databases for face detection.} } @INPROCEEDINGS{1081Clüver2007, AUTHOR = {Kai Clüver and Jan Weil and Thomas Sikora}, TITLE = {Multiple-Description Coding of Speech using Forward Error Correction Codes}, BOOKTITLE = {15th European Signal Processing Conference (EUSIPCO 2007)}, YEAR = {2007}, MONTH = sep, PAGES = {1377--1381}, ADDRESS = {Poznań, Poland}, PDF = {http://elvera.nue.tu-berlin.de/files/1081Cluever2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1081Cluever2007.pdf}, ABSTRACT = {A flexible framework is presented which performs multiple-description coding of speech signals with two or more channels. The use of forward error correction codes together with a layered speech codec permits encoding into more than two descriptions without excessive increase in complexity. Results of a formal MOS listening test reveal considerable improvements in robustness as long as base layer quality and the number of descriptions are chosen appropriately. A modification of the original encoding scheme allows trading off bit rate savings against robustness to extreme channel conditions. Different coding schemes can easily be compared using a real-time demonstrator software.} } @INPROCEEDINGS{1082Haller2007, AUTHOR = {Martin Haller and Andreas Krutz and Thomas Sikora}, TITLE = {A Generic Approach for Motion-based Video Parsing}, BOOKTITLE = {Proceedings of the 15th European Signal Processing Conference (EUSIPCO 2007)}, YEAR = {2007}, MONTH = sep, PAGES = {713--717}, ADDRESS = {Poznań, Poland}, NOTE = {invited paper, ISBN 978-83-921340-2-2}, PDF = {http://elvera.nue.tu-berlin.de/files/1082Haller2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1082Haller2007.pdf}, ABSTRACT = {Motion-based video parsing methods segment video streams according to changes of camera motion types. They rely usually on compressed video streams, where motion vector fields are provided. Camera parameters can be derived from these motion vectors. There are a number of relevant video codecs where no motion information is included. For such video streams, camera parameters have to be estimated using a frame-to-frame image registration method. In our approach, we provide both techniques to estimate camera parameters. Enhanced feature extraction algorithms take advantage of estimated parameters. For classification, the method uses three multi-class Support Vector Machines (M-SVMs) to independently detect pan, tilt, and zoom camera motion as well as the direction of motion. Experimental results show a promising performance of our generic approach with test video streams from the TRECVid 2005 BBC rushes video corpus.} } @INPROCEEDINGS{1103Samour2007, AUTHOR = {Amjad Samour and Lutz Goldmann and Thomas Sikora}, TITLE = {Towards Person Google: Multimodal Person Search and Retrieval}, BOOKTITLE = {K-Space PhD Jamboree (KSPJ)}, YEAR = {2007}, MONTH = sep, ADDRESS = {Berlin, Germany}, PDF = {http://elvera.nue.tu-berlin.de/files/1103Samour2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1103Samour2007.pdf} } @INPROCEEDINGS{1075Kim2007, AUTHOR = {Jang-Heon Kim and Thomas Sikora}, TITLE = {Confocal Disparity Estimation and Recovery of Pinhole Image in Real-aperture 3-D Camera system}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP\'07)}, YEAR = {2007}, MONTH = sep, ADDRESS = {San Antonio, Texas, USA}, URL = {http://ieeexplore.ieee.org/iel5/4378863/4379738/04379807.pdf?tp=&arnumber=4379807&isnumber=4379738} } @INPROCEEDINGS{1076Knorr2007, AUTHOR = {Sebastian Knorr and Thomas Sikora}, TITLE = {An Image-based Rendering (IBR) Approach for Realistic Stereo View Synthesis of TV Broadcast Based on Structure From Motion}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP)}, YEAR = {2007}, MONTH = sep, ADDRESS = {San Antonio, Texas, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1076Knorr2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1076Knorr2007.pdf}, ABSTRACT = {In the past years, the 3D display technology has become a booming branch of research with fast technical progress. Hence, the 3D conversion of already existing 2D video material increases more and more in popularity. In this paper, a new approach for realistic stereo view synthesis (RSVS) of existing 2D video material is presented. The intention of our work is not a real-time conversion of existing video material with a deduction in stereo perception, but rather a more realistic off-line conversion with high accuracy. Our approach is based on structure from motion techniques and uses image-based rendering to reconstruct the desired stereo views for each video frame. The algorithm is tested on several TV broadcast videos, as well as on sequences captured with a single handheld camera. Finally, some simulation results will show the remarkable performance of this approach.} } @INPROCEEDINGS{1077Krutz2007, AUTHOR = {Andreas Krutz and Michael Frater and Thomas Sikora}, TITLE = {Window-Based Image Registration Using Variable Window Sizes}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP 2007)}, YEAR = {2007}, MONTH = sep, ADDRESS = {San Antonio, Texas, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1077Krutz2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1077Krutz2007.pdf} } @INPROCEEDINGS{1078Kunter2007, AUTHOR = {Matthias Kunter and Andreas Krutz and Michael Dröse and Michael Frater and Thomas Sikora}, TITLE = {Object-based Multiple Sprite Coding of unsegmented Videos using H.264/AVC}, BOOKTITLE = {IEEE Int. Conf. on Image Processing (ICIP 2007)}, YEAR = {2007}, MONTH = sep, ADDRESS = {San Antonio, Texas, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1078Kunter2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1078Kunter2007.pdf} } @INPROCEEDINGS{1111Kurutepe2007, AUTHOR = {Engin Kurutepe and M. Reha Civanlar and A. Murat Tekalp}, TITLE = {Selective Streaming of Multi-View Video for Head-Tracking 3D Displays}, BOOKTITLE = {International Conference on Image Processing}, YEAR = {2007}, MONTH = sep, ORGANIZATION = {IEEE}, ADDRESS = {San Antonio, TX, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1111Kurutepe2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1111Kurutepe2007.pdf}, ABSTRACT = {We present a novel client-driven multi-view video streaming system that allows a user watch 3-D video interactively with significantly reduced bandwidth requirements by transmitting a small number of views selected according to his/her head position. The proposed scheme can be used to efficiently stream a dense set of multi-view sequences (light-fields) or wider baseline multi-view sequences together with depth information. The user's head position is tracked and predicted into the future to select the views that best match the user's current viewing angle dynamically. Prediction of future head positions is needed so that views matching the predicted head positions can be requested from the server ahead of time in order to account for delays due to network transport and stream switching. Highly compressed, lower quality versions of some other views are also requested in order to provide protection against having to display the wrong view when the current user viewpoint differs from the predicted viewpoint. The proposed system makes use of multi-view coding (MVC) and scalable video coding (SVC) concepts together to obtain improved compress ion efficiency while providing flexibility in bandwidth allocation to the selected views. Rate-distortion performance of the proposed system is demonstrated under different experimental conditions.} } @INPROCEEDINGS{1085Martins2007, AUTHOR = {Luis Gustavo Martins and Juan José Burred and George Tzanetakis and Mathieu Lagrange}, TITLE = {Polyphonic Instrument Recognition Using Spectral Clustering}, BOOKTITLE = {International Conference on Music Information Retrieval (ISMIR 2007)}, YEAR = {2007}, MONTH = sep, ADDRESS = {Vienna, Austria}, NOTE = {L.G. Martins: INESC Porto, Portugal G. Tzanetakis, M. Lagrange: University of Victoria, Canada}, PDF = {http://elvera.nue.tu-berlin.de/files/1085Martins2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1085Martins2007.pdf}, ABSTRACT = {The identification of the instruments playing in a polyphonic music signal is an important and unsolved problem in Music Information Retrieval. In this paper, we propose a framework for the sound source separation and timbre classification of polyphonic, multi-instrumental music signals. The sound source separation method is inspired by ideas from Computational Auditory Scene Analysis and formulated as a graph partitioning problem. It utilizes a sinusoidal analysis front-end and makes use of the normalized cut, applied as a global criterion for segmenting graphs. Timbre models for six musical instruments are used for the classification of the resulting sound sources. The proposed framework is evaluated on a dataset consisting of mixtures of a variable number of simultaneous pitches and instruments, up to a maximum of four concurrent notes. The overall instrument classification success rate is of 47%.} } @INPROCEEDINGS{1086Burred2007, AUTHOR = {Juan José Burred and Thomas Sikora}, TITLE = {Monaural Source Separation from Musical Mixtures Based on Time-Frequency Timbre Models}, BOOKTITLE = {International Conference on Music Information Retrieval (ISMIR 2007)}, YEAR = {2007}, MONTH = sep, ADDRESS = {Vienna, Austria}, PDF = {http://elvera.nue.tu-berlin.de/files/1086Burred2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1086Burred2007.pdf}, ABSTRACT = {We present a system for source separation from monaural musical mixtures based on Sinusoidal Modeling and on a library of timbre models trained a priori. The models, which rely on Principal Component Analysis, serve as time-frequency probabilistic templates of the spectral envelope. They are used to match groups of sinusoidal tracks and assign them to a source, as well as to reconstruct overlapping partials. The proposed method does not make any assumptions on the harmonicity of the sources, and does not require a previous multipitch estimation stage. Since the timbre matching stage detects the instruments present on the mixture, the system can also be used for classification and segmentation.} } @ARTICLE{1114Kurutepe2007, AUTHOR = {Engin Kurutepe and M. Reha Civanlar and A. Murat Tekalp}, TITLE = {Client-Driven Selective Streaming of Multiview Video for Interactive 3DTV}, JOURNAL = {IEEE Transactions on Circuits and Systems for Video Technology}, YEAR = {2007}, MONTH = nov, PAGES = {1558--1565}, VOLUME = {17}, NUMBER = {11}, PDF = {http://elvera.nue.tu-berlin.de/files/1114Kurutepe2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1114Kurutepe2007.pdf} } @ARTICLE{1115Tekalp2007, AUTHOR = {A. Murat Tekalp and E. Kurutepe and M. Reha Civanlar}, TITLE = {3DTV over IP}, JOURNAL = {IEEE Signal Processing Magazine}, YEAR = {2007}, MONTH = nov, PAGES = {77--87}, VOLUME = {24}, NUMBER = {6}, PDF = {http://elvera.nue.tu-berlin.de/files/1115Tekalp2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1115Tekalp2007.pdf} } @INPROCEEDINGS{1107Wilkins2007, AUTHOR = {P. Wilkins and T. Adamek and D. Byrne and G. J.F.Jones and H. Lee and G. Keenan and K. McGuinness and N. E. O’Connor and A. F. Smeaton and A. Amin and Z. Obrenovic and R. Benmokhtar and E. Galmar and B. Huet and S. Essid and R. Landais and F. Vallet and G. Th. Papadopoulos and S. Vrochidis and V. Mezaris and I. Kompatsiaris and E. Spyrou and Y. Avrithis and R. Mörzinger and P. Schallauer and W. Bailer and T. Piatrik and K. Chandramouli and E. Izquierdo and Martin Haller and Lutz Goldmann and Amjad Samour and Andreas Cobet and Thomas Sikora and P. Praks}, TITLE = {K-Space at TRECVid 2007}, BOOKTITLE = {Proceedings of the TRECVid Workshop}, YEAR = {2007}, MONTH = nov, PAGES = {205--216}, PDF = {http://elvera.nue.tu-berlin.de/files/1107Wilkins2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1107Wilkins2007.pdf}, ABSTRACT = {In this paper we describe K-Space participation in TRECVid 2007. K-Space participated in two tasks, high-level feature extraction and interactive search. We present our approaches for each of these activities and provide a brief analysis of our results. Our high-level feature submission utilized multi-modal low-level features which included visual, audio and temporal elements. Specific concept detectors (such as Face detectors) developed by K-Space partners were also used. We experimented with different machine learning approaches including logistic regression and support vector machines (SVM). Finally we also experimented with both early and late fusion for feature combination. This year we also participated in interactive search, submitting 6 runs. We developed two interfaces which both utilized the same retrieval functionality. Our objective was to measure the effect of context, which was supported to different degrees in each interface, on user performance. The first of the two systems was a ‘shot’ based interface, where the results from a query were presented as a ranked list of shots. The second interface was ‘broadcast’ based, where results were presented as a ranked list of broadcasts. Both systems made use of the outputs of our high-level feature submission as well as low-level visual features.} } @INPROCEEDINGS{1100Krutz2007, AUTHOR = {Andreas Krutz and Matthias Kunter and Michael Dröse and Michael Frater and Thomas Sikora}, TITLE = {Content-adaptive Video Coding Combining Object-based Coding and H.264/AVC}, BOOKTITLE = {Picture Coding Symposium}, YEAR = {2007}, MONTH = nov, ADDRESS = {Lisbon, Portugal}, PDF = {http://elvera.nue.tu-berlin.de/files/1100Krutz2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1100Krutz2007.pdf} } @INPROCEEDINGS{1110Kurutepe2007, AUTHOR = {Engin Kurutepe and Anil Aksay and Cagdas Bilen and C. Goktug Gurler and Thomas Sikora and Gozde Bozdagi Akar and A. Murat Tekalp}, TITLE = {A Standards-Based Flexible End-to-End Multi-View Video Streaming Architecture}, BOOKTITLE = {Packet Video Workshop 2007}, YEAR = {2007}, MONTH = nov, ADDRESS = {Lausanne}, PDF = {http://elvera.nue.tu-berlin.de/files/1110Kurutepe2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1110Kurutepe2007.pdf} } @INPROCEEDINGS{1101Goldmann2007, AUTHOR = {Lutz Goldmann and Amjad Samour and Thomas Sikora}, TITLE = {Towards Person Google: Multimodal Person Search and Retrieval}, BOOKTITLE = {International Conference on Semantics And digital Media Technologies (SAMT'07)}, YEAR = {2007}, MONTH = dec, ADDRESS = {Genoa, Italy}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/1101Goldmann2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1101Goldmann2007.pdf}, ABSTRACT = {Content based multimedia retrieval systems have been proposed to allow for automatic and efficient indexing and retrieval of the increasing amount of audiovisual data (image, video and audio clips). The search for specic persons within this data is an important subtopic due to its large range of applications. This article describes an original system for multimodal person search and provides some initial performance results that demonstrate the efficiency of the system.} } @INPROCEEDINGS{1122Declerck2007, AUTHOR = {Thierry Declerck and Andreas Cobet}, TITLE = {Towards a cross-media Analysis of spatially co-located Image and Text Regions in TV-News}, BOOKTITLE = {Proceedings of the 2nd International Conference on Semantics And digital Media Technologies (SAMT'07)}, YEAR = {2007}, MONTH = dec, PAGES = {188--191}, PDF = {http://elvera.nue.tu-berlin.de/files/1122Declerck2007.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1122Declerck2007.pdf}, ABSTRACT = {We describe in this poster/short paper on-going work on the extraction and semantic interpretation of text regions in television news programmes. We present some of the data we consider in this work, the actual technologies in use and where they have to be improved. Finally we briefly discuss a possible innovative and valuable approach to the establishment of a cross-media analysis framework.} } @INPROCEEDINGS{1123Nemrava2007, AUTHOR = {J. Nemrava and P. Buitelaar and N. Simou and D. Sadlier and V. Svatek and T. Declerck and A. Cobet and T. Sikora and N. O'Connor and V. Tzouvaras and H. Zeiner and J. Petrak}, TITLE = {An Architecture for Mining Resources Complementary to Audio-Visual Streams}, BOOKTITLE = {International Workshop on Knowledge Acquisition from Multimedia Content (KAMC '07)}, YEAR = {2007}, MONTH = dec, ABSTRACT = {In this paper we attempt to characterize resources of information complementary to audio-visual (A/V) streams and propose their usage for enriching A/V data with semantic concepts in order to bridge the gap between low-level video detectors and high-level analysis. Our aim is to extract cross-media feature descriptors from semantically enriched and aligned resources so as to detect finer-grained events in video.We introduce an architecture for complementary resource analysis and discuss domain dependency aspects of this approach related to our domain of soccer broadcasts.} } @INPROCEEDINGS{1124Nemrava2007, AUTHOR = {J. Nemrava and P. Buitelaar and T. Declerck and V. Svátek and J. Petrák and A. Cobet and H. Zeiner and D. Sadlier and N. O’Connor}, TITLE = {Enhancing Video Analysis Results using Complementary Textual Resources}, BOOKTITLE = {Proceedings of the 2nd International Conference on Semantics And digital Media Technologies (SAMT'07)}, YEAR = {2007}, MONTH = dec, ABSTRACT = {In this document we deal with different sources of information complementary to audio-visual streams and propose their usage for enriching this data with semantic concepts in order to bridge the gap between low-level video analysis and high-level analysis. Our aim is to extract cross-media feature descriptors from semantically enriched and aligned resources so as to detect finer-grained events in video. We introduce an architecture for complementary resources analysis and discuss domain dependency aspects of this approach connected to our initial domain of football broadcasts.} } {1134Krutz2007, } @ARTICLE{1133Izquierdo2007, AUTHOR = {Ebroul Izquierdo and Hyoung Joong Kim and Thomas Sikora}, TITLE = {Knowledge-Assisted Media Analysis for Interactive Multimedia Applications}, JOURNAL = {EURASIP Journal on Advances in Signal Processing}, YEAR = {2007}, MONTH = dec, PAGES = {2}, VOLUME = {Vol. 2007}, NOTE = {Article ID 36404}, DOI = {doi:10.1155/2007/36404}, URL = {http://www.hindawi.com/GetPDF.aspx?doi=10.1155/2007/36404} } @INBOOK{1058Burred2008, AUTHOR = {Juan José Burred and Martin Haller and Shan Jin and Amjad Samour and Thomas Sikora}, TITLE = {Audio Content Analysis}, YEAR = {2008}, BOOKTITLE = {Semantic Multimedia and Ontologies: Theory and Applications}, EDITOR = {Yiannis Kompatsiaris, Paola Hobson}, PUBLISHER = {Springer Verl.}, PAGES = {123--161}, CHAPTER = {Part II: Chapter 5}, EDITION = {1.}, ADDRESS = {London, UK}, NOTE = {ISBN 978-1-84800-075-9}, PDF = {http://elvera.nue.tu-berlin.de/files/1058Burred2008.pdf}, ABSTRACT = {Taking a step-by-step approach, and drawing on the expertise of key researchers in the multimedia and knowledge domains, this book guides the reader through the fundamental enabling technologies of ontologies (for example MPEG-7 and OWL), analysis, context and reasoning, to commercially interesting applications including personalised content summarisation, 3D modelling and management of scientific data. All relevant topics are covered; including ontologies for low level multimedia feature representation, higher level multimedia systems representations, application of multimedia ontologies for visual analysis, and usage of multimedia and knowledge technologies for applications. The authors aggregate relevant disciplines including knowledge representation, multidimensional signal processing, logic, artificial intelligence and machine learning to provide a coherent picture of the different strands of research that need to be combined in order to achieve semantic multimedia applications. "Semantic Multimedia and Technologies" will serve as an excellent reference and guide to exploring how knowledge technologies can be exploited in the creation of new multimedia applications, and how these technologies can provide new contexts for the successful use of knowledge technologies.} } @ARTICLE{1251Ide2008, AUTHOR = {Kai Ide and Seung Eun Lee and Young Chun Kim and Dong Kwon Kim and O'Dae Kwon}, TITLE = {LaGuerre–Gaussian Emission Properties of Photonic Quantum Ring Hole-Type Lasers}, JOURNAL = {IEEE Transactions on Nanotechnology}, YEAR = {2008}, MONTH = mar, PAGES = {185--188}, VOLUME = {7}, NUMBER = {2}, DOI = {10.1109/TNANO.2007.908168} } @ARTICLE{1135Pereira2008, AUTHOR = {Fernando Pereira and Anthony Vetro and Thomas Sikora}, TITLE = {Multimedia Retrieval and Delivery: Essential Metadata Challenges and Standards}, JOURNAL = {Proceedings of the IEEE}, YEAR = {2008}, MONTH = apr, PAGES = {721--744}, VOLUME = {96}, NUMBER = {4}, NOTE = {ISSN: 0018-9219}, PDF = {http://elvera.nue.tu-berlin.de/files/1135Pereira2008.pdf}, DOI = {10.1109/JPROC.2008.916384}, URL = {http://elvera.nue.tu-berlin.de/files/1135Pereira2008.pdf}, ABSTRACT = {Multimedia information retrieval (MIR) and delivery plays an important role in many application domains due to the increasing need to identify, filter, and manage growing amounts of data, notably multimedia information. To efficiently manage and exchange multimedia information, interoperability between coded data and metadata is required and standardization is central to achieving the necessary level of interoperability. In the context of this paper, the term retrieval refers to the process by which a user, human or machine, identifies the content it needs, and the term delivery refers to the adaptive transport and consumption of the identified content in a particular context or usage environment. Both the retrieval and delivery processes may require content and context metadata. This paper will argue that maximum quality of experience depends not only on the content itself (and thus content metadata) but also on the consumption conditions (thus context metadata). Additionally, the rights and protection conditions have become critically important in recent years, especially with the explosion of electronic music commerce and different “shopping” conditions. This paper will review existing multimedia standards related to information retrieval and adaptive delivery of multimedia content, emphasizing the need for such standards, and will show how these standards can help the development, dissemination, and valorization of MIR research results. Moreover, it will also discuss limitations of the current standards and anticipate what future standardization activities are relevant and needed. Due to space limitations, the paper will mainly concentrate on MPEG standards although many other relevant standards are also reviewed and discussed.} } @INPROCEEDINGS{1180Glasberg2008, AUTHOR = {Ronald Glasberg and Sebastian Schmiedeke and Pascal Kelm and Thomas Sikora}, TITLE = {An automatic system for real-time video-genres detection using high-level-descriptors and a set of classifiers}, BOOKTITLE = {12th Annual IEEE International Symposium on Consumer Electronics, ISCE 2008, Algarve, Portugal}, YEAR = {2008}, MONTH = apr, EDITOR = {The International Symposium on Consumer Electronics (ISCE), Antonio Navarro}, PUBLISHER = {IEEE Press}, PAGES = {1--4}, ORGANIZATION = {IEEE}, ADDRESS = {Algarve, Portugal}, NOTE = {oral presentation; eingereicht; ISBN 978-1-4244-2422-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1180Glasberg2008.pdf}, DOI = {10.1109/ISCE.2008.4559449}, URL = {http://elvera.nue.tu-berlin.de/files/1180Glasberg2008.pdf}, ABSTRACT = {We present a new approach for classifying mpeg-2 video sequences as ‘cartoon’, ‘commercial’, ‘music’, ‘news’ or ‘sport’ by analyzing specific, high-level audio-visual features of consecutive frames in real-time. This is part of the well-known video-genre-classification problem, where popular TV-broadcast genres are studied. Such applications have also been discussed in the context of MPEG-7 [1]. In our method the extracted features are logically combined using a set of classifiers to produce a reliable recognition. The results demonstrate a high identification rate based on a large representative collection of 100 video sequences (20 sequences per genre) gathered from free digital TV-broadcasting in Europe.} } @INPROCEEDINGS{1145Krutz2008, AUTHOR = {Andreas Krutz and Alexander Glantz and Martin Haller and Michael Droese and Thomas Sikora}, TITLE = {Multiple Background Sprite Generation using Camera Motion Characterization for Object-based Video Coding}, BOOKTITLE = {3DTV Conference 2008, The True Vision Capture, Transmission and Display of 3D Video, May 2008, Istanbul, Turkey}, YEAR = {2008}, MONTH = may, EDITOR = {Tolga K. Capin}, PUBLISHER = {IEEE Press}, PAGES = {313--316}, ORGANIZATION = {3DTV Network of Excellence, Middle East Technical University, Bilkent University}, ADDRESS = {Ankara, Turkey}, NOTE = {ISBN 978-1-4244-1760-5, CD ISBN 978-1-4244-1755-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1145Krutz2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1145Krutz2008.pdf}, ABSTRACT = {Recent work has shown that object-based video coding can provide higher coding gain than common H. 264/AVC for single-view and the MVC standard based on H. 264 for multi-view (MVC). The use of background sprites outperformes the AVC/MVC especially in sequences containing rotating camera motion and moving foreground objects. The coding performance strongly relies on the preprocessing steps, e.g. sprite generation and object segmentation. In this paper, we present an enhanced background sprite generation algorithm for object-based single-and multi-view video coding (OBVC/OBMVC). It is a new feature for our OBVC/OBMVC recently proposed. We produce multiple background sprites based on camera motion characterization and physical camera parameter estimation. Experimental results show how these multiple sprites increase the coding performance for single-and multi-view sequences.} } @INPROCEEDINGS{1162Kurutepe2008, AUTHOR = {Engin Kurutepe and Thomas Sikora}, TITLE = {Feasibility of Multi-View Video Streaming over P2P Networks}, BOOKTITLE = {3DTV Conference 2008, The True Vision Capture, Transmission and Display of 3D Video, May 2008, Istanbul, Turkey}, YEAR = {2008}, MONTH = may, EDITOR = {Tolga K. Capin}, PUBLISHER = {IEEE Press}, PAGES = {157--160}, ORGANIZATION = {3DTV Network of Excellence, Middle East Technical University, Bilkent University}, ADDRESS = {Ankara, Turkey}, NOTE = {CD ISBN 978-1-4244-1755-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1162Kurutepe2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1162Kurutepe2008.pdf}, ABSTRACT = {We propose to stream multi-view video over a multi-tree peerto-peer (P2P) network using the NUEPMuT protocol. Each view of the multi-view video is streamed over an independent P2P streaming tree and each peer only contributes upload capacity in a single tree, in order to limit the adverse effects of ungraceful peer departures. Additionally, we investigate the feasibility of using the proposed P2P networking architecture, NUEPMuT, for the streaming of multi-view video content with the currently available Internet connection bandwidths.} } {1143Knorr2008, } @PROCEEDINGS{1177Burred2008, TITLE = {LSAS, Second International Workshop on Learning Semantics of Audio Signals, Proceedings, June 21, 2008, IRCAM, Paris, France}, EDITOR = {Juan José Burred, Andreas Nürnberger, Geoffroy Peeters, Sebastian Stober}, YEAR = {2008}, MONTH = jun, PUBLISHER = {IRCAM}, VOLUME = {Juni 2008}, SERIES = {Proceedings}, ADDRESS = {Paris, Frankreich}, NOTE = {ISBN: 978-3-9804874-7-4 ; TAGU-//-BURR-08}, ABSTRACT = {The workshop managed to gather a multidisciplinary group of researchers and included presentations covering signal-processing, social, musicological and usability aspects of semantic audio analysis. Topics: mapping between audio features and contextual interpretation, description of music contents, models describing how music is perceived, methods for extraction, analysis and representation of linguistic descriptions of music, audio features and analysis of music structure, personalization.} } @INPROCEEDINGS{1151Kim2008, AUTHOR = {Woong Hee Kim and Thomas Sikora}, TITLE = {Noise Filtering Method for Color Images based on LDA and Nonlinear Diffusion}, BOOKTITLE = {IEEE Int. Conference on Multimedia & Expo, ICME 2008, Proceedings, Hannover, Germany}, YEAR = {2008}, MONTH = jun, EDITOR = {Jörn Ostermann, Touradj Ebrahimi, Oscar Au}, PUBLISHER = {IEEE Press}, PAGES = {1017--1020}, ORGANIZATION = {IEEE ICME}, ADDRESS = {Hannover, Germany}, NOTE = {ISBN 978-1-4244-2571-6}, PDF = {http://elvera.nue.tu-berlin.de/files/1151Kim2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1151Kim2008.pdf}, ABSTRACT = {The purpose of noise filtering for images is to preserve features such as edge or corners in images, while reducing noise. Recent noise filtering algorithms based on diffusion equation shows the satisfactory results to some extent, if the noise is additive Gaussian noise.However, if the noise is not additive Gaussian noise, the filtering result is not satisfactory. In this paper, we propose a noise filtering method for color images based on LDA and nonlinear diffusion, which makes use of a common diffusion control. Experimental results with images degraded by additive Gaussian noise, salt and pepper noise, and multiplicative noise are presented.} } @INPROCEEDINGS{1161Durrieu2008, AUTHOR = {Jean-Louis Durrieu and Jan Weil}, TITLE = {Automatic Beat-synchronous Generation of Music Lead Sheets}, BOOKTITLE = {2nd K-Space PhD Jamboree Workshop 2008, Proceedings, Paris, France}, YEAR = {2008}, MONTH = jul, EDITOR = {Francesca De Simone, Jan Nemrava}, PUBLISHER = {Information Society Technologies}, ORGANIZATION = {TELECOM ParisTech, K-Space}, ADDRESS = {Paris, France}, NOTE = {Jean-Louis Durrieu: TELECOM ParisTech}, PDF = {http://elvera.nue.tu-berlin.de/files/1161Durrieu2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1161Durrieu2008.pdf}, ABSTRACT = {Most of the popular music scores are written in a specific format, the lead sheet format. It sums up a song by representing the notes of the main melody, along with the chord sequence together with other cues such as style, tempo and time signature. This sort of representation is very common in jazz and pop music, where the accompaniment playing the chord sequence usually is improvised. The aim of our study is to bring together two techniques, a chord detection system and a lead melody transcriber, in order to produce a lead sheet. In addition to the respective issues inherent to each problem, we also need to address tempo estimation, time signature estimation, and, based on these estimations, time quantification of both the chord sequence and the melody line. We propose a tempo tracker that aligns the beats to the audio, and adapt the chord detection and melody extraction systems so as to take into account this new piece of information. Future works include cover song detection based on lead sheet representation, query-by-similarity applications and so on.} } @INPROCEEDINGS{1181Glasberg2008, AUTHOR = {Ronald Glasberg and Sebastian Schmiedeke and Martin Mocigemba and Thomas Sikora}, TITLE = {Real-Time Approaches for Video-Genre-Classification using new High-Level Descriptors and a Set of Classifiers}, BOOKTITLE = {2008 IEEE International Conference on Semantic Computing}, YEAR = {2008}, MONTH = aug, PUBLISHER = {IEEE}, PAGES = {120--127}, ORGANIZATION = {IEEE}, ADDRESS = {Santa Clara/ USA}, NOTE = {oral presentation; eingereicht; ISBN 978-0-7695-3279-0}, PDF = {http://elvera.nue.tu-berlin.de/files/1181Glasberg2008.pdf}, DOI = {10.1109/ICSC.2008.92}, URL = {http://elvera.nue.tu-berlin.de/files/1181Glasberg2008.pdf}, ABSTRACT = {In this paper we describe in detail the recent publications related to video-genre-classification and present our improved approaches for classifying video sequences in real-time as ‘cartoon’, ‘commercial’, ‘music’, ‘news’ or ‘sport’ by analyzing the content with high-level audio-visual descriptors and classification methods. Such applications have also been discussed in the context of MPEG-7 [1]. The results demonstrate identification rates of more than 90% based on a large representative collection of 100 videos gathered from free digital TV and Internet.} } @INPROCEEDINGS{1147Krutz2008, AUTHOR = {Andreas Krutz and Alexander Glantz and Michael Frater and Thomas Sikora}, TITLE = {Local Background Sprite Generation}, BOOKTITLE = {International Workshop on Local and Non-Local Approximation in Image Processing, LNLA 2008, Lausanne, Switzerland}, YEAR = {2008}, MONTH = aug, PUBLISHER = {IEEE, EURASIP}, ORGANIZATION = {IEEE, EURASIP, Tampere International Center for Signal Processing (TICSP), EUSIPCO2008}, ADDRESS = {Lausanne, Switzerland}, PDF = {http://elvera.nue.tu-berlin.de/files/1147Krutz2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1147Krutz2008.pdf}, ABSTRACT = {Background modeling of video sequences can be used in many different applications. For video object segmentation it is often applied in a background subtraction method. When conventional sprites like single or multiple sprites are used a background sequence has to be reconstructed from the model. The double mapping into the coordinate system of the sprite and back can lead to severe distortion of the background model and therefore to erroneous segmentation masks. We present a novel background modeling approach that lessens distortion. These so-called local background sprites are built for every reference frame independently and fit its original size. Experimental results show that this new approach clearly outperforms conventional background sprites in terms of PSNR.} } @INPROCEEDINGS{1140Wegener2008, AUTHOR = {Sebastian Wegener and Martin Haller and Juan José Burred and Thomas Sikora and Slim Essid and Gaël Richard}, TITLE = {On the Robustness of Audio Features for Musical Instrument Classification}, BOOKTITLE = {16th European Signal Processing Conference, EUSIPCO 2008, Proceedings, Lausanne, Switzerland}, YEAR = {2008}, MONTH = aug, EDITOR = {J.-Ph. Thiran, P. Vandergheynst, R. Reilly}, PUBLISHER = {EURASIP}, ORGANIZATION = {European Association for Signal Processing (EURASIP), Swiss Federal Institute of Technology Lausanne (EPFL)}, ADDRESS = {Lausanne, Switzerland}, NOTE = {Oral presentation}, PDF = {http://elvera.nue.tu-berlin.de/files/1140Wegener2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1140Wegener2008.pdf}, ABSTRACT = {We examine the robustness of several audio features applied exemplarily to musical instrument classification. For this purpose we study the robustness of 15 MPEG-7 Audio Low- Level Descriptors and 13 further spectral, temporal, and perceptual features against four types of signal modifications: low-pass filtering, coding artifacts, white noise, and reverberation. The robustness of the 120 feature coefficients obtained is evaluated with three different methods: comparison of rankings obtained by feature selection techniques, qualitative evaluation of changes in statistical parameters, and classification experiments using Gaussian Mixture Models(GMMs). These experiments are performed on isolated notes of 14 musical instrument classes.} } @INPROCEEDINGS{1148Krutz2008, AUTHOR = {Andreas Krutz and Alexander Glantz and Thomas Sikora and Paulo Nunes and Fernando Pereira}, TITLE = {Automatic Object Segmentation Algorithms for Sprite Coding using MPEG-4}, BOOKTITLE = {50th International Symposium ELMAR-2008, September, Proceedings, Zadar, Croatia, Vol. 2 of 2}, YEAR = {2008}, MONTH = sep, EDITOR = {Mislav Grgic, Sonja Grgic}, PUBLISHER = {ELMAR, Croatian Society Electronics in Marine, Zadar}, PAGES = {459--462}, ORGANIZATION = {ELMAR, Zadar}, ADDRESS = {Department of Wireless Communications, Faculty of Electrical Engineering and Computering, University of Zagreb, Croatia}, PDF = {http://elvera.nue.tu-berlin.de/files/1148Krutz2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1148Krutz2008.pdf}, ABSTRACT = {Object-based video coding, as standardized in MPEG-4 Part 2, can result in superior performance in comparison to common hybrid motion-compensated DCT-based approaches. We consider sprite coding which increases significantly the objective as well as the subjective quality of the coded video. The main challenge of this approach is the pre-segmentation of the video and the video content itself. To apply sprite coding, the input video has to be firstly segmented into foreground and background objects. We evaluate automatic object segmentation methods based on global motion estimation and background sprite generation. These algorithms are evaluated using the standardized MPEG-4 Visual Main Profile (sprite coding).} } @INPROCEEDINGS{1154Rodriguez2008, AUTHOR = {Daniel Rodriguez and Lutz Goldmann and Surachai Ongkittikul and Mustafa Karaman and Thomas Sikora}, TITLE = {A System for Personalized Human Computer Interaction}, BOOKTITLE = {50th International Symposium ELMAR-2008, September, Proceedings, Zadar, Croatia, Vol. 2 of 2}, YEAR = {2008}, MONTH = sep, EDITOR = {Mislav Grgic, Sonja Grgic}, PUBLISHER = {ELMAR, Croatian Society Electronics in Marine, Zadar}, PAGES = {439--442}, ORGANIZATION = {ELMAR, Zadar}, ADDRESS = {Department of Wireless Communications, Faculty of Electrical Engineering and Computering, University of Zagreb, Croatia}, URL = {http://www.elmar-zadar.org/2008/}, ABSTRACT = {This paper describes an advanced user-interface that detects and identifies people from video data, tracks their body and body part movements, and recognizes a set of gesture-based user commands. In this way an exemplar implementation for interacting with an intelligent cash machine is presented. The system combines person identification and gesture recognition for increased performance with accurate skin detection and high gesture recognition rates for a database of 10 users of varying skin tone and clothing color.} } @INPROCEEDINGS{1155Rama2008, AUTHOR = {Antonio Rama and Lutz Goldmann and Francesc Tarres and Thomas Sikora}, TITLE = {More Robust Face Recognition by Considering Occlusion Information}, BOOKTITLE = {8th IEEE International Conference on Automatic Face and Gesture Recognition (FG) 2008, Amsterdam, The Netherlands}, YEAR = {2008}, MONTH = sep, EDITOR = {Jeffrey F. Cohn, Thomas S. Huang, Maja Pantic, Nicu Sebe, Ferdinand Beljaars}, PUBLISHER = {IEEE Press}, ORGANIZATION = {IEEE FG}, ADDRESS = {Amsterdam, Netherlands}, NOTE = {Poster?}, URL = {http://www.fg2008.nl/}, ABSTRACT = {This paper addresses one of the main challenges of face recognition (FR): facial occlusions. Currently, the human brain is the most robust known FR approach towards partially occluded faces. Nevertheless, it is still not clear if humans recognize faces using a holistic or a component-based strategy, or even a combination of both. In this paper, three different approaches based on Principal Component Analysis (PCA) are analyzed. The first one, a holistic approach, is the well-known Eigenface approach. The second one, a component-based method, is a variation of the Eigen-features approach, and finally, the third one, a near-holistic method, is an extension of the Lophoscopic Principal Component Analysis (LPCA). So the main contributions of this paper are: The three different strategies are compared and analyzed for identifying partially occluded faces and furthermore it explores how a priori knowledge about present occlusions can be used to improve the recognition performance.} } @ARTICLE{1160Knorr2008, AUTHOR = {Sebastian Knorr and Matthias Kunter and Thomas Sikora}, TITLE = {Stereoscopic 3D from 2D Video with Super-Resolution Capability}, JOURNAL = {Signal Processing: Image Communication}, YEAR = {2008}, MONTH = oct, PAGES = {665--676}, VOLUME = {Vol. 23}, NUMBER = {9}, NOTE = {http://dx.doi.org/10.1016/j.image.2008.07.004 ; ISSN: 0923-5965}, PDF = {http://elvera.nue.tu-berlin.de/files/1160Knorr2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1160Knorr2008.pdf}, ABSTRACT = {This paper presents a new approach for generation of super-resolution stereoscopic and multi-view video from monocular video. Such multi-view video is used for instance with multi-user 3D displays or auto-stereoscopic displays with head-tracking to create a depth impression of the observed scenery. Our approach is an extension of the realistic stereo view synthesis (RSVS) approach which is based on structure from motion techniques and image-based rendering to generate the desired stereoscopic views for each point in time. Subjective quality measurements with 25 real and 3 synthetic sequences were carried out to test the performance of RSVS against simple time-shift and depth image-based rendering (DIBR). Our approach heavily enhances the stereoscopic depth perception and gives a more realistic impression of the observed scenery. Simulation results applying super-resolution show that the image quality can further be improved by reducing motion blur and compression artifacts.} } @INPROCEEDINGS{1152Goldmann2008, AUTHOR = {Lutz Goldmann and Antonio Rama and Thomas Sikora and Francesc Tarres}, TITLE = {On the Detection and Localization of Facial Occlusions and its Use within Different Scenarios}, BOOKTITLE = {IEEE 10th International Workshop on Multimedia Signal Processing, MMSP 8-10 October 2008, Proceedings, Cairns, Australia}, YEAR = {2008}, MONTH = oct, EDITOR = {David Feng, Thomas Sikora, W. C. Siu, Jian Zhang, Ling Guan, Jean Luc Dugelay, Qiang Wu, Wanqing Li}, PUBLISHER = {IEEE}, PAGES = {592--597}, ORGANIZATION = {IEEE MMSP}, ADDRESS = {Australia}, NOTE = {ISBN 978-1-4244-2295-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1152Goldmann2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1152Goldmann2008.pdf}, ABSTRACT = {In recent years advanced video codecs have been developed, such as standardized in MPEG-4. The latest video codec H.264/AVC provides compression performance superior to previous standards, but is based on the same basic motioncompensated-DCT architecture. However, for certain types of video, it has been shown that it is possible to outperform the H.264/AVC using an object-based video codec. Towards a general-purpose object-based video coding system we present an automated approach to separate a video sequences into subsequences regarding its camera motion type. Then, the subsequences are coded either with an object-based codec or the common H.264/AVC. Applying different video codecs for different kinds of camera motion, we achieve a higher overall coding gain for the video sequence. In first experimental evaluations, we demonstrate the excellence performance of this approach on two test sequences.} } @INPROCEEDINGS{1170Krutz2008, AUTHOR = {Andreas Krutz and Sebastian Knorr and Matthias Kunter and Thomas Sikora}, TITLE = {Camera Motion-Constraint Video Codec Selection}, BOOKTITLE = {IEEE 10th International Workshop on Multimedia Signal Processing, MMSP 8-10 October 2008, Proceedings, Cairns, Australia}, YEAR = {2008}, MONTH = oct, EDITOR = {David Feng, Thomas Sikora, W. C. Siu, Jian Zhang, Ling Guan, Jean Luc Dugelay, Qiang Wu, Wanqing Li}, PUBLISHER = {IEEE}, PAGES = {58--63}, ORGANIZATION = {IEEE MMSP}, ADDRESS = {Cairns, Queensland, Australia}, NOTE = {invited paper, Special Session on Global Motion Estimation and Mosaicing for Applications in Video Analysis and Coding ; ISBN 978-1-4244-2295-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1170Krutz2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1170Krutz2008.pdf}, ABSTRACT = {In recent years advanced video codecs have been developed, such as standardized in MPEG-4. The latest video codec H.264 /AVC provides compression performance superior to previous standards, but is based on the same basic motion compensated -DCT architecture. However, for certain types of video, it has been shown that it is possible to outperform the H.264/ AVC using an object-based video codec. Towards a general- purpose object-based video coding system we present an automated approach to separate a video sequences into subsequences regarding its camera motion type. Then, the subsequences are coded either with an object-based codec or the common H.264/AVC. Applying different video codecs for different kinds of camera motion, we achieve a higher overall coding gain for the video sequence. In first experimental evaluations, we demonstrate the excellence performance of this approach on two test sequences.} } @INPROCEEDINGS{1171Farin2008, AUTHOR = {Dirk Farin and Martin Haller and Andreas Krutz and Thomas Sikora}, TITLE = {Recent Developments in Panoramic Image Generation and Sprite Coding}, BOOKTITLE = {Proceedings of the IEEE International Workshop on Multimedia Signal Processing (MMSP 2008)}, YEAR = {2008}, MONTH = oct, EDITOR = {David Feng, Thomas Sikora, W. C. Siu, Jian Zhang, Ling Guan, Jean Luc Dugelay, Qiang Wu, Wanqing Li}, PUBLISHER = {IEEE}, PAGES = {64--69}, ORGANIZATION = {IEEE MMSP}, ADDRESS = {Cairns, Queensland, Australia}, NOTE = {invited paper, Special Session on Global Motion Estimation and Mosaicing for Applications in Video Analysis and Coding ; ISBN 978-1-4244-2294-4}, PDF = {http://elvera.nue.tu-berlin.de/files/1171Farin2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1171Farin2008.pdf}, ABSTRACT = {The composition of panoramic images has recently received considerable attention. While panoramic images were first used mainly as a flexible visualization technique, they also found application in video coding, video enhancement, format conversion, and content analysis. The topic has enlarged and diverged into many specialized research directions, which makes it difficult to stay in touch with recent developments. This paper intends to give an overview of the current state of research, including recent developments. Two of the applications of sprite coding and global-motion estimation are presented in more detail to provide some insights into the system aspects.} } @INPROCEEDINGS{1149Kunter2008, AUTHOR = {Matthias Kunter and Philipp Krey and Andreas Krutz and Thomas Sikora}, TITLE = {Extending H.264/AVC with a Background Sprite Prediction Mode}, BOOKTITLE = {15th IEEE International Conference on Image Processing, Proceedings ICIP 2008, October, San Diego, California, USA}, YEAR = {2008}, MONTH = oct, EDITOR = {Rama Chellappa, Bernd Girod, Gang Qian}, PUBLISHER = {IEEE}, PAGES = {2128--2131}, ORGANIZATION = {IEEE Signal Processing Society}, ADDRESS = {United States}, NOTE = {ISBN 978-1-4244-1764-3 ; ISSN 1522-4880}, PDF = {http://elvera.nue.tu-berlin.de/files/1149Kunter2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1149Kunter2008.pdf}, ABSTRACT = {The latest standardized hybrid video codec, H.264/AVC, significantly outperforms earlier video coding standards. Despite combining improved and new algorithms within this codec, it is still possible to find methods which lead to a higher coding efficiency. We tackle the prediction problem adding a new prediction mode to the codec. It has been shown that the generation of a background sprite image containing all the background information of a certain sequence is very useful e.g. for object-based video coding. We use a pre-generated background sprite image for creating a new prediction mode in the encoder loop. For the current frame to be compensated, blocks reconstructed from the background sprite are used beside the remaining modes to calculate the residual. The rate-distortion optimization decides which mode is taken. Experimental results show the improvement using the new sprite prediction (SP) mode with the considered test sequences.} } @INPROCEEDINGS{1163Kurutepe2008, AUTHOR = {Engin Kurutepe and Thomas Sikora}, TITLE = {Multi-View Video Streaming over P2P Networks With Low Start-Up Delay}, BOOKTITLE = {15th IEEE International Conference on Image Processing, Proceedings ICIP 2008, October, San Diego, California, USA}, YEAR = {2008}, MONTH = oct, EDITOR = {Rama Chellappa, Bernd Girod, Gang Qian}, PUBLISHER = {IEEE}, PAGES = {3088--3091}, ORGANIZATION = {IEEE Signal Processing Society}, ADDRESS = {San Diego, USA}, NOTE = {ISBN 978-1-4244-1764-3 ; ISSN 1522-4880}, PDF = {http://elvera.nue.tu-berlin.de/files/1163Kurutepe2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1163Kurutepe2008.pdf}, ABSTRACT = {We propose to stream multi-view video over a multi-tree peerto-peer (P2P) network using the NUEPMuT protocol. Each view of the multi-view video is streamed over an independent P2P streaming tree and each peer only contributes upload capacity in a single tree, in order to limit the adverse effects of ungraceful peer departures. Additionally, we introduce a quick join procedure to reduce the start-up delay for the first data packet after a join request. Continuity index and decoded video quality performance for simulcast and MVC encoding in a large topology under different settings are reported, in addition to the improvements achieved by the quick join procedure.} } @INPROCEEDINGS{1153Goldmann2008, AUTHOR = {Lutz Goldmann and Tomasz Adamek and Peter Vajda and Mustafa Karaman and Roland Mörzinger and Eric Galmar and Thomas Sikora and Noel O'Connor and Thien Ha-Minh and Touradj Ebrahimi and Peter Schallauer and Benoit Huet}, TITLE = {Towards Fully Automatic Image Segmentation Evaluation}, BOOKTITLE = {Advanced Concepts for Intelligent Vision Systems, ACIVS, 10th International Conference, Proceedings, October 2008, Juan-les-Pins, France}, YEAR = {2008}, MONTH = oct, EDITOR = {Jacques Blanc-Talon, Salah Bourennane, Wilfried Philips, Dan Popescu, Paul Scheunders}, PUBLISHER = {Berlin, Heidelberg: Springer Verlag}, PAGES = {566--577}, ORGANIZATION = {ACIVS}, NOTE = {ISBN 978-3-540-88457-6 ; TAGU-//-ACIVS-08}, URL = {http://acivs.org/acivs2008/} } @INPROCEEDINGS{1150Dumont2008, AUTHOR = {Emilie Dumont and Bernard Merialdo and Slim Essid and Werner Bailer and Herwig Rehatschek and Daragh Byrne and Hervé Bredin and Noel E. O'Connor and Gareth J.F. Jones and Alan F. Smeaton and Martin Haller and Andreas Krutz and Thomas Sikora and Tomas Piatrik}, TITLE = {Rushes Video Summarization using a Collaborative Approach}, BOOKTITLE = {TRECVID BBC Rushes Summarization Workshop (TVS 2008) at ACM Multimedia 2008, Vancouver, Canada}, YEAR = {2008}, MONTH = oct, EDITOR = {ACM, Paul Over, Alan Smeaton, Wessel Kraaij}, PUBLISHER = {National Institute of Standards and Technology (NIST), Washington, DC, USA}, PAGES = {90--94}, ORGANIZATION = {ACM}, ADDRESS = {Vancouver, BC, Canada}, NOTE = {ISBN 978-1-60558-303-7}, PDF = {http://elvera.nue.tu-berlin.de/files/1150Dumont2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1150Dumont2008.pdf}, ABSTRACT = {This paper describes the video summarization system developed by the partners of the K-Space European Network of Excellence for the TRECVID 2008 BBC rushes summarization evaluation. We propose an original method based on individual content segmentation and selection tools in a collaborative system. Our system is organized in several steps. First, we segment the video, secondly we identify relevant and redundant segments, and finally, we select a subset of segments to concatenate and build the final summary with video acceleration incorporated. We analyze the performance of our system through the TRECVID evaluation.} } @INPROCEEDINGS{1182Glasberg2008, AUTHOR = {Ronald Glasberg and Sebastian Schmiedeke and Hüseyin Oguz and Pascal Kelm and Thomas Sikora}, TITLE = {Real-Time Detection of Sport in MPEG-2 Sequences using High-Level AV-Descriptors and SVM}, BOOKTITLE = {Third International Conference on Digital Information Management, ICDIM November 13-16, 2008, London, UK}, YEAR = {2008}, MONTH = nov, EDITOR = {IEE, Richard Chbeir}, PUBLISHER = {IEEE}, ORGANIZATION = {IEEE}, ADDRESS = {London, UK}, NOTE = {oral presentation ; eingereicht ; ISBN 978-1-4244-2917-2 ; TACD//-ICDIM-08}, PDF = {http://elvera.nue.tu-berlin.de/files/1182Glasberg2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1182Glasberg2008.pdf}, ABSTRACT = {We present a new approach for classifying mpeg-2 video sequences as ‘sport’ or ‘non-sport’ by analyzing new high-level audiovisual features of consecutive frames in real-time. This is part of the well-known video-genre-classification problem, where popular TV-broadcast genres like cartoon, commercial, music video, news and sports are studied. Such applications have also been discussed in the context of MPEG-7 [1]. In our method the extracted features are logically combined by a support vector machine [2] to produce a reliable detection. The results demonstrate a high identification rate of 98.5% based on a large balanced database of 100 representative video sequences gathered from free digital TV-broadcasting and world wide web.} } @INPROCEEDINGS{1191Wilkins2008, AUTHOR = {P. Wilkins and D. Byrne and Gareth J.F.Jones and H. Lee and G. Keenan and K. McGuinness and N. E. O'Connor and N. O'Hare and A. F. Smeaton and T. Adamek and R. Troncy and A. Amin and R. Benmokhtar and E. Dumont and B. Huet and B. Merialdo and G. Tolias and E. Spyrou and Y. Avrithis and G. Th. Papadopoulous and V. Mezaris and I. Kompatsiaris and R. Mörzinger and P. Schallauer and W. Bailer and K. Chandramouli and E. Izquierdo and Lutz Goldmann and Martin Haller and Amjad Samour and Andreas Cobet and Thomas Sikora and P. Praks and D. Hannah and M. Halvey and F. Hopfgartner and R. Villa and P. Punitha and A. Goyal and J. M. Jose}, TITLE = {K-Space at TRECVid 2008}, BOOKTITLE = {Proceedings of the TRECVID Workshop 2008}, YEAR = {2008}, MONTH = nov, ADDRESS = {Gaithersburg, Maryland USA}, NOTE = {Oral Presentation}, PDF = {http://elvera.nue.tu-berlin.de/files/1191Wilkins2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1191Wilkins2008.pdf}, ABSTRACT = {In this paper we describe K-Space’s participation in TRECVid 2008 in the interactive search task. For 2008 the K-Space group performed one of the largest interactive video information retrieval experiments conducted in a laboratory setting. We had three institutions participating in a multi-site multi-system experiment. In total 36 users participated, 12 each from Dublin City University (DCU, Ireland), University of Glasgow (GU, Scotland) and Centrum Wiskunde & Informatica (CWI, the Netherlands). Three user interfaces were developed, two from DCU which were also used in 2007 as well as an interface from GU. All interfaces leveraged the same search service. Using a latin squares arrangement, each user conducted 12 topics, leading in total to 6 runs per site, 18 in total. We officially submitted for evaluation 3 of these runs to NIST with an additional expert run using a 4th system. Our submitted runs performed around the median. In this paper we will present an overview of the search system utilized, the experimental setup and a preliminary analysis of our results.} } @INPROCEEDINGS{1173Dumont2008, AUTHOR = {Emilie Dumont and Bernard Merialdo and Slim Essid and Werner Bailer and Daragh Byrne and Hervé Bredin and Noel E. O'Connor and Gareth Jones and Martin Haller and Andreas Krutz and Thomas Sikora and Tomas Piatrik}, TITLE = {A Collaborative Approach to Video Summarization}, BOOKTITLE = {3rd International Conference on Semantic and Digital Media Technologies (SAMT 2008)}, YEAR = {2008}, MONTH = dec, ADDRESS = {Koblenz, Germany}, NOTE = {Poster}, PDF = {http://elvera.nue.tu-berlin.de/files/1173Dumont2008.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1173Dumont2008.pdf} } @INPROCEEDINGS{1175Kunter2009, AUTHOR = {Matthias Kunter and Sebastian Knorr and Andreas Krutz and Thomas Sikora}, TITLE = {Unsupervised object segmentation for 2D to 3D conversion}, BOOKTITLE = {IS&T/SPIE's Electronic Imaging}, YEAR = {2009}, MONTH = jan, ADDRESS = {San Jose, California, USA}, NOTE = {Matthias Kunter, Sebastian Knorr: imcube Media}, PDF = {http://elvera.nue.tu-berlin.de/files/1175Kunter2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1175Kunter2009.pdf} } @INPROCEEDINGS{1192Karaman2009, AUTHOR = {Mustafa Karaman and Lutz Goldmann and Thomas Sikora}, TITLE = {Improving object segmentation by reflection detection and removal}, BOOKTITLE = {Visual Communications and Image Processing (VCIP), IS&T/SPIE's Electronic Imaging 2009}, YEAR = {2009}, MONTH = jan, ADDRESS = {San Jose, CA, USA}, ABSTRACT = {For object analysis in videos such as in video surveillance systems, the preliminary segmentation step is very important. Many segmentation methods using static camera have been proposed in the last decade, but they all suffer in occurrance of object reflection especially on the ground, i.e. reflected regions are also segmented as foregrounds. We present a new method which detects the border between the real object and its reflection. Experiments show that an outstanding improvement of segmentation results are obtained by removing the reflection part of the over-segmented objects.} } @ARTICLE{1190Triantafyllidis2009, AUTHOR = {G. A. Triantafyllidis and A. Enis Çetin and Aljoscha Smolic and Levent Onural and Thomas Sikora and John Watson}, TITLE = {3DTV: Capture, Transmission, and Display of 3D Video}, JOURNAL = {EURASIP Journal on Advances in Signal Processing}, YEAR = {2009}, MONTH = jan, PAGES = {2 Seiten, Artikel ID 585216}, VOLUME = {vol. 2009}, NUMBER = {doi:10.1155/2009/585216}, DOI = {10.1155/2009/585216}, URL = {http://www.hindawi.com/journals/asp/volume-2009/} } @INPROCEEDINGS{1193Lardeur2009, AUTHOR = {Maxime Lardeur and Slim Essid and Gael Richard and Martin Haller and Thomas Sikora}, TITLE = {Incorporating prior knowledge on the digital media creation process into audio classifiers}, BOOKTITLE = {Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2009)}, YEAR = {2009}, MONTH = apr, PAGES = {1653--1656}, ADDRESS = {Taipei, Taiwan}, NOTE = {ISBN 978-1-4244-2354-5}, PDF = {http://elvera.nue.tu-berlin.de/files/1193Lardeur2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1193Lardeur2009.pdf}, ABSTRACT = {In the process of music content creation, a wide range of typical audio effects such as reverberation, equalization or dynamic compression are very commonly used. Despite the fact that such effects have a clear impact on the audio features, they are rarely taken into account when building an automatic audio classifier. In this paper, it is shown that the incorporation of prior knowledge of the digital media creation chain can clearly improve the robustness of the audio classifiers, which is demonstrated on a task of musical instrument recognition. The proposed system is based on a robust feature selection strategy, on a novel use of the virtual support vector machines technique and a specific equalization used to normalize the signals to be classified. The robustness of the proposed system is experimentally evidenced using a rather large and varied sound database.} } @INPROCEEDINGS{1194Krutz2009, AUTHOR = {Andreas Krutz and Alexander Glantz and Thilo Borgmann and Michael Frater and Thomas Sikora}, TITLE = {Motion-Based Object Segmentation using Local Background Sprites}, BOOKTITLE = {Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2009)}, YEAR = {2009}, MONTH = apr, PUBLISHER = {IEEE}, PAGES = {1221--1224}, ORGANIZATION = {IEEE}, ADDRESS = {Taipei, Taiwan}, NOTE = {ISBN: 978-1-4244-2354-5}, PDF = {http://elvera.nue.tu-berlin.de/files/1194Krutz2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1194Krutz2009.pdf}, ABSTRACT = {It is well known that video material with a static background allows easier segmentation than that with a moving background. One approach to segmentation of sequences with a moving background is to use preprocessing to create a static background, after which conventional background subtraction techniques can be used for segmenting foreground objects. It has been recently shown that global motion estimation and/or background sprite generation techniques are reliable. We propose a new background modeling technique for object segmentation using local background sprite generation. Experimental results show the excellent performance of this new method compared to recent algorithms proposed.} } @INPROCEEDINGS{1202Burred2009, AUTHOR = {Juan-Jose Burred and Axel Röbel and Thomas Sikora}, TITLE = {Polyphonic Musical Instrument Recognition Based on a Dynamic Model of the Spectral Envelope}, BOOKTITLE = {Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2009)}, YEAR = {2009}, MONTH = apr, ADDRESS = {Taipei, Taiwan}, NOTE = {Axel Röbel: IRCAM} } @INPROCEEDINGS{1195Haller2009, AUTHOR = {Martin Haller and Andreas Krutz and Thomas Sikora}, TITLE = {Evaluation of pixel- and motion vector-based Global Motion Estimation for camera motion characterization}, BOOKTITLE = {Proceedings of the International Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS 2009)}, YEAR = {2009}, MONTH = may, PAGES = {49--52}, ADDRESS = {London, UK}, NOTE = {ISBN 978-1-4244-3610-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1195Haller2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1195Haller2009.pdf}, ABSTRACT = {Pixel-based and motion vector-based global motion estimation (GME) techniques are evaluated in this paper with an automatic system for camera motion characterization. First, the GME techniques are compared with a frame-by-frame PNSR measurement using five video sequences. The best motion vector-based GME method is then evaluated together with a common and a simplified pixel-based GME technique for camera motion characterization. For this, selected unedited videos from the TRECVid 2005 BBC rushes corpus are used. We evaluate how the estimation accuracy of global motion parameters affects the results for camera motion characterization in terms of retrieval measures. The results for this characterization show that the simplified pixel-based GME technique obtains results that are comparable with the common pixel-based GME method, and outperforms significantly the results of an earlier proposed motion vector-based GME approach.} } @INPROCEEDINGS{1196Arvanitidou2009, AUTHOR = {Marina Georgia Arvanitidou and Alexander Glantz and Andreas Krutz and Thomas Sikora and Marta Mrak and Ahmet Kondoz}, TITLE = {Global motion estimation using variable block sizes and its application to object segmentation}, BOOKTITLE = {Proceedings of the International Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS 2009)}, YEAR = {2009}, MONTH = may, PAGES = {173--176}, ADDRESS = {London, UK}, NOTE = {ISBN 978-1-4244-3610-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1196Arvanitidou2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1196Arvanitidou2009.pdf} } @INPROCEEDINGS{1213Jin2009, AUTHOR = {Shan Jin and Hemant Misra and Thomas Sikora and Joemon Jose}, TITLE = {Automatic Topic Detection Strategy for information retrieval in Spoken Document}, BOOKTITLE = {Wiamis 2009}, YEAR = {2009}, MONTH = may, PDF = {http://elvera.nue.tu-berlin.de/files/1213Jin2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1213Jin2009.pdf}, ABSTRACT = {This paper suggests an alternative solution for the task of spoken document retrieval (SDR). The proposed system runs retrieval on multi-level transcriptions (word and phone) produced by word and phone recognizers respectively, and their outputs are combined. We propose to use latent Dirichlet allocation (LDA) model for capturing the semantic information on word transcription. The LDA model is employed for estimating topic distribution in queries and word transcribed spoken documents, and the matching is performed at the topic level. Acoustic matching between query words and phonetically transcribed spoken documents is performed using phone-based matching algorithm. The results of acoustic and topic level matching methods are compared and shown to be complementary.} } @INPROCEEDINGS{1229Kelm2009, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and and Thomas Sikora}, TITLE = {FEATURE-BASED VIDEO KEY FRAME EXTRACTION FOR LOW QUALITY VIDEO}, BOOKTITLE = {Proceedings of the International Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS 2009)}, YEAR = {2009}, MONTH = may, PAGES = {pp.25--28}, ADDRESS = {London, UK}, NOTE = {ISBN: 978-1-4244-3609-5}, PDF = {http://elvera.nue.tu-berlin.de/files/1229Kelm2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1229Kelm2009.pdf} } @INPROCEEDINGS{1209Ide2009, AUTHOR = {Kai Ide and Matthias Kunter and Thomas Sikora}, TITLE = {Fast Generation of Cylindrical Panoramic Views From Free-Hand Video Sequences}, BOOKTITLE = {IMMERSCOM 2009}, YEAR = {2009}, MONTH = may, PDF = {http://elvera.nue.tu-berlin.de/files/1209Ide2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1209Ide2009.pdf}, ABSTRACT = {We report on a fast algorithm for the generation of cylindrical panoramic views from hand-held video sequences. Due to its high processing speed the algorithm is suited for hardware implementation into next generation video- and photo cameras. This enables the user to easily create immersive views from simple pan shots of variable quality. The individual processing steps within the algorithm are described in detail. Final results of the video to panorama conversion process along with an an outlook on how to further improve the method when implemented in consumer grade video- and photo cameras are given at the end of this paper.} } @ARTICLE{1226Rasamimanana2009, AUTHOR = {Nicolas Rasamimanana and Florian Kaiser and Frederic Bevilacqua}, TITLE = {Perspectives on gesture-sound relationships informed from acoustic instrument studies}, JOURNAL = {Organised Sound}, YEAR = {2009}, MONTH = aug, PAGES = {208--216}, VOLUME = {14}, NUMBER = {02}, URL = {http://journals.cambridge.org/action/displayAbstract?fromPage=online&aid=5882420&fulltextType=RA&fileId=S1355771809000314}, ABSTRACT = {We present an experimental study on articulation in bowed strings that provides important elements for a discussion about sound synthesis control. The study focuses on bow acceleration profiles and transient noises, measured for different players for the bowing techniques detach. We found that maximum of these profiles are not synchronous, and temporal shifts are dependent on the bowing techniques. These results allow us to bring out important mechanisms in sound and gesture articulation. In particular, the results reveal a potential shortcoming of mapping strategies using simple frame-by-frame data-stream procedures. We propose instead to consider input control data as time functions, and consider gesture co-articulation processes.} } @INPROCEEDINGS{1214Jin2009, AUTHOR = {Shan Jin and Thomas Sikora}, TITLE = {Combining Confusion Networks with probabilistic phone matching for open-vocabulary keyword spotting in spontaneous speech signal}, BOOKTITLE = {17th in a series of conferences organised by the European Association for Signal, Speech, and Image Processing (EUSIPCO 2009)}, YEAR = {2009}, MONTH = aug, ORGANIZATION = {The European Association for Signal Processing (EURASIP)}, ADDRESS = {Glasgow, Scotland}, ABSTRACT = {In this paper, we study several methods for keyword spotting in spontaneous speech signal. Novel method combining probabilistic phone matching (PSM) approach with word confusion networks (WCN) is proposed for open-vocabulary keyword spotting task. This method runs keyword spotting on multi-level transcriptions (WCN and phone-onebest). We propose to use classical string matching for word spotting on WCN. At the same time probabilistic string matching is used for acoustic word spotting on phone-onebest transcription. It is verified that the novel hybrid method outperforms WCN-based and PSM-based approaches in-vocabulary and out-of-vocabulary (OOV) keywords.} } @INPROCEEDINGS{1283Pathan2009, AUTHOR = {Saira Saleem Pathan and Ayoub Al-Hamadi and Tobias Senst and Bernd Michaelis}, TITLE = {Multi-Object Tracking Using Semantic Analysis and Kalman Filter}, BOOKTITLE = {Image and Signal Processing and Analysis (ISPA)}, YEAR = {2009}, MONTH = sep, PAGES = {271--276}, ADDRESS = {Salzburg, Austria}, NOTE = {ISSN: 1845-5921 Print ISBN: 978-953-184-135-1}, URL = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5297722&tag=1}, ABSTRACT = {A generic approach for tracking humans and objects under occlusion using semantic analysis is presented. The aim is to exploit knowledge representation schemes, precisely semantic logic where each detected object is represented by a node and the association among the nodes is interpretated as flow paths. Besides, maximum likelihood is computed using our CWHI technique and Bhattacharyya coefficient. These likelihood weights are mapped onto the semantic network to efficiently infer the multiple possibilities of tracking by the manipulation of ldquopropositional logicrdquo at a time window. The logical propositions are built by formularizing facts, semantic rules and constraints associated with tracking. Currently, we are able to handle tracking under normal, occlusion, and split conditions. The experimental results show that the proposed approach enables accurate and reliable tracking by resolving the ambiguities of online data association under occlusions.} } @ARTICLE{1216Liebchen2009, AUTHOR = {Tilman Liebchen}, TITLE = {MPEG-4 ALS – The Standard for Lossless Audio Coding}, JOURNAL = {Journal of the the Acoustical Society of Korea}, YEAR = {2009}, MONTH = oct, PAGES = {618--629}, VOLUME = {28}, NUMBER = {7}, PDF = {http://elvera.nue.tu-berlin.de/files/1216Liebchen2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1216Liebchen2009.pdf}, ABSTRACT = {The MPEG-4 Audio Lossless Coding (ALS) standard belongs to the family MPEG-4 audio coding standards. In contrast to lossy codecs such as AAC, which merely strive to preserve the subjective audio quality, lossless coding preserves every single bit of the original audio data. The ALS core codec is based on forward-adaptive linear prediction, which combines remarkable compression with low complexity. Additional features include long-term prediction, multichannel coding, and compression of floating-point audio material. This paper describes the basic elements of the ALS codec with a focus on prediction, entropy coding, and related tools and points out the most important applications of this standardized lossless audio format.} } @INPROCEEDINGS{1215Weil2009, AUTHOR = {Jan Weil and Jean-Louis Durrieu and Gaël Richard and Thomas Sikora}, TITLE = {Automatic Generation of Lead Sheets from Polyphonic Music Signals}, BOOKTITLE = {10th International Society for Music Information Retrieval Conference (ISMIR 2009)}, YEAR = {2009}, MONTH = oct, PAGES = {603--608}, ORGANIZATION = {Society for Music Information Retrieval}, ADDRESS = {Kobe, Japan}, NOTE = {Jean-Louis Durrieu, Gaël Richard: Télécom ParisTech}, PDF = {http://elvera.nue.tu-berlin.de/files/1215Weil2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1215Weil2009.pdf}, ABSTRACT = {A lead sheet is a type of music notation which summarizes the content of a song. The usual elements that are reproduced are the melody, the chords, the tempo, the time signature, the style and the lyrics, if any. In this paper we propose a system that aims at transcribing both the melody and the associated chords in a beat-synchronous framework. A beat tracker identifies the pulse positions and thus defines a beat grid on which the chord sequence and the melody notes are mapped. The harmonic changes are used to estimate the time signature and the down beats as well as the key of the piece. The different modules perform very well on each of the different tasks, and the lead sheets that were rendered show the potential of the approaches adopted in this paper.} } @INPROCEEDINGS{1210Glantz2009, AUTHOR = {Alexander Glantz and Andreas Krutz and Martin Haller and Thomas Sikora}, TITLE = {Video Coding using Global Motion Temporal Filtering}, BOOKTITLE = {Proceedings of the 16th IEEE International Conference on Image Processing (ICIP 2009)}, YEAR = {2009}, MONTH = nov, PUBLISHER = {IEEE}, PAGES = {1053--1056}, ORGANIZATION = {IEEE Signal Processing Society}, ADDRESS = {Cairo, Egypt}, NOTE = {ISBN: 978-1-4244-5655-0 ISSN: 1522-4880}, PDF = {http://elvera.nue.tu-berlin.de/files/1210Glantz2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1210Glantz2009.pdf}, ABSTRACT = {Recent deblocking techniques are based on spatial filtering. We present a new deblocking technique based on temporal filtering of spatially aligned frames. This approach is used in an H.264/AVC coding environment. The algorithm estimates the ideal amount of frames used for temporal filtering at the encoder side. In that way it is assured that the receiver is presented with the best possible visual quality in terms of structural similarity. Theoretical consideration of the problem proves the concept of the new approach. Experimental evaluation shows that the new temporal deblocking filter significantly improves visual quality and reduces bit rate compared to common H.264/AVC deblocking by up to 18%.} } @INPROCEEDINGS{1211Krutz2009, AUTHOR = {Andreas Krutz and Alexander Glantz and Michael Frater and Thomas Sikora}, TITLE = {Rate-Distortion Optimization for Automatic Sprite Video Coding using H.264/AVC}, BOOKTITLE = {Proceedings of the 16th IEEE International Conference on Image Processing (ICIP 2009)}, YEAR = {2009}, MONTH = nov, PUBLISHER = {IEEE}, PAGES = {2297--2300}, ORGANIZATION = {IEEE Signal Processing Society}, ADDRESS = {Cairo, Egypt}, NOTE = {ISBN: 978-1-4244-5655-0 ISSN: 1522-4880}, PDF = {http://elvera.nue.tu-berlin.de/files/1211Krutz2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1211Krutz2009.pdf} } {1228Glantz2009, } {1230Ide2009, } @INPROCEEDINGS{1231Ide2009, AUTHOR = {Kai Ide and Steffen Siering and Thomas Sikora}, TITLE = {Automating Multi-Camera Self-Calibration}, BOOKTITLE = {IEEE Winter Vision Meetings, WACV}, YEAR = {2009}, MONTH = dec, ORGANIZATION = {IEEE}, ADDRESS = {Snowbird, Utah}, PDF = {http://elvera.nue.tu-berlin.de/files/1231Ide2009.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1231Ide2009.pdf} } @INPROCEEDINGS{1241Ramzan2010, AUTHOR = {Naeem Ramzan and Martha Larson and Frédéric Dufaux and Kai Clüver}, TITLE = {The Participation Payoff: Challenges and Opportunities for Multimedia Access in Networked Communities}, BOOKTITLE = {11th ACM SIGMM Int. Conf. on Multimedia Information Retrieval (MIR 2010)}, YEAR = {2010}, MONTH = mar, ADDRESS = {Philadelphia PA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1241Ramzan2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1241Ramzan2010.pdf}, ABSTRACT = {Increasingly, multimedia collections are associated with networked communities consisting of interconnected groups of users who create, annotate, browse, search, share, view, critique and remix collection content. Information arises within networked communities via connections among users and in the course of interactions between users and content. Community-derived information can be exploited to improve user access to multimedia. This paper provides a survey of techniques that make use of a combination of three information sources: community-contributed information (e.g., tags and ratings), network structure and techniques for multimedia content analysis. This triple synergy offers a wide range of opportunities for improving access to multimedia in networked communities. We focus our survey on three areas important for multimedia access: annotation, distribution and retrieval. The picture that emerges is promising: information derived from the social community is remarkably effective in improving access to multimedia content, and participation in networked communities has a high payoff.} } @INPROCEEDINGS{1235Esche2010, AUTHOR = {Marko Esche and Mustafa Karaman and Thomas Sikora}, TITLE = {Semi-Automatic Object Tracking in Video Sequences by Extension of the MRSST Algorithm}, BOOKTITLE = {International Workshop on Image Analysis for Multimedia Active Services (WIAMIS)}, YEAR = {2010}, MONTH = apr, ADDRESS = {Desenzano del Garda, Italy}, PDF = {http://elvera.nue.tu-berlin.de/files/1235Esche2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1235Esche2010.pdf}, ABSTRACT = {The objective of this work is to investigate a new approach for object segmentation in videos. While some amount of user interaction is still necessary for most algorithms in this field, these can be reduced making use of certain properties of graph-based image segmentation algorithms. Based on one of these algorithms a framework is proposed, that tracks individual foreground objects through arbitrary video sequences and partly automates the necessary corrections required from the user. Experimental results suggest, that the proposed algorithm performs well on both low- and high-resolution video sequences and can even cope with motion blur.} } @INPROCEEDINGS{1236Senst2010, AUTHOR = {Tobias Senst and Rubén Heras Evangelio and Volker Eiselein and Michael Pätzold and Thomas Sikora}, TITLE = {TOWARDS DETECTING PEOPLE CARRYING OBJECTS: A Periodicity Dependency Pattern Approach}, BOOKTITLE = {International Conference on Computer Vision Theory and Applications (VISAPP)}, YEAR = {2010}, MONTH = may, EDITOR = {INSTICC Press}, PAGES = {524--529}, ADDRESS = {Angers, France}, NOTE = {ISBN:978-989-674-029-0}, PDF = {http://elvera.nue.tu-berlin.de/files/1236Senst2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1236Senst2010.pdf}, ABSTRACT = {Detecting people carrying objects is a commonly formulated problem which results can be used as a first step in order to monitor interactions between people and objects in computer vision applications. In this paper we propose a novel method for this task. By using gray-value information instead of the contours obtained by a segmentation process we build up a system that is robust against segmentation errors. Experimental results show the validity of the method.} } {1239Knorr2010, } @INPROCEEDINGS{1232Krutz2010, AUTHOR = {Andreas Krutz and Alexander Glantz and Thomas Sikora}, TITLE = {Background Modeling for Video Coding: From Sprites to Global Motion Temporal Filtering}, BOOKTITLE = {Proceedings of the IEEE International Symposium on Circuits and Systems (ISCAS 2010)}, YEAR = {2010}, MONTH = may, PUBLISHER = {IEEE}, PAGES = {2179--2182}, ORGANIZATION = {IEEE Circuits and Systems Society}, ADDRESS = {Paris, France}, NOTE = {ISBN: 978-1-4244-5309-2}, PDF = {http://elvera.nue.tu-berlin.de/files/1232Krutz2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1232Krutz2010.pdf} } @INPROCEEDINGS{1250Ide2010, AUTHOR = {Kai Ide and Thomas Sikora}, TITLE = {Adaptive Parallax For 3D Television}, BOOKTITLE = {3DTV Conference 2010, Capture, Transmission and Display of 3D Video}, YEAR = {2010}, MONTH = jun, EDITOR = {IEEE}, ADDRESS = {Tampere, Finland}, NOTE = {ISBN: 978-1-4244-6378-7} } @INPROCEEDINGS{1240Senst2010, AUTHOR = {Tobias Senst and Volker Eiselein and Thomas Sikora}, TITLE = {II-LK-A Real-Time Implementation for sparse Optical Flow}, BOOKTITLE = {International Conference on Image Analysis and Recognition (ICIAR)}, YEAR = {2010}, MONTH = jun, EDITOR = {A. Campilho and M. Kamel (Eds.)}, PUBLISHER = {Springer Verlag}, PAGES = {240--249}, ADDRESS = {Povoa de Varzim, Portugal}, NOTE = {ISBN: 978-3-642-13771-6}, DOI = {10.1007/978-3-642-13772-3_25}, ABSTRACT = {In this paper we present an approach to speed up the computation of sparse optical flow fields by means of integral images and provide implementation details. Proposing a modification of the Lucas-Kanade energy Functional allows us to use integral images and thus to speed up the method notably while affecting only slightly the quality of the computed optical flow. The approach is combined with an efficient scanline algorithm to reduce the computation of integral images to those areas where there are features to be tracked. The proposed method can speed up current surveillance algorithms used for scene description and crowd analysis.} } @INPROCEEDINGS{1246Kaiser2010, AUTHOR = {Florian Kaiser and Thomas Sikora}, TITLE = {Music Structure Discovery in Popular Music using Non-negative Matrix Factorization}, BOOKTITLE = {11th International Society for Music Information Retrieval Conference (ISMIR 2010)}, YEAR = {2010}, MONTH = aug, ORGANIZATION = {Society for Music Information Retrieval}, ADDRESS = {Utrecht, Netherlands}, NOTE = {ISBN 978-90-393-53813}, URL = {http://ismir2010.ismir.net/proceedings/ismir2010-73.pdf} } @INPROCEEDINGS{1257Pätzold2010, AUTHOR = {Michael Pätzold and Rubén Heras Evangelio and Thomas Sikora}, TITLE = {Counting people in crowded environments by fusion of shape and motion information}, BOOKTITLE = {Proceedings of the IEEE International Conference on Advanced Video and Signal Based Surveillance, PETS 2010 Workshop}, YEAR = {2010}, MONTH = aug, EDITOR = {IEEE Computer Society}, PAGES = {157--164}, ORGANIZATION = {IEEE}, ADDRESS = {Boston, USA}, NOTE = {ISBN: 978-0-7695-4264-5}, DOI = {10.1109/AVSS.2010.92}, ABSTRACT = {Knowing the number of people in a crowded scene is of big interest in the surveillance scene. In the past, this problem has been tackled mostly in an indirect, statistical way. This paper presents a direct, counting by detection, method based on fusing spatial information received from an adapted Histogram of Oriented Gradients algorithm (HOG) with temporal information by exploiting distinctive motion characteristics of different human body parts. For that purpose, this paper defines a measure for uniformity of motion. Furthermore, the system performance is enhanced by validating the resulting human hypotheses by tracking and applying a coherent motion detection. The approach is illustrated with an experimental evaluation.} } @ARTICLE{1227Glantz2010, AUTHOR = {Alexander Glantz and Andreas Krutz and Thomas Sikora and Paulo Nunes and Fernando Pereira}, TITLE = {Automatic MPEG-4 Sprite Coding - Comparison of Integrated Object Segmentation Algorithms}, JOURNAL = {Multimedia Tools and Applications, Special Issue on "Advances in Image and Video Processing Techniques"}, YEAR = {2010}, MONTH = sep, PAGES = {483--512}, VOLUME = {49}, NUMBER = {3}, NOTE = {ISSN: 1380-7501 (Print) ISSN: 1573-7721 (Online)}, PDF = {http://elvera.nue.tu-berlin.de/files/1227Glantz2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1227Glantz2010.pdf}, ABSTRACT = {Sprite coding, as standardized in MPEG-4 Visual, can result in superior performance compared to common hybrid video codecs. We consider sprite coding, which significantly increases the objective as well as the subjective quality of coded video content. The main challenge of this approach is the segmentation of the fore- ground objects in a preprocessing step. We evaluate automatic object segmentation methods based on global motion estimation and background sprite generation. The objects are coded using the MPEG-4 Visual Main Profile and compared with the Advanced Simple Profile.} } @INPROCEEDINGS{1243Haller2010, AUTHOR = {Martin Haller and Andreas Krutz and Thomas Sikora}, TITLE = {Robust Global Motion Estimation using Motion Vectors of Variable Size Blocks and Automatic Motion Model Selection}, BOOKTITLE = {Proceedings of the 17th IEEE International Conference on Image Processing (ICIP 2010)}, YEAR = {2010}, MONTH = sep, ADDRESS = {Hong Kong}, NOTE = {ISBN: 978-1-4244-7993-1 ISSN: 1522-4880}, PDF = {http://elvera.nue.tu-berlin.de/files/1243Haller2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1243Haller2010.pdf} } @INPROCEEDINGS{1244Glantz2010, AUTHOR = {Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {Global Motion Temporal Filtering for In-loop Deblocking}, BOOKTITLE = {Proceedings of the 17th IEEE International Conference on Image Processing (ICIP 2010)}, YEAR = {2010}, MONTH = sep, ADDRESS = {Hong Kong}, NOTE = {ISBN: 978-1-4244-7993-1 ISSN: 1522-4880}, PDF = {http://elvera.nue.tu-berlin.de/files/1244Glantz2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1244Glantz2010.pdf} } @INPROCEEDINGS{1245Tok2010, AUTHOR = {Michael Tok and Alexander Glantz and Marina Georgia Arvanitidou and Andreas Krutz and Thomas Sikora}, TITLE = {Compressed Domain Global Motion Estimation using the Helmholtz Tradeoff Estimator}, BOOKTITLE = {Proceedings of the 17th IEEE International Conference on Image Processing (ICIP 2010)}, YEAR = {2010}, MONTH = sep, ADDRESS = {Hong Kong}, NOTE = {ISBN: 978-1-4244-7993-1 ISSN: 1522-4880}, PDF = {http://elvera.nue.tu-berlin.de/files/1245Tok2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1245Tok2010.pdf}, ABSTRACT = {Several algorithms for global motion estimation in video sequences using pixel- or block-based approaches have been published. Most known pixel-based methods lack in performance while when using block-based algorithms working on motion vectors, robustness to outliers and accuracy is missing. In this paper we present the fundamentals of a significantly improved, robust block-based method for global motion estimation in compressed domain following the generic Helmholtz principle. To this aim, we use motion vector fields as provided by MPEG data streams. Background PSNR values for four motion compensated test sequences show that our new method delivers results comparable to more complex algorithms.} } @INBOOK{1269Anstädt2010, AUTHOR = {Torsten Anstädt and Ivo Keller and Harald Lutz}, TITLE = {Intelligente Videoanalyse}, YEAR = {2010}, BOOKTITLE = {Intelligente Videoanalyse Handbuch}, PUBLISHER = {Wiley-Vch Verl. GmbH}, PAGES = {151}, EDITION = {1.}, NOTE = {978-3-527-40976-1} } @INPROCEEDINGS{1445Kelm2010, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Thomas Sikora}, TITLE = {Video2GPS: Geotagging using collaborative systems, textual and visual features}, BOOKTITLE = {Video2GPS: Geotagging using collaborative systems, textual and visual features}, YEAR = {2010}, MONTH = oct, PDF = {http://elvera.nue.tu-berlin.de/files/1445Kelm2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1445Kelm2010.pdf} } @INPROCEEDINGS{1284Lee2010, AUTHOR = {Jong-Seok Lee and Francesca De Simone and Naeem Ramzan and Zhijie Zhao and Engin Kurutepe and Thomas Sikora and Jörn Ostermann and Ebroul Izquierdo and Touradj Ebrahimi}, TITLE = {Subjective Evaluation of Scalable Video Coding for Content Distribution}, BOOKTITLE = {ACM Multimedia 2010}, YEAR = {2010}, MONTH = oct, EDITOR = {ACM}, ORGANIZATION = {ACM}, PDF = {http://elvera.nue.tu-berlin.de/files/1284Lee2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1284Lee2010.pdf}, ABSTRACT = {This paper investigates the influence of the combination of the scalability parameters in scalable video coding (SVC) schemes on the subjective visual quality. We aim at providing guidelines for an adaptation strategy of SVC that can se- lect the optimal scalability options for resource-constrained networks. Extensive subjective tests are conducted by using two different scalable video codecs and high definition contents. The results are analyzed with respect to five dimen- sions, namely, codec, content, spatial resolution, temporal resolution, and frame quality.} } @INPROCEEDINGS{1282Pätzold2010, AUTHOR = {Michael Pätzold and Rubén Heras Evangelio and Thomas Sikora}, TITLE = {Counting People in Crowded Environments: An Overview}, BOOKTITLE = {Hands-on Image Processing 2010 (HOIP10). Security, Surveillance and Identification in Everyday Life}, YEAR = {2010}, MONTH = nov, ORGANIZATION = {TECNALIA}, ADDRESS = {TECNALIA. Technology Park building 202; 48170 Zamudio (Bizkaia)}, NOTE = {invited paper}, URL = {http://es.slideshare.net/TECNALIA/hoip10-articulo-1702univberlin}, ABSTRACT = {Counting the number of persons in a crowded scene is of big interest in many applications. Most of the proposed approaches in the literature tackle the task of counting people in an indirect, statistical way. Recently, we presented a direct, counting-by-detection method based on fusing shape information obtained from an adapted Histogram of Oriented Gradients algorithm (HOG) with temporal information. The use of temporal information reduces false positives by considering the characteristics of motion of different human body parts. A subsequent tracking and coherent motion detection of the human hypotheses enhance the performance of this system additionally. The performance obtained by this system is comparable to state-of-the-art systems while allowing not only counting people but also providing valuable information for a tracking approach. In this paper we present an overview of relevant state-of-the-art methods for counting people in crowded environments, paying special attention to the method proposed by our group and showing results based on standard video sequences.} } @INPROCEEDINGS{1280Sikora2010, AUTHOR = {Florian Kaiser and Thomas Sikora}, TITLE = {Détection de structure en musique par décomposition non-négative des matrices de similarités}, BOOKTITLE = {Journées Jeunes Chercheurs en Audition, Acoustique musicale et Signal audio (JJCAAS)}, YEAR = {2010}, MONTH = nov, ADDRESS = {IRCAM - Paris}, NOTE = {Workshop on Acoustics and Audio Signal Processing} } @INPROCEEDINGS{1258Glantz2010, AUTHOR = {Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {Adaptive Global Motion Temporal Prediction for Video Coding}, BOOKTITLE = {Proceedings of the 28th IEEE Picture Coding Symposium (PCS 2010)}, YEAR = {2010}, MONTH = dec, ADDRESS = {Nagoya, Japan}, NOTE = {ISBN: 978-1-4244-7135-5}, PDF = {http://elvera.nue.tu-berlin.de/files/1258Glantz2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1258Glantz2010.pdf} } @INPROCEEDINGS{1259Krutz2010, AUTHOR = {Andreas Krutz and Alexander Glantz and Thomas Sikora}, TITLE = {Recent Advances in Video Coding using Static Background Models}, BOOKTITLE = {Proceedings of the 28th IEEE Picture Coding Symposium (PCS 2010)}, YEAR = {2010}, MONTH = dec, ADDRESS = {Nagoya, Japan}, NOTE = {ISBN: 978-1-4244-7135-5}, PDF = {http://elvera.nue.tu-berlin.de/files/1259Krutz2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1259Krutz2010.pdf} } @INPROCEEDINGS{1260Esche2010, AUTHOR = {Marko Esche and Andreas Krutz and Alexander Glantz and Thomas Sikora}, TITLE = {A Novel In-loop Filter for Video-Compression based on Temporal Pixel Trajectories}, BOOKTITLE = {Proceedings of the 28th IEEE Picture Coding Symposium (PCS 2010)}, YEAR = {2010}, MONTH = dec, ADDRESS = {Nagoya, Japan}, NOTE = {ISBN: 978-1-4244-7135-5}, PDF = {http://elvera.nue.tu-berlin.de/files/1260Esche2010.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1260Esche2010.pdf} } @BOOK{1287Schallauer2011, AUTHOR = {Peter Schallauer and Werner Bailer and Raphael Troncy and Florian Kaiser}, TITLE = {Multimedia Metadata Standards}, YEAR = {2011}, BOOKTITLE = {Multimedia Semantics: Metadata, Analysis and Interaction}, EDITOR = {Raphael Troncy, Benoir Huet and Simon Schenk}, PUBLISHER = {John Wiley and Sons} } @INPROCEEDINGS{1277Senst2011, AUTHOR = {Tobias Senst and Volker Eiselein and Rubén Heras Evangelio and Thomas Sikora}, TITLE = {Robust Modified L2 Local Optical Flow Estimation and Feature Tracking}, BOOKTITLE = {IEEE Workshop on Motion and Video Computing (WMVC)}, YEAR = {2011}, MONTH = jan, EDITOR = {Eric Mortensen}, PAGES = {685--690}, ADDRESS = {Kona, USA}, NOTE = {IEEE Catalog Number: CFP11082-CDR ISBN: 978-1-4244-9495-8 DOI: 10.1109/WACV.2011.5711571}, PDF = {http://elvera.nue.tu-berlin.de/files/1277Senst2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1277Senst2011.pdf}, ABSTRACT = {This paper describes a robust method for the local optical flow estimation and the KLT feature tracking performed on the GPU. Therefore we present an estimator based on the L2 norm with robust characteristics. In order to increase the robustness at discontinuities we propose a strategy to adapt the used region size. The GPU implementation of our approach achieves real-time (>25fps) performance for High Definition (HD) video sequences while tracking several thousands of points. The benefit of the suggested enhancement is illustrated on the Middlebury optical flow benchmark.} } @INPROCEEDINGS{1278Senst2011, AUTHOR = {Tobias Senst and Ruben Heras Evangelio and Thomas Sikora}, TITLE = {Detecting People Carrying Objects based on an Optical Flow Motion Model}, BOOKTITLE = {IEEE Workshop on Applications of Computer Vision (WACV)}, YEAR = {2011}, MONTH = jan, EDITOR = {Eric Mortensen}, PAGES = {301--306}, ADDRESS = {Kona, USA}, NOTE = {IEEE Catalog Number: CFP11082-CDR ISBN: 978-1-4244-9495-8 DOI:10.1109/WACV.2011.5711518}, PDF = {http://elvera.nue.tu-berlin.de/files/1278Senst2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1278Senst2011.pdf}, ABSTRACT = {Detecting people carrying objects is a commonly formulated problem as a first step to monitor interactions between people and objects. Recent work relies on a precise foreground object segmentation, which is often difficult to achieve in video surveillance sequences due to a bad contrast of the foreground objects with the scene background, abrupt changing light conditions and small camera vibrations. In order to cope with these difficulties we propose an approach based on motion statistics. Therefore we use a Gaussian mixture motion model (GMMM) and, based on that model, we define a novel speed and direction independent motion descriptor in order to detect carried baggage as those regions not fitting in the motion description model of an average walking person. The system was tested with the public dataset PETS2006 and a more challenging dataset including abrupt lighting changes and bad color contrast and compared with existing systems, showing very promissing results.} } @INPROCEEDINGS{1279Evangelio2011, AUTHOR = {Rubén Heras Evangelio and Tobias Senst and Thomas Sikora}, TITLE = {Detection of Static Objects for the Task of Video Surveillance}, BOOKTITLE = {IEEE Workshop on Applications of Computer Vision (WACV)}, YEAR = {2011}, MONTH = jan, EDITOR = {IEEE Computer Society}, PAGES = {27--32}, ORGANIZATION = {IEEE}, ADDRESS = {Kona, USA}, DOI = {10.1109/WACV.2011.5711550}, ABSTRACT = {Detecting static objects in video sequences has a high relevance in many surveillance scenarios like airports and railwaystations. In this paper we propose a system for the detection of static objects in crowded scenes that, based on the detection of two background models learning at different rates, classifies pixels with the help of a finite-state machine. The background is modelled by two mixtures of Gaussians with identical parameters except for the learning rate. The state machine provides the meaning for the interpretation of the results obtained from background subtraction and can be used to incorporate additional information cues, obtaining thus a flexible system specially suitable for real-life applications. The system was built in our surveillance application and successfully validated with several public datasets.} } @INPROCEEDINGS{1281Evangelio2011, AUTHOR = {Rubén Heras Evangelio and Michael Pätzold and Thomas Sikora}, TITLE = {A System for Automatic and Interactive Detection of Static Objects}, BOOKTITLE = {IEEE Workshop on Applications of Computer Vision (POV)}, YEAR = {2011}, MONTH = jan, EDITOR = {IEEE Computer Society}, PAGES = {534--540}, ORGANIZATION = {IEEE}, ADDRESS = {Kona, USA}, NOTE = {IEEE Catalog Number: CFP11082-CDR ISBN: 978-1-4244-9495-8}, DOI = {10.1109/POV.2011.5712365}, ABSTRACT = {Designing static object detection systems that are able to incorporate user interaction conveys a great benefit in many surveillance applications, since some correctly detected static objects can be considered to have no interest by a human operator. Interactive systems allow the user to include these decisions into the system, making automated surveillance systems more attractive and comfortable to use. In this paper we present a system for the detection of static objects that, based on the detection of a dual background model, classifies pixels by means of a finite-state machine. The state machine provides the meaning for the interpretation of the results obtained from background subtraction and it can be optionally used to integrate user input. The system can thus be used both in an automatic and an interactive manner without requiring any expert knowledge from the user. We successfully validated the system with several public datasets.} } @INPROCEEDINGS{1296Kaiser2011, AUTHOR = {Florian Kaiser}, TITLE = {Audio Signal Representations for Temporal Structure Segmentation}, BOOKTITLE = {Dagstuhl Seminar on Multimodal Music Processing}, YEAR = {2011}, MONTH = jan, URL = {http://drops.dagstuhl.de/opus/volltexte/2011/3145/} } @INPROCEEDINGS{1301Sikora2011, AUTHOR = {Prof. Dr.-Ing. Thomas Sikora}, TITLE = {Social Media - A Signal Processing Perspective}, BOOKTITLE = {The Eighth IASTED International Conference on Signal Processing, Pattern Recognition and Applications SPPRA 2011}, YEAR = {2011}, MONTH = feb, EDITOR = {IASTED}, ADDRESS = {Innsbruck, Austria}, NOTE = {invited paper, Key-Note Vortrag} } @ARTICLE{1297Evangelio2011, AUTHOR = {Rubén Heras Evangelio and Thomas Sikora}, TITLE = {Static Object Detection Based on a Dual Background Model and a Finite-State Machine}, JOURNAL = {EURASIP Journal on Image and Video Processing}, YEAR = {2011}, MONTH = mar, PAGES = {11 pages}, VOLUME = {Vol. 2011}, NOTE = {Article ID 858502, doi:10.1155/2011/858502}, PDF = {http://elvera.nue.tu-berlin.de/files/1297Evangelio2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1297Evangelio2011.pdf}, ABSTRACT = {Detecting static objects in video sequences has a high relevance in many surveillance applications, such as the detection of abandoned objects in public areas. In this paper, we present a system for the detection of static objects in crowded scenes. Based on the detection of two background models learning at different rates, pixels are classified with the help of a finite-state machine. The background is modelled by two mixtures of Gaussians with identical parameters except for the learning rate. The state machine provides the meaning for the interpretation of the results obtained from background subtraction; it can be implemented as a lookup table with negligible computational cost and it can be easily extended. Due to the definition of the states in the state machine, the system can be used either full automatically or inter-actively, making it extremely suitable for real-life surveillance applications. The system was successfully validated with several public datasets.} } {1305Knorr2011, } @ARTICLE{1307Zhang2011, AUTHOR = {Liang Zhang and Carlos Vázquez and Sebastian Knorr}, TITLE = {3D-TV Content Creation: Automatic 2D-to-3D Video Conversion}, JOURNAL = {IEEE Transactions on Broadcasting, Special Issue on 3D-TV}, YEAR = {2011}, MONTH = mar, PAGES = {372--383}, VOLUME = {57}, NUMBER = {2}, NOTE = {Received the Scott Helt Memorial Award for the best paper published in the IEEE Transactions on Broadcasting in 2011.}, PDF = {http://elvera.nue.tu-berlin.de/files/1307Zhang2011.pdf}, DOI = {10.1109/TBC.2011.2122930}, URL = {http://elvera.nue.tu-berlin.de/files/1307Zhang2011.pdf}, ABSTRACT = {Three-dimensional television (3D-TV) is the next major revolution in television. A successful rollout of 3D-TV will require a backward-compatible transmission/distribution system, inexpensive 3D displays, and an adequate supply of high-quality 3D program material. With respect to the last factor, the conversion of 2D images/videos to 3D will play an important role. This paper provides an overview of automatic 2D-to-3D video conversion with a specific look at a number of approaches for both the extraction of depth information from monoscopic images and the generation of stereoscopic images. Some challenging issues for the success of automatic 2D-to-3D video conversion are pointed out as possible research topics for the future.} } @ARTICLE{1308Smolic2011, AUTHOR = {Aljoscha Smolic and Peter Kauff and Sebastian Knorr and Alexander Hornung and Matthias Kunter and Marcus Müller and Manuel Lang}, TITLE = {Three-Dimensional Video Postproduction and Processing}, JOURNAL = {Proceedings of the IEEE}, YEAR = {2011}, MONTH = apr, PAGES = {607--625}, VOLUME = {99}, NUMBER = {4}, PDF = {http://elvera.nue.tu-berlin.de/files/1308Smolic2011.pdf}, DOI = {10.1109/JPROC.2010.2098350}, URL = {http://elvera.nue.tu-berlin.de/files/1308Smolic2011.pdf}, ABSTRACT = {This paper gives an overview of the state-of-the-art in 3-D video postproduction and processing as well as an outlook to remaining challenges and opportunities. First, fundamentals of stereography are outlined that set the rules for proper 3-D content creation. Manipulation of the depth composition of a given stereo pair via view synthesis is identified as the key functionality in this context. Basic algorithms are described to adapt and correct fundamental stereo properties such as geometric distortions, color alignment, and stereo geometry. Then, depth image-based rendering is explained as the widely applied solution for view synthesis in 3-D content creation today. Recent improvements of depth estimation already provide very good results. However, in most cases, still interactive workflows dominate. Warping-based methods may become an alternative for some applications in the future, which do not rely on dense and accurate depth estimation. Finally, 2-D to 3-D conversion is covered, which is an important special area for reuse of existing legacy 2-D content in 3-D. Here various advanced algorithms are combined in interactive workflows.} } @INPROCEEDINGS{1293Kelm2011, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Thomas Sikora}, TITLE = {Multi-modal, Multi-resource Methods for Placing Flickr Videos on the Map}, BOOKTITLE = {ACM International Conference on Multimedia Retrieval (ICMR)}, YEAR = {2011}, MONTH = apr, PAGES = {8}, DOI = {10.1145/1991996.1992048}, URL = {http://dl.acm.org/ft_gateway.cfm?id=1992048&ftid=980972&dwn=1&CFID=57606239&CFTOKEN=98200645}, ABSTRACT = {We present three approaches for placing videos in Flickr on the world map. The toponym extraction and geo lookup ap- proach makes use of external resources to identify toponyms in the metadata and associate them with geo-coordinates. The metadata-based region model approach uses a k-nearest- neighbour classifier trained over geographical regions. Videos are represented using their metadata in a text space with re- duced dimensionality. The visual region model approach uses a support vector machine also trained over geographical re- gions. Videos are represented using low-level feature vectors from multiple key frames. Voting methods are used to form a single decision for each video. We compare the approaches experimentally, highlighting the importance of using appro- priate metadata features and suitable regions as the basis of the region model. The best performance is achieved by the geo-lookup approach used with fallback to the visual region model when the video metadata contains no toponym.} } {1306Knorr2011, } @INPROCEEDINGS{1286Tok2011, AUTHOR = {Michael Tok and Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {Feature-Based Global Motion Estimation Using the Helmholtz Principle}, BOOKTITLE = {Proceedings of the IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP 2011)}, YEAR = {2011}, MONTH = may, ADDRESS = {Prague, Czech Republic}, NOTE = {ISSN: 1520-6149 E-ISBN: 978-1-4577-0537-3 Print ISBN: 978-1-4577-0538-0}, PDF = {http://elvera.nue.tu-berlin.de/files/1286Tok2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1286Tok2011.pdf}, ABSTRACT = {Global motion estimation is an important task for various video processing techniques. The estimation itself has to be robust in presence of arbitrarily moving foreground objects. For that task, two different kinds of estimation methods exist. On the one hand, pixel-based approaches deliver more precise results and work more robust on video sequences with foreground objects. On the other hand, when working on encoded video streams, block-based methods can be used for a much faster but often less precise estimation. We propose a two step estimation method based on the determination and tracking of feature points of video frames and robust motion model estimation using the Helmholtz principle. Therefore, good trackable features are detected and tracked in video sequences. Subsequently, a perspective motion model is derived from the resulting correspondencies by removing feature pairs not belonging to global motion.} } @INPROCEEDINGS{1294Kaiser2011, AUTHOR = {Florian Kaiser and Marina Georgia Arvanitidou and Thomas Sikora}, TITLE = {Audio Similarity Matrices Enhancement in an Image Processing Framework}, BOOKTITLE = {9th International Workshop on Content-Based Multimedia Indexing (CBMI)}, YEAR = {2011}, MONTH = jun, ADDRESS = {Madrid, Spain}, PDF = {http://elvera.nue.tu-berlin.de/files/1294Kaiser2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1294Kaiser2011.pdf} } @INPROCEEDINGS{1336Kim2011, AUTHOR = {Woong Hee Kim and Jongwoon Hwang and Thomas Sikora}, TITLE = {Document Capturing Method with a Camera using Robust Feature Points Detection}, BOOKTITLE = {IWSSIP 2011, 18th International Conference on Systems, Signals and Image Processing}, YEAR = {2011}, MONTH = jun, EDITOR = {IEEE, EURASIP}, ORGANIZATION = {Faculty of Electrical Engineering University of Sarajevo Bosnia and Herzegovina}, ADDRESS = {Zmaja od Bosne 4, 71000 Sarajevo, Bosnia and Herzegovina}, NOTE = {5 Seiten}, PDF = {http://elvera.nue.tu-berlin.de/files/1336Kim2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1336Kim2011.pdf}, ABSTRACT = {This paper presents a method of capturing a document with a single camera based on the Hough transform with reduced angle domain. Document image capturing with a camera has some advantages over traditional flatbed scanners. A problem of capturing a document with a camera is to find a paper region in an image and find robust features to remove perspective distortions. If we have boundary infor-mation of a paper with a document to extract feature points, a homography could be estimated to remove per- spective distortions. In this paper, text regions in a document are removed with edge plane decomposition to detect the region and the boundary of a paper in an image from a camera, because characters in a document make outliers, which make it difficult to find features. After removing text regions in a document, feature points to remove perspective distortions are extracted by the Hough transform in reduced angle domain from each decomposed edge plane. The experimental results show that the proposed method successfully removes text regions, which are out- liers of finding line features to eliminate perspective distortions. A document region is extracted from the image successfully.} } @INPROCEEDINGS{1290Arvanitidou2011, AUTHOR = {Marina Georgia Arvanitidou and Michael Tok and Andreas Krutz and Thomas Sikora}, TITLE = {Short-Term Motion-Based Object Segmentation}, BOOKTITLE = {Proceedings of the IEEE International Conference on Multimedia & Expo (ICME)}, YEAR = {2011}, MONTH = jul, ADDRESS = {Barcelona, Spain}, PDF = {http://elvera.nue.tu-berlin.de/files/1290Arvanitidou2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1290Arvanitidou2011.pdf}, ABSTRACT = {In this paper we present an automatic motion-based object segmentation algorithm for video sequences with moving camera. For every frame, two error frames are generated using short-term motion compensation, they are combined, and a thresholding segmentation algorithm is applied. Recent advances in global motion estimation eliminate outliers in the background area, and thus enable more precise definition of the foreground. We show that employing only the motion information of two adjacent frames we achieve improved results compared with a previously proposed short-term motion-based method and we provide subjective and objective evaluation.} } @INPROCEEDINGS{1313Senst2011, AUTHOR = {Tobias Senst and Michael Pätzold and Rubén Heras Evangelio and Volker Eiselein and Ivo Keller and Thomas Sikora}, TITLE = {On Building Decentralized Wide-Area Surveillance Networks based on ONVIF}, BOOKTITLE = {Workshop on Multimedia Systems for Surveillance (MMSS) in conjunction with 8th IEEE International Conference on Advanced Video and Signal-Based Surveillance}, YEAR = {2011}, MONTH = aug, PAGES = {420--423}, ORGANIZATION = {IEEE}, ADDRESS = {Klagenfurt, Austria}, NOTE = {ISBN: 978-1-4577-0844-2 DOI: 10.1109/AVSS.2011.6027365}, PDF = {http://elvera.nue.tu-berlin.de/files/1313Senst2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1313Senst2011.pdf}, ABSTRACT = {In this paper we present a decentralized surveillance network composed of IP video cameras, analysis devices and a central node which collects information and displays it in a 3D model of the complete area. The exchange of information between all components in the surveillance network takes place according to the ONVIF specification, therefore ensuring interoperability between products complying with the specification and flexibility regarding the integration of new devices and services. The collected information is displayed in a 3D model of the surveilled area, therefore providing a comfortable overview of the activity in large environments and offering the user an intuitive way to eventually interact with network devices.} } @INPROCEEDINGS{1319Sikora2011, AUTHOR = {Michael Pätzold and Thomas Sikora}, TITLE = {Real-time person counting by propagating networks flows}, BOOKTITLE = {8th IEEE International Conference on Advanced Video and Signal-Based Surveillance (AVSS)}, YEAR = {2011}, MONTH = aug, PAGES = {5}, ORGANIZATION = {IEEE}, ADDRESS = {Klagenfurt, Austria}, NOTE = {ISBN: 978-1-4577-0844-2}, DOI = {10.1109/AVSS.2011.6027296}, ABSTRACT = {In this paper we present a system that tracks multiple persons by detection in real-time. We introduce a measure for similarity of detections which segments significant information from background clutter by using statistical information obtained during the learning phase of the detector. In order to track multiple persons we map the detections into flow networks utilizing this measure. A continuous real-time processing of video streams is accomplished by analyzing only small chunks of detections consecutively using different networks. By propagating the result of one network into the subsequent one a temporal consistent association is achieved. The system was evaluated using a standard video sequence with a crowded scene and an own dataset containing very long sequences. The results demonstrate that the system performs comparable to other systems while meeting real-time requirements.} } @INPROCEEDINGS{1323Evangelio2011, AUTHOR = {Rubén Heras Evangelio and Thomas Sikora}, TITLE = {Complementary Background Models for the Detection of Static and Moving Objects in Crowded Environments}, BOOKTITLE = {8th IEEE International Conference on Advanced Video and Signal-Based Surveillance (AVSS)}, YEAR = {2011}, MONTH = aug, PAGES = {71--76}, ORGANIZATION = {IEEE}, ADDRESS = {Klagenfurt, Austria}, NOTE = {E-ISBN : 978-1-4577-0843-5 Print ISBN: 978-1-4577-0844-2}, PDF = {http://elvera.nue.tu-berlin.de/files/1323Evangelio2011.pdf}, DOI = {10.1109/AVSS.2011.6027297}, URL = {http://elvera.nue.tu-berlin.de/files/1323Evangelio2011.pdf}, ABSTRACT = {In this paper we propose the use of complementary background models for the detection of static and moving objects in crowded video sequences. One model is devoted to accurately detect motion, while the other aims to achieve a representation of the empty scene. The differences in foreground detection of the complementary models are used to identify new static regions. A subsequent analysis of the detected regions is used to ascertain if an object was placed in or removed from the scene. Static objects are prevented from being incorporated into the empty scene model. Removed objects are rapidly dropped from both models. In this way, we build a very precise model of the empty scene and improve the foreground segmentation results of a single background model. The system was validated with several public datasets, showing many advantages over state- of-the-art static objects and foreground detectors.} } @INPROCEEDINGS{1337Rae2011, AUTHOR = {Adam Rae and Vanesa Murdock and Pavel Serdyukov and Pascal Kelm}, TITLE = {Working Notes for the Placing Task at MediaEval 2011}, BOOKTITLE = {Multimedia Benchmark Workshop 2011}, YEAR = {2011}, MONTH = sep, PDF = {http://elvera.nue.tu-berlin.de/files/1337Rae2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1337Rae2011.pdf}, ABSTRACT = {This paper provides a description of the MediaEval 2011 Placing Task. The task requires participants to automati- cally assign latitude and longitude coordinates to each of the provided test videos. This kind of geographical location tag, or geotag, helps users localise videos, allowing their media to be anchored to real world locations. Currently, however, most videos online are not labelled with this kind of data. This task encourages participants to nd innovative ways of doing this labelling automatically. The data comes from Flickr|an example of a photo sharing website that allows users to both encode their photos and videos with geotags, as well as use them when searching and browsing. This paper describes the task, the data sets provided and how the individual participants results are evaluated.} } @INPROCEEDINGS{1338Larson2011, AUTHOR = {Martha Larson and Maria Eskevich and Roeland Ordelman and Christoph Kofler and Sebastian Schmiedeke and Gareth J. F. Jones}, TITLE = {Overview of MediaEval 2011 Rich Speech Retrieval Task and Genre Tagging Task}, BOOKTITLE = {Multimedia Benchmark Workshop 2011}, YEAR = {2011}, MONTH = sep, PUBLISHER = {CEUR-WS}, URL = {http://ceur-ws.org/Vol-807/} } @INPROCEEDINGS{1339Schmiedeke2011, AUTHOR = {Sebastian Schmiedeke and Pascal Kelm and Thomas Sikora}, TITLE = {TUB @ MediaEval 2011 Genre Tagging Task: Prediction using Bag-of-(visual)-Words Approaches}, BOOKTITLE = {Multimedia Benchmark Workshop 2011}, YEAR = {2011}, MONTH = sep, PUBLISHER = {CEUR-WS}, URL = {http://ceur-ws.org/Vol-807/} } @INPROCEEDINGS{1317Knorr2011, AUTHOR = {Sebastian Knorr and Kai Ide and Matthias Kunter and Thomas Sikora}, TITLE = {Basic rules for good 3D and the avoidance of visual discomfort in stereoscopic vision}, BOOKTITLE = {International Broadcasting Convention (IBC)}, YEAR = {2011}, MONTH = sep, ADDRESS = {Amsterdam} } @INPROCEEDINGS{1302Krutz2011, AUTHOR = {Andreas Krutz and Alexander Glantz and Thomas Sikora}, TITLE = {Theoretical Consideration of Global Motion Temporal Filtering}, BOOKTITLE = {Proceedings of the 18th IEEE International Conference on Image Processing (IEEE ICIP2011)}, YEAR = {2011}, MONTH = sep, PUBLISHER = {IEEE}, PAGES = {3534--3537}, ORGANIZATION = {IEEE}, ADDRESS = {Brussels, Belgium}, NOTE = {IEEE catalog number: CFP11CIP-USB ISBN: 978-1-4577-1302-6}, PDF = {http://elvera.nue.tu-berlin.de/files/1302Krutz2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1302Krutz2011.pdf}, ABSTRACT = {A widely used technique to reduce the noise variance of a signal is a temporal overlapping of several noisy versions of it. It will be shown that the same idea can be applied for video sequences. Several versions of the current frame can be aligned using motion compensation that adjacent frames represent a noisy version of the current frame. In a first theoretical calculation of this concept combining the temporal overlapping of several noisy versions of the same signal and a rate-distortion equation, it has been shown that a theoretical bit rate reduction of 1/2 log2 (N) is possible. Here, the concept will be advanced to be closer to practice by adding a model for the motion estimation error. It will be shown that the derived theoretical equation confirms the practice and models the behavior of a video encoding environment using parametric motion compensated temporal filtering very well.} } @INPROCEEDINGS{1303Esche2011, AUTHOR = {Marko Esche and Andreas Krutz and Alexander Glantz and Thomas Sikora}, TITLE = {Temporal trajectory filtering for bi-directional predicted frames}, BOOKTITLE = {Proceedings of the 18th IEEE International Conference on Image Processing (IEEE ICIP2011)}, YEAR = {2011}, MONTH = sep, PUBLISHER = {IEEE}, PAGES = {1669--1672}, ORGANIZATION = {IEEE}, ADDRESS = {Brussels, Belgium}, NOTE = {IEEE catalog number: CFP11CIP-USB ISBN: 978-1-4577-1302-6}, PDF = {http://elvera.nue.tu-berlin.de/files/1303Esche2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1303Esche2011.pdf}, ABSTRACT = {In this work the application of a temporal in-loop filtering approach for B-frames in video compression based on the Temporal Trajectory Filter (TTF) is investigated. The TTF constructs temporal pixel trajectories for individual image points in the P-frames of a video sequence, which can be utilized to improve the quality of the reconstructed frames used for prediction. It is shown, how this concept can be adapted to B-frames despite the fact that these already use temporal motion information to a great extent through the flexible choice of reference frames and prediction modes. The proposed filter has been integrated into the H.264/AVC encoder using the extended profile with hierarchical B-frames and was tested on a wide range of sequences. The filter produces bit rate reductions of up to -4\% with an average of -1.6\% over all tested sequences while also improving the subjective quality of the decoded video.} } @INPROCEEDINGS{1304Glantz2011, AUTHOR = {Alexander Glantz and Michael Tok and Andreas Krutz and Thomas Sikora}, TITLE = {A Block-adaptive Skip Mode for Inter Prediction based on Parametric Motion Models}, BOOKTITLE = {Proceedings of the 18th IEEE International Conference on Image Processing (IEEE ICIP2011)}, YEAR = {2011}, MONTH = sep, PUBLISHER = {IEEE}, PAGES = {1225--1228}, ORGANIZATION = {IEEE}, ADDRESS = {Brussels, Belgium}, NOTE = {IEEE catalog number: CFP11CIP-USB ISBN: 978-1-4577-1302-6}, PDF = {http://elvera.nue.tu-berlin.de/files/1304Glantz2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1304Glantz2011.pdf}, ABSTRACT = {Motion compensated prediction (MCP) in hybrid video coding estimates a translational motion vector for a given block which is then used for residual computation. However, when complex motion like zoom, rotation, and perspective transformation occur, the translational model assumption does not always hold. This may result in higher residual energy and splitting of blocks, respectively. This paper proposes a skip mode based on higher-order parametric motion models. Often, these models provide a better prediction quality resulting in lower residual energy and larger block sizes. The proposed technique estimates a higher-order motion model between two given pictures. The encoder decides in terms of rate-distortion optimization whether to use the new skip mode for a block and therefore not to transfer any additional information like coefficient data. Experimental evaluation shows that the proposed technique can improve the coding performance of next generation video coding standards significantly.} } @INPROCEEDINGS{1309Senst2011, AUTHOR = {Tobias Senst and Volker Eiselein and Michael Pätzold and Thomas Sikora}, TITLE = {Efficient Real-Time Local Optical Flow Estimation by Means of Integral Projections}, BOOKTITLE = {Proceedings of the 18th IEEE International Conference on Image Processing (IEEE ICIP2011)}, YEAR = {2011}, MONTH = sep, PAGES = {2393--2396}, ADDRESS = {Brussels, Belgium}, NOTE = {ISSN : 1522-4880 E-ISBN : 978-1-4577-1302-6 Print ISBN: 978-1-4577-1304-0 DOI: 10.1109/ICIP.2011.6116111}, PDF = {http://elvera.nue.tu-berlin.de/files/1309Senst2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1309Senst2011.pdf}, ABSTRACT = {In this paper we present an approach for the efficient computation of optical flow fields in real-time and provide implementation details. Proposing a modification of the popular Lucas-Kanade energy functional based on integral projections allows us to speed up the method notably. We show the potential of this method which can compute dense flow fields of 640x480 pixels at a speed of 4 fps in a GPU implementation based on the OpenCL framework. Working on sparse optical flow fields of up to 17,000 points, we reach execution times of 70 fps. Optical flow methods are used in many different areas, the proposed method speeds up current surveillance algorithms used for scene description and crowd analysis or Augmented Reality and robot navigation applications.} } {1327Sikora2011, } @INPROCEEDINGS{1318Sikora2011, AUTHOR = {Florian Kaiser and Thomas Sikora}, TITLE = {Multi-Probe Histograms: A Mid-Level Harmonic Feature for Music Structure Segmentation}, BOOKTITLE = {14th International Conference on Digital Audio Effects (DAFx)}, YEAR = {2011}, MONTH = sep, ADDRESS = {Paris, France} } @INPROCEEDINGS{1324Kelm2011, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Kai Clüver and Thomas Sikora}, TITLE = {Automatic Geo-referencing of Flickr Videos}, BOOKTITLE = {NEM Summit 2011}, YEAR = {2011}, MONTH = sep, EDITOR = {Eurescom – the European Institute for Research and Strategic Studies in Telecommunications – GmbH}, PUBLISHER = {Sigma Orionis}, PAGES = {76--80}, ORGANIZATION = {NEM Summit}, ADDRESS = {Torino, Italy, September 27-29 2011}, NOTE = {Copyright © 2011 – Eurescom GmbH – On behalf of NEM Initiative All rights on Proceedings of 2011 NEM Summit (Torino, Italy, September 27-29 2011) reserved. All rights on individual papers, published in the proceedings, remain unaffected. ISBN 978-3-00-035465-6 Publisher Eurescom – the European Institute for Research and Strategic Studies in Telecommunications – GmbH Wieblinger Weg 19/4 - 69123 Heidelberg - Germany Phone: +49 6221 989 0 - Fax: +49 6221 989 209 - http://www.eurescom.eu For publisher: Halid Hrasnica On behalf of NEM Initiative – http://www.nem-initiative.org eBook and USB produced by Sigma Orionis 1240, route des dolines - BP287 Valbonne - France Phone: +33 (0) 493 001 550 - Fax: +33 (0) 493 001 560 - http://www.sigma-orionis.com For producer: Florent Genoux On behalf of NEM Initiative – http://www.nem-initiative.org}, PDF = {http://elvera.nue.tu-berlin.de/files/1324Kelm2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1324Kelm2011.pdf}, ABSTRACT = {We present a hierarchical, multi-modal approach for geo-referencing Flickr videos. Our approach makes use of external resources to identify toponyms in the metadata and of visual features to identify similar content. We use a database of more than 3.6 million Flickr images to group them into geographical areas and to build a hierarchical model. First, the geographical boundaries extraction method identifies the country and its dimension. Then, a visual method is used to classify the videos’ location into plausible areas. Next, the visually nearest neighbour method is used to find correspondences with the training images within the pre-classified areas. As the processed video sequences are represented using low-level feature vectors from multiple key frames, we also present techniques for video to image matchings. The Flickr videos are tagged with the geo-information of the visually most similar training item within the areas previously filtered in the pre-classification step. The results show that we are able to tag one third of our videos correctly within an error margin of 1 km.} } @ARTICLE{1320Krutz2011, AUTHOR = {Andreas Krutz and Alexander Glantz and Michael Frater and Thomas Sikora}, TITLE = {Rate-Distortion Optimized Video Coding using Automatic Sprites}, JOURNAL = {IEEE Journal of Selected Topics in Signal Processing}, YEAR = {2011}, MONTH = nov, PAGES = {1309--1321}, VOLUME = {5}, NUMBER = {7}, NOTE = {ISSN 1932-4553}, DOI = {10.1109/JSTSP.2011.2166247}, ABSTRACT = {Object-based video coding has been of interest for many years. Recent work by the authors has shown that a video coding system using background Sprites with automatic segmentation and background subtraction can indeed perform better than the H.264/AVC coder. In this paper, we extend this work by developing a rate-distortion optimization approach for an object-based coder. A key issue addressed by this approach is the joint choice of quantization parameters for the foreground and background. The performance of the resulting rate-distortionoptimized coder is far superior to that of H.264/AVC for source material with dominant global motion, across a range of bit rates.} } {1340Knorr2011, } @INPROCEEDINGS{1325Kelm2011, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Thomas Sikora}, TITLE = {A Hierarchical, Multi-modal Approach for Placing Videos on the Map using Millions of Flickr Photographs}, BOOKTITLE = {ACM Multimedia 2011 (Workshop on Social and Behavioral Networked Media Access - SBNMA)}, YEAR = {2011}, MONTH = nov, ORGANIZATION = {ACM}, NOTE = {invited paper}, PDF = {http://elvera.nue.tu-berlin.de/files/1325Kelm2011.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1325Kelm2011.pdf}, ABSTRACT = {We present a hierarchical, multi-modal approach for placing Flickr videos on the map. Our approach makes use of external resources to identify toponyms in the metadata and of visual and textual features to identify similar content. First, the geographical boundaries extraction method identifies the country and its dimension. We use a database of more than 3.6 million Flickr images to group them together into geographical regions and to build a hierarchical model. A fusion of visual and textual methods is used to classify the videos’ location into possible regions. Next, the visually nearest neighbour method uses a nearest neighbour approach to find correspondences with the training images within the preclassified regions. The video sequences are represented us- ing low-level feature vectors from multiple key frames. The Flickr videos are tagged with the geo-information of the visually most similar training item within the regions that is previously filtered by the pre-classification step for each test video. The results show that we are able to tag one third of our videos correctly within an error of 1 km.} } @INPROCEEDINGS{1335Senst2012, AUTHOR = {Tobias Senst and Brigitte Unger and Ivo Keller and Thomas Sikora}, TITLE = {Performance Evaluation of Feature Detection for Local Optical Flow Tracking}, BOOKTITLE = {International Conference on Pattern Recognition Applications and Methods (ICPRAM 2012)}, YEAR = {2012}, MONTH = feb, PAGES = {303--309}, ADDRESS = {Vilamoura, Portugal}, NOTE = {DOI: 10.5220/0003731103030309}, PDF = {http://elvera.nue.tu-berlin.de/files/1335Senst2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1335Senst2012.pdf}, ABSTRACT = {Due to its high computational efficiency the Kanade Lucas Tomasi feature tracker still remains as a widely accepted and utilized method to compute sparse motion fields or trajectories in video sequences. This method consists of a Good Feature To Track feature detection and a pyramidal Lucas Kanade feature tracking algorithm. It is well known that the Good Feature To Track concerns the Aperture Problem, but it does not consider the Generalized Aperture Problem. In this paper we want to provide an evaluation of a set of alternative feature detection methods. These methods are taken from feature matching techniques as FAST, SIFT and MSER. The evaluation is based on the Middlebury dataset and performed by using an improved pyramidal Lucas Kanade method, called RLOF feature tracker. To compare the results of the feature detector and RLOF pair, we propose a methodology based on accuracy, efficiency and covering measurements.} } {1361Knorr2012, } @ARTICLE{1328Esche2012, AUTHOR = {Marko Esche and Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {Adaptive Temporal Trajectory Filtering for Video Compression}, JOURNAL = {IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)}, YEAR = {2012}, MONTH = may, PAGES = {659--670}, VOLUME = {22}, NUMBER = {5}, DOI = {10.1109/TCSVT.2011.2177142}, ABSTRACT = {Most in-loop filters currently being employed in video compression algorithms use spatial information from a single frame of the video sequence only. In this paper a new filter is introduced and investigated that combines both spatial and temporal information to provide subjective and objective quality improvement. The filter only requires a small overhead on slice level while using the temporal information conveyed in the bit stream to reconstruct the individual motion trajectory of every pixel in a frame at both encoder and decoder. This information is then used to perform pixel-wise adaptive motion compensated temporal filtering. It is shown that the filter performs better than the state-of-the-art codec H.264/AVC over a large range of sequences and bit rates. Additionally, the filter is compared with another, Wiener-based in-loop filtering approach and a complexity analysis of both algorithms is conducted.} } @INPROCEEDINGS{1345Krutz2012, AUTHOR = {Andreas Krutz and Alexander Glantz and Michael Tok and Thomas Sikora}, TITLE = {Adaptive Global Motion Temporal Filtering}, BOOKTITLE = {Proceedings of the 29th IEEE Picture Coding Symposium (PCS 2012)}, YEAR = {2012}, MONTH = may, ORGANIZATION = {IEEE}, ADDRESS = {Kraków, Poland}, NOTE = {ISBN: 978-1-4577-2048-2}, PDF = {http://elvera.nue.tu-berlin.de/files/1345Krutz2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1345Krutz2012.pdf}, ABSTRACT = {The emerging standardization on high efficiency video coding (HEVC) has brought a huge improvement in terms of coding performance comparing to existing standards and approaches. One tool that provides a significant portion of the coding gain so far is loop filtering. In H.264/AVC, a deblocking filter has been used to reduce blocking artifacts. HEVC added loop filter approaches to this deblocking method that further reduce noise in the decoded video frame. All these techniques work spatially. Besides that, work has been done to further improve the quality of decoded frames by applying a temporal filtering approach. In this work, we propose adaptive global motion temporal filtering (AGMTF) that reduces noise along temporal trajectories. Experimental results show that the coding performance of the current HEVC test model HM 4.0 can be improved by up to 8.8% and 3.7% in average over a large bit rate range using this technique.} } @INPROCEEDINGS{1346Tok2012, AUTHOR = {Michael Tok and Andreas Krutz and Alexander Glantz and Thomas Sikora}, TITLE = {Lossy Parametric Motion Model Compression for Global Motion Temporal Filtering}, BOOKTITLE = {Proceedings of the 29th IEEE Picture Coding Symposium (PCS 2012)}, YEAR = {2012}, MONTH = may, ORGANIZATION = {IEEE}, ADDRESS = {Kraków, Poland}, NOTE = {ISBN: 978-1-4577-2048-2}, PDF = {http://elvera.nue.tu-berlin.de/files/1346Tok2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1346Tok2012.pdf}, ABSTRACT = {It has been shown that techniques using higher- order motion parameters outperform common translational mo- tion compensated prediction for hybrid video coders. A critical issue is the transmission of accurate higher-order motion pa- rameters with as little additional bits as possible to maximize the compression gain of the whole system. For that, we propose a compression scheme for perspective motion models using trans- formation before quantization and temporal redundancy reduc- tion and integrate this scheme into a video coding environment using adaptive global motion temporal filtering. Experimental results show that using the proposed compression scheme for the perspective motion models, the BD-rate can be improved up to 3.5% in average in the higher bit rate range and up to 6.3% in average in the lower bit rate range compared to the latest version of the HEVC test model HM 4.0.} } @INPROCEEDINGS{1347Tok2012, AUTHOR = {Michael Tok and Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {Parametric Motion Vector Prediction for Hybrid Video Coding}, BOOKTITLE = {Proceedings of the 29th IEEE Picture Coding Symposium (PCS 2012)}, YEAR = {2012}, MONTH = may, ORGANIZATION = {IEEE}, ADDRESS = {Kraków, Poland}, NOTE = {ISBN: 978-1-4577-2048-2}, PDF = {http://elvera.nue.tu-berlin.de/files/1347Tok2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1347Tok2012.pdf}, ABSTRACT = {Motion compensated prediction still is the main technique for redundancy reduction in modern hybrid video codecs. However, the resulting motion vector fields are highly redundant as well. Thus, motion vector prediction and difference coding are used for compressing. One drawback of all common motion vector prediction techniques is, that they are not able to predict complex motion as rotation and zoom efficiently. We present a novel parametric motion vector predictor (PMVP), based on higher-order motion models to overcome this issue. To transmit the needed motion models, an efficient compression scheme is utilized. This scheme is based on transformation, quan- tization and difference coding. By incorporating this predictor into the HEVC test model HM 3.2 gains of up to 2.42% are achieved.} } @INPROCEEDINGS{1348Esche2012, AUTHOR = {Marko Esche and Alexander Glantz and Andreas Krutz and Michael Tok and Thomas Sikora}, TITLE = {Weighted Temporal Long Trajectory Filtering for Video Compression}, BOOKTITLE = {Proceedings of the 29th IEEE Picture Coding Symposium (PCS 2012)}, YEAR = {2012}, MONTH = may, ORGANIZATION = {IEEE}, ADDRESS = {Kraków, Poland}, NOTE = {ISBN: 978-1-4577-2048-2}, PDF = {http://elvera.nue.tu-berlin.de/files/1348Esche2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1348Esche2012.pdf}, ABSTRACT = {In the context of the HEVC standardization activity, in-loop filters such as the adaptive loop filter and the deblocking filter are currently under investigation. Both filters work in the spatial domain only, despite the temporal correlation within video sequences. In this work a previously introduced filter, that uses temporal information for deblocking and denoising instead, is integrated into the HEVC test model HM 3.0. It is shown how the filter is to be adapted to work in combination with the adaptive loop filter for the HEVC low-delay profile. In addition, an optimal weighting function for the filtered luma samples based on the qunatization parameter is derived. Bit rate reductions of up to 7.6% are reported for individual sequences.} } @BOOK{1291Esche2012, AUTHOR = {Marko Esche and Mustafa Karaman and Thomas Sikora}, TITLE = {Semi-Automatic Object Tracking in Video Sequences by Extension of the MRSST Algorithm}, YEAR = {2012}, BOOKTITLE = {Analysis, Retrieval and Delivery of Multimedia Contents}, EDITOR = {Nicola Adami, Andrea Cavallaro, Riccardo Leonardi, Pierangelo Migliorati}, PUBLISHER = {Springer, Berlin}, SERIES = {LNEE}, ABSTRACT = {The objective of this work is to investigate a new approach for segmentation of real-world objects in video sequences. While some amount of user interaction is still necessary for most algorithms in this field, in order for them to produce adequate results, these can be reduced making use of certain properties of graph-based image segmentation algorithms. Based on one of these algorithms a framework is proposed that tracks individual foreground objects through arbitrary video sequences and partly automates the necessary corrections required from the user. Experimental results suggest that the proposed algorithm performs well on both low- and high-resolution video sequences and can even, to a certain extent, cope with motion blur and gradual object deformations.} } @INPROCEEDINGS{1356Schmiedeke2012, AUTHOR = {Schmiedeke and Sebastian and Kelm and Pascal and Sikora and Thomas}, TITLE = {Cross-Modal Categorisation of User-Generated Video Sequences}, BOOKTITLE = {Proceedings of the 2nd ACM International Conference on Multimedia Retrieval}, YEAR = {2012}, MONTH = jun, EDITOR = {ACM}, PAGES = {251--258}, ADDRESS = {New York, NY, USA}, NOTE = {isbn: 978-1-4503-1329-2 articleno: 25 numpages: 8 location: Hong Kong, China}, DOI = {10.1145/2324796.2324828}, URL = {http://dl.acm.org/citation.cfm?id=2324828}, ABSTRACT = {This paper describes the possibilities of cross-modal classification of multimedia documents in social media platforms. Our framework predicts the user-chosen category of consumer-produced video sequences based on their textual and visual features. These text resources---includes metadata and automatic speech recognition transcripts---are represented as bags of words and the video content is represented as a bag of clustered local visual features. The contribution of the different modalities is investigated and how they should be combined if sequences lack certain resources. Therefore, several classification methods are evaluated, varying the resources. The paper shows an approach that achieves a mean average precision of 0.3977 using user-contributed metadata in combination with clustered SURF.} } {1371Kelm2012, } @INPROCEEDINGS{1393Sikora2012, AUTHOR = {Marina Georgia Arvanitidou and Thomas Sikora}, TITLE = {Motion saliency for spatial pooling of objective video quality metrics}, BOOKTITLE = {Proceedings of the EuroITV 2012 Conference, Workshop on Quality of Experience for Multimedia Content Sharing (QoEMCS)}, YEAR = {2012}, MONTH = jul, ADDRESS = {Berlin}, PDF = {http://elvera.nue.tu-berlin.de/files/1393Sikora2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1393Sikora2012.pdf}, ABSTRACT = {In this paper we propose a novel motion saliency estimation method for video sequences considering the motion between successive frames and their corresponding parametric camera motion representation. Background motion is compensated for every pair of frames, revealing areas that contain relative motion. Considering that these areas will likely attract the attention of the viewer and in line with properties of the human visual system, regarding spatially invariant focus distribution, we augment their effect on the quality estimation. The generated saliency maps are thus incorporated in the spatial pooling stage of several video quality metrics, and experimental evaluation on the LIVE video database shows that this strategy enhances their performance.} } @ARTICLE{1349Senst2012, AUTHOR = {Tobias Senst and Volker Eiselein and Thomas Sikora}, TITLE = {Robust Local Optical Flow for Feature Tracking}, JOURNAL = {IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)}, YEAR = {2012}, MONTH = sep, PAGES = {1377--1387}, VOLUME = {22}, NUMBER = {9}, NOTE = {ISSN={1051-8215}, DOI=10.1109/TCSVT.2012.2202070}, PDF = {http://elvera.nue.tu-berlin.de/files/1349Senst2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1349Senst2012.pdf}, ABSTRACT = {The presented work is motivated by the problem of local motion estimation via robust regression with linear models. In order to increase the robustness of the motion estimates we propose a novel Robust Local Optical Flow approach based on a modified Hampel estimator. We show the deficiencies of the least squares estimator used by the standard KLT tracker when the assumptions made by Lucas/Kanade are violated. We propose a strategy to adapt the window sizes to cope with the Generalized Aperture Problem. Finally we evaluate our method on the Middlebury and MIT dataset and show that the algorithm provides excellent feature tracking performance with only slightly increased computational complexity compared to KLT. To facilitate further development the presented algorithm can be downloaded from http://www.nue.tu-berlin.de/menue/forschung/ projekte/rlof/.} } @INPROCEEDINGS{1373Acar2012, AUTHOR = {Esra Acar and Tobias Senst and Alexander Kuhn and Ivo Keller and Holger Theisel and Sahin Albayrak and Thomas Sikora}, TITLE = {Human Action Recognition using Lagrangian Descriptors}, BOOKTITLE = {IEEE Workshop on Multimedia Signal Processing (MMSP 2012)}, YEAR = {2012}, MONTH = sep, PAGES = {360--365}, ADDRESS = {Banff, Canada}, NOTE = {IEEE Catalog Number: CFP12MSP-USB E-ISBN : 978-1-4673-4571-2 Print ISBN: 978-1-4673-4570-5 INSPEC Accession Number: 13116360 DOI:10.1109/MMSP.2012.6343469}, PDF = {http://elvera.nue.tu-berlin.de/files/1373Acar2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1373Acar2012.pdf} } @INPROCEEDINGS{1374Kuhn2012, AUTHOR = {Alexander Kuhn and Tobias Senst and Ivo Keller and Thomas Sikora and Holger Theisel}, TITLE = {A Lagrangian Framework for Video Analytics}, BOOKTITLE = {IEEE Workshop on Multimedia Signal Processing (MMSP 2012)}, YEAR = {2012}, MONTH = sep, PAGES = {387--392}, ADDRESS = {Banff, Canada}, NOTE = {IEEE Catalog Number: CFP12MSP-USB E-ISBN : 978-1-4673-4571-2 Print ISBN: 978-1-4673-4570-5 INSPEC Accession Number: 13116365 DOI: 10.1109/MMSP.2012.6343474}, PDF = {http://elvera.nue.tu-berlin.de/files/1374Kuhn2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1374Kuhn2012.pdf} } @INPROCEEDINGS{1363Evangelio2012, AUTHOR = {Rubén Heras Evangelio and Michael Pätzold and Thomas Sikora}, TITLE = {Splitting Gaussians in Mixture Models}, BOOKTITLE = {9th IEEE International Conference on Advanced Video and Signal-Based Surveillance}, YEAR = {2012}, MONTH = sep, ADDRESS = {Beijing, China}, NOTE = {ISBN: 978-1-4673-2499-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1363Evangelio2012.pdf}, DOI = {10.1109/AVSS.2012.69}, URL = {http://elvera.nue.tu-berlin.de/files/1363Evangelio2012.pdf}, ABSTRACT = {Gaussian mixture models have been extensively used and enhanced in the surveillance domain because of their ability to adaptively describe multimodal distributions in real-time with low memory requirements. Nevertheless, they still often suffer from the problem of converging to poor solutions if the main mode stretches and thus over-dominates weaker distributions. Based on the results of the Split and Merge EM algorithm, in this paper we propose a solution to this problem. Therefore, we define an appropriate splitting operation and the corresponding criterion for the selection of candidate modes, for the case of background subtraction. The proposed method achieves better background models than state-of-the-art approaches and is low demanding in terms of processing time and memory requirements, therefore making it especially appealing in the surveillance domain.} } @INPROCEEDINGS{1364Eiselein2012, AUTHOR = {Volker Eiselein and Daniel Arp and Michael Pätzold and Thomas Sikora}, TITLE = {Real-time Multi-Human Tracking using a Probability Hypothesis Density Filter and multiple detectors}, BOOKTITLE = {9th IEEE International Conference on Advanced Video and Signal-Based Surveillance}, YEAR = {2012}, MONTH = sep, ADDRESS = {Beijing, China}, NOTE = {ISBN: 978-1-4673-2499-1}, DOI = {10.1109/AVSS.2012.59} } @INPROCEEDINGS{1366Senst2012, AUTHOR = {Tobias Senst and Rubén Heras Evangelio and Ivo Keller and Thomas Sikora}, TITLE = {Clustering Motion for Real-Time Optical Flow based Tracking}, BOOKTITLE = {IEEE International Conference on Advanced Video and Signal-Based Surveillance (AVSS 2012)}, YEAR = {2012}, MONTH = sep, PAGES = {410--415}, ADDRESS = {Beijing, China}, NOTE = {ISBN: 978-1-4673-2499-1 DOI: 10.1109/AVSS.2012.20}, PDF = {http://elvera.nue.tu-berlin.de/files/1366Senst2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1366Senst2012.pdf}, ABSTRACT = {The selection of regions or sets of points to track is a key task in motion-based video analysis, which has significant performance effects in terms of accuracy and computational efficiency. Computational efficiency is an unavoidable requirement in video surveillance applications. Well established methods, e.g., Good Features to Track, select points to be tracked based on appearance features such as cornerness, therefore neglecting the motion exhibited by the selected points. In this paper, we propose an interest point selection method that takes the motion of the tracked points into account in order to constrain the number of point trajectories needed. By defining pair-wise temporal affinities between trajectories and representing them in a minimum spanning tree, we achieve a very efficient clustering. The number of trajectories assigned to each motion cluster is adapted by initializing and removing tracked points by means of feed-back. Compared to the KLT tracker, we save up to 65% of the points to track, therefore gaining in efficiency while not scarifying accuracy.} } @INPROCEEDINGS{1368Pätzold2012, AUTHOR = {Michael Pätzold and Rubén Heras Evangelio and Thomas Sikora}, TITLE = {Boosting Multi-Hypothesis Tracking by means of Instance-specific Models}, BOOKTITLE = {9th IEEE International Conference on Advanced Video and Signal-Based Surveillance}, YEAR = {2012}, MONTH = sep, ADDRESS = {Beijing, China}, NOTE = {ISBN: 978-1-4673-2499-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1368Pätzold2012.pdf}, DOI = {10.1109/AVSS.2012.18}, URL = {http://elvera.nue.tu-berlin.de/files/1368Pätzold2012.pdf}, ABSTRACT = {In this paper we present a visual person tracking-by-detection system based on on-line-learned instance-specific information along with the kinematic relation of measurements provided by a generic person-category detector. The proposed system is able to initialize tracks on individual persons and start learning their appearance even in crowded situations and does not require that a person enters the scene separately. For that purpose we integrate the process of learning instance-specific models into a standard MHT-framework. The capability of the system to eliminate detections-to-object association ambiguities occurring from missed detections or false ones is demonstrated by experiments for counting and tracking applications using very long video sequences on challenging outdoor scenarios.} } @INPROCEEDINGS{1372Senst2012, AUTHOR = {Tobias Senst and Alexander Kuhn and Holger Theisel and Thomas Sikora}, TITLE = {Detecting People Carrying Objects utilizing Lagrangian Dynamics}, BOOKTITLE = {IEEE International Conference on Advanced Video and Signal-Based Surveillance (AVSS 2012)}, YEAR = {2012}, MONTH = sep, PAGES = {398--403}, ADDRESS = {Beijing, China}, NOTE = {ISBN: 978-1-4673-2499-1 DOI: 10.1109/AVSS.2012.34}, PDF = {http://elvera.nue.tu-berlin.de/files/1372Senst2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1372Senst2012.pdf} } @INPROCEEDINGS{1358Sikora2012, AUTHOR = {Marko Esche and Alexander Glantz and Andreas Krutz and Michael Tok and Thomas Sikora}, TITLE = {Quadtree-based Temporal Trajectory Filtering}, BOOKTITLE = {Proceedings of the 19th IEEE International Conference on Image Processing (ICIP)}, YEAR = {2012}, MONTH = sep, PUBLISHER = {IEEE}, ORGANIZATION = {IEEE}, ADDRESS = {Orlando, Florida}, PDF = {http://elvera.nue.tu-berlin.de/files/1358Sikora2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1358Sikora2012.pdf}, ABSTRACT = {In both the HEVC draft and in H.264/AVC, in-loop filters are employed to improve the subjective and the objective quality of compressed video sequences. These filters use spatial information from a single frame only. Temporal Trajectory Filtering (TTF) constitutes an alternative approach which performs filtering in the temporal domain instead. In this work, a combination of the TTF with a quadtree partitioning algorithm for applying different filter parameters to different image regions is proposed and investigated. Experiments were conducted in the environment of the HEVC test model HM 3.0. Bit rate reductions of up to 9% for the low delay high efficiency setting of HEVC are reported.} } @ARTICLE{1392Knorr2012, AUTHOR = {Sebastian Knorr and Kai Ide and Matthias Kunter and and Thomas Sikora}, TITLE = {THE AVOIDANCE OF VISUAL DISCOMFORT AND BASIC RULES FOR PRODUCING “GOOD 3D” PICTURES}, JOURNAL = {SMPTE Motion Imaging Journal}, YEAR = {2012}, MONTH = oct, PAGES = {72--79} } @INPROCEEDINGS{1377Rae2012, AUTHOR = {Adam Rae and Pascal Kelm}, TITLE = {Working Notes for the Placing Task at MediaEval 2012}, BOOKTITLE = {Working Notes Proceedings of the MediaEval 2012 Workshop}, YEAR = {2012}, MONTH = oct, EDITOR = {Martha A. Larson, Sebastian Schmiedeke, Pascal Kelm, Adam Rae, Vasileios Mezaris, Tomas Piatrik, Mohammad Soleymani, Florian Metze, Gareth J.F. Jones}, PUBLISHER = {CEUR-WS.org}, PAGES = {1--2}, ADDRESS = {Santa Croce in Fossabanda Piazza Santa Croce, 5 - 56125 - Pisa - Toscana - Italia}, NOTE = {ISSN 1613-0073}, PDF = {http://elvera.nue.tu-berlin.de/files/1377Rae2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1377Rae2012.pdf}, ABSTRACT = {This paper provides a description of the MediaEval 2012 Placing Task. The task requires participants to automatically assign latitude and longitude coordinates to each of the provided test videos. This kind of geographical location tag, or geotag, helps users localise videos, allowing their media to be anchored to real world locations. Currently, however, most videos online are not labelled with this kind of data. This task encourages participants to find innovative ways of doing this labelling automatically. The data comes from Flickr—an example of a photo sharing website that allows users to both encode their photos and videos with geotags, as well as use them when searching and browsing. This paper describes the task, the data sets provided and how the individual participants results are evaluated.} } @INPROCEEDINGS{1378Schmiedeke2012, AUTHOR = {Sebastian Schmiedeke and Christoph Kofler and Isabelle Ferrane}, TITLE = {Overview of the MediaEval 2012 Tagging Task}, BOOKTITLE = {Working Notes Proceedings of the MediaEval 2012 Workshop}, YEAR = {2012}, MONTH = oct, EDITOR = {Martha A. Larson, Sebastian Schmiedeke, Pascal Kelm, Adam Rae, Vasileios Mezaris, Tomas Piatrik, Mohammad Soleymani, Florian Metze, Gareth J.F. Jones}, PUBLISHER = {CEUR-WS.org}, PAGES = {75--76}, ADDRESS = {Santa Croce in Fossabanda Piazza Santa Croce, 5 - 56125 - Pisa - Toscana - Italia}, NOTE = {ISSN 1613-0073}, PDF = {http://elvera.nue.tu-berlin.de/files/1378Schmiedeke2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1378Schmiedeke2012.pdf}, ABSTRACT = {The MediaEval 2012 Tagging Task is a follow-up task of the MediaEval 2011 Genre Tagging Task and the MediaEval 2010 Wild Wild Web Tagging Task to test and evaluate retrieval techniques for video content as it occurs on the Internet, i.e., for semi-professional user generated content that is associated with annotations existing on the social Web. The task uses the MediaEval 2012 Tagging Task (ME12TT) dataset, an extension of the original MediaEval 2010 Wild Wild Web Tagging Task dataset used in previous tasks. In this task overview paper, we describe the principal characteristics of the data set, the task itself, and the evaluation metrics used to test the participants´ results.} } @INPROCEEDINGS{1386Schmiedeke2012, AUTHOR = {Sebastian Schmiedeke and Pascal Kelm and Thomas Sikora}, TITLE = {TUB @ MediaEval 2012 Tagging Task: Feature Selection Methods for Bag-of-(visual)-Words Approaches}, BOOKTITLE = {Working Notes Proceedings of the MediaEval 2012 Workshop}, YEAR = {2012}, MONTH = oct, EDITOR = {Martha A. Larson, Sebastian Schmiedeke, Pascal Kelm, Adam Rae, Vasileios Mezaris, Tomas Piatrik, Mohammad Soleymani, Florian Metze, Gareth J.F. Jones}, PUBLISHER = {CEUR-WS.org}, PAGES = {79--80}, ADDRESS = {Santa Croce in Fossabanda Piazza Santa Croce, 5 - 56125 - Pisa - Toscana - Italia}, NOTE = {ISSN 1613-0073}, PDF = {http://elvera.nue.tu-berlin.de/files/1386Schmiedeke2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1386Schmiedeke2012.pdf}, ABSTRACT = {This paper describes our participation in the Genre Tagging Task of MediaEval 2012, which aims to predict the videos’ category label. In last year’s participation, we performed experiments with bag-of-words (BoW) approaches in which different constellations in respect of modalities, features, and methods were investigated. This year, we focus on feature selection methods to improve the classification performance in terms of mean average precision (mAP) and classification accuracy (CA). We investigated the effectiveness of selection methods based on scores calculated using mutual informa- tion (MI) or term frequency (TF) and the effectiveness of transformation methods like the principle component anal- ysis (PCA).} } @INPROCEEDINGS{1387Kelm2012, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Thomas Sikora}, TITLE = {How Spatial Segmentation improves the Multimodal Geo-Tagging}, BOOKTITLE = {Working Notes Proceedings of the MediaEval 2012 Workshop}, YEAR = {2012}, MONTH = oct, EDITOR = {Martha A. Larson, Sebastian Schmiedeke, Pascal Kelm, Adam Rae, Vasileios Mezaris, Tomas Piatrik, Mohammad Soleymani, Florian Metze, Gareth J.F. Jones}, PUBLISHER = {CEUR-WS.org}, PAGES = {9--10}, ADDRESS = {Santa Croce in Fossabanda Piazza Santa Croce, 5 - 56125 - Pisa - Toscana - Italia}, NOTE = {ISSN 1613-0073}, PDF = {http://elvera.nue.tu-berlin.de/files/1387Kelm2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1387Kelm2012.pdf}, ABSTRACT = {In this paper we present a hierarchical, multi-modal ap- proach in combination with different granularity levels for the Placing Task at the MediaEval benchmark 2012. Our approach makes use of external resources like gazetteers to extract toponyms in the metadata and of visual and textual features to identify similar content. First, the bounderies detection recognizes the country and its dimension to speed up the estimation and to eliminate geographical ambiguity. Next, we prepared a training database to group them to- gether into geographical regions and to build a hierarchical model. The fusion of visual and textual methods for differ- ent granularities is used to classify the videos’ location into possible regions. At the end the Flickr videos are tagged with the geo-information of the most similar training image within the regions that is previously filtered by the proba- bilistic model for each test video.} } @PROCEEDINGS{1394Larson2012, TITLE = {Working Notes Proceedings of the MediaEval 2012 Workshop}, EDITOR = {Martha A. Larson, Sebastian Schmiedeke, Pascal Kelm, Adam Rae, Vasileios Mezaris, Tomas Piatrik, Mohammad Soleymani, Florian Metze, Gareth J.F. Jones}, YEAR = {2012}, MONTH = oct, PUBLISHER = {CEUR-WS.org}, VOLUME = {927}, NOTE = {ISSN 1613-0073 URN: urn:nbn:de:0074-927-7} } @INPROCEEDINGS{1379Gottlieb2012, AUTHOR = {Luke Gottlieb and Jaeyoung Choi and Gerald Friedland and Pascal Kelm and Thomas Sikora}, TITLE = {Pushing the limits of Mechanical Turk; Qualifying the crowd for geolocation}, BOOKTITLE = {ACM Multimedia 2012 Workshop on Crowdsourcing for Multimedia}, YEAR = {2012}, MONTH = oct, ORGANIZATION = {ACM MM 2012}, PDF = {http://elvera.nue.tu-berlin.de/files/1379Gottlieb2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1379Gottlieb2012.pdf}, ABSTRACT = {In this article we review the methods we have developed for finding Mechanical Turk participants for the manual annota- tion of the geo-location of random videos from the web. We require high quality annotations for this project, as we are attempting to establish a human baseline for future comparison to machine systems. This task is different from a standard Mechanical Turk task in that it is difficult for both humans and machines, whereas a standard Mechanical Turk task is usually easy for humans and difficult or impossible for machines. This article discusses the varied difficulties we encountered while qualifying annotators and the steps that we took to select the individuals most likely to do well at our annotation task in the future.} } @INPROCEEDINGS{1391Kelm2012, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Thomas Sikora}, TITLE = {Multimodal Geo-tagging in Social Media Websites using Hierarchical Spatial Segmentation}, BOOKTITLE = {Proceedings of the 20th ACM SIGSPATIAL International Conference on Advances in Geographic Information Systems}, YEAR = {2012}, MONTH = nov, PAGES = {8}, ORGANIZATION = {ACM SIGSPATIAL}, PDF = {http://elvera.nue.tu-berlin.de/files/1391Kelm2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1391Kelm2012.pdf} } @INPROCEEDINGS{1451Zilly2012, AUTHOR = {F. Zilly and C. Riechert and M. Müller and W. Waizenegger and T. Sikora and P. Kauff}, TITLE = {Multi-Camera Rectification using Linearized Trifocal Tensor}, BOOKTITLE = {ICPR 2012, 21st International Conference on Pattern Recognition}, YEAR = {2012}, MONTH = nov, EDITOR = {IEEE Computer Society}, PUBLISHER = {IEEE}, ORGANIZATION = {IAPR, Science Council of Japan} } @BOOK{1384Kelm2012, AUTHOR = {Pascal Kelm and Vanessa Murdock and Sebastian Schmiedeke and Steven Schockaert and Pavel Serdyukov and Olivier Van Laere}, TITLE = {Georeferencing in Social Networks}, YEAR = {2012}, BOOKTITLE = {Social Media Retrieval}, EDITOR = {Naeem Ramzan, Roelof van Zwol, Jong-Seok Lee, Kai Clüver, Xian-Sheng Hua}, PUBLISHER = {Springer}, NOTE = {ISBN 978-1-4471-4554-7}, ABSTRACT = {Social media is now ubiquitous on the internet, generating both new possibilities and new challenges in information analysis and retrieval. This comprehensive text/reference examines in depth the synergy between multimedia content analysis, ersonalization, and next-generation networking. The book demonstrates how this integration can result in robust, personalized services that provide users with an improved multimedia-centric quality of experience. Each chapter offers a practical step-by-step walkthrough for a variety of concepts, components and technologies relating to the development of applications and services. Topics and features: - Provides contributions from an international and interdisciplinary selection of experts in their fields Introduces the fundamentals of social media retrieval, presenting the most important areas of research in this domain - Examines the important topic of multimedia tagging in social environments, including geo-tagging Discusses issues of personalization and privacy in social media - Reviews advances in encoding, compression and network architectures for the exchange of social media information - Describes a range of applications related to social media Researchers and students interested in social media retrieval will find this book a valuable resource, covering a broad overview of state-of-the-art research and emerging trends in this area. The text will also be of use to practicing engineers involved in envisioning and building innovative social media applications and services.} } @ARTICLE{1385Krutz2012, AUTHOR = {Andreas Krutz and Alexander Glantz and Michael Tok and Marko Esche and Thomas Sikora}, TITLE = {Adaptive Global Motion Temporal Filtering for High Efficiency Video Coding}, JOURNAL = {IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)}, YEAR = {2012}, MONTH = dec, DOI = {10.1109/TCSVT.2012.2223012}, ABSTRACT = {Coding artifacts in video codecs can be reduced using several spatial in-loop filters which are part of the emerging video coding standard HEVC. In this paper, we introduce the concept of global motion temporal filtering (GMTF). A theoretical framework for a concept combining the temporal overlapping of several noisy versions of the same signal is introduced. This includes a model of the motion estimation error. As an important result it is shown that an optimum number of frames N for filtering exists. An implementation of the concept based on several versions of the HEVC test model using global motion compensated temporal filtering shows that significant gains can be achieved.} } @INPROCEEDINGS{1403Ide2012, AUTHOR = {Kai Ide and Thomas Sikora}, TITLE = {Real-Time Active Multiview 3D Reconstruction}, BOOKTITLE = {International Conference on Computer Vision in Remote Sensing}, YEAR = {2012}, MONTH = dec, ADDRESS = {Xiamen University, Xiamen, China}, PDF = {http://elvera.nue.tu-berlin.de/files/1403Ide2012.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1403Ide2012.pdf} } @INBOOK{1416Vazquez2013, AUTHOR = {Carlos Vazquez and Liang Zhang and Filippo Speranza and Nils Plath and and Sebastian Knorr}, TITLE = {2D-to-3D Video Conversion: Overview and Perspectives}, YEAR = {2013}, BOOKTITLE = {Emerging Technologies for 3D Video: Creation, Coding, Transmission and Rendering}, EDITOR = {Frédéric Dufaux, Béatrice Pesquet-Popescu, Marco Cagnazzo}, PUBLISHER = {Wiley}, CHAPTER = {3} } @ARTICLE{1418Plath2013, AUTHOR = {Nils Plath and Sebastian Knorr and Lutz Goldmann and Thomas Sikora}, TITLE = {Adaptive Image Warping for Hole Prevention in 3D View Synthesis}, JOURNAL = {IEEE Transactions on Image Processing, Special Issue on 3D Video Representation, Compression and Rendering}, YEAR = {2013} } @INPROCEEDINGS{1405Gottlieb2013, AUTHOR = {Luke Gottlieb and Jaeyoung Choi and Gerald Friedland and Pascal Kelm and Thomas Sikora}, TITLE = {On Pushing the Limits of Mechanical Turk: Qualifying the Crowd for Video Geolocation}, BOOKTITLE = {MULTIMEDIA COMMUNICATIONS TECHNICAL COMMITTEE IEEE COMMUNICATIONS SOCIETY}, YEAR = {2013}, MONTH = jan, PUBLISHER = {IEEE Communications Society}, PAGES = {27--29}, ORGANIZATION = {IEEE}, NOTE = {Volume 8, Number 1 - January 2013 EMERGING TOPICS: SPECIAL ISSUE ON "Multimedia and Cloud Computing" INDUSTRIAL COLUMN: SPECIAL ISSUE ON "Crowdsourcing-based Multimedia Systems"}, PDF = {http://elvera.nue.tu-berlin.de/files/1405Gottlieb2013.pdf}, DOI = {http://committees.comsoc.org/mmc/eletters.asp}, URL = {http://elvera.nue.tu-berlin.de/files/1405Gottlieb2013.pdf}, ABSTRACT = {This work was first appeared in Gottlieb et al. [1]. In this article we summarize the methods we took for finding skilled Mechanical Turk participants for our annotation task, which will be to determine the geolocation of random videos from the web. The task itself is unlike the standard setup for a Mechanical Turk task, in that it is difficult for both humans and machines, whereas a standard Mechanical Turk task is usually easy for humans and difficult or impossible for machines. There are several notable challenges to finding skilled workers for this task: First, we must find what we termed “honest operators”, i.e., people who will seriously attempt to do the task and not just click quickly through it to collect the bounty. Second, we need to develop meaningful qualification test set(s) that are challenging enough to allow us to qualify people for the real task, but were also solvable by individuals regardless of their culture or location, although English language understanding was required for instructions.} } @INPROCEEDINGS{1399Eiselein2013, AUTHOR = {Volker Eiselein and Tobias Senst and Ivo Keller and Thomas Sikora}, TITLE = {A motion-enhanced Hybrid Probability Hypothesis Density filter for real-time Multi-Human Tracking in video surveillance scenarios}, BOOKTITLE = {15th IEEE International Workshop on Performance Evaluation of Tracking and Surveillance (PETS 2013)}, YEAR = {2013}, MONTH = jan, PAGES = {6--13}, ADDRESS = {Clearwater Beach, USA}, DOI = {10.1109/PETS.2013.6523789} } @INPROCEEDINGS{1404Borgmann2013, AUTHOR = {Thilo Borgmann and Thomas Sikora}, TITLE = {Image Guided Cost Aggregation for Hierarchical Depth Map Fusion}, BOOKTITLE = {International Conference on Computer Vision Theory and Applications (VISAPP)}, YEAR = {2013}, MONTH = feb, ORGANIZATION = {INSTICC}, PDF = {http://elvera.nue.tu-berlin.de/files/1404Borgmann2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1404Borgmann2013.pdf}, ABSTRACT = {Estimating depth from a video sequence is still a challenging task in computer vision with numerous applica- tions. Like other authors we utilize two major concepts developed in this field to achieve that task which are the hierarchical estimation of depth within an image pyramid as well as the fusion of depth maps from different views. We compare the application of various local matching methods within such a combined approach and can show the relative performance of local image guided methods in contrast to commonly used fixed–window aggregation. Since efficient implementations of these image guided methods exist and the available hardware is rapidly enhanced, the disadvantage of their more complex but also parallel computation vanishes and they will become feasible for more applications.} } @INPROCEEDINGS{1402Schmiedeke2013, AUTHOR = {Sebastian Schmiedeke and Peng Xu and Isabelle Ferrane and Maria Eskevich and Christoph Kofler and Martha A. Larson and Yannick Esteve and Lori Lamel and Gareth J.F. Jones and Thomas Sikora}, TITLE = {Blip10000: A social Video Dataset containing SPUG Content for Tagging and Retrieval}, BOOKTITLE = {Proceedings of the 4th ACM Multimedia Systems Conference}, YEAR = {2013}, MONTH = feb, PUBLISHER = {ACM}, PAGES = {96--101}, ORGANIZATION = {ACM, New York, NY, USA}, ADDRESS = {Oslo, Norway}, NOTE = {IDSN = 978-1-4503-1894-5}, DOI = {10.1145/2483977.2483988}, URL = {http://doi.acm.org/10.1145/2483977.2483988} } @INPROCEEDINGS{1400Sikora2013, AUTHOR = {Michael Tok and Marko Esche and Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {A Parametric Merge Candidate for High Efficiency Video Coding}, BOOKTITLE = {Data Compression Conference}, YEAR = {2013}, MONTH = mar, ORGANIZATION = {IEEE}, ADDRESS = {Snowbird, Utah}, PDF = {http://elvera.nue.tu-berlin.de/files/1400Sikora2013.pdf}, DOI = {10.1109/DCC.2013.11}, URL = {http://elvera.nue.tu-berlin.de/files/1400Sikora2013.pdf} } @INPROCEEDINGS{1401Sikora2013, AUTHOR = {Marko Esche and Michael Tok and Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {Efficient Quadtree Compression for Temporal Trajectory Filtering}, BOOKTITLE = {Data Compression Conference}, YEAR = {2013}, MONTH = mar, ADDRESS = {Snowbird, Utah}, DOI = {10.1109/DCC.2013.118} } @ARTICLE{1375Tok2013, AUTHOR = {Michael Tok and Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {Monte-Carlo-based Parametric Motion Estimation using a Hybrid Model Approach}, JOURNAL = {IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)}, YEAR = {2013}, MONTH = apr, PDF = {http://elvera.nue.tu-berlin.de/files/1375Tok2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1375Tok2013.pdf}, ABSTRACT = {Parametric motion estimation is an important task for various video processing applications such as analysis, segmentation or coding. The process for such an estimation has to satisfy three requirements. It has to be fast, accurate, and robust in presence of arbitrarily moving foreground objects at the same time. We introduce a two-step simplification scheme, suitable for Monte-Carlo-based perspective motion model estimation. For complexity reduction, the Helmholtz Tradeoff Estimator as well as Random Sample Consensus are enhanced with this scheme and applied on KLT features as well as on video stream macroblock motion vector fields. For the feature-based estimation, good trackable features are detected and tracked on raw video sequences. For the block-based approach, motion vector fields from encoded H.264/AVC video streams are used. Results indicate that the complexity of the whole estimation process can be reduced by a factor of up to 10,000 compared to state-of-the-art methods without loosing estimation precision.} } @INPROCEEDINGS{1409Senst2013, AUTHOR = {Tobias Senst and Volker Eiselein and Atta Badii and Mathieu Einig and Ivo Keller,Thomas Sikora}, TITLE = {A decentralized Privacy-sensitive Video Surveillance Framework}, BOOKTITLE = {IEEE International Conference on Digital Signal Processing (DSP 2013)}, YEAR = {2013}, MONTH = jul, ADDRESS = {Greece, Santorini}, NOTE = {DOI:10.1109/ICDSP.2013.6622765}, PDF = {http://elvera.nue.tu-berlin.de/files/1409Senst2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1409Senst2013.pdf}, ABSTRACT = {With the increasing spread of accurate and robust video surveillance, applications such as crowd monitoring, people counting and abnormal behavior recognition become ubiquitous.This leads to needs of interactive systems taking into account a high degree of interoperability as well as privacy protection concerns. In this paper we propose a framework based on the ONVIF specification to support the work of video operators while implementing a privacy-by-design concept.We use an OpenGL-based 3D model of the CCTV site where we display the results of the video analytics in an avatar-based manner and give an example application on mugging detection.To place the automatically detected scene information, such as people detections and event, a automatic camera calibration is used which effective reduces the deployment effort.} } @INPROCEEDINGS{1410Fradi2013, AUTHOR = {Hajer Fradi and Volker Eiselein and Ivo Keller and Jean-Luc Dugelay and Thomas Sikora}, TITLE = {Crowd Context-Dependent Privacy Protection Filters}, BOOKTITLE = {IEEE International Conference on Digital Signal Processing (DSP 2013)}, YEAR = {2013}, MONTH = jul, ADDRESS = {Greece, Santorini}, DOI = {10.1109/ICDSP.2013.6622808} } @INPROCEEDINGS{1411Evangelio2013, AUTHOR = {Rubén Heras Evangelio and Tobias Senst and Ivo Keller and Thomas Sikora}, TITLE = {Video Indexing and Summarization as a Tool for Privacy Protection}, BOOKTITLE = {IEEE International Conference on Digital Signal Processing (DSP 2013)}, YEAR = {2013}, MONTH = jul, ADDRESS = {Greece, Santorini}, PDF = {http://elvera.nue.tu-berlin.de/files/1411Evangelio2013.pdf}, DOI = {10.1109/ICDSP.2013.6622770}, URL = {http://elvera.nue.tu-berlin.de/files/1411Evangelio2013.pdf}, ABSTRACT = {The ever increasing number of surveillance camera networks being deployed all over the world has resulted in a high interest in the development of algorithms to automatically analyze the video footage, but has also opened new questions as how to efficiently manage the vast amount of information generated and, more important, how to protect the privacy of the individuals being recorded in their daily life. In this paper, we present a survey on video summarization techniques developed in order to efficiently access to the points of interest in the video footage. Thereby, we emphasize on the links that these techniques show with the task of privacy protection and draw lines of future research directions to incorporate indexing and summarization as tools for privacy protection by design.} } @ARTICLE{1429Zilly2013, AUTHOR = {Frederik Zilly and Christian Riechert and Marcus Müller and Peter Eisert and Thomas Sikora and Peter Kauff}, TITLE = {Real-time generation of multi-view video plus depth content using mixed narrow and wide baseline}, JOURNAL = {Journal of Visual Communication and Image Representation}, YEAR = {2013}, MONTH = jul, PAGES = {???}, VOLUME = {Volume 24,}, NUMBER = {Issue 8}, NOTE = {ISSN 1047-3203}, DOI = {http://dx.doi.org/10.1016/j.jvcir.2013.07.002}, URL = {http://www.sciencedirect.com/science/article/pii/S104732031300134X}, ABSTRACT = {Content production for stereoscopic 3D-TV displays has become mature in the past years while huge progress has also been achieved in the improvement of the image quality of glasses-free auto-stereoscopic displays and light-field displays. Concerning the latter two display families, the content production workflow is less elaborated and more complex, as the number of required views not only differs considerably but is also likely to increase in the near future. As a co-existence of all 3D display families can be expected for the next years, one aims to establish an efficient content production workflow which yields to high quality content for all 3D-TV displays. Against this background we present a real-time capable multi-view video plus depth (MVD) content production workflow based on a four-camera rig with mixed narrow and wide baseline. Results show the suitability of the approach to simultaneously produce high quality MVD4 and native stereoscopic 3D content} } @ARTICLE{1428Sun2013, AUTHOR = {J. Sun and J. Xie and J. Liu and Thomas Sikora}, TITLE = {Image Adaptation and Dynamic Browsing Based on Two-Layer Saliency Combination}, JOURNAL = {Broadcasting, IEEE Transactions on}, YEAR = {2013}, MONTH = jul, PAGES = {1}, VOLUME = {vol.PP}, NUMBER = {No.99}, DOI = {doi: 10.1109/TBC.2013.2272172}, URL = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6572828&isnumber=4359969}, ABSTRACT = {In recent years, image adaptation has attracted more and more attention during the evolution of the integration of broadcasting, Internet and telecommunications. The diversity of display devices requires images to be resized for optimal display on different terminals. In this paper, an image adaptation scheme and a dynamic browsing strategy are proposed, which are based on the visual attention model (VAM) with two-layer saliency optimization. First, a VAM is constructed by optimizing the image saliency according to the global and local layers by simulated annealing and the saliency of focus of attention (FOA) can be ranked by the obtained saliency map. Then an image adaptation scheme is designed based on the obtained saliency map. In the proposed adaptation scheme, each predominant FOA is modeled as a rectangular region, and the image is adjusted corresponding to the display terminal and the rank of FOA. Based on the principle of global features precedence, a dynamic browsing strategy is developed for browsing large images on small display devices. Experiments on saliency map show that the cross-layer saliency detection algorithm has advantages on detecting salient regions accurately and reflecting the shift of FOA implicitly, which are useful for image adaptation. In addition, subjective evaluation experiments demonstrate the proposed image adaptation scheme and dynamic browsing strategy are feasible and have promising applications in practice.} } @INPROCEEDINGS{1421Eiselein2013, AUTHOR = {Volker Eiselein and Hajer Fradi and Ivo Keller and Thomas Sikora and Jean-Luc Dugelay}, TITLE = {Enhancing Human Detection using Crowd Density Measures and an adaptive Correction Filter}, BOOKTITLE = {10th IEEE International Conference on Advanced Video and Signal-Based Surveillance}, YEAR = {2013}, MONTH = aug, ORGANIZATION = {IEEE}, ADDRESS = {Kraków, Polen}, URL = {http://ieeexplore.ieee.org/document/6636610/}, ABSTRACT = {In this paper we present a method of improving a human detector by means of crowd density information. Human detection is especially challenging in crowded scenes which makes it important to introduce additional knowledge into the detection process. We compute crowd density maps in order to estimate the spatial distribution of people in the scene and show how it is possible to enhance the detection results of a state-of-the-art human detector by this information. The proposed method applies a self-adaptive, dynamic parametrization and as an additional contribution uses scene-adaptive learning of the human aspect ratio in order to reduce false positive detections in crowded areas. We evaluate our method on videos from different datasets and demonstrate how our system achieves better results than the baseline algorithm.} } @INPROCEEDINGS{1425Evangelio2013, AUTHOR = {Rubén Heras Evangelio and Ivo Keller and Thomas Sikora}, TITLE = {Multiple Cue Indexing and Summarization of Surveillance Video}, BOOKTITLE = {10th IEEE International Conference on Advanced Video and Signal-Based Surveillance}, YEAR = {2013}, MONTH = aug, ADDRESS = {Kraków, Poland}, PDF = {http://elvera.nue.tu-berlin.de/files/1425Evangelio2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1425Evangelio2013.pdf}, ABSTRACT = {In this paper we propose a system for the summarization of safety and security surveillance video. By combining the information provided by multiple analysis cues, we improve the quality of the information extracted out of the analyzed video sequences with respect to the state-of-the-art approaches, therefore, being able to generate summaries that better align with the content of the original video. The proposed system has been tested using an extensive set of surveillance sequences, showing compression ratios ranging from 11 to 114, depending on the video content and the configuration of the system.} } @INPROCEEDINGS{1414Tok2013, AUTHOR = {Michael Tok and Marko Esche and Thomas Sikora}, TITLE = {A Dynamic Model Buffer for Parametric Motion Vector Prediction in Random-Access Coding Scenarios}, BOOKTITLE = {20th IEEE International Conference on Image Processing}, YEAR = {2013}, MONTH = sep, PUBLISHER = {IEEE}, ORGANIZATION = {IEEE}, ADDRESS = {Melbourne, Australia}, PDF = {http://elvera.nue.tu-berlin.de/files/1414Tok2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1414Tok2013.pdf} } @INPROCEEDINGS{1415Esche2013, AUTHOR = {Marko Esche and Michael Tok and Thomas Sikora}, TITLE = {Adaptive Dense Vector Field Interpolation for Temporal Filtering}, BOOKTITLE = {20th IEEE International Conference on Image Processing}, YEAR = {2013}, MONTH = sep, PUBLISHER = {IEEE}, ORGANIZATION = {IEEE}, ADDRESS = {Melbourne, Australia}, DOI = {10.1109/ICIP.2013.6738395} } @INPROCEEDINGS{1419Ide2013, AUTHOR = {Kai Ide and Ivo Keller and Thomas Sikora}, TITLE = {Consensus-Based Multiview Texturing and Depth-Map Completion}, BOOKTITLE = {20th IEEE International Conference on Image Processing}, YEAR = {2013}, MONTH = sep, ORGANIZATION = {IEEE}, ADDRESS = {Melbourne, Australia}, PDF = {http://elvera.nue.tu-berlin.de/files/1419Ide2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1419Ide2013.pdf}, ABSTRACT = {We describe an active depth imaging system based on phase measuring triangulation. Typically depth-maps generated with such 3D scanning systems suffer from occlusions and imperfections, especially in the vicinity of depth discontinuities. Applying multiple color images, captured with a camera array, for view synthesis from the erroneous depth-maps can result in severe texturing artifacts. Our consensus-based approach greatly reduces these artifacts by comparing the similarity of the multiview texture images during the blending process to detect outliers in the form of foreground texture projected on background surfaces and specular ambiguity. Additionally, the approach is applied to dramatically improve the depth-maps by generating multiple depth-map hypotheses and selecting the areas of each that have the highest consensus with the set of multiview texture images. Our approach yields accurate and occlusion-free depth-maps in real-time.} } @INPROCEEDINGS{1422Senst2013, AUTHOR = {Tobias Senst and Jonas Geistert and Ivo Keller and Thomas Sikora}, TITLE = {Robust Local Optical Flow Estimation using Bilinear Equations for Sparse Motion Estimation}, BOOKTITLE = {20th IEEE International Conference on Image Processing}, YEAR = {2013}, MONTH = sep, PAGES = {2499--2503}, ADDRESS = {Melbourne, Australia}, NOTE = {DOI:10.1109/ICIP.2013.6738515}, PDF = {http://elvera.nue.tu-berlin.de/files/1422Senst2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1422Senst2013.pdf}, ABSTRACT = {This article presents a theoretical framework to decrease the computation effort of the Robust Local Optical Flow method which is based on the Lucas Kanade method. We show mathematically,how to transform the iterative scheme of the feature tracker into a system of bilinear equations and thus estimate the motion vectors directly by analyzing its zeros. Furthermore, we show that it is possible to parallelise our approach efficiently on a GPU, thus, outperforming the current OpenCV-OpenCL implementation of the pyramidal Lucas Kanade method in terms of runtime and accuracy. Finally, an evaluation is given for the Middlebury Optical Flow and the KITTI datasets.} } @INPROCEEDINGS{1426Schmiedeke2013, AUTHOR = {Sebastian Schmiedeke and Pascal Kelm and Thomas Sikora}, TITLE = {DCT-based Features for Categorisation of Social Media in Compressed Domain}, BOOKTITLE = {15th IEEE International Workshop on Multimedia Signal Processing}, YEAR = {2013}, MONTH = sep, EDITOR = {IEEE}, PUBLISHER = {IEEE}, PAGES = {295--300}, ORGANIZATION = {IEEE}, NOTE = {ISBN 978-1-4799-0125-8 Copyright and Reprint Permission: Abstracting is permitted with credit to the source. libraries are permitted to photocopy beyond the limit of U.S. copyright law for private use of patrons those articles in this volumen that carry a code at the bottom of the first page, provided the per-copy fee indicated in the code is paid through Copyright Clearance Center, 222 Rosewood Drive, Danvers, MA 01923. For other copying, reprint or republication permission, write to IEEE Copyrights Manager, IEEE Operations Center, 445 Hoes lane, Piscataway, NJ 08854. All rights reserved. Copyright ©2013 IEEE.}, PDF = {http://elvera.nue.tu-berlin.de/files/1426Schmiedeke2013.pdf}, DOI = {10.1109/MMSP.2013.6659304}, URL = {http://elvera.nue.tu-berlin.de/files/1426Schmiedeke2013.pdf}, ABSTRACT = {These days the sharing of videos is very popular in social networks. Many of these social media websites such as Flickr, Facebook and YouTube allows the user to manually label their uploaded videos with textual information. However, the manually labelling for a large set of social media is still boring and error-prone. For this reason we present a fast algorithm for categorisation of videos in social media platforms without decoding them. The paper shows a data-driven approach which makes use of global and local features from the compressed domain and achieves a mean average precision of 0.2498 on the Blip10k dataset. In comparison with existing retrieval approaches at the MediaEval Tagging Task 2012 we will show the effectiveness and high accuracy relative to the state-of-the art solutions.} } @INPROCEEDINGS{1432Schmiedeke2013, AUTHOR = {Sebastian Schmiedeke and Pascal Kelm and and Thomas Sikora}, TITLE = {TUB @ MediaEval 2013 Visual Privacy Task: Reversible Scrambling with colour-preservative Characteristic}, BOOKTITLE = {Proceedings of the MediaEval 2013 Multimedia Benchmark Workshop}, YEAR = {2013}, MONTH = oct, EDITOR = {Martha Larson, Xavier Anguera, Timo Reuter, Gareth J.F. Jones, Bogdan Ionescu, Markus Schedl, Tomas Piatrik, Claudia Hauff, Mohammad Soleymani}, PUBLISHER = {CEUR-WS}, PAGES = {128--129}, ADDRESS = {Barcelona, Spain}, NOTE = {URN: urn:nbn:de:0074-1043-4 ARCHIVE: ftp://SunSITE.Informatik.RWTH-Aachen.DE/pub/publications/CEUR-WS/Vol-1043.zip ISSN 1613-0073}, PDF = {http://elvera.nue.tu-berlin.de/files/1432Schmiedeke2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1432Schmiedeke2013.pdf}, ABSTRACT = {This paper describes our participation in the Visual Privacy Task of MediaEval 2013, which aims to obscure human occurrence in image sequences. As a result the recorded person should be unrecognisable. We use an approach which pseudo-randomly scrambles pixels within specified regions. This technique is reversible and preserves the colour characteristic of each region. So, colour-based approaches will still be able to automatically distinguish between differently dressed individuals. The evaluations of our results show that the privacy aspect got a very high score in both objective and subjective metrics. Our approach has a lack of intelligibility since it was measured by applying the Histogram of Oriented Gradients which might be fail on scrambled areas since edges are not preserved.} } @INPROCEEDINGS{1469Sikora2013, AUTHOR = {Volker Eiselein and Tobias Senst and Ivo Keller and Thomas Sikora}, TITLE = {TUB @ MediaEval 2013 Visual Privacy Task: Using Adaptive Edge Detection for Privacy in Surveillance Videos}, BOOKTITLE = {Proceedings of the MediaEval 2013 Multimedia Benchmark Workshop}, YEAR = {2013}, MONTH = oct, ADDRESS = {Barcelona, Spain}, NOTE = {DBLP:conf/mediaeval/2013}, DOI = {10.13140/2.1.3640.1445}, URL = {http://ceur-ws.org/Vol-1043/mediaeval2013_submission_76.pdf}, ABSTRACT = {n this paper we present a system for preserving the privacy of individuals in a video surveillance scenario. While a person’s privacy should not be revealed to a viewer of the video without special needs, it is still important that the action in a scene as the semantic content of a video remain perceivable by a human observer. The proposed system uses edge detection and adaptive thresholding in order to estimate the persons’ silhouettes in a video scene and thus rendering most of their actions visible, while hiding sensitive personal information. In order to obtain a more complete contour around a person, an adaptive thresholding scheme using edge histograms is used as well as background subtraction which limits the edge extraction to foreground masks and thus avoids distraction of the viewer’s eyes to background structures.} } @INPROCEEDINGS{1423Choi2013, AUTHOR = {Jaeyoung Choi and Venkatesan Ekambaram and Howard Lei and Pascal Kelm and Luke Gottlieb and Thomas Sikora and Kannan Ramchandran and Gerald Friedland}, TITLE = {Human vs Machine: Establishing a Human Baseline for Multimodal Location Estimation}, BOOKTITLE = {Human vs Machine: Establishing a Human Baseline for Multimodal Location Estimation}, YEAR = {2013}, MONTH = oct, ORGANIZATION = {ACM}, ABSTRACT = {Over the recent years, the problem of video location estimation (i.e., estimating the longitude/latitude coordinates of a video without GPS information) has been approached with diverse methods and ideas in the research community and significant improvements have been made. So far, however, systems have only been compared against each other and no systematic study on human performance has been conducted. Based on a human-subject study with over 11,000 experiments, this article presents a human baseline for location estimation for different combinations of modalities (au- dio, audio/video, audio/video/text). Furthermore, this article reports on the comparison of the accuracy of state-of-the-art location estimation systems with the human baseline. Although the overall performance of humans’ multimodal video location estimation is better than current machine learning approaches, the difference is quite small: For 41 % of the test set, the machine’s accuracy was superior to the humans. We present case studies and discuss why machines did better for some videos and not for others. Our analysis suggests new directions and priorities for future work on the improvement of location inference algorithms.} } @INPROCEEDINGS{1427Kelm2013, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Jaeyoung Choi and Gerald Friedland and Venkatesan Nallampatti Ekambaram and Kannan Ramchandran and Thomas Sikora}, TITLE = {A Novel Fusion Method for Integrating Multiple Modalities and Knowledge for Multimodal Location Estimation}, BOOKTITLE = {Proceedings of the 2nd ACM International Workshop on Geotagging and Its Applications in Multimedia}, YEAR = {2013}, MONTH = oct, PUBLISHER = {ACM, New York, NY, USA}, PAGES = {7--12}, ADDRESS = {Barcelona, Spain}, NOTE = {isbn = 978-1-4503-2391-8 acmid = 2509238}, DOI = {10.1145/2509230.2509238}, URL = {http://doi.acm.org/10.1145/2509230.2509238}, ABSTRACT = {This article describes a novel fusion approach for multiple modalities and knowledge sources that improves the accuracy of multimodal location estimation algorithms. The problem of "multimodal location estimation" or "placing" consists of associating geo-locations to consumer-produced multimedia data such as videos or photos that have not been tagged using GPS. Our algorithm effectively integrates the visual and textual modalities with external geographical knowledge bases by building a hierarchical model that combines both data-driven as well as semantic methods to group visual and textual features together into geographical regions. We evaluate our algorithm on the MediaEval 2010 Placing Task data set and show that our system outperforms the state of the art significantly, achieving to locate about 40% of the videos within an accuracy radius of 100m.} } @INPROCEEDINGS{1449Cao2013, AUTHOR = {Liangliang Cao and Gerald Friedland and Pascal Kelm}, TITLE = {Second ACM Multimedia Workshop on Geotagging and Its Applications in Multimedia (GeoMM 2013)}, BOOKTITLE = {ACM Multimedia}, YEAR = {2013}, MONTH = oct, PDF = {http://elvera.nue.tu-berlin.de/files/1449Cao2013.pdf}, DOI = {http://dx.doi.org/10.1145/2502081.2503829}, URL = {http://elvera.nue.tu-berlin.de/files/1449Cao2013.pdf}, ABSTRACT = {The Workshop on Geotagging and Its Applications in Mul- timedia (GeoMM 2013) focuses on new applications and methods of geotagging and in geo-location support systems. As the location based multimedia becomes more and more popular in the era of Web and mobile applications, the in- crease in the use of geotagging and improvements in geo- location support systems open up a new dimension for the description, organization and manipulation of multimedia data. This new dimension radically expands the usefulness of multimedia data both for daily users of the Internet and social networking sites as well as for experts in particular application scenarios. The workshop serves as a venue for the premier research in geotagging and multimedia, and continues to attract submissions from a diverse set of researchers,who address newly arising problems within this emerging field.} } @ARTICLE{1430Arvanitidou2013, AUTHOR = {Marina Georgia Arvanitidou and Michael Tok and Alexander Glantz and Andreas Krutz and Thomas Sikora}, TITLE = {Motion-based object segmentation using hysteresis and bidirectional inter-frame change detection in sequences with moving camera}, JOURNAL = {Signal Processing: Image Communication}, YEAR = {2013}, MONTH = nov, PAGES = {1420–1434}, VOLUME = {28}, NUMBER = {10}, PDF = {http://elvera.nue.tu-berlin.de/files/1430Arvanitidou2013.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1430Arvanitidou2013.pdf}, ABSTRACT = {We present an unsupervised motion-based object segmentation algorithm for video sequences with moving camera, employing bidirectional inter-frame change detection. For every frame, two error frames are generated using mo- tion compensation. They are combined and a segmentation algorithm based on thresholding is applied. We employ a simple and effective error fusion scheme and consider spatial error localization in the thresholding step. We find the optimal weights for the weighted mean thresholding algorithm that enables unsupervised robust moving object segmentation. Further, a post processing step for improving the temporal consistency of the segmentation masks is incorporated and thus we achieve improved performance compared to previously proposed methods. The experimental evaluation and compari- son with other methods demonstrates the validity of the proposed method.} } @INPROCEEDINGS{1438Skupin2014, AUTHOR = {Robert Skupin and Thilo Borgmann and Thomas Sikora}, TITLE = {Multiview Point Cloud Filtering for Spatiotemporal Consistency}, BOOKTITLE = {International Conference on Computer Vision Theory and Applications (VISAPP)}, YEAR = {2014}, MONTH = jan, PUBLISHER = {SCITEPRESS Digital Library}, PAGES = {531--538}, ORGANIZATION = {INSTICC}, PDF = {http://elvera.nue.tu-berlin.de/files/1438Skupin2014.pdf}, DOI = {10.5220/0004681805310538}, URL = {http://elvera.nue.tu-berlin.de/files/1438Skupin2014.pdf}, ABSTRACT = {This work presents algorithms to resample and filter point cloud data reconstructed from multiple cameras and multiple time instants. In an initial resampling stage, a voxel or a surface mesh based approach resamples the point cloud data into a common sampling grid. Subsequently, the resampled data undergoes a filtering stage based on clustering to remove artifacts and achieve spatiotemporal consistency across cameras and time instants. The presented algorithms are evaluated in a view synthesis scenario. Results show that view synthesis with enhanced depth maps as produced by the algorithms leads to less artifacts than synthesis with the original source data.} } @INPROCEEDINGS{1434Esche2014, AUTHOR = {Marko Esche and Michael Tok and Thomas Sikora}, TITLE = {Theoretical Considerations Concerning Pixelwise Temporal Filtering}, BOOKTITLE = {Data Compression Conference}, YEAR = {2014}, MONTH = mar, EDITOR = {IEEE}, PAGES = {73--82}, ADDRESS = {Snowbird, Utah}, DOI = {10.1109/DCC.2014.20}, ABSTRACT = {Temporal inloop filters present one possible way to reduce noise introduced in compressed video sequences at low bit rates. Some of these filtering approaches make use of the quantized and generally noisy motion information conveyed in the bit stream generated by the encoder. One key feature of such filters is an adaptive filter length depending on the image content and the quality of the motion field. This paper derives mathematical equations to model the behavior of one such filter in the presence of noisy motion vectors. The predicted optimal filter lengths are demonstrated to have a global optimum. They also show strong correlation with a real-world implementation of the previously introduced Temporal Trajectory Filter based on the HEVC main profile.} } @ARTICLE{1441Evangelio2014, AUTHOR = {Rubén Heras Evangelio and Michael Pätzold and Ivo Keller and Thomas Sikora}, TITLE = {Adaptively Splitted GMM with Feedback Improvement for the Task of Background Subtraction}, JOURNAL = {IEEE Transactions on Information Forensics & Security}, YEAR = {2014}, MONTH = may, PAGES = {863--874}, VOLUME = {9}, NUMBER = {5}, DOI = {10.1109/TIFS.2014.2313919}, URL = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6778782}, ABSTRACT = {Per pixel adaptive Gaussian Mixture Models (GMM) have become a popular choice for the detection of change in the video surveillance domain because of their ability to cope with many challenges characteristic for surveillance systems in real-time with low memory requirements. Since their first introduction in the surveillance domain, GMM have been enhanced in many directions. In this paper, we present a study of some relevant GMM approaches and analyze their underlying assumptions and design decisions. Based on this study, we show how these systems can be further improved by means of a variance controlling scheme and the incorporation of region analysis based feedback. The proposed system has been thoroughly evaluated using the extensive dataset of the IEEE Workshop on Change Detection, showing an outranking performance in comparison with state-of-the-art methods.} } @ARTICLE{1450Zilly2014, AUTHOR = {F. Zilly and C. Riechert and M. Müller and P. Eisert and T. Sikora and P. Kauff}, TITLE = {Real-time generation of multi-view video plus depth content using mixed narrow and wide baseline}, JOURNAL = {Journal of Visual Communication and Image Representation, Special Issue on 3D Video Processing}, YEAR = {2014}, MONTH = may, PAGES = {632--648}, VOLUME = {vol. 25}, NUMBER = {issue 4}, DOI = {10.1016/j.jvcir.2013.07.002}, ABSTRACT = {Content production for stereoscopic 3D-TV displays has become mature in the past years while huge progress has also been achieved in the improvement of the image quality of glasses-free auto-stereoscopic displays and light-field displays. Concerning the latter two display families, the content production workflow is less elaborated and more complex, as the number of required views not only differs considerably but is also likely to increase in the near future. As a co-existence of all 3D display families can be expected for the next years, one aims to establish an efficient content production workflow which yields to high quality content for all 3D-TV displays. Against this background we present a real-time capable multi-view video plus depth (MVD) content production workflow based on a four-camera rig with mixed narrow and wide baseline. Results show the suitability of the approach to simultaneously produce high quality MVD4 and native stereoscopic 3D content.} } @INPROCEEDINGS{1443Adderley2014, AUTHOR = {Richard Adderley and Atta Badii and Ruben Heras Evangelio and Matteo Raffaelli and Patrick Seidler and Marco Tiemann}, TITLE = {MOSAIC: A Multi-modal Surveillance System to Enhance Situation Awareness and Decision Making}, BOOKTITLE = {HCI International 2014}, YEAR = {2014}, MONTH = jun, PUBLISHER = {Springer International Publishing}, PAGES = {141--146}, ADDRESS = {Heraklion, Crete, Greece}, NOTE = {Print ISBN 978-3-319-07856-4 Online ISBN 978-3-319-07857-1}, DOI = {10.1007/978-3-319-07857-1_25}, URL = {http://link.springer.com/chapter/10.1007%2F978-3-319-07857-1_25}, ABSTRACT = {With increasing complexity of systems under surveillance, demand grows for automated video-based surveillance systems which are able to support end users in making sense of situational context from the amount of available data and incoming data streams. Traditionally, those systems have been developed based on techniques derived from the fields of image processing and pattern recognition. This paper presents MOSAIC (Multi-Modal Situation Assessment and Analytics Platform), a system which aims at exploiting multi-modal data analysis comprising advanced tools for video analytics, text mining, social network analysis, and decision support in order to provide from a richer context an understanding of behaviour of the system under surveillance and to support police personnel in decision making processes.} } @INPROCEEDINGS{1452García-Martín2014, AUTHOR = {Álvaro García-Martín and Rubén Heras Evangelio and Thomas Sikora}, TITLE = {A Multi-configuration Part-based Person Detector}, BOOKTITLE = {International Conference on Signal Processing and Multimedia Applications, SIGMAP}, YEAR = {2014}, MONTH = aug, PUBLISHER = {SCITEPRESS Digital Library}, PAGES = {321--328}, ADDRESS = {Vienna, Austria}, DOI = {10.5220/0005126703210328}, URL = {http://www.scitepress.org/DigitalLibrary/Link.aspx?doi=10.5220/0005126703210328}, ABSTRACT = {People detection is a task that has generated a great interest in the computer vision and specially in the surveillance community. One of the main problems of this task in crowded scenarios is the high number of occlusions deriving from persons appearing in groups. In this paper, we address this problem by combining individual body part detectors in a statistical driven way in order to be able to detect persons even in case of failure of any detection of the body parts, i.e., we propose a generic scheme to deal with partial occlusions. We demonstrate the validity of our approach and compare it with other state of the art approaches on several public datasets. In our experiments we consider sequences with different complexities in terms of occupation and therefore with different number of people present in the scene, in order to highlight the benefits and difficulties of the approaches considered for evaluation. The results show that our approach improves the results provided by state of the art approaches specially in the case of crowded scenes.} } @INPROCEEDINGS{1462Badii2014, AUTHOR = {Atta Badii and Marco Tiemann and Richard Adderley and Patrick Seidler and Ruben Heras Evangelio and Tobias Senst and Thomas Sikora and Luca Panattoni and Matteo Raffaelli and Matthew Cappel-Porter and Zsolt L. Husz and Thomas Hecker and Ines Peters}, TITLE = {MOSAIC - Multimodal Analytics for the Protection of Critical Assets}, BOOKTITLE = {International Conference on Signal Processing and Multimedia Applications, SIGMAP}, YEAR = {2014}, MONTH = aug, PUBLISHER = {SCITEPRESS Digital Library}, PAGES = {311--320}, ADDRESS = {Vienna, Austria}, DOI = {10.5220/0005126503110320}, URL = {http://www.scitepress.org/DigitalLibrary/Link.aspx?doi=10.5220/0005126503110320}, ABSTRACT = {This paper presents an overview of the MOSAIC architecture and the validated Demonstrator resulting from an EU-co-funded research project concerned with the development of an advanced system for the use and integration of multimodal analytics for the protection of critical assets. The paper motivates the MOSAIC vision and describes the major components of the integrated solution; including the ontological framework, the data representation, text mining, data mining, video analytics, social network analysis and decision support. In the descriptions of these components, it is illustrated how MOSAIC can be used to improve the protection of critical assets without necessitating data gathering that goes beyond what is already currently being gathered by relevant security organisations such as police forces by improving data analytics techniques, integration of analysis outputs and decision support mechanisms.} } @INPROCEEDINGS{1453García-Martín2014, AUTHOR = {Álvaro García-Martín and Rubén Heras Evangelio and Thomas Sikora}, TITLE = {Multi-configurations for Part-based Person Detectors}, BOOKTITLE = {Third International Workshop on Parts and Attributes, in Conjunction with the European Conference on Computer Vision (ECCV 2014)}, YEAR = {2014}, MONTH = sep, ADDRESS = {Zurich, Switzerland}, URL = {https://filebox.ece.vt.edu/~parikh/PnA2014/posters/Posters/GarciaPnA2014.pdf} } @INPROCEEDINGS{1457Schmiedeke2014, AUTHOR = {Sebastian Schmiedeke and Pascal Kelm and Lutz Goldmann and Thomas Sikora}, TITLE = {TUB @ MediaEval 2014 Visual Privacy Task: Reversible Scrambling on Foreground Masks}, BOOKTITLE = {Proceedings of the MediaEval 2014 Multimedia Benchmark Workshop}, YEAR = {2014}, MONTH = oct, EDITOR = {Martha Larson, Bogdan Ionescu, Xavier Anguera, Maria Eskevich, Pavel Korshunov, Markus Schedl, Mohammad Soleymani, Georgios Petkos, Richard Sutcliffe, Jaeyoung Choi, Gareth J.F. Jones}, PUBLISHER = {CEUR-WS}, PAGES = {73--74}, PDF = {http://elvera.nue.tu-berlin.de/files/1457Schmiedeke2014.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1457Schmiedeke2014.pdf} } @INPROCEEDINGS{1519Badii2014, AUTHOR = {Atta Badii and Touradj Ebrahimi and Christian Fedorczak and Pavel Koshunov and Tomas Piatrik and Volker Eiselein and Ahmed Al-Obaidi}, TITLE = {Overview of the MediaEval 2014 Visual Privacy Task}, BOOKTITLE = {MediaEval 2014 Workshop}, YEAR = {2014}, MONTH = oct, URL = {http://ceur-ws.org/Vol-1263/mediaeval2014_submission_37.pdf} } @INPROCEEDINGS{1456Eiselein2014, AUTHOR = {Volker Eiselein and Gleb Sternharz and Tobias Senst and Ivo Keller and Thomas Sikora}, TITLE = {Person re-Identification using Region Covariance in a Multi-feature Approach}, BOOKTITLE = {International Conference on Image Analysis and Recognition (ICIAR) 2014}, YEAR = {2014}, MONTH = oct, URL = {http://link.springer.com/chapter/10.1007%2F978-3-319-11755-3_9#page-1}, ABSTRACT = {Person re-identification is an important requirement for modern video surveillance systems and relevant for human tracking, especially over camera networks. Many different approaches have been proposed but a robust identification under real-life conditions still remains hard. In this paper we investigate the fusion of multiple person descriptors in order to increase the performance using complementary feature vectors. As an additional improvement to state-of-the-art region covariance descriptors, an extension of the comparison metric is proposed which increases the robustness and performance of the system in cases of rank deficiency. The proposed system is evaluated on the well-known benchmarks CAVIAR4REID, VIPeR, ETHZ and PRID 2011 and shows significant improvements over existing re-identification algorithms.} } @INPROCEEDINGS{1447Senst2014, AUTHOR = {Tobias Senst and Volker Eiselein and Ivo Keller and Thomas Sikora}, TITLE = {Crowd Analysis in non-static Cameras using Feature Tracking and Multi-Person Density}, BOOKTITLE = {21th IEEE International Conference on Image Processing}, YEAR = {2014}, MONTH = oct, PAGES = {6041--6045}, ADDRESS = {Paris,France}, NOTE = {ISBN: 978-1-4799-5750-7 DOI:10.1109/ICIP.2014.7026219}, PDF = {http://elvera.nue.tu-berlin.de/files/1447Senst2014.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1447Senst2014.pdf}, ABSTRACT = {We propose a new methodology for crowd analysis by introducing the concept of Multi-Person Density. Using a stateof-the-art feature tracking algorithm, representative low-level features and their long-term motion information are extracted and combined into a human detection model. In contrast to previously proposed techniques, the proposed method takes small camera motion into account and is not affected by camera shaking. This increases the robustness of separating crowd features from background and thus opens a whole new field for application of these techniques in non-static CCTV cameras. We show the effectiveness of our approach on various test videos and compare it to state-of-the-art people counting methods.} } @INPROCEEDINGS{1448Senst2014, AUTHOR = {Tobias Senst and Thilo Borgmann and Ivo Keller and Thomas Sikora}, TITLE = {Cross based Robust Local Optical Flow}, BOOKTITLE = {21th IEEE International Conference on Image Processing}, YEAR = {2014}, MONTH = oct, PAGES = {1967--1971}, ADDRESS = {Paris,France}, NOTE = {ISBN: 978-1-4799-5750-7 DOI:10.1109/ICIP.2014.7025394}, PDF = {http://elvera.nue.tu-berlin.de/files/1448Senst2014.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1448Senst2014.pdf}, ABSTRACT = {In many computer vision applications local optical flow methods are still a widely used. Such methods, like the Pyramidal Lucas Kanade and the Robust Local Optical Flow, have to address the trade--off between run time and accuracy. In this work we propose an extension to these methods that improves the accuracy especially at object boundaries. This extension makes use of the cross based variable support region generation proposed in Zhang2009 accounting for local intensity discontinuities. In the evaluation using Middlebury data set we prove the ability of the proposed extension to increase the accuracy by a slight increase of run time.} } @INPROCEEDINGS{1466Verhack2014, AUTHOR = {Ruben Verhack and Andreas Krutz and Peter Lambert and Rik Van de Walle and Thomas Sikora}, TITLE = {[Top 10% Paper] LOSSY IMAGE CODING IN THE PIXEL DOMAIN USING A SPARSE STEERING KERNEL SYNTHESIS APPROACH}, BOOKTITLE = {21th IEEE International Conference on Image Processing}, YEAR = {2014}, MONTH = oct, PAGES = {4807--4811}, ADDRESS = {Paris,France}, NOTE = {ISBN: 978-1-4799-5750-7}, PDF = {http://elvera.nue.tu-berlin.de/files/1466Verhack2014.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1466Verhack2014.pdf}, ABSTRACT = {Kernel regression has been proven successful for image de- noising, deblocking and reconstruction. These techniques lay the foundation for new image coding opportunities. In this pa- per, we introduce a novel compression scheme: Sparse Steer- ing Kernel Synthesis Coding (SSKSC). This pre- and post- processor for JPEG performs non-uniform sampling based on the smoothness of an image, and reconstructs the miss- ing pixels using adaptive kernel regression. At the same time, the kernel regression reduces the blocking artifacts from the JPEG coding. Crucial to this technique is that non-uniform sampling is performed while maintaining only a small over- head for signalization. Compared to JPEG, SSKSC achieves a compression gain for low bits-per-pixel regions of 50% or more for PSNR and SSIM. A PSNR gain is typically in the 0.0 - 0.5 bpp range, and an SSIM gain can mostly be achieved in the 0.0 - 1.0 bpp range.} } @ARTICLE{1461Gottlieb2014, AUTHOR = {Luke Gottlieb and Gerald Friedland and Jaeyoung Choi and Pascal Kelm and Thomas Sikora}, TITLE = {Creating Experts From the Crowd: Techniques for Finding Workers for Difficult Tasks}, JOURNAL = {IEEE Transactions on Multimedia}, YEAR = {2014}, MONTH = nov, PAGES = {2075--2079}, VOLUME = {16}, NUMBER = {7}, DOI = {10.1109/TMM.2014.2347268}, URL = {http://dx.doi.org/10.1109/TMM.2014.2347268}, ABSTRACT = {Crowdsourcing is currently used for a range of applications, either by exploiting unsolicited user-generated content, such as spontaneously annotated images, or by utilizing explicit crowdsourcing platforms such as Amazon Mechanical Turk to mass-outsource artificial-intelligence-type jobs. However, crowdsourcing is most often seen as the best option for tasks that do not require more of people than their uneducated intuition as a human being. This article describes our methods for identifying workers for crowdsourced tasks that are difficult for both machines and humans. It discusses the challenges we encountered in qualifying annotators and the steps we took to select the individuals most likely to do well at these tasks.} } @INPROCEEDINGS{1464Sikora2014, AUTHOR = {Nils Plath; Lutz Goldmann; Alexander Nitsch; Sebastian Knorr; Thomas Sikora}, TITLE = {Line-preserving hole-filling for 2D-to-3D conversion}, BOOKTITLE = {European Conference on Visual Media Production}, YEAR = {2014}, MONTH = nov, EDITOR = {ACM New York, NY, USA 2014}, ADDRESS = {London}, NOTE = {ISBN: 978-1-4503-3185-2}, DOI = {10.1145/2668904.2668931}, ABSTRACT = {Many 2D-to-3D conversion techniques rely on image-based rendering methods in order to synthesize 3D views from monoscopic images. This leads to holes in the generated views due to previously occluded objects becoming visible for which no texture information is available. Approaches attempting to alleviate the effects of these artifacts are referred to as hole-filling. This paper proposes a method which determines a non-uniform deformation of the stereoscopic view such that no holes are visible. Additionally, an energy term is devised, which prevents straight lines in the input image from being bent due to the non-uniform image warp. This is achieved by constructing a triangle mesh, which approximates the depth map of the input image and by integrating a set of detected lines into it. The line information is incorporated into the underlying optimization problem in order to prevent bending of the lines. The evaluation of the proposed algorithm on a comprehensive dataset with a variety of scenes shows that holes are efficiently filled without obvious background distortions.} } @INBOOK{1458Kelm2014, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Steven Schockaert and Thomas Sikora and Michele Trevisiol and Olivier Van Laere}, TITLE = {Georeferencing Flickr resources based on multimodal features}, YEAR = {2014}, BOOKTITLE = {Multimodal Location Estimation of Videos and Images}, EDITOR = {Jaeyoung Choi, Gerald Friedland}, PUBLISHER = {Springer International Publishing}, PAGES = {127--152}, CHAPTER = {8}, DOI = {10.1007/978-3-319-09861-6}, ABSTRACT = {The popularity of social media, and location based services in particular, has led to a vast increase in the number of georeferenced resources on the web. Examples are the large numbers of Flickr photos, Twitter posts (tweets), and Wikipedia articles for which explicit geographic coordinates are currently available. This trend has also led to an upsurge in research into methods for automatically assigning coordinates to web content. Being able to associate coordinates to web content is important in applications such as geographic information retrieval, where search results are adapted to the location of the user, and in applications which rely on characterizing places, e.g., for offering personalized travel recommendations.} } @INBOOK{1459Larson2014, AUTHOR = {Martha Larson and Pascal Kelm and Adam Rae and Claudia Hauff and Bart Thomee and Michele Trevisiol and Jaeyoung Choi and Olivier Van Laere and Steven Schockaert and Gareth J.F. Jones and Pavel Serdyukov and Vanessa Murdock and Gerald Friedland}, TITLE = {The Benchmark as a Research Catalyst: Charting the Progress of Geo-prediction for Social Multimedia}, YEAR = {2014}, BOOKTITLE = {Multimodal Location Estimation of Videos and Images}, EDITOR = {Jaeyoung Choi, Gerald Friedland}, PUBLISHER = {Springer International Publishing}, PAGES = {5--40}, CHAPTER = {2}, DOI = {10.1007/978-3-319-09861-6_2}, ABSTRACT = {Benchmarks have the power to bring research communities together to focus on specific research challenges. They drive research forward by making it easier to systematically compare and contrast new solutions, and evaluate their performance with respect to the existing state of the art. In this chapter, we present a retrospective on the Placing Task, a yearly challenge offered by the MediaEval Multimedia Benchmark. The Placing Task, launched in 2010, is a benchmarking task that requires participants to develop algorithms that automatically predict the geolocation of social multimedia (videos and images). This chapter covers the editions of the Placing Task offered in 2010–2013, and also presents an outlook onto 2014. We present the formulation of the task and the task dataset for each year, tracing the design decisions that were made by the organizers, and how each year built on the previous year. Finally, we provide a summary of future directions and challenges for multimodal geolocation, and concluding remarks on how benchmarking has catalyzed research progress in the research area of geolocation prediction for social multimedia.} } @INBOOK{1460Choi2014, AUTHOR = {Jaeyoung Choi and Howard Lei and Venkatesan Ekambaram and Pascal Kelm and Luke Gottlieb and Thomas Sikora and Kannan Ramchandran and Gerald Friedland}, TITLE = {Human Versus Machine: Establishing a Human Baseline for Multimodal Location Estimation}, YEAR = {2014}, BOOKTITLE = {Multimodal Location Estimation of Videos and Images}, EDITOR = {Jaeyoung Choi, Gerald Friedland}, PUBLISHER = {Springer International Publishing}, PAGES = {153--171}, CHAPTER = {9}, DOI = {10.1007/978-3-319-09861-6_9}, ABSTRACT = {In recent years, the problem of video location estimation (i.e., estimating the longitude/latitude coordinates of a video without GPS information) has been approached with diverse methods and ideas in the research community and significant improvements have been made. So far, however, systems have only been compared against each other and no systematic study on human performance has been conducted. Based on a human-subject study with 11,900 experiments, this article presents a human baseline for location estimation for different combinations of modalities (audio, audio/video, audio/video/text). Furthermore, this article compares state-of-the-art location estimation systems with the human baseline. Although the overall performance of humans’ multimodal video location estimation is better than current machine learning approaches, the difference is quite small: For 41 % of the test set, the machine’s accuracy was superior to the humans. We present case studies and discuss why machines did better for some videos and not for others. Our analysis suggests new directions and priorities for future work on the improvement of location inference algorithms.} } @ARTICLE{1463Fradi2015, AUTHOR = {Hajer Fradi and Volker Eiselein and Jean-Luc Dugelay and Ivo Keller and Thomas Sikora}, TITLE = {Spatio-Temporal Crowd Density Model in a Human Detection and Tracking Framework}, JOURNAL = {Signal Processing: Image Communication}, YEAR = {2015}, PAGES = {100--111}, VOLUME = {31}, NOTE = {ISSN: 0923-5965}, DOI = {http://dx.doi.org/10.1016/j.image.2014.11.006}, URL = {http://www.sciencedirect.com/science/article/pii/S0923596514001647}, ABSTRACT = {Recently significant progress has been made in the field of person detection and tracking. However, crowded scenes remain particularly challenging and can deeply affect the results due to overlapping detections and dynamic occlusions. In this paper, we present a method to enhance human detection and tracking in crowded scenes. It is based on introducing additional information about crowds and integrating it into the state-of-the-art detector. This additional information cue consists of modeling time-varying dynamics of the crowd density using local features as an observation of a probabilistic function. It also involves a feature tracking step which allows excluding feature points attached to the background. This process is favorable for the later density estimation since the influence of features irrelevant to the underlying crowd density is removed. Our proposed approach applies a scene-adaptive dynamic parametrization using this crowd density measure. It also includes a self-adaptive learning of the human aspect ratio and perceived height in order to reduce false positive detections. The resulting improved detections are subsequently used to boost the efficiency of the tracking in a tracking-by-detection framework. Our proposed approach for person detection is evaluated on videos from different datasets, and the results demonstrate the advantages of incorporating crowd density and geometrical constraints into the detection process. Also, its impact on tracking results have been experimentally validated showing good results.} } @INPROCEEDINGS{1475Verhack2015, AUTHOR = {Ruben Verhack and Lieven Lange and Peter Lambert and Rik Van de Walle and Thomas Sikora}, TITLE = {Lossless Image Compression based on Kernel Least Mean Squares}, BOOKTITLE = {31st IEEE Picture Coding Symposium, Cairns, Australia}, YEAR = {2015}, MONTH = may, PDF = {http://elvera.nue.tu-berlin.de/files/1475Verhack2015.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1475Verhack2015.pdf} } @INPROCEEDINGS{1470Tok2015, AUTHOR = {Michael Tok and Volker Eiselein and Thomas Sikora}, TITLE = {Motion Modeling for Motion Vector Coding in HEVC}, BOOKTITLE = {31st IEEE Picture Coding Symposium, Cairns, Australia}, YEAR = {2015}, MONTH = may, PDF = {http://elvera.nue.tu-berlin.de/files/1470Tok2015.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1470Tok2015.pdf} } @INPROCEEDINGS{1471Borgmann2015, AUTHOR = {Thilo Borgmann and Michael Tok and Thomas Sikora}, TITLE = {Image Guided Phase Unwrapping for real-time 3D-Scanning}, BOOKTITLE = {31st IEEE Picture Coding Symposium, Cairns, Australia}, YEAR = {2015}, MONTH = may, PDF = {http://elvera.nue.tu-berlin.de/files/1471Borgmann2015.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1471Borgmann2015.pdf} } @INPROCEEDINGS{1476Sikora2015, AUTHOR = {Thomas Sikora}, TITLE = {A Novel Kernel PCA/KLT Approach for Transform Coding of Waveforms}, BOOKTITLE = {31st IEEE Picture Coding Symposium, Cairns, Australia}, YEAR = {2015}, MONTH = may, PAGES = {174--178}, PDF = {http://elvera.nue.tu-berlin.de/files/1476Sikora2015.pdf}, DOI = {10.1109/PCS.2015.7170070}, URL = {http://elvera.nue.tu-berlin.de/files/1476Sikora2015.pdf}, ABSTRACT = {A novel Kernel PCA/Kernel KLT transform (S-KPCA) is introduced which incorporates higher order statistics into the design of the transform matrix using a Reproducing Kernel Hilbert Space (RKHS) approach. The goal is to arrive at an orthonormal transform matrix E with column eigenvectors that allow reconstruction of an input vector with few coefficients and superior signal fidelity. In contrast to the well known Kernel PCA the number of the generated transform coefficients is not dependent on the size of the training set and the “pre-image problem” is avoided completely. Results indicate that the derived transform is more compact than the standard PCA/KLT in terms of fidelity measures in RKHS.} } @INPROCEEDINGS{1479Senst2015, AUTHOR = {Tobias Senst and Volker Eiselein and Thomas Sikora}, TITLE = {[Best Paper Award] A Local Feature based on Lagrangian Measures for Violent Video Classification}, BOOKTITLE = {6th IET International Conference on Imaging for Crime Detection and Prevention}, YEAR = {2015}, MONTH = jul, EDITOR = {Georgios Chaitas, Sergio A Velastin}, PUBLISHER = {IET Digital Library}, PAGES = {1--6}, ADDRESS = {UK, London}, NOTE = {ISBN: 978-1-78561-131-5}, PDF = {http://elvera.nue.tu-berlin.de/files/1479Senst2015.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1479Senst2015.pdf}, ABSTRACT = {Lagrangian theory provides a diverse set of tools for continuous motion analysis. Existing work shows the applicability of Lagrangian method for video analysis in several aspects. In this paper we want to utilize the concept of Lagrangian measures to detect violent scenes. Therefore we propose a local feature based on the SIFT algorithm that incooperates appearance and Lagrangian based motion models. We will show that the temporal interval of the used motion information is a crucial aspect and study its influence on the classification performance. The proposed LaSIFT feature outperforms other state-of-the-art local features, in particular in uncontrolled realistic video data. We evaluate our algorithm with a bag-of-word approach. The experimental results show a significant improvement over the state-of-the-art on current violent detection datasets, i.e. Crowd Violence, Hockey Fight.} } @ARTICLE{1480Sikora2015, AUTHOR = {Stephan A. Rein; Frank H.P. Fitzek; Clemens Gühmann; Thomas Sikora}, TITLE = {Evaluation of the wavelet image two-line coder: A low complexity scheme for image compression}, JOURNAL = {Signal Processing: Image Communication}, YEAR = {2015}, MONTH = sep, PAGES = {58--74}, VOLUME = {Volume 37, 2015}, DOI = {doi:10.1016/j.image.2015.07.010}, ABSTRACT = {This paper introduces the wavelet image two-line (Wi2l) coding algorithm for low complexity compression of images. The algorithm recursively encodes an image backwards reading only two lines of a wavelet subband, which are read in blocks of 512 bytes from flash memory. It thus only requires very little memory, i.e., a memory array for two wavelet subband lines, an array to store intermediate tree level data, and an array for writing binary data. A picture of 256×256 pixels would require 1152 bytes of memory. Computation time for the coding is derived analytically and measured on a real system. The times on a low-cost microcontroller for 256×256 grayscale pictures are measured as 0.25–0.6 s for encoding and 0.22–0.77 s for decoding. The algorithm can thus realize a low complexity system for compression of images when combined with a customized scheme for the wavelet transform; low complexity here refers to low memory, minimum write access to flash memory, usage of integer operations only, and low conceptual complexity (ease of implementation). As demonstrated in this paper, a compression performance similar to JPEG 2000 and the more recent Google WebP picture compression is achieved. The compression system uses flash memory (SD or MMC card) and a small camera sensor thus building an image communication system. It is also suitable for mobile devices or satellite communication. The underlying C source code is made publicly available.} } @INPROCEEDINGS{1518Badii2015, AUTHOR = {Atta Badii and Pavel Koshunov and Hamid Oudi and Touradj Ebrahimi and Tomas Piatrik and Volker Eiselein and Natacha Ruchaud and Christian Fedorczak and Jean-Luc Dugelay and Diego F. Vazquez}, TITLE = {Overview of the MediaEval 2015 Drone Protect Task}, BOOKTITLE = {MediaEval 2015 Workshop}, YEAR = {2015}, MONTH = sep, URL = {http://ceur-ws.org/Vol-1436/Paper7.pdf} } @INPROCEEDINGS{1491Bochinski2016, AUTHOR = {Erik Bochinski and Volker Eiselein and Thomas Sikora}, TITLE = {Training a Convolutional Neural Network for Multi-Class Object Detection Using Solely Virtual World Data}, BOOKTITLE = {IEEE International Conference on Advanced Video and Signal-Based Surveillance}, YEAR = {2016}, MONTH = aug, PAGES = {278--285}, ADDRESS = {Colorado Springs, CO, USA}, NOTE = {Electronic ISBN: 978-1-5090-3811-4 Print on Demand(PoD) ISBN: 978-1-5090-3812-1 DOI: 10.1109/AVSS.2016.7738056}, PDF = {http://elvera.nue.tu-berlin.de/files/1491Bochinski2016.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1491Bochinski2016.pdf}, ABSTRACT = {Convolutional neural networks are a popular choice for current object detection and classification systems. Their performance improves constantly but for effective training, large, hand-labeled datasets are required. We address the problem of obtaining customized, yet large enough datasets for CNN training by synthesizing them in a virtual world, thus eliminating the need for tedious human interaction for ground truth creation. We developed a CNN-based multi-class detection system that was trained solely on virtual world data and achieves competitive results compared to state-of-the-art detection systems.} } @INPROCEEDINGS{1496Senst2016, AUTHOR = {Tobias Senst and Jonas Geistert and Thomas Sikora}, TITLE = {Robust local optical flow: Long-range motions and varying illuminations}, BOOKTITLE = {IEEE International Conference on Image Processing}, YEAR = {2016}, MONTH = sep, PUBLISHER = {IEEE}, PAGES = {4478--4482}, ADDRESS = {Phoenix, AZ, USA}, NOTE = {IEEE Catalog Number: CFP16CIP-USB ISBN: 978-1-4673-9960-9 DOI:10.1109/ICIP.2016.7533207}, PDF = {http://elvera.nue.tu-berlin.de/files/1496Senst2016.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1496Senst2016.pdf}, ABSTRACT = {Sparse motion estimation with local optical flow methods is fundamental for a wide range of computer vision application. Classical approaches like the pyramidal Lucas-Kanade method (PLK) or more sophisticated approaches like the Robust Local Optical Flow (RLOF) fail when it comes to environments with illumination changes and/or long-range motions. In this work we focus on these limitations and propose a novel local optical flow framework taking into account an illumination model to deal with varying illumination and a prediction step based on a perspective global motion model to deal with long-range motions. Experimental results shows tremendous improvements, e.g. 56% smaller error for dense motion fields on the KITTI and an about 76% smaller error for sparse motion fields on the Sintel dataset.} } @INPROCEEDINGS{1501Verhack2016, AUTHOR = {Ruben Verhack and Thomas Sikora and Lieven Lange and Glenn Van Wallendael and Peter Lambert}, TITLE = {A Universal Image Coding Approahc using Sparse Steered Mixture-of-Experts Regression}, BOOKTITLE = {IEEE International Conference on Image Processing}, YEAR = {2016}, MONTH = sep, PAGES = {2142--2146}, ADDRESS = {Phoenix, AZ, USA}, NOTE = {IEEE Catalog Number: CFP16CIP-USB ISBN: 978-1-4673-9960-9}, DOI = {10.1109/ICIP.2016.7532737}, ABSTRACT = {Our challenge is the design of a “universal” bit-efficient image compression approach. The prime goal is to allow reconstruction of images with high quality. In addition, we attempt to design the coder and decoder “universal”, such that MPEG-7-like low-and mid-level descriptors are an integral part of the coded representation. To this end, we introduce a sparse Mixture-of-Experts regression approach for coding images in the pixel domain. The underlying stochastic process of the pixel amplitudes are modelled as a 3-dimensional and multi-modal Mixture-of-Gaussians with K modes. This closed form continuous analytical model is estimated using the Expectation-Maximization algorithm and describes segments of pixels by local 3-D Gaussian steering kernels with global support. As such, each component in the mixture of experts steers along the direction of highest correlation. The conditional density then serves as the regression function. Experiments show that a considerable compression gain is achievable compared to JPEG for low bitrates for a large class of images, while forming attractive low-level descriptors for the image, such as the local segmentation boundaries, direction of intensity flow and the distribution of these parameters over the image.} } @INPROCEEDINGS{1498Geistert2016, AUTHOR = {Jonas Geistert and Tobias Senst and Thomas Sikora}, TITLE = {Robust Local Optical Flow: Dense Motion Vector Field Interpolation}, BOOKTITLE = {Picture Coding Symposium}, YEAR = {2016}, MONTH = dec, PUBLISHER = {IEEE}, PAGES = {1--5}, ADDRESS = {Nuremberg, Germany}, NOTE = {In IEEE-Explore zugefügt am 24 April 2017! Electronic ISSN: 2472-7822 DOI: 10.1109/PCS.2016.7906352}, PDF = {http://elvera.nue.tu-berlin.de/files/1498Geistert2016.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1498Geistert2016.pdf}, ABSTRACT = {Optical flow methods integrating sparse point correspondences have made significant contribution in the field of optical flow estimation. Especially for the goal of estimating motion accurately and efficiently sparse-to-dense interpolation schemes for feature point matches have shown outstanding performances. Concurrently, local optical flow methods have been significantly improved with respect to long-range motion estimation in environments with varying illumination. This motivates us to propose a sparse-to-dense approach based on the Robust Local Optical Flow method. We study the performance of different efficient motion vector interpolation techniques for recent optical low benchmarks. Compared state-of-the-art method the proposed approach is significantly faster while remaining competitive accuracy on Middlebury, KITTI 2015 and MPI-Sintel data-set.} } @INPROCEEDINGS{1502Lange2016, AUTHOR = {Lieven Lange and Ruben Verhack and Thomas Sikora}, TITLE = {Video Representation and Coding Using a Sparse Steered Mixture-of-Experts Network}, BOOKTITLE = {Picture Coding Symposium}, YEAR = {2016}, MONTH = dec, PUBLISHER = {IEEE}, PAGES = {1--5}, ADDRESS = {Nuremberg, Germany}, NOTE = {In IEEE-Explore zugefügt am 24 April 2017! Electronic ISSN: 2472-7822 DOI: 10.1109/PCS.2016.7906369}, PDF = {http://elvera.nue.tu-berlin.de/files/1502Lange2016.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1502Lange2016.pdf}, ABSTRACT = {In this paper, we introduce a novel approach for video compression that explores spatial as well as temporal redundancies over sequences of many frames in a unified framework. Our approach supports “compressed domain vision” capabilities. To this end, we developed a sparse Steered Mixture of- Experts (SMoE) regression network for coding video in the pixel domain. This approach drastically departs from the established DPCM/Transform coding philosophy. Each kernel in the Mixture-of-Experts network steers along the direction of highest correlation, both in spatial and temporal domain, with local and global support. Our coding and modeling philosophy is embedded in a Bayesian framework and shows strong resemblance to Mixture-of-Experts neural networks. Initial experiments show that at very low bit rates the SMoE approach can provide competitive performance to H.264.} } @ARTICLE{1510Axenopoulos2017, AUTHOR = {Apostolos Axenopoulos and Volker Eiselein and Antonio Penta and Eugenia Koblents and Ernesto La Mattina and Petros Daras}, TITLE = {A framework for large-scale analysis of video "in the Wild" to assist digital forensic examination}, JOURNAL = {IEEE Security & Privacy Magazine, Special Issue on Digital Forensics}, YEAR = {2017}, VOLUME = {15}, NUMBER = {6}, URL = {https://www.iti.gr/iti/files/document/publications/LASIE_DF_SI_CR.pdf} } @INPROCEEDINGS{1511Verhack2017, AUTHOR = {Ruben Verhack and Simon Van de Keer and Glenn Van Wallendael and Peter Lambert and Thomas Sikora}, TITLE = {Color prediction in image coding using Steered Mixture-of-Experts}, BOOKTITLE = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2017}, YEAR = {2017}, MONTH = mar, PUBLISHER = {IEEE}, ADDRESS = {New Orleans, LA, USA}, NOTE = {Electronic ISBN: 978-1-5090-4117-6 USB ISBN: 978-1-5090-4116-9 Print on Demand(PoD) ISBN: 978-1-5090-4118-3 Electronic ISSN: 2379-190X}, DOI = {10.1109/ICASSP.2017.7952364}, ABSTRACT = {We propose a novel approach for modeling and coding color in images and video. Luminance is linearly correlated with chrominance locally, as such we can predict color given the luma value. Using the Steered Mixture-of-Experts (SMoE) approach, the image is viewed as a stochastic process over 5 random variables including the 2-D pixel locations, 1 luminance and 2 chrominance values. We model this process as a continuous joint density function by fitting a K-modal 5-D Gaussian Mixture Model (GMM). As such, the chroma values are predicted as the expectation of the conditional density. To validate, the technique was integrated within JPEG showing PSNR gains in the lower bitrate regions. A deeper analysis of the tolerance of the activation function is given through recycling color models in video sequences, yielding a high quality reconstruction over a considerable range of frames.} } @INPROCEEDINGS{1520Verhack2017, AUTHOR = {Ruben Verhack and Thomas Sikora and Lieven Lange and Rolf Jongebloed and Glenn Van Wallendael and Peter Lambert}, TITLE = {[Best Student Paper] STEERED MIXTURE-OF-EXPERTS FOR LIGHT FIELD CODING, DEPTH ESTIMATION, AND PROCESSING}, BOOKTITLE = {IEEE International Conference on Multimedia and Expo}, YEAR = {2017}, MONTH = jul, PAGES = {1183--1188}, NOTE = {ISBN:978-1-5090-6067-2/17}, PDF = {http://elvera.nue.tu-berlin.de/files/1520Verhack2017.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1520Verhack2017.pdf}, ABSTRACT = {The proposed framework, called Steered Mixture-of-Experts (SMoE), enables a multitude of processing tasks on light fields using a single unified Bayesian model. The underlying assumption is that light field rays are instantiations of a non-linear or non-stationary random process that can be modeled by piecewise stationary processes in the spatial domain. As such, it is modeled as a space-continuous Gaussian Mixture Model. Consequently, the model takes into account different regions of the scene, their edges, and their development along the spatial and disparity dimensions. Applications presented include light field coding, depth estimation, edge detection, segmentation, and view interpolation. The representation is compact, which allows for veryefficient compression yielding state-of-the-art coding resultsfor low bit-rates. Furthermore, due to the statistical representation, a vast amount of information can be queried from the model even without having to analyze the pixel values. This allows for “blind” light field processing and classification.} } @INPROCEEDINGS{1509Eiselein2017, AUTHOR = {Volker Eiselein and Erik Bochinski and Thomas Sikora}, TITLE = {Assessing Post-Detection Filters for a Generic Pedestrian Detector in a Tracking-By-Detection Scheme}, BOOKTITLE = {Analysis of video and audio "in the Wild" workshop at IEEE AVSS 2017}, YEAR = {2017}, MONTH = aug, PAGES = {1--6}, ADDRESS = {Lecce, Italy}, NOTE = {ISBN:978-1-5386-2939-0/17}, PDF = {http://elvera.nue.tu-berlin.de/files/1509Eiselein2017.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1509Eiselein2017.pdf}, ABSTRACT = {Tracking-by-detection becomes more and more popular for visual pedestrian tracking applications. However, it requires accurate and reliable detections in order to obtain good results. In this work, we propose two different post-detection filters designed to enhance the performance of custom person detectors. Using a popular deformable-parts-based pedestrian detector as a baseline, a detailed comparison over multiple test videos is performed and the gain of both algorithms is proven. Further analysis shows that the improved detection outcomes also lead to improved tracking results. We thus found that the usage of the proposed post-detection filters is recommendable as they do not impose a high computational load and are not limited to a specific detector method.} } @INPROCEEDINGS{1515Kutschbach2017, AUTHOR = {Tino Kutschbach and Erik Bochinski and Volker Eiselein and Thomas Sikora}, TITLE = {Sequential Sensor Fusion Combining Probability Hypothesis Density and Kernelized Correlation Filters for Multi-Object Tracking in Video Data}, BOOKTITLE = {International Workshop on Traffic and Street Surveillance for Safety and Security at IEEE AVSS 2017}, YEAR = {2017}, MONTH = aug, PAGES = {1--5}, ADDRESS = {Lecce, Italy}, NOTE = {ISBN:978-1-5386-2939-0/17}, PDF = {http://elvera.nue.tu-berlin.de/files/1515Kutschbach2017.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1515Kutschbach2017.pdf}, ABSTRACT = {This work applies the Gaussian Mixture Probability Hypothesis Density (GMPHD) Filter to multi-object tracking in video data. In order to take advantage of additional visual information, Kernelized Correlation Filters(KCF) are evaluated as a possible extension of the GMPHD tracking-by-detection scheme to enhance its performance. The baseline GMPHD filter and its extension are evaluated on the UA-DETRAC benchmark, showing that combining both methods leads to a higher recall and a better quality of object tracks to the cost of increased computational complexity and increased sensitivity to false-positives.} } @INPROCEEDINGS{1517Bochinski2017, AUTHOR = {Erik Bochinski and Volker Eiselein and Thomas Sikora}, TITLE = {[Challenge winner IWOT4S] High-Speed Tracking-by-Detection Without Using Image Information}, BOOKTITLE = {International Workshop on Traffic and Street Surveillance for Safety and Security at IEEE AVSS 2017}, YEAR = {2017}, MONTH = aug, PAGES = {1--6}, ADDRESS = {Lecce, Italy}, NOTE = {ISBN:978-1-5386-2939-0/17}, PDF = {http://elvera.nue.tu-berlin.de/files/1517Bochinski2017.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1517Bochinski2017.pdf}, ABSTRACT = {Tracking-by-detection is a common approach to multi- object tracking. With ever increasing performances of ob- ject detectors, the basis for a tracker becomes much more reliable. In combination with commonly higher frame rates, this poses a shift in the challenges for a successful tracker. That shift enables the deployment of much simpler tracking algorithms which can compete with more sophisticated approaches at a fraction of the computational cost. We present such an algorithm and show with thorough experiments its potential using a wide range of object detectors. The proposed method can easily run at 100K fps while outperforming the state-of-the-art on the DETRAC vehicle tracking dataset.} } @INPROCEEDINGS{1524Lyu2017, AUTHOR = {Siwei Lyu and Ming-Ching Chang and Dawei Du and Longyin Wen and Honggang Qi and Yuezun Li and Yi Wei and Lipeng Ke and Tao Hu and Marco Del Coco and Pierluigi Carcagnì and Dmitriy Anisimov and Erik Bochinski and Fabio Galasso and Filiz Bunyak and Guang Han and Hao Ye and Hong Wang and Kannappan Palaniappan and Koray Ozcan and Li Wang and Liang Wang and Martin Lauer and Nattachai Watcharapinchai and Nenghui Song and Noor M Al-Shakarji and Shuo Wang and Sikandar Amin and Sitapa Rujikietgumjorn and Tatiana Khanova and Thomas Sikora and Tino Kutschbach and Volker Eiselein and Wei Tian and Xiangyang Xue and Xiaoyi Yu and Yao Lu and Yingbin Zheng and Yongzhen Huang and Yuqi Zhang}, TITLE = {UA-DETRAC 2017: Report of AVSS2017 & IWT4S Challenge on Advanced Traffic Monitoring}, BOOKTITLE = {IEEE International Conference on Advanced Video and Signal-Based Surveillance}, YEAR = {2017}, MONTH = aug, PAGES = {1--7}, ADDRESS = {Lecce, Italy}, NOTE = {ISBN:978-1-5386-2939-0/17}, DOI = {10.1109/AVSS.2017.8078560} } @INPROCEEDINGS{1506Krusch2017, AUTHOR = {Patrick Krusch and Erik Bochinski and Volker Eiselein and Thomas Sikora}, TITLE = {A Consistent Two-Level Metric for Evaluation of Automated Abandoned Object Detection Methods}, BOOKTITLE = {24th IEEE International Conference on Image Processing (ICIP)}, YEAR = {2017}, MONTH = sep, PAGES = {4352--4356}, ADDRESS = {Beijing, China}, NOTE = {ISBN: 978-1-5090-2174-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1506Krusch2017.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1506Krusch2017.pdf}, ABSTRACT = {Scientific interest in automated abandoned object detection algorithms using visual information is high and many related systems have been published in recent years. However, most evaluation techniques rely only on statistical evaluation on the object level. Therefore and due to benchmarks with commonly only few abandoned objects and a non-standardized evaluation procedure, an objective performance comparison between different methods is generally hard. We propose a new evaluation metric which is focused on an end-user application case and an evaluation protocol which eliminates uncertainties in previous performance assessments. Using two variants of an abandoned object detection method, we show the features of the novel metric on multiple datasets proving its advantages over previously used measures.} } @INPROCEEDINGS{1507Bochinski2017, AUTHOR = {Erik Bochinski and Tobias Senst and Thomas Sikora}, TITLE = {Hyper-Parameter Optimization for Convolutional Neural Network Committees Based on Evolutionary Algorithms}, BOOKTITLE = {24th IEEE International Conference on Image Processing (ICIP)}, YEAR = {2017}, MONTH = sep, PAGES = {3924--2928}, ADDRESS = {Beijing, China}, NOTE = {ISBN: 978-1-5090-2174-1}, PDF = {http://elvera.nue.tu-berlin.de/files/1507Bochinski2017.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1507Bochinski2017.pdf}, ABSTRACT = {In a broad range of computer vision tasks, convolutional neural networks (CNNs) are one of the most prominent techniques due to their outstanding performance. Yet it is not trivial to find the best performing network structure for a specific application because it is often unclear how the network structure relates to the network accuracy. We propose an evolutionary algorithm-based framework to automatically optimize the CNN structure by means of hyper-parameters. Further, we extend our framework towards a joint optimization of a committee of CNNs to leverage specialization and cooperation among the individual networks. Experimental results show a significant improvement over the state-of-the-art on the well-established MNIST dataset for hand-written digits recognition.} } @INPROCEEDINGS{1514Arvanitidou2017, AUTHOR = {M. Arvanitidou and T. Sikora}, TITLE = {Motion-Aware Video Quality Assessment}, BOOKTITLE = {51st Asilomar Conference on Signals, Systems, and Computers 2017}, YEAR = {2017}, MONTH = oct, PUBLISHER = {IEEE}, ORGANIZATION = {IEEE Signal Processing Society}, ADDRESS = {Pacific Grove, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1514Arvanitidou2017.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1514Arvanitidou2017.pdf}, ABSTRACT = {This work focuses on considering motion towards improving video quality assessment algorithms. The improvement refers to improving computational video quality assessment algorithms in order to be in closer agreement with the subjective evaluation of video quality. We propose a motion saliency model that exploits motion features on spatial level and also an approach for consideration of global motion in the temporal dimension, leading to further improvements in the accuracy of video quality assessment. We perform evaluation by integrating our approaches in existing objective quality models and also by comparing them to existing related state-of-the-art video quality assessment methods.} } @ARTICLE{1512Senst2017, AUTHOR = {Tobias Senst and Volker Eiselein and Alexander Kuhn and Thomas Sikora}, TITLE = {Crowd Violence Detection Using Global Motion-Compensated Lagrangian Features and Scale-Sensitive Video-Level Representation}, JOURNAL = {IEEE Transactions on Information Forensics and Security}, YEAR = {2017}, MONTH = dec, PAGES = {2945--2956}, VOLUME = {12}, NUMBER = {12}, NOTE = {Print ISSN: 1556-6013 Online ISSN: 1556-6021 www.doi.org/10.1109/TIFS.2017.2725820}, PDF = {http://elvera.nue.tu-berlin.de/files/1512Senst2017.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1512Senst2017.pdf}, ABSTRACT = {Lagrangian theory provides a rich set of tools for analyzing non-local, long-term motion information in computer vision applications. Based on this theory, we present a specialized Lagrangian technique for the automated detection of violent scenes in video footage. We present a novel feature using Lagrangian direction fields that is based on a spatio-temporal model and uses appearance, background motion compensation, and long-term motion information. To ensure appropriate spatial and temporal feature scales, we apply an extended bag-of-words procedure in a late-fusion manner as classification scheme on a per-video basis.We demonstrate that the temporal scale, captured by the Lagrangian integration time parameter, is crucial for violence detection and show how it correlates to the spatial scale of characteristic events in the scene. The proposed system is validated on multiple public benchmarks and non-public, real-world data from the London Metropolitan Police. Our experiments confirm that the inclusion of Lagrangian measures is a valuable cue for automated violence detection and increases the classification performance considerably compared to stateof- the-art methods.} } @INPROCEEDINGS{1553Walles2018, AUTHOR = {Tim Walles and Ghassen Bacha and Erik Bochinski and Volker Eiselein and Jens Nejstgaard}, TITLE = {High-resolution automated approaches to study in situ mesozooplankton abundance and migration.}, BOOKTITLE = {Geophysical Research Abstracts}, YEAR = {2018}, MONTH = apr, PAGES = {764}, NOTE = {eISSN: 1607-7962}, URL = {http://adsabs.harvard.edu/abs/2018EGUGA..20..764W} } @INPROCEEDINGS{1533Küchhold2018, AUTHOR = {Markus Küchhold and Maik Simon and Thomas Sikora}, TITLE = {Restricted Boltzmann Machine Image Compression}, BOOKTITLE = {Picture Coding Symposium (PCS 2018)}, YEAR = {2018}, MONTH = jun, ORGANIZATION = {IEEE}, ADDRESS = {San Francisco, CA, USA}, NOTE = {DOI: 10.1109/PCS.2018.8456279}, PDF = {http://elvera.nue.tu-berlin.de/files/1533Küchhold2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1533Küchhold2018.pdf}, ABSTRACT = {We propose a novel lossy block-based image compression approach. Our approach builds on non-linear autoencoders that can, when properly trained, explore non-linear statistical dependencies in the image blocks for redundancy reduction. In contrast the DCT employed in JPEG is inherently restricted to exploration of linear dependencies using a second-order statistics framework. The coder is based on pre-trained class-specific Restricted Boltzmann Machines (RBM). These machines are statistical variants of neural network autoencoders that directly map pixel values in image blocks into coded bits. Decoders can be implemented with low computational complexity in a codebook design. Experimental results show that our RBM-codec outperforms JPEG at high compression rates, both in terms of PSNR, SSIM and subjective results.} } @INPROCEEDINGS{1534Tok2018, AUTHOR = {Michael Tok and Rolf Jongebloed and Lieven Lange and Erik Bochinski and Thomas Sikora}, TITLE = {An Mse Approach for Training and Coding Steered Mixtures of Experts}, BOOKTITLE = {Picture Coding Symposium (PCS)}, YEAR = {2018}, MONTH = jun, ADDRESS = {San Francisco, California USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1534Tok2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1534Tok2018.pdf} } @INPROCEEDINGS{1536Jongebloed2018, AUTHOR = {Rolf Jongebloed and Ruben Verhack and Lieven Lange and Thomas Sikora}, TITLE = {Hierarchical Learning of Sparse Image Representations using Steered Mixture-of-Experts}, BOOKTITLE = {2018 IEEE International Conference on Multimedia Expo Workshops (ICMEW)}, YEAR = {2018}, MONTH = jul, ORGANIZATION = {IEEE}, ADDRESS = {San Diego, CA, USA}, PDF = {http://elvera.nue.tu-berlin.de/files/1536Jongebloed2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1536Jongebloed2018.pdf}, ABSTRACT = {Previous research showed highly efficient compression results for low bit-rates using Steered Mixture-of-Experts (SMoE), higher rates still pose a challenge due to the non- convex optimization problem that becomes more difficult when increasing the number of components. Therefore, a novel estimation method based on Hidden Markov Random Fields is introduced taking spatial dependencies of neighbor- ing pixels into account combined with a tree-structured split- ting strategy. Experimental evaluations for images show that our approach outperforms state-of-the-art techniques using only one robust parameter set. For video and light field mod- eling even more gain can be expected.} } @INPROCEEDINGS{1540Bochinski2018, AUTHOR = {Erik Bochinski and Ghassen Bacha and Volker Eiselein and Tim J. W. Walles and Jens C. Nejstgaard and Thomas Sikora}, TITLE = {Deep Active Learning for In Situ Plankton Classification}, BOOKTITLE = {Computer Vision for Analysis of Underwater Imagery (CVAUI) IPCRW}, YEAR = {2018}, MONTH = aug, EDITOR = {Z. Zhang et al.}, PUBLISHER = {Springer International Publishing}, PAGES = {pp. 5–15}, ADDRESS = {Beijing, China}, NOTE = {DOI: 10.1007/978-3-030-05792-3 eBook ISBN: 978-3-030-05792-3 Softcover ISBN: 978-3-030-05791-6}, PDF = {http://elvera.nue.tu-berlin.de/files/1540Bochinski2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1540Bochinski2018.pdf} } @INPROCEEDINGS{1537Croci2018, AUTHOR = {Simone Croci and Mairead Grogan and Sebastian Knorr and Aljosa Smolic}, TITLE = {Colour Correction for Stereoscopic Omnidirectional Images}, BOOKTITLE = {Irish Machine Vision and Image Processing Conference}, YEAR = {2018}, MONTH = aug, ADDRESS = {Belfast, North Ireland}, NOTE = {ISBN: 978-0-9934207-3-3}, PDF = {http://elvera.nue.tu-berlin.de/files/1537Croci2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1537Croci2018.pdf}, ABSTRACT = {Stereoscopic omnidirectional images (ODI) when viewed with a head-mounted display are a way to generate an immersive experience. Unfortunately, their creation is not an easy process, and different problems can be present in the ODI that can reduce the quality of experience. A common problem is colour mismatch, which occurs when the colours of the objects in the scene are different between the two stereoscopic views. In this paper we propose a novel method for the correction of colour mismatch based on the subdivision of ODIs into patches, where local colour correction transformations are fitted and then globally combined. The results presented in the paper show that the proposed method is able to reduce the colour mismatch in stereoscopic ODIs.} } @INPROCEEDINGS{1549Zhu2018, AUTHOR = {Pengfei Zhu and Longyin Wen and Dawei Du and Xiao Bian and Haibin Ling and Qinghua Hu and Hao Cheng and Chengfeng Liu and Xiaoyu Liu and Wenya Ma and Qinqin Nie and Haotian Wu and Lianjie Wang and Arne Schumann and Dan Wang and Diego Ortego and Elena Luna and Emmanouil Michail and Erik Bochinski and Feng Ni and Filiz Bunyak and Gege Zhang and Guna Seetharaman and Guorong Li and Hongyang Yu and Ioannis Kompatsiaris and Jianfei Zhao and Jie Gao and Jose Martinez and Juan Miguel and Kannappan Palaniappan and Konstantinos Avgerinakis and Lars Sommer and Martin Lauer and Mengkun Liu and Noor Al-Shakarji and Oliver Acatay and Panagiotis Giannakeris and Qijie Zhao and Qinghua Ma and Qingming Huang and Stefanos Vrochidis and Thomas Sikora and Tobias Senst and Wei Song and Wei Tian and Wenhua Zhang and Yanyun Zhao and Yidong Bai and Yinan Wu and Yongtao Wang and Yuxuan Li and Zhaoliang Pi and Zhiming Ma}, TITLE = {VisDrone-VDT2018: The Vision Meets Drone Video Detection and Tracking Challenge Results}, BOOKTITLE = {ECCV 2018 Workshops}, YEAR = {2018}, MONTH = sep, EDITOR = {Leal-Taixé, Laura, Roth, Stefan}, PUBLISHER = {Springer International Publishing}, PAGES = {496--518}, ADDRESS = {Munich, Germany}, ABSTRACT = {Softcover ISBN: 978-3-030-11020-8 eBook ISBN: 978-3-030-11021-5 DOI: 10.1007/978-3-030-11021-5} } @INPROCEEDINGS{1532Küchhold2018, AUTHOR = {Markus Küchhold and Maik Simon and Volker Eiselein and Thomas Sikora}, TITLE = {Scale-Adaptive Real-Time Crowd Detection and Counting for Drone Images}, BOOKTITLE = {25th IEEE International Conference on Image Processing (ICIP)}, YEAR = {2018}, MONTH = oct, ORGANIZATION = {IEEE}, ADDRESS = {Athens, Greece}, PDF = {http://elvera.nue.tu-berlin.de/files/1532Küchhold2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1532Küchhold2018.pdf}, ABSTRACT = {DOI: 10.1109/ICIP.2018.8451289} } @INPROCEEDINGS{1535Bochinski2018, AUTHOR = {Erik Bochinski and Rolf Jongebloed and Michael Tok and Thomas Sikora}, TITLE = {Regularized Gradient Descent Training of Steered Mixture of Experts for Sparse Image Representation}, BOOKTITLE = {IEEE International Conference on Image Processing (ICIP)}, YEAR = {2018}, MONTH = oct, ADDRESS = {Athens, Greece}, NOTE = {DOI: 10.1109/ICIP.2018.8451823 Electronic ISBN: 978-1-4799-7061-2 Print on Demand(PoD) ISBN: 978-1-4799-7062-9 Electronic ISSN: 2381-8549}, PDF = {http://elvera.nue.tu-berlin.de/files/1535Bochinski2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1535Bochinski2018.pdf} } @INPROCEEDINGS{1547Bochinski2018, AUTHOR = {Erik Bochinski and Tobias Senst and Thomas Sikora}, TITLE = {Extending IOU Based Multi-Object Tracking by Visual Information}, BOOKTITLE = {IEEE International Conference on Advanced Video and Signals-based Surveillance}, YEAR = {2018}, MONTH = nov, PAGES = {441--446}, NOTE = {ISBN: 978-1-5386-9294-3/18}, PDF = {http://elvera.nue.tu-berlin.de/files/1547Bochinski2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1547Bochinski2018.pdf} } @INPROCEEDINGS{1548Schröder2018, AUTHOR = {Gregory Schröder and Tobias Senst and Erik Bochinski and Thomas Sikora}, TITLE = {Optical Flow Dataset and Benchmark for Visual Crowd Analysis}, BOOKTITLE = {IEEE International Conference on Advanced Video and Signals-based Surveillance}, YEAR = {2018}, MONTH = nov, PAGES = {7--11}, ADDRESS = {Auckland, New Zealand}, NOTE = {ISBN: 978-1-5386-9294-3/18}, PDF = {http://elvera.nue.tu-berlin.de/files/1548Schröder2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1548Schröder2018.pdf} } @INPROCEEDINGS{1560Lyu2018, AUTHOR = {Siwei Lyu and Ming-Ching Chang and Dawei Du and Wenbo Li and Yi Wei and Marco Del Coco and Pierluigi Carcagnì and Arne Schumann and Bharti Munjal and Dinh-Quoc-Trung Dang and Doo-Hyun Choi and Erik Bochinski and Fabio Galasso and Filiz Bunyak and Guna Seetharaman,Jang-Woon Baek and Jong Taek Lee and Kannappan Palaniappan and Kil-Taek Lim and Kiyoung Moon and Kwang-Ju Kim and Lars Sommer and Meltem Brandlmaier and Min-Sung Kang and Moongu Jeon and Noor M. Al-Shakarji and Oliver Acatay and Pyong-Kun Kim and Sikandar Amin and Thomas Sikora and Tien Dinh and Tobias Senst and Vu-Gia-Hy Che and Young-Chul Lim and Young-min Song and and Yun-Su Chung}, TITLE = {UA-DETRAC 2018: Report of AVSS2018 & IWT4S Challenge on Advanced Traffic Monitoring}, BOOKTITLE = {IEEE International Conference on Advanced Video and Signals-based Surveillance}, YEAR = {2018}, MONTH = nov, ADDRESS = {Auckland, New Zealand}, NOTE = {ISBN: 978-1-5386-9294-3/18} } @INPROCEEDINGS{1542Fearghail2018, AUTHOR = {Colm O Fearghail and Cagri Ozcinar and Sebastian Knorr and Aljosa Smolic}, TITLE = {Director’s Cut - Analysis of Aspects of Interactive Storytelling for VR Films}, BOOKTITLE = {International Conference for Interactive Digital Storytelling}, YEAR = {2018}, MONTH = dec, ADDRESS = {Dublin, Ireland}, NOTE = {Received the runner up best full paper award}, PDF = {http://elvera.nue.tu-berlin.de/files/1542Fearghail2018.pdf}, DOI = {10.1007/978-3-030-04028-4_34}, URL = {http://elvera.nue.tu-berlin.de/files/1542Fearghail2018.pdf}, ABSTRACT = {To explore methods that are currently used by professional virtual reality (VR) filmmakers to tell their stories and guide users, we analyze how end-users view 360◦ video in the presence of directional cues and evaluate if they are able to follow the actual story of narrative 360◦ films. In this context, we first collected data from five professional VR filmmakers. The data contains eight 360◦ videos, the directors cut, which is the intended viewing direction of the director, plot points and directional cues used for user guidance. Then, we performed a subjective experiment with 20 test subjects viewing the videos while their head orientation was recorded. Finally, we present and discuss the experimental results and show, among others, that visual discomfort and disorientation on part of the viewer not only lessen the immersive quality of the films but also cause difficulties in the viewer gaining a full understanding of the narrative that the director wished them to view.} } @INPROCEEDINGS{15432018, AUTHOR = {Declan Dowling and Colm O Fearghail and Aljosa Smolic and Sebastian Knorr}, TITLE = {Faoladh : A Case Study in Cinematic VR Storytelling and Production}, BOOKTITLE = {International Conference for Interactive Digital Storytelling}, YEAR = {2018}, MONTH = dec, ADDRESS = {Dublin, Ireland}, PDF = {http://elvera.nue.tu-berlin.de/files/15432018.pdf}, DOI = {10.1007/978-3-030-04028-4_42}, URL = {http://elvera.nue.tu-berlin.de/files/15432018.pdf}, ABSTRACT = {Portraying traditional cinematic narratives in virtual reality(VR) is an emerging practice where often the methods normally associ-ated with cinematic storytelling need to be adapted to the 360◦format.In this paper we investigates some proposed cinematic practices for nar-rative storytelling in a cinematic VR film set in late 9th century Irelandthat follows the perilous journey young Celt as he evades being capturedby Viking raiders. From this we will analyze the fidelity of those practiceswith results collected from YouTube Analytics.} } @INPROCEEDINGS{1546Knorr2018, AUTHOR = {Sebastian Knorr and Matis Hudong and Julian Cabrera and Thomas Sikora and Aljosa Smolic}, TITLE = {[Lumiere Award] DeepStereoBrush: Interactive Depth Map Creation}, BOOKTITLE = {International Conference on 3D Immersion}, YEAR = {2018}, MONTH = dec, ADDRESS = {Brussels, Belgium}, NOTE = {Received the Lumiere Award for the best scientific paper}, PDF = {http://elvera.nue.tu-berlin.de/files/1546Knorr2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1546Knorr2018.pdf}, ABSTRACT = {In this paper, we introduce a novel interactive depth map creation approach for image sequences which uses depth scribbles as input at user-defined keyframes. These scribbled depth values are then propagated within these keyframes and across the entire sequence using a 3-dimensional geodesic distance transform (3D-GDT). In order to further improve the depth estimation of the intermediate frames, %of an image sequence, we make use of a convolutional neural network (CNN) in an unconventional manner. Our process is based on online learning which allows us to specifically train a disposable network for each sequence individually using the user generated depth at keyframes along with corresponding RGB images as training pairs. Thus, we actually take advantage of one of the most common issues in deep learning: over-fitting. Furthermore, we integrated this approach into a professional interactive depth map creation application and compared our results against the state of the art in interactive depth map creation.} } @INPROCEEDINGS{1558Fearghail2018, AUTHOR = {Colm O Fearghail and Cagri Ozcinar and Sebastian Knorr and Aljosa Smolic}, TITLE = {Director´s Cut - Analysis of VR Film Cuts for Interactive Storytelling}, BOOKTITLE = {International Conference on 3D Immersion}, YEAR = {2018}, MONTH = dec, ADDRESS = {Brussels, Belgium}, PDF = {http://elvera.nue.tu-berlin.de/files/1558Fearghail2018.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1558Fearghail2018.pdf} } @INPROCEEDINGS{1544Knorr2018, AUTHOR = {Sebastian Knorr and Cagri Ozcinar and Colm O Fearghail and Aljosa Smolic}, TITLE = {Director’s Cut - A Combined Dataset for Visual Attention Analysis in Cinematic VR Content}, BOOKTITLE = {The 15th ACM SIGGRAPH European Conference on Visual Media Production}, YEAR = {2018}, MONTH = dec, ADDRESS = {London, UK}, PDF = {http://elvera.nue.tu-berlin.de/files/1544Knorr2018.pdf}, DOI = {10.1145/3278471.3278472}, URL = {http://elvera.nue.tu-berlin.de/files/1544Knorr2018.pdf}, ABSTRACT = {Methods of storytelling in cinema have well established conventions that have been built over the course of its history and the development of the format. In 360◦ film many of the techniques that have formed part of this cinematic language or visual narrative are not easily applied or are not applicable due to the nature of the format i.e. not contained the border of the screen. In this paper, we analyze how end-users view 360◦ video in the presence of directional cues and evaluate if they are able to follow the actual story of narrative 360◦ films. We first let filmmakers create an intended scan-path, the so called director’s cut, by setting position markers in the equirectangular representation of the omnidirectional content for eight short 360◦ films. Alongside this the filmmakers provided additional information regarding directional cues and plot points. Then, we performed a subjective test with 20 participants watching the films with a head-mounted display and recorded the center position of the viewports. The resulting scan-paths of the participants are then compared against the director’s cut using different scan-path similarity measures. In order to better visualize the similarity between the scan-paths, we introduce a new metric which measures and visualizes the viewport overlap between the participants’ scan-paths and the director’s cut. Finally, the entire dataset, i.e. the director’s cuts including the directional cues and plot points as well as the scan-paths of the test subjects, is publicly available with this paper.} } @INPROCEEDINGS{1571Jongebloed2019, AUTHOR = {Rolf Jongebloed and Erik Bochinski and Lieven lange and Thomas Sikora}, TITLE = {Quantized and Regularized Optimization for Coding Images Using Steered Mixtures-of-Experts}, BOOKTITLE = {2019 Data Compression Conference (DCC)}, YEAR = {2019}, PDF = {http://elvera.nue.tu-berlin.de/files/1571Jongebloed2019.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1571Jongebloed2019.pdf}, ABSTRACT = {Compression algorithms that employ Mixtures-of-Experts depart drastically from standard hybrid block-based transform domain approaches as in JPEG and MPEG coders. In pre- vious works we introduced the concept of Steered Mixtures-of-Experts (SMoEs) to arrive at sparse representations of signals. SMoEs are gating networks trained in a machine learn- ing approach that allow individual experts to explain and harvest directional long-range correlation in the N-dimensional signal space. Previous results showed excellent potential for compression of images and videos but the reconstruction quality was mainly limited to low and medium image quality. In this paper we provide evidence that SMoEs can com- pete with JPEG2000 at mid- and high-range bit-rates. To this end we introduce a SMoE approach for compression of color images with specialized gates and steering experts. A novel machine learning approach is introduced that optimizes RD-performance of quantized SMoEs towards SSIM using fake quantization. We drastically improve our previous results and outperform JPEG by up to 42%.} } @ARTICLE{1566Dudek2019, AUTHOR = {Roman Dudek and Simone Croci and Aljosa Smolic and Sebastian Knorr}, TITLE = {Robust Global and Local Color Matching in Stereoscopic Omnidirectional Content}, JOURNAL = {Signal Processing: Image Communication}, YEAR = {2019}, MONTH = may, PAGES = {231--241}, VOLUME = {74}, PDF = {http://elvera.nue.tu-berlin.de/files/1566Dudek2019.pdf}, DOI = {https://doi.org/10.1016/j.image.2019.02.013}, URL = {http://elvera.nue.tu-berlin.de/files/1566Dudek2019.pdf}, ABSTRACT = {Shooting a live-action immersive 360-degree experience, i.e. omnidirectional content (ODC) is a technological challenge as there are many technical limitations which need to be overcome, especially for capturing and post-processing in stereoscopic 3D (S3D). In this paper, we introduce a novel approach and entire system for stitching and color mismatch correction and detection in S3D omnidirectional content, which consists of three main modules: pre-processing,spherical color correction and color mismatch evaluation. The system and its individual modules are evaluated on two datasets, including a new dataset which will be publicly available with this paper. We show that our system outperforms the state of the art in color correction of S3D ODC and demonstrate that our spherical color correction module even further improves the results of the state of the art approaches.} } @INPROCEEDINGS{1579Knorr2019, AUTHOR = {Sebastian Knorr and Matthias Knoblauch and Thomas Sikora}, TITLE = {Creation of 360° Light Fields using Concentric Mosaics with Varying Slit Widths}, BOOKTITLE = {European Light Field Imaging Workshop}, YEAR = {2019}, MONTH = jun, ADDRESS = {Borovets, Bulgaria}, PDF = {http://elvera.nue.tu-berlin.de/files/1579Knorr2019.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1579Knorr2019.pdf}, ABSTRACT = {In this paper, we present a detailed procedure to capture high spatial resolution 360° cylindrical light fields using the approach of concentric mosaics. We therefore use a DSLR camera which is mounted off-centered on a horizontal bar and rotated with a fixed radius. Due to the off-centered arrangement, a 360° image can be created by stitching together predefined columns of each image captured at a certain angle, the so called slit images. By changing the height of the camera after a full rotation, an array of 360° images with horizontal and vertical angular resolution can be created. We then evaluate the impact of varying slit image widths with respect to storage requirements, capture time and resulting image quality.} } @INPROCEEDINGS{1568Croci2019, AUTHOR = {Simone Croci and Sebastian Knorr and Aljosa Smolic}, TITLE = {Study on the Perception of Sharpness Mismatch in Stereoscopic Video}, BOOKTITLE = {IEEE 11th International Conference on Quality of Multimedia Experience (QoMEX)}, YEAR = {2019}, MONTH = jun, PUBLISHER = {IEEE}, ADDRESS = {Berlin, Germany}, PDF = {http://elvera.nue.tu-berlin.de/files/1568Croci2019.pdf}, DOI = {https://doi.org/10.1109/QoMEX.2019.8743217}, URL = {http://elvera.nue.tu-berlin.de/files/1568Croci2019.pdf}, ABSTRACT = {In this paper, we study an artifact of stereoscopic 3D (S3D) video called sharpness mismatch (SM), that occurs when one view is more blurred than the other. SM beyond a certain level can create visual discomfort, and consequently degrade the quality of experience. Therefore, it is important to measure the just noticeable sharpness mismatch (JNSM), i.e., the minimal level of SM that is perceived by the human visual system and creates discomfort. The knowledge of the JNSM can be used in the evaluation of the quality of S3D video, and more in general when processing S3D video, like in asymmetric compression. In this paper, we focus in particular on the detection of SM. For this goal, we organized a psychophysical experiment with 23 subjects and a crosstalk-free stereoscopic display in order to gather psychophysical data necessary for the development of a SM detection method. Based on the gathered experiment data, we propose a new SM detection method. The evaluation of this method shows that its performance is close but not better than that of the state-of-the-art methods. Therefore, our goal in the near future is to improve the proposed method.} } @INPROCEEDINGS{1576Simon2019, AUTHOR = {Maik Simon and Markus Küchhold and Tobias Senst and Erik Bochinski and Thomas Sikora}, TITLE = {Video-based Bottleneck Detection utilizing Lagrangian Dynamics in Crowded Scenes}, BOOKTITLE = {IEEE International Conference on Advanced Video and Signals-based Surveillance}, YEAR = {2019}, MONTH = sep, PDF = {http://elvera.nue.tu-berlin.de/files/1576Simon2019.pdf}, URL = {http://elvera.nue.tu-berlin.de/files/1576Simon2019.pdf} } @INPROCEEDINGS{1580Fearghail2019, AUTHOR = {Colm O Fearghail and Sebastian Knorr and Aljosa Smolic}, TITLE = {Analysis of Intended Viewing Area vs Estimated Saliency on Narrative Plot Structures in VR Video}, BOOKTITLE = {International Conference on 3D Immersion}, YEAR = {2019}, MONTH = dec, ADDRESS = {Brussels, Belgium}, ABSTRACT = {In cinematic virtual reality film one of the primary challenges from a storytelling perceptive is that of leading the attention of the viewers to ensure that the narrative is understood as desired. Methods from traditional cinema have been applied to varying levels of success. This paper explores the use of a saliency convolutional neural network model and measures it against the intending viewing area as denoted by the creators and the ground truth as to where the viewers actually looked. This information could then be used to further increase the effectiveness of a directors ability to focus attention in cinematic VR.} }