@INPROCEEDINGS{1324Kelm2011, AUTHOR = {Pascal Kelm and Sebastian Schmiedeke and Kai Clüver and Thomas Sikora}, TITLE = {Automatic Geo-referencing of Flickr Videos}, BOOKTITLE = {NEM Summit 2011}, YEAR = {2011}, MONTH = sep, EDITOR = {Eurescom – the European Institute for Research and Strategic Studies in Telecommunications – GmbH}, PUBLISHER = {Sigma Orionis}, PAGES = {76--80}, ORGANIZATION = {NEM Summit}, ADDRESS = {Torino, Italy, September 27-29 2011}, NOTE = {Copyright © 2011 – Eurescom GmbH – On behalf of NEM Initiative All rights on Proceedings of 2011 NEM Summit (Torino, Italy, September 27-29 2011) reserved. All rights on individual papers, published in the proceedings, remain unaffected. ISBN 978-3-00-035465-6 Publisher Eurescom – the European Institute for Research and Strategic Studies in Telecommunications – GmbH Wieblinger Weg 19/4 - 69123 Heidelberg - Germany Phone: +49 6221 989 0 - Fax: +49 6221 989 209 - http://www.eurescom.eu For publisher: Halid Hrasnica On behalf of NEM Initiative – http://www.nem-initiative.org eBook and USB produced by Sigma Orionis 1240, route des dolines - BP287 Valbonne - France Phone: +33 (0) 493 001 550 - Fax: +33 (0) 493 001 560 - http://www.sigma-orionis.com For producer: Florent Genoux On behalf of NEM Initiative – http://www.nem-initiative.org}, PDF = {http://elvera.nue.tu-berlin.de/files/1324Kelm2011.pdf}, ABSTRACT = {We present a hierarchical, multi-modal approach for geo-referencing Flickr videos. Our approach makes use of external resources to identify toponyms in the metadata and of visual features to identify similar content. We use a database of more than 3.6 million Flickr images to group them into geographical areas and to build a hierarchical model. First, the geographical boundaries extraction method identifies the country and its dimension. Then, a visual method is used to classify the videos’ location into plausible areas. Next, the visually nearest neighbour method is used to find correspondences with the training images within the pre-classified areas. As the processed video sequences are represented using low-level feature vectors from multiple key frames, we also present techniques for video to image matchings. The Flickr videos are tagged with the geo-information of the visually most similar training item within the areas previously filtered in the pre-classification step. The results show that we are able to tag one third of our videos correctly within an error margin of 1 km.} }