User:DschwenBot/source

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
#!/usr/bin/python

import sys, os
print os.environ['HOME']
sys.path.append(os.environ['HOME'] + '/dschwen_bot/pywikipedia')

import wikipedia
import MySQLdb
import pyexiv2
import re
import math
import string
import unicodedata
import htmlentitydefs 
import marshal
import urllib
from urllib import FancyURLopener
from PHPUnserialize import *
from datetime import timedelta
from datetime import datetime

# look at images of the last two days
dt = timedelta(3)
cut = datetime.now() - dt

class MyOpener(FancyURLopener):
        version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'

myopener = MyOpener()
urllib.urlopen = MyOpener().open
urllib.urlretrieve = MyOpener().retrieve

   
def unescape_charref(ref) :
        name = ref[2:-1]
        base = 10
        if name.startswith("x") :
                name = name[1:]
                base = 16
        return unichr(int(name, base))
                                          
def replace_entities(match) :
        ent = match.group()
        if ent[1] == "#":
                return unescape_charref(ent)
                                                              
        repl = htmlentitydefs.name2codepoint.get(ent[1:-1])
        if repl is not None :
                repl = unichr(repl)
        else :
                repl = ent
        return repl
                                                                                                    
def unescape(data) : 
        return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) 


gpstrackusers = set( [ 'Ikiwaner' ] );

loc1RE = re.compile( '\{\{[Ll]ocation[ _]dec\|([^\|]+)\|([^\|]+)[\|\}]' )
loc2RE = re.compile( '\{\{[Ll]ocation[ _]dec\|([^\}\{]+)\}\}' )
loc3RE = re.compile( '\{\{[Ll]ocation\|([^\}\{]+)\}\}' )

nolocRE = re.compile( '\[\[[Cc]ategory:[Ll]ocation[ _]not[ _]applicable\]\]' )

gpsRE = re.compile( '\{\{(Template:|template:|10:|)[Gg]PS[_ ]EXIF\}\}(\n|)' )
#gpsRE = re.compile( '\{\{Template:[Gg]PS[ _]EXIF\}\}' )

latrefRE = re.compile( '^[NnSs]$' )
lonrefRE = re.compile( '^[EeOoWw]$' )

site = wikipedia.getSite()


try:
        f = open( "badlist.gps", "rb" )
        badlist = marshal.load( f )
        f.close()
except:
        badlist = {}

try:
        f = open( "taglist.gps", "rb" )
        taglist = marshal.load( f )
        f.close()
except:
        taglist = {}

try:
        connection = MySQLdb.connect(host="commonswiki-p.db.ts.wikimedia.org", user="xxx", passwd="xxx", db="commonswiki_p" )
        #connection = MySQLdb.connect(host="sql-s1.toolserver.org", user="xxx", passwd="xxx", db="commonswiki_p" )
        cursor = connection.cursor() 
        cursor.execute( "create temporary table u_dschwen.dump (tl_from int, score int)" )
        print "Looking for GPS EXIF data (images > %s)"  % cut.strftime( "%Y%m%d%H%M%S" )
        #cursor.execute( "insert into u_dschwen.dump SELECT page_id, 0 from image, page where img_name = page_title and img_name = 'Freaky_Age_4_Luc_Viatour.jpg'" )
        cursor.execute( "insert into u_dschwen.dump SELECT page_id, 0 from image, page where img_timestamp > '%s' and img_name = page_title and page_namespace = 6 and ( img_metadata like '%%\"GPSAltitude\"%%' or img_metadata like '%%\"GPSLatitudeRef\"%%' )" % cut.strftime( "%Y%m%d%H%M%S" ) )
        print "Looking for {{Location}}";
        cursor.execute( "insert into u_dschwen.dump select tl_from, 1 from templatelinks where tl_namespace = 10 and tl_title = 'Location'" )
        print "Looking for {{Location dec}}";
        cursor.execute( "insert into u_dschwen.dump select tl_from, 1 from templatelinks where tl_namespace = 10 and tl_title = 'Location_dec'" )
        print "Looking for {{GPS EXIF}}";
        cursor.execute( "insert into u_dschwen.dump select tl_from, 0 from templatelinks where tl_namespace = 10 and tl_title = 'GPS_EXIF'" )
        print "subtracting..."
        cursor.execute( "select page_title, tl_from, SUM( score ) as s from u_dschwen.dump, page where page_id = tl_from group by tl_from having s = 0" )
        print "fetching results..."

        data = cursor.fetchall() 
        fields = cursor.description
        cursor.close()
        connection.close()

        for row in range(len(data)):
                name = data[row][0]
                taglist[ name ] = True

        file = open( "taglist.gps", "wb" )
        marshal.dump( taglist, file )
        file.close()


except MySQLdb.OperationalError, message: 
        errorMessage = "Error %d:\n%s" % (message[ 0 ], message[ 1 ] ) 



#
# get potential images from taglist
#

for name in taglist.keys() :
        if taglist[ name ] and not ( name in badlist ):

                #exif = PHPUnserialize().unserialize(data[row][1])
                print name

                decomposed_string = unicodedata.normalize( 'NFD', name.decode('utf-8') )

                #page = wikipedia.Page(site, 'Image:' + decomposed_string.encode('utf-8') )
                page = wikipedia.Page(site, 'Image:' + name.decode('utf-8') )
                text = ""
                if page.exists() :
                        text = page.get()

                # remove {{GPS EXIF}}
                oldtext = text
                text = gpsRE.sub( '', text )

                # Location not applicable
                if nolocRE.search( text ) != None :
                        taglist[ name ] = False;
                        print "Location not applicable"
                        continue

                # already contains a Location
                if string.find(text, '{{Location' ) >= 0 :

                        # already contains a generated Location
                        if string.find(text, 'source:exif' ) >= 0 :
                                print "HMM, %s looks already processed" % name;

                                if oldtext != text :
                                        wikipedia.setAction("removed gps exif request template")
                                        page.put(text)
                                        print "removed a superfluous gps exif request template"

                                taglist[ name ] = False;
                                continue

                        # extract location to compare to exif
                        lat_in_dec = 0
                        lon_in_dec = 0

                        #{{Location dec|47.5059|-122.0343|type:forest_region:US}}
                        for match in loc1RE.findall(text) :
                                lat_in_dec = float( match[0] )
                                lon_in_dec = float( match[1] )


                # already contains a suggestion
                if string.find(text, '<!-- EXIF_BOT' ) >= 0 :
                        print "HMM, looks already processed";

                        if oldtext != text :
                                wikipedia.setAction("removed gps exif request template")
                                page.put(text)
                                print "removed a superfluous gps exif request template"

                        taglist[ name ] = False;
                        continue



                print "downloading http://commons.wikimedia.org/wiki/Special:Filepath/%s ..." % name
                try:
                        urllib.urlretrieve( ( "http://commons.wikimedia.org/wiki/Special:Filepath/%s" % urllib.quote(name) ), ".temp.jpg" )
                except Exception, e:
                        print "Exception while downloading:", e
                        continue

                print "analyzing GPS EXIF data ...";

                try :

                        image = pyexiv2.Image( ".temp.jpg" )
                        image.readMetadata()
                        #print image.exifKeys()
                        #print image['Exif.GPSInfo.GPSAltitudeRef']

                        temp = '{{Location'

                        lat_dec = 0.0;
                        for i in range(0, 3):
                                val = float(image['Exif.GPSInfo.GPSLatitude'][i].numerator) / float(image['Exif.GPSInfo.GPSLatitude'][i].denominator)
                                temp += ( "|%f" % ( val ) ).rstrip('0').rstrip('.')
                                lat_dec = lat_dec * 60.0 + val

                        ref = image['Exif.GPSInfo.GPSLatitudeRef']
                        if ref == 'S' :
                                lat_dec = -lat_dec
                        elif ref != 'N' :
                                print "Broken lattitude ref!"
                                temp = "<!-- GPS: Broken lattitude ref! --><br>" + temp
                                ref = 'N'
                        temp += '|' + ref


                        lon_dec = 0.0;
                        for i in range(0, 3):
                                val = float(image['Exif.GPSInfo.GPSLongitude'][i].numerator) / float(image['Exif.GPSInfo.GPSLongitude'][i].denominator)
                                temp += ( "|%f" % ( val ) ).rstrip('0').rstrip('.')
                                lon_dec = lon_dec * 60.0 + val

                        ref = image['Exif.GPSInfo.GPSLongitudeRef']
                        if ref == 'W' :
                                lon_dec = -lon_dec
                        elif ref != 'E' :
                                print "Broken longitude ref!"
                                temp = "<!-- GPS: Broken longitude ref! --><br>" + temp
                                ref = 'E'
                        temp +=  '|' + ref

                except :
                        print "Broken Tag"
                        taglist[ name ] = False;
                        badlist[ name ] = True;
                        continue

                #
                # Jump through several hoops to try to determine a heading and make [User:Ikiwaner] happy
                #

                heading = '?'
                try:
                        val = float(image['Exif.GPSInfo.GPSImgDirection'].numerator) / float(image['Exif.GPSInfo.GPSImgDirection'].denominator)
                        heading = ( "%f" % ( val ) ).rstrip('0').rstrip('.')
                        print "Heading found :-)"

                except:
                        print "No heading found :-("

                        try:
                                val = float(image['Exif.GPSInfo.GPSTrack'].numerator) / float(image['Exif.GPSInfo.GPSTrack'].denominator)

                                try:
                                        connection = MySQLdb.connect(host="commonswiki-p.db.ts.wikimedia.org", user="dschwen", passwd="6iemq2c!--TfJFAI", db="commonswiki_p" )
                                        cursor = connection.cursor() 
                                        print "lets see if the user is known..."
                                        cursor.execute( "select img_user_text from image where img_name = '%s'" % name )
                                        data = cursor.fetchall() 
                                        fields = cursor.description
                                        cursor.close()
                                        connection.close()
                                        uploader = data[0][0]

                                        print "Uploaded by: %s" % uploader
                                        if uploader in gpstrackusers :
                                                heading = ( "%f" % ( val ) ).rstrip('0').rstrip('.')
                                except:
                                        print "Database error"

                        except:
                                print "No dir of movement found either :-("

                if heading == 0 :
                        heading = '?'

                #
                # deal with missing altitude data
                #

                try:
                        alt = float(image['Exif.GPSInfo.GPSAltitude'].numerator) / float(image['Exif.GPSInfo.GPSAltitude'].denominator)

                        try:
                                if int( image['Exif.GPSInfo.GPSAltitudeRef'] ) == 1 :
                                        alt = -alt
                        except:
                                print "no AltitudeRef, assuming above sea level!"

                        temp += ( "|alt:%f" % alt ).rstrip('0').rstrip('.') + "_" 
                except:
                        print "No altitude data"
                        temp += "|"

                temp += ( 'source:exif_heading:%s}}' % ( heading ) )
                print temp

                if lon_dec == 0.0 and lat_dec == 0.0 :
                        print "apparently INVALID GPS data!"
                        taglist[ name ] = False;
                        badlist[ name ] = True;
                        continue

                lat_dec /= 3600.0
                lon_dec /= 3600.0

                #print "Old: %f,%f" % ( lat_in_dec, lon_in_dec )
                #print "New: %f,%f" % ( lat_dec, lon_dec )

                if string.find(text, '{{Location' ) < 0 :
                        print "YAY! tagging..."

                        wikipedia.setAction("creating {{Location}} from EXIF data, please visit [[Commons:Geocoding]] for further information")

                        infopos = string.find(text, '{{Information' )
                        if infopos < 0 :
                                text2 = temp + "\n" + text
                        else :
                                last = ''
                                infopos += 2
                                curl = 1
                                squr = 0
                                print infopos

                                while infopos < len(text) :
                                        c = text[infopos]

                                        if c == '[' and last == '[' :
                                                squr += 1
                                                #print "[ %d" % squr
                                                last = ''
                                        if c == ']' and last == ']' :
                                                squr -= 1
                                                #print "] %d" % squr
                                                last = ''
                                        if c == '{' and last == '{' :
                                                curl += 1
                                                #print "{ %d" % curl
                                                last = ''
                                        if c == '}' and last == '}' :
                                                curl -= 1
                                                #print "} %d" % curl
                                                last = ''

                                        last = c
                                        infopos += 1

                                        if curl == 0 and squr <= 0 :
                                                break

                                text2 = text[:infopos] + "\n" + temp + text[infopos:]

                        page.put(text2)
                        taglist[ name ] = False;
                else :
                        if string.find(text, '<!-- EXIF_BOT' ) < 0 and string.find(text, 'source:exif' ) < 0 :

                                if math.fabs( lat_in_dec - lat_dec ) < 0.0001 and math.fabs( lon_in_dec - lon_dec ) < 0.0001 :
                                        print "OK, existing geocoding seems reasonably accurate"
                                        taglist[ name ] = False;
                                        continue

                                print "OK, just inserting hidden suggestion"
                                wikipedia.setAction("adding suggested {{Location}} from EXIF data")
                                temp = ( "%f|%f check EWNS!\n" % ( lat_dec, lon_dec ) ) + temp;
                                text = '<!-- EXIF_BOT suggests: ' + temp + " -->\n" + text.replace( '{{GPS EXIF}}', '' )
                                page.put(text)
                                taglist[ name ] = False;
                        else :
                                print "HMM, looks already processed";

                                if oldtext != text :
                                        wikipedia.setAction("removed gps exif request template")
                                        page.put(text)
                                        print "removed a superfluous gps exif request template"

                                taglist[ name ] = False;


                file = open( "badlist.gps", "wb" )
                marshal.dump( badlist, file )
                file.close()

                file = open( "taglist.gps", "wb" )
                marshal.dump( taglist, file )
                file.close()


file = open( "badlist.gps", "wb" )
marshal.dump( badlist, file )
file.close()

file = open( "taglist.gps", "wb" )
marshal.dump( taglist, file )
file.close()