User:Fæ/code/reportRedlinksLACMA.py
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
# reportRedlinksLACMA.py
#
# Quick fix for LACMA uploads with red links
# Grab matching files from catscan (JSON query), use API to check for red links and repost text.
#
# Date: August 2013
# Author: Fae http://j.mp/faewm
# Copyright: CC-BY-SA
'''
import wikipedia, upload, sys, config, urllib2, urllib, re, string, time, catlib, pagegenerators, os.path, json
from time import sleep
from colorama import Fore, Back, Style
from colorama import init
init()
site = wikipedia.getSite('commons', 'commons')
def urltry(u):
headers = { 'User-Agent' : 'Mozilla/5.0' } # Spoof header
countErr=0
x=''
while x=='':
try:
req = urllib2.Request(u,None,headers)
x = urllib2.urlopen(req)
time.sleep(1)
except:
x=''
countErr+=1
if countErr>20: countErr=20
print Fore.CYAN,'** ERROR',countErr,'\n ** Failed to read from '+u+'\n ** Pause for '+str(countErr*1)+' seconds and try again'+Fore.WHITE
time.sleep(1*countErr)
return x
def htmltry(x,u):
countErr=0
r=True
while r:
try:
return x.read()
except:
x=urltry(u)
countErr+=1
if countErr>200:
p=300
else:
p=countErr*2
print Fore.CYAN,'** ERROR',countErr,'\n ** Failed to read xml'
if countErr==1:
print Fore.BLUE+'xml ='+str(x)
print 'url ='+u+Fore.CYAN
print ' ** Pause for '+str(p)+' seconds and try again'+Fore.WHITE
time.sleep(p)
else:
r=False
return
def exists(page):
url="http://commons.wikimedia.org/w/api.php?action=query&prop=info&format=xml&titles="+urllib.quote(page)
xml=htmltry(urltry(url),url)
if re.search('missing=""',xml):
return False
else:
return True
'''
*** MAIN ***
'''
# Get list of articles with redlinks
url="http://tools.wmflabs.org/catscan2/quick_intersection.php?lang=commons&project=commons&cats=Files+with+broken+file+links%0D%0AImages+from+LACMA+uploaded+by+F%C3%A6&ns=6&depth=-1&max=1000&start=0&format=json"
uri=urltry(url)
json=json.loads(htmltry(uri,url))
print Fore.GREEN+"Intersection of:\n"+Fore.YELLOW,json['cats']
print Fore.GREEN+"Total pages returned:", Fore.YELLOW,json['pagecount'],Fore.WHITE
pages=[p['page_title'] for p in json['pages']]
count=0
for p in pages:
count+=1
#if count>1:continue
print Fore.CYAN+str(count),p,Fore.WHITE
page=wikipedia.Page(site,"File:"+p)
html=page.get()
links=[l.split("|")[0] for l in html.split("other_versions=")[1].split("}}")[0].split("[[")]
links.pop(0)
for l in links:
if not exists(l):
print Fore.YELLOW,l,Fore.WHITE
sl="[["+l+"|220px|left]]\n"
html=html.split(sl)[0]+html.split(sl)[1]
wikipedia.setAction("Trim other_versions not uploaded")
page.put(html)