User:MDanielsBot/CheckFinna.py

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-

#Copyright 2020, Michael Daniels
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:

#The above copyright notice and this permission notice shall be included in
#all copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#THE SOFTWARE.


import requests
import time
import re
import json
from PIL import Image

S = requests.Session()
idURLString = 'https:\/\/www\.finna\.fi\/Record\/.{3}\..{10}:.{8}'
regexIDURL = re.compile(idURLString, re.IGNORECASE)
regexID = re.compile('.{3}\..{10}:.{8}', re.IGNORECASE)
regexCCBY40 = re.compile('{{cc[- ]by[- ]4\.0}}', re.IGNORECASE)
outFilePass = open("outFilePass.txt", 'a')
outFileFail = open("outFileFail.txt", 'a')

myUserAgentA = "Finna License Checker, Wikimedia Commons. "
myUserAgentB = "Contact: en.wikipedia.org/wiki/User:Mdaniels5757 "
myUserAgentC = "& click 'email this user'"
headers = {
    'User-Agent': str(myUserAgentA + myUserAgentB + myUserAgentC)
}

def GetCopyrightData(id):
    URLBase = "https://api.finna.fi/api/v1/record?id="
    URL = URLBase + id + "&field[]=imageRights&prettyPrint=false&lng=en-gb"
    raw = requests.get(URL, headers=headers)
    response = json.loads(raw.text)
    imageRights = response["records"][0]["imageRights"]
    copyright = imageRights["copyright"]
    return copyright
    
def downloadImages(line, id):
    # Get finna image: small if avail, master if not
    URLBase = "https://api.finna.fi/api/v1/record?id="
    URL = URLBase + id + "&field[]=imagesExtended&prettyPrint=false&lng=en-gb"
    raw = requests.get(URL, headers=headers)
    response = json.loads(raw.text)
    sizes = response["records"][0]["imagesExtended"][0]["urls"]
    finnaURLBase = "https://www.finna.fi/Cover/Show?id="
    if "small" in sizes:
        finnaURLBase = "https://www.finna.fi/Cover/Show?id="
        finnaURL = finnaURLBase + id + "&index=0&size=small"
    else:
        finnaURL = finnaURLBase + id + "&index=0&size=master"
    finnaFile = open('finnaFile.jpg', 'wb')
    finnaFile.write(requests.get(finnaURL, headers=headers).content)
    finnaFile.close()
    with Image.open('finnaFile.jpg') as img:
        width, height = img.size
        finnaSize = width, height
    
    # Set up Commons API again to get image url
    URL = "https://commons.wikimedia.org/w/api.php"
    params = {
    	"action": "query",
    	"format": "json",
    	"prop": "imageinfo",
        "indexpageids": 1,
    	"titles": line,
        "maxlag": "5",
    	"iiprop": "url",
        "iiurlwidth": str(width)
    }
    raw = S.get(url=URL, params=params, headers=headers)
    response = raw.json()
    pageID = response["query"]["pageids"][0]
    commonsURL = response["query"]["pages"][pageID]["imageinfo"][0]["thumburl"]
    commonsFile = open('commonsFile.jpg', 'wb')
    commonsFile.write(requests.get(commonsURL, headers=headers).content)
    commonsFile.close()
    
def CompareImages():
    i1orig = Image.open("commonsFile.jpg")
    i1 = i1orig.convert("RGB")
    i2orig = Image.open("finnaFile.jpg")
    i2 = i2orig.convert("RGB")
    assert i1.mode == i2.mode, "Different kinds of images."
    assert i1.size == i2.size, "Different sizes."
     
    pairs = zip(i1.getdata(), i2.getdata())
    if len(i1.getbands()) == 1:
        # for gray-scale jpegs
        dif = sum(abs(p1-p2) for p1,p2 in pairs)
    else:
        dif = sum(abs(c1-c2) for p1,p2 in pairs for c1,c2 in zip(p1,p2))
     
    ncomponents = i1.size[0] * i1.size[1] * 3
    # Return difference percentage
    return (dif / 255.0 * 100) / ncomponents

def perPage(line):
    # Set up API, find the ID
    URL = "https://commons.wikimedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": line,
        "maxlag": "5",
        "format": "json"
    }
    raw = S.get(url=URL, params=params, headers=headers)
    response = raw.json()
    wikicode = response["parse"]["text"]["*"]
    matchIDURL = regexIDURL.search(wikicode)
    if matchIDURL:
        # Find the ID and get copyright info
        matchURLstr = matchIDURL.group(0)
        matchID = regexID.search(matchURLstr)
        finnaIDstr = matchID.group(0)
        copyright = GetCopyrightData(finnaIDstr)
    else:
        return
    if copyright != "CC BY 4.0":
        outFileFail.write(line)
        print("Failed - not CC-BY-4.0: " + line)
        return
    
    downloadImages(line, finnaIDstr)
    
    diffPercent = CompareImages()
    if diffPercent <= 5:
        outFilePass.write(line)
        str = "PASS: " + line + " diffPercent" + "{:.2f}".format(diffPercent)
    else:
        outFileFail.write(line)
        str = "Failed - different: " + line
        str = str + " diffPercent" + "{:.2f}".format(diffPercent)
    print(str)

with open("infile.txt",'r') as infile:
    for line in infile:
        perPage(line)
        time.sleep(2)
    outFilePass.close()
    outFileFail.close()