User:MDanielsBot/CheckFinna.py
Jump to navigation
Jump to search
#!/usr/bin/python # -*- coding: utf-8 -*- #Copyright 2020, Michael Daniels #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal #in the Software without restriction, including without limitation the rights #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell #copies of the Software, and to permit persons to whom the Software is #furnished to do so, subject to the following conditions: #The above copyright notice and this permission notice shall be included in #all copies or substantial portions of the Software. #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN #THE SOFTWARE. import requests import time import re import json from PIL import Image S = requests.Session() idURLString = 'https:\/\/www\.finna\.fi\/Record\/.{3}\..{10}:.{8}' regexIDURL = re.compile(idURLString, re.IGNORECASE) regexID = re.compile('.{3}\..{10}:.{8}', re.IGNORECASE) regexCCBY40 = re.compile('{{cc[- ]by[- ]4\.0}}', re.IGNORECASE) outFilePass = open("outFilePass.txt", 'a') outFileFail = open("outFileFail.txt", 'a') myUserAgentA = "Finna License Checker, Wikimedia Commons. " myUserAgentB = "Contact: en.wikipedia.org/wiki/User:Mdaniels5757 " myUserAgentC = "& click 'email this user'" headers = { 'User-Agent': str(myUserAgentA + myUserAgentB + myUserAgentC) } def GetCopyrightData(id): URLBase = "https://api.finna.fi/api/v1/record?id=" URL = URLBase + id + "&field[]=imageRights&prettyPrint=false&lng=en-gb" raw = requests.get(URL, headers=headers) response = json.loads(raw.text) imageRights = response["records"][0]["imageRights"] copyright = imageRights["copyright"] return copyright def downloadImages(line, id): # Get finna image: small if avail, master if not URLBase = "https://api.finna.fi/api/v1/record?id=" URL = URLBase + id + "&field[]=imagesExtended&prettyPrint=false&lng=en-gb" raw = requests.get(URL, headers=headers) response = json.loads(raw.text) sizes = response["records"][0]["imagesExtended"][0]["urls"] finnaURLBase = "https://www.finna.fi/Cover/Show?id=" if "small" in sizes: finnaURLBase = "https://www.finna.fi/Cover/Show?id=" finnaURL = finnaURLBase + id + "&index=0&size=small" else: finnaURL = finnaURLBase + id + "&index=0&size=master" finnaFile = open('finnaFile.jpg', 'wb') finnaFile.write(requests.get(finnaURL, headers=headers).content) finnaFile.close() with Image.open('finnaFile.jpg') as img: width, height = img.size finnaSize = width, height # Set up Commons API again to get image url URL = "https://commons.wikimedia.org/w/api.php" params = { "action": "query", "format": "json", "prop": "imageinfo", "indexpageids": 1, "titles": line, "maxlag": "5", "iiprop": "url", "iiurlwidth": str(width) } raw = S.get(url=URL, params=params, headers=headers) response = raw.json() pageID = response["query"]["pageids"][0] commonsURL = response["query"]["pages"][pageID]["imageinfo"][0]["thumburl"] commonsFile = open('commonsFile.jpg', 'wb') commonsFile.write(requests.get(commonsURL, headers=headers).content) commonsFile.close() def CompareImages(): i1orig = Image.open("commonsFile.jpg") i1 = i1orig.convert("RGB") i2orig = Image.open("finnaFile.jpg") i2 = i2orig.convert("RGB") assert i1.mode == i2.mode, "Different kinds of images." assert i1.size == i2.size, "Different sizes." pairs = zip(i1.getdata(), i2.getdata()) if len(i1.getbands()) == 1: # for gray-scale jpegs dif = sum(abs(p1-p2) for p1,p2 in pairs) else: dif = sum(abs(c1-c2) for p1,p2 in pairs for c1,c2 in zip(p1,p2)) ncomponents = i1.size[0] * i1.size[1] * 3 # Return difference percentage return (dif / 255.0 * 100) / ncomponents def perPage(line): # Set up API, find the ID URL = "https://commons.wikimedia.org/w/api.php" params = { "action": "parse", "page": line, "maxlag": "5", "format": "json" } raw = S.get(url=URL, params=params, headers=headers) response = raw.json() wikicode = response["parse"]["text"]["*"] matchIDURL = regexIDURL.search(wikicode) if matchIDURL: # Find the ID and get copyright info matchURLstr = matchIDURL.group(0) matchID = regexID.search(matchURLstr) finnaIDstr = matchID.group(0) copyright = GetCopyrightData(finnaIDstr) else: return if copyright != "CC BY 4.0": outFileFail.write(line) print("Failed - not CC-BY-4.0: " + line) return downloadImages(line, finnaIDstr) diffPercent = CompareImages() if diffPercent <= 5: outFilePass.write(line) str = "PASS: " + line + " diffPercent" + "{:.2f}".format(diffPercent) else: outFileFail.write(line) str = "Failed - different: " + line str = str + " diffPercent" + "{:.2f}".format(diffPercent) print(str) with open("infile.txt",'r') as infile: for line in infile: perPage(line) time.sleep(2) outFilePass.close() outFileFail.close()