#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Robot om webskakels op die Wayback Machine te argiveer.
Die volgende parameters word ondersteun:
¶ms;
-dry Moenie enige veranderinge maak nie, maar wys wat verander sou word.
"""
#
__version__ = '$Id: 9c315ea1c38f5f9f2d74a4e8929403ffa35b2987 $'
#
import re
import time
import json
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
import ssl
import pywikibot
from pywikibot import pagegenerators
from pywikibot import i18n
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp
}
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
class BasicBot:
def __init__(self, generator, dry):
"""
Constructor.
Parameters:
@param generator: The page generator that determines on which pages
to work.
@type generator: generator.
@param dry: If True, doesn't do any real changes, but only shows
what would have been changed.
@type dry: boolean.
"""
self.generator = generator
self.dry = dry
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
# Set the edit summary message
site = pywikibot.Site()
self.summary = i18n.twtranslate(site, 'basic-changing')
def run(self):
""" Verwerk elke bladsy van die generator. """
for page in self.generator:
self.treat(page)
def treat(self, page):
""" Laai die bladsy, wysig en stoor dit. """
text = self.load(page)
if not text:
return
################################################################
# NOTA: Here you can modify the text in whatever way you want. #
################################################################
t = page.title(asLink=False);
pywikibot.output('Verwerk bladsy %s%s%s...' % (bcolors.BOLD, t, bcolors.ENDC))
talktext = '';
it = re.findall(r'(https?://[^\s|<>{}\[\]]+)', text)
it = list(set(it)) # Remove dups
for wurl in it:
if "archive.org/" in wurl: continue
url = urllib.parse.quote(wurl.encode('utf8'), ':/')
ia = urlopen('http://archive.org/wayback/available?url=%s' % url).read().decode('utf8')
data = json.loads(ia)
#
#{"archived_snapshots":
# {"closest":
# {"available":true,
# "url":"http://web.archive.org/web/20150223120334/http://www.orafaq.com:80/forum/?",
# "timestamp":"20150223120334",
# "status":"200"
# }
# }
#}
if '"available":true' in ia:
# Page archived, check to see if original URL is still OK.
aurl = data["archived_snapshots"]["closest"]["url"]
atime = data["archived_snapshots"]["closest"]["timestamp"]
pywikibot.output('%s... URL %s is reeds geargiveer%s' % (bcolors.OKBLUE, wurl, bcolors.ENDC))
try:
ia = urlopen(wurl)
rc = ia.getcode()
if rc == 200:
pywikibot.output('%s... ... URL is steeds OK%s' % (bcolors.OKBLUE, bcolors.ENDC))
elif rc == 404:
pywikibot.output('%s... ... 404 verander na %s%s' % (bcolors.FAIL, aurl, bcolors.ENDC))
pywikibot.output('%s... ... {{Wayback|url=%s|date=%s}}%s' % (bcolors.FAIL, url, atime, bcolors.ENDC))
talktext += "|-\n| %s || {{Wayback|url=%s|date=%s}}\n" % (wurl, url, atime)
print(talktext)
else:
pywikibot.output('%s... ... HTTP %s-fout%s' % (bcolors.OKBLUE, rc, bcolors.ENDC))
except IOError as e:
pywikibot.output('%s... ... URL het aandag nodig %s%s' % (bcolors.FAIL, e, bcolors.ENDC))
# pywikibot.output('%s... ... verander na %s%s' % (bcolors.FAIL, aurl, bcolors.ENDC))
# pywikibot.output('%s... ... {{Wayback|url=%s|date=%s}}%s' % (bcolors.FAIL, url, atime, bcolors.ENDC))
#if str(e) == 'HTTP Error 404: Not Found':
# talktext += "|-\n| %s || {{Wayback|url=%s|date=%s}}\n" % (wurl, url, atime)
# print(talktext)
else:
try:
ia = urlopen('https://web.archive.org/save/%s' % url).read()
except IOError as e:
pywikibot.output('%s... URL %s gee %s%s' % (bcolors.FAIL, url, e, bcolors.ENDC))
ia = '';
if 'FILE ARCHIVED ON' in str(ia):
pywikibot.output('%s... URL %s suksesvol opgelaai%s' % (bcolors.OKGREEN, wurl, bcolors.ENDC))
elif '403 Forbidden' in str(ia):
pywikibot.output('... URL %s is geblokkeer op archive.org' % wurl)
elif '404: Not Found' in str(ia):
pywikibot.output('... URL %s is dood' % wurl)
elif 'due to robots.txt' in str(ia):
pywikibot.output('... URL %s is geblokkeer deur robots.txt' % wurl)
elif 'URL has been excluded from the Wayback Machine' in str(ia):
pywikibot.output('... URL %s is deur Wayback Machine utgesluit' % wurl)
elif 'look like an valid URL' in str(ia):
pywikibot.output('%s... URL %s lys soos ''n ongeldige URL%s' % (bcolors.WARNING, wurl, bcolors.ENDC))
elif 'url is not available on the live web' in str(ia):
pywikibot.output('... URL %s is nie beskikbaar nie' % wurl)
else:
#ia = str(ia)#, errors='ignore')
pywikibot.output('... URL %s het gefaal: %s' % (wurl, ia))
time.sleep(1)
if talktext:
talktext = "== Geargiveerde skakels ==\n{| class=\"wikitable\"\n|-\n! Dooie skakel !! Argief\n" + talktext + "|}"
talkpage = page.toggleTalkPage()
if talkpage.exists():
talktext_prev = talkpage.get()
if talktext == talktext_prev:
pywikibot.output("Reeds gestoor - slaan oor...\n")
else:
talktext_prev = re.sub(r'== Geargiveerde skakels ==.*\|\}', '', talktext_prev, flags=re.MULTILINE|re.DOTALL)
if talktext_prev != '':
talktext_prev += "\n\n"
talktext_prev += talktext
pywikibot.output('Nuwe blad: [' + talktext_prev + ']')
talkpage.put(talktext_prev, 'Rapporteer dooie skakels wat geargiveer is');
else:
pywikibot.output('Skep bespreking: [' + talktext + ']')
talkpage.put(talktext, 'Rapporteer dooie skakels wat geargiveer is');
# if not self.save(text, page, self.summary):
# pywikibot.output(u'Page %s not saved.' % page.title(asLink=True))
def load(self, page):
""" Laai die teks van 'n gegewe bladsy. """
try:
# Load the page
text = page.get()
except pywikibot.NoPage:
pywikibot.output(u"Bladsy %s bestaan nie, slaan oor."
% page.title(asLink=True))
except pywikibot.IsRedirectPage:
pywikibot.output(u"Bladsy %s is 'n aanstuur; slaan oor."
% page.title(asLink=True))
else:
return text
return None
def save(self, text, page, comment=None, minorEdit=True,
botflag=True):
""" Opdateer 'n gegewe bladsy met muwe teks. """
# only save if something was changed
if text != page.get():
# Show the title of the page we're working on.
# Highlight the title in purple.
pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
% page.title())
# show what was changed
pywikibot.showDiff(page.get(), text)
pywikibot.output(u'Comment: %s' % comment)
if not self.dry:
if pywikibot.input_yn(
u'Do you want to accept these changes?',
default=False, automatic_quit=False):
try:
page.text = text
# Save the page
page.save(comment=comment or self.comment,
minor=minorEdit, botflag=botflag)
except pywikibot.LockedPage:
pywikibot.output(u"Bladsy %s is gesluit; slaan oor."
% page.title(asLink=True))
except pywikibot.EditConflict:
pywikibot.output(
u'Slaan %s as gevolg van ''n wysigingskonflik oor'
% (page.title()))
except pywikibot.SpamfilterError as error:
pywikibot.output(
u'Kan nie %s wysig nie, swattlys-inskrywing %s'
% (page.title(), error.url))
else:
return True
return False
def main(*args):
"""
Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
@param args: command line arguments
@type args: list of unicode
"""
# Process global arguments to determine desired site
local_args = pywikibot.handle_args(args)
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
genFactory = pagegenerators.GeneratorFactory()
# The generator gives the pages that should be worked upon.
gen = None
# If dry is True, doesn't do any real changes, but only show
# what would have been changed.
dry = False
# Parse command line arguments
for arg in local_args:
if arg.startswith("-dry"):
dry = True
else:
genFactory.handleArg(arg)
if not gen:
gen = genFactory.getCombinedGenerator()
if gen:
# The preloading generator is responsible for downloading multiple
# pages from the wiki simultaneously.
gen = pagegenerators.PreloadingGenerator(gen)
bot = BasicBot(gen, dry)
bot.run()
else:
pywikibot.showHelp()
if __name__ == "__main__":
main()