Gebruiker:Naudefj/Linkbot

Python 3.x-weergawe wysig

#!/usr/bin/python3
# -*- coding: utf-8  -*-
"""
Robot om webskakels op die Wayback Machine te argiveer.

Die volgende parameters word ondersteun:

&params;

-dry              Moenie enige veranderinge maak nie, maar wys wat verander sou word.

"""
#
__version__ = '$Id: 9c315ea1c38f5f9f2d74a4e8929403ffa35b2987 $'
#

import re
import time
import json
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
import ssl
import pywikibot
from pywikibot import pagegenerators
from pywikibot import i18n

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;': pagegenerators.parameterHelp
}

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

class BasicBot:
    def __init__(self, generator, dry):
        """
        Constructor.

        Parameters:
            @param generator: The page generator that determines on which pages
                              to work.
            @type generator: generator.
            @param dry: If True, doesn't do any real changes, but only shows
                        what would have been changed.
            @type dry: boolean.
        """
        self.generator = generator
        self.dry = dry

        if hasattr(ssl, '_create_unverified_context'):
           ssl._create_default_https_context = ssl._create_unverified_context

        # Set the edit summary message
        site = pywikibot.Site()
        self.summary = i18n.twtranslate(site, 'basic-changing')

    def run(self):
        """ Verwerk elke bladsy van die generator. """
        for page in self.generator:
            self.treat(page)

    def treat(self, page):
        """ Laai die bladsy, wysig en stoor dit. """
        text = self.load(page)
        if not text:
            return

        ################################################################
        # NOTA: Here you can modify the text in whatever way you want. #
        ################################################################

        t = page.title(asLink=False);
        pywikibot.output('Verwerk bladsy %s%s%s...' % (bcolors.BOLD, t, bcolors.ENDC))
        talktext = '';

        it = re.findall(r'(https?://[^\s|<>{}\[\]]+)', text)
        it = list(set(it))	# Remove dups
        for wurl in it:
            if "archive.org/" in wurl: continue
            url = urllib.parse.quote(wurl.encode('utf8'), ':/')
            ia = urlopen('http://archive.org/wayback/available?url=%s' % url).read().decode('utf8')
            data = json.loads(ia)
#
#{"archived_snapshots":
#	{"closest":
#		{"available":true,
#		 "url":"http://web.archive.org/web/20150223120334/http://www.orafaq.com:80/forum/?",
#		 "timestamp":"20150223120334",
#                "status":"200"
#                }
#         }
#}

            if '"available":true' in ia:
                # Page archived, check to see if original URL is still OK.
                aurl  = data["archived_snapshots"]["closest"]["url"]
                atime = data["archived_snapshots"]["closest"]["timestamp"]
                pywikibot.output('%s... URL %s is reeds geargiveer%s' % (bcolors.OKBLUE, wurl, bcolors.ENDC))
                try:
                    ia = urlopen(wurl)
                    rc = ia.getcode()
                    if rc == 200:
                        pywikibot.output('%s... ... URL is steeds OK%s' % (bcolors.OKBLUE, bcolors.ENDC))
                    elif rc == 404:
                        pywikibot.output('%s... ... 404 verander na %s%s' % (bcolors.FAIL, aurl, bcolors.ENDC))
                        pywikibot.output('%s... ... {{Wayback|url=%s|date=%s}}%s' % (bcolors.FAIL, url, atime, bcolors.ENDC))
                        talktext += "|-\n| %s || {{Wayback|url=%s|date=%s}}\n" % (wurl, url, atime)
                        print(talktext)
                    else:
                        pywikibot.output('%s... ... HTTP %s-fout%s' % (bcolors.OKBLUE, rc, bcolors.ENDC))
                except IOError as e:
                    pywikibot.output('%s... ... URL het aandag nodig %s%s' % (bcolors.FAIL, e, bcolors.ENDC))
                    # pywikibot.output('%s... ... verander na %s%s' % (bcolors.FAIL, aurl, bcolors.ENDC))
                    # pywikibot.output('%s... ... {{Wayback|url=%s|date=%s}}%s' % (bcolors.FAIL, url, atime, bcolors.ENDC))
                    #if str(e) == 'HTTP Error 404: Not Found':
                    #   talktext += "|-\n| %s || {{Wayback|url=%s|date=%s}}\n" % (wurl, url, atime)
                    #   print(talktext)
            else:
                try:
                    ia = urlopen('https://web.archive.org/save/%s' % url).read()
                except IOError as e:
                    pywikibot.output('%s... URL %s gee %s%s' % (bcolors.FAIL, url, e, bcolors.ENDC))
                    ia = '';
            if 'FILE ARCHIVED ON' in str(ia):
                pywikibot.output('%s... URL %s suksesvol opgelaai%s' % (bcolors.OKGREEN, wurl, bcolors.ENDC))
            elif '403 Forbidden' in str(ia):
                pywikibot.output('... URL %s is geblokkeer op archive.org' % wurl)
            elif '404: Not Found' in str(ia):
                pywikibot.output('... URL %s is dood' % wurl)
            elif 'due to robots.txt' in str(ia):
                pywikibot.output('... URL %s is geblokkeer deur robots.txt' % wurl)
            elif 'URL has been excluded from the Wayback Machine' in str(ia):
                pywikibot.output('... URL %s is deur Wayback Machine utgesluit' % wurl)
            elif 'look like an valid URL' in str(ia):
                pywikibot.output('%s... URL %s lys soos ''n ongeldige URL%s' % (bcolors.WARNING, wurl, bcolors.ENDC))
            elif 'url is not available on the live web' in str(ia):
                pywikibot.output('... URL %s is nie beskikbaar nie' % wurl)
            else:
                #ia = str(ia)#, errors='ignore')
                pywikibot.output('... URL %s het gefaal: %s' % (wurl, ia))
            time.sleep(1)

        if talktext:
           talktext = "== Geargiveerde skakels ==\n{| class=\"wikitable\"\n|-\n! Dooie skakel !! Argief\n" + talktext + "|}"
           talkpage = page.toggleTalkPage()
           if talkpage.exists():
              talktext_prev = talkpage.get()
              if talktext == talktext_prev:
                 pywikibot.output("Reeds gestoor - slaan oor...\n")
              else:
                 talktext_prev = re.sub(r'== Geargiveerde skakels ==.*\|\}', '', talktext_prev, flags=re.MULTILINE|re.DOTALL)
                 if talktext_prev != '':
                    talktext_prev += "\n\n"
                 talktext_prev += talktext
                 pywikibot.output('Nuwe blad: [' + talktext_prev + ']')
                 talkpage.put(talktext_prev, 'Rapporteer dooie skakels wat geargiveer is');
           else:
              pywikibot.output('Skep bespreking: [' + talktext + ']')
              talkpage.put(talktext, 'Rapporteer dooie skakels wat geargiveer is');
        # if not self.save(text, page, self.summary):
        #    pywikibot.output(u'Page %s not saved.' % page.title(asLink=True))

    def load(self, page):
        """ Laai die teks van 'n gegewe bladsy. """
        try:
            # Load the page
            text = page.get()
        except pywikibot.NoPage:
            pywikibot.output(u"Bladsy %s bestaan nie, slaan oor."
                             % page.title(asLink=True))
        except pywikibot.IsRedirectPage:
            pywikibot.output(u"Bladsy %s is 'n aanstuur; slaan oor."
                             % page.title(asLink=True))
        else:
            return text
        return None

    def save(self, text, page, comment=None, minorEdit=True,
             botflag=True):
        """ Opdateer 'n gegewe bladsy met muwe teks. """
        # only save if something was changed
        if text != page.get():
            # Show the title of the page we're working on.
            # Highlight the title in purple.
            pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                             % page.title())
            # show what was changed
            pywikibot.showDiff(page.get(), text)
            pywikibot.output(u'Comment: %s' % comment)
            if not self.dry:
                if pywikibot.input_yn(
                        u'Do you want to accept these changes?',
                        default=False, automatic_quit=False):
                    try:
                        page.text = text
                        # Save the page
                        page.save(comment=comment or self.comment,
                                  minor=minorEdit, botflag=botflag)
                    except pywikibot.LockedPage:
                        pywikibot.output(u"Bladsy %s is gesluit; slaan oor."
                                         % page.title(asLink=True))
                    except pywikibot.EditConflict:
                        pywikibot.output(
                            u'Slaan %s as gevolg van ''n wysigingskonflik oor'
                            % (page.title()))
                    except pywikibot.SpamfilterError as error:
                        pywikibot.output(
                            u'Kan nie %s wysig nie, swattlys-inskrywing %s'
                            % (page.title(), error.url))
                    else:
                        return True
        return False


def main(*args):
    """
    Process command line arguments and invoke bot.

    If args is an empty list, sys.argv is used.

    @param args: command line arguments
    @type args: list of unicode
    """
    # Process global arguments to determine desired site
    local_args = pywikibot.handle_args(args)

    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()
    # The generator gives the pages that should be worked upon.
    gen = None
    # If dry is True, doesn't do any real changes, but only show
    # what would have been changed.
    dry = False

    # Parse command line arguments
    for arg in local_args:
        if arg.startswith("-dry"):
            dry = True
        else:
            genFactory.handleArg(arg)

    if not gen:
        gen = genFactory.getCombinedGenerator()
    if gen:
        # The preloading generator is responsible for downloading multiple
        # pages from the wiki simultaneously.
        gen = pagegenerators.PreloadingGenerator(gen)
        bot = BasicBot(gen, dry)
        bot.run()
    else:
        pywikibot.showHelp()

if __name__ == "__main__":
    main()