User:Hymie the SpelChek™ bot/editarticle.py

From Uncyclopedia, the content-free encyclopedia
Jump to navigation Jump to search

This is the main script behind the evil which is SpelChek™; editarticle.py is a drop-in replacement for use with the open-source pywikibot package.

Pywikibot is available from www.sourceforge.net and Python itself can be had from www.python.org

This script makes various kludgy attempts to clean up wiki text and remove broken/red links before dumping the whole mess into an external text editor for manual SpelChek™ correction. It looks for a plain-text file, addlinks.txt, which lists items which should be wikilinked once if they're found in the text.

It is 100.1% evil, so be warned:


# -*- coding: utf-8  -*-
#!/usr/bin/python

# Modified script: Edit a Wikipedia article with your favourite editor. Requires Python 2.3.
#
# (C) Gerrit Holl 2004, modified June 2005 by [[user:carlb]] for Uncyclopedia SpelChek[tm] use.
#
# This version has been modified to remove broken wikilinks and to add links based on addlinks.txt
# The text is then loaded into an external editor to permit correction, categorisation and SpelChek.
# This script is a kludge[tm] and to some degree is still under development.
#
# Note: If no manual changes (even if wikification auto-changed) updates will not be written to site.
#
# Distribute under the terms of the PSF license.

# Example usage (SpelChek with 602text as external editor):
# editarticle.py -e \progra~1\software602\602pro~1\602text\exec\602text.exe -u "Hymie the SpelChek™ bot"

# Example usage (Undictionary index generation, based on templates):
# editarticle.py -e notepad -i Undictionary: -p Template:Dict -u "Hymie the SpelChek™ bot"
#
# Note: this is currently based on Special:Whatlinkshere and only finds the first 500 (or 999) definitions 
# which use the template

# Version 0.3cb
#
# Features:
#       - logging in
#
# TODO: - non existing pages
#       - correct encoding
#       - use cookies to remember login
#       - edit conflicts
#       - difflib
#       - minor edits
#       - watch/unwatch
# And more SpelChek[tm] TODO:
#       - cleanup attempts to add wikification in existing section headers, HTML, wiki, template links
#       - fix conversions of %nn characters to ASCII in URL's (for wiki redlink/broken-link removal)
#       - fix Unicode->ASCII conversions before invoking external editor for spelchek functions
#       - ...
#
# Removed features:
#       - editing anonymously

__metaclass__ = type
__version__ = "$Id: editarticle.py,v 1.18 2005/02/21 11:32:36 gerrit Exp $"
sig = u" ()"

import array
import sys
import os
import httplib
import urllib
import getpass
import difflib
import optparse
import tempfile
import re

import wikipedia
import login
import config

#
# change string from re pattern-search to all-lower or all-upper case 
#
def tolower( match ):
    value = match.group()
    return value.lower()

def toupper( match ):
    value = match.group()
    return value.upper()


#
# optimise links of form [[word|words]] to instead use [[word]]s as wikilink
#
def fixpipelink( match ):
    value = match.group()
    txt = re.split('([\[\|\]]+)', value)
    newvalue = value 

    
    if txt[1]=='[[' and txt[3] == '|' and txt[5] == ']]':
      txt2 = array.array('u',txt[2])
      txt4 = array.array('u',txt[4])
      txt2addr, txt2len = txt2.buffer_info()
      txt4addr, txt4len = txt4.buffer_info()
      match = 0
      while match < txt2len and match < txt4len and txt2[match] == txt4[match]:
       match += 1
       if match == txt2len:
         newvalue = u'[[' + txt2.tounicode() + u']]'
         while match < txt4len:
           newvalue += txt4[match]
           match += 1
         newvalue = txt[0] + newvalue + txt[6]
    print "replacing piped|link with:", newvalue
    return newvalue

#
# check string from re pattern-search for nested wiki links created by addlinks.txt wikification
# note: this does not properly handle wikilinks mixed with external links, links split between lines
#       or (in some cases) links nested more than two levels deep at once; it also does not target as
#       special template/tags, section headers and other non-body text to remove spurious wikification
#       it also doesn't properly handle more than two contiguous ]] or [[ tokens in a row - needs work
#
def fixwikilink( match ):
    value = match.group()
    newvalue = ''
    indent = 0
    markup = re.split('([\[\]]+)', value)
    
    for txt in markup:
      if txt == '[[':
	indent+=1
        if indent != 1:
           txt = ''
      if txt == ']]':
	indent-=1
        if indent > 0:
           txt = ''
      if txt == '[[[[':
	indent+=2
        if indent != 2:
           txt = ''
        else:
           txt = '[['
      if txt == ']]]]':
	indent-=2
        if indent != 0:
           txt = ''
        else:
           txt = ']]'
      newvalue += txt

    return newvalue

class EditArticle:
    import string
    joinchars = string.letters + '[]' + string.digits # join lines if line starts with this ones

    def __init__(self, args):
        """Takes one argument, usually this is sys.argv[1:]"""
        self.all_args = args
        self.set_options()

    def initialise_data(self):
        """Login, set editor, page and pagelink attributes"""
        self.login()#anonymous=self.options.anonymous)
        self.editor = self.options.editor or wikipedia.input(u"Editor to use: ", encode=True)
        self.setpage()

    def login(self):#, anonymous):
        """Initialises site and username data"""#, or anonymous"""
        if False:#anonymous:
            self.site = wikipedia.getSite(user=None)
        else:
            self.username = self.options.username or wikipedia.input(u"Username: ", encode=True)
            self.site = wikipedia.getSite(user=self.username)
            self.site._fill() # load cookies
            if not self.site._loggedin:
                password = getpass.getpass("Password: ")
                cookie = login.login(self.site, self.username, password)
                if not cookie:
                    sys.exit("Login failed")
                login.storecookiedata(cookie, self.site, self.username)
                wikipedia.output(u"Login succesful")

    def set_options(self):
        """Parse commandline and set options attribute"""
        my_args = []
        for arg in self.all_args:
            arg = wikipedia.argHandler(arg)
            if arg:
                my_args.append(arg)
        parser = optparse.OptionParser()
##        parser.add_option("-a", "--anonymous", action="store_true", default=False, help="Login anonymously")
        parser.add_option("-r", "--edit_redirect", action="store_true", default=False, help="Ignore/edit redirects")
        parser.add_option("-u", "--username", help="Username to login with") #(ignored with -a)
        parser.add_option("-p", "--page", help="Page to edit")
        parser.add_option("-i", "--index", help="Compile dictionary-style index of pages linking here")
        parser.add_option("-e", "--editor", help="Editor to use")
        parser.add_option("-j", "--join_lines", action="store_true", default=False, help="Join consecutive lines if possible")
        parser.add_option("-w", "--watch", action="store_true", default=False, help="Watch article after edit")
        parser.add_option("-n", "--new_data", default="", help="Automatically generated content")
        self.options = parser.parse_args(args=my_args)[0]

    def setpage(self):
        """Sets page and pagelink"""
        self.page = self.options.page or wikipedia.input(u"Page to edit: ", encode=True)
        self.pagelink = wikipedia.PageLink(self.site, self.page)
        if not self.options.edit_redirect and self.pagelink.isRedirectPage():
            self.pagelink = wikipedia.PageLink(site, self.pagelink.getRedirectTo())

    def repair(self, content):
        """Removes single newlines and prepare encoding for local wiki"""
        if self.options.join_lines:
            lines = content.splitlines()
            result = []
            for i, line in enumerate(lines):
                try:
                    nextline = lines[i+1]
                except IndexError:
                    nextline = "last"
                result.append(line)
                if line.strip() == "" or line[0] not in self.joinchars or \
                   nextline.strip() == "" or nextline[0] not in self.joinchars:
                    result.append('\n')
                else:
                    result.append(" ")
            s = "".join(result)
        else:
            s = content
        return wikipedia.unicode2html(s, self.site.encoding())

    def edit(self):
        """Edit the page using the editor.
      
        It returns two strings: the old version and the new version."""

#
#       to process dictionary index (edit.py -i Undictionary: -p Template:Dict -e doesntmatter ), find what links to 
#       template and use that list to recreate the index pages A - Z as {{dictionary}} {{def|...}} templates
#
        if self.options.index:
             linkshere = wikipedia.getReferences(self.pagelink)
             section = ''
             sectionindex = ''

#            for each link returned (linkshere is already in alphabetical order), get appropriate section (A-Z)

             for lnk in linkshere:
                oldsection = section
                section = ''

                thissect = 'A'
                for nextsect in ['B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','[']:
                    if lnk > self.options.index + thissect and lnk < self.options.index + nextsect:
			section = self.options.index + thissect
                    thissect = nextsect

                if section != oldsection:
                    if oldsection != '':
                          print '((',oldsection,')): ',sectionindex
                          new = self.repair(sectionindex)
                          comment = "generate index"
                          self.page = oldsection
                          self.pagelink = wikipedia.PageLink(self.site, self.page)
                          try:
                             self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous)
                          except wikipedia.EditConflict:
                             self.handle_edit_conflict()
                    sectionindex = '{{dictionary}}\n\n== ' + section.replace(self.options.index,u'',1)
                    sectionindex += ' ==\n\n'

                sectionindex += '{{def|' + lnk.replace(self.options.index,u'',1) + '}}\n'

#
# post this last section index to site
# TO DO: see if last section (== Z ==) has already been written, if yes then avoid overwriting it???
#

             if oldsection != '':
                 print '(((',oldsection,'))): ',sectionindex
                 new = self.repair(sectionindex)
                 comment = "generate index"
                 self.page = oldsection
                 self.pagelink = wikipedia.PageLink(self.site, self.page)
                 try:
                     self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous)
                 except wikipedia.EditConflict:
                     self.handle_edit_conflict()

             return linkshere, linkshere

#
#       to handle semi-manual editing of all other articles, retrieve article wiki and HTML text
#       cleanup wiki [[pipe|links]], broken links, missing links to articles listed on addlinks.txt
#       return oldcontent, newcontent as post of modified content is done by calling routine
#

#
#       create temporary file to store wiki article for editing
#
        (fd, ofn) = tempfile.mkstemp()
        ofp = os.fdopen(fd)
        try:
#
#           retrieve existing wiki page
#
            oldcontent = self.pagelink.get()

        except wikipedia.NoPage:
            oldcontent = ""
        except wikipedia.IsRedirectPage:
            if self.options.redirect:
                oldcontent = self.pagelink.get(force=True, get_redirect=redirect)
            else:
                raise

#
#       remove broken wikilinks from page
#
        oldcontent = re.sub(ur'\[\[\w+\|\w+\]\]',fixpipelink,oldcontent)
        oldredlinks = self.pagelink.redlinks()
        for l in oldredlinks:
                l=l.replace(u'%21','!')
                l=l.replace(u'%22','\"')
                l=l.replace(u'%24','$')
                l=l.replace(u'%26','&')
                l=l.replace(u'%27','\'')
                l=l.replace(u'%28','(')
                l=l.replace(u'%29',')')
                l=l.replace(u'%2C',',')
                l=l.replace(u'%3F','?')
                oldcontent = oldcontent.replace(u'[['+l+u']]',l)
                l = l.replace(u'_',u' ')
		l=re.sub(ur'^[A-Z]',tolower,l)
		lu=re.sub(ur'^[a-z]',toupper,l)
                oldcontent = oldcontent.replace(u'[['+l+u']]',l)
                oldcontent = oldcontent.replace(u'[['+lu+u']]',lu)
                if l != self.page and lu != self.page and l != "talk:" + self.page:
                  print lu, "not found in", self.page

#
#       add possibly-missing links to page based on addlinks.txt
#       TO DO: allow config_encoding to be overridden, but without auto-defaulting to DOS console codes
#
        config_encoding = 'latin-1'
        if config_encoding != config.console_encoding:
           print "Using", config_encoding, "instead of default", config.console_encoding
        addlinks = open("addlinks.txt").read().decode(config_encoding)
	addlinks = addlinks.split('\n')
        for l in addlinks:
	    if l != '':
#
# 		 lq is l represented with escaped metacharacters: . ^ $ * + ? { [ ] \ | ( )
#                required to use re.sub to replace keywords without treating these as special chrs
#
                 lq = l.replace(ur'\\',ur'\\\\')
                 lq = lq.replace(ur'.',ur'\.')
                 lq = lq.replace(ur'^',ur'\^')
                 lq = lq.replace(ur'$',ur'\$')
                 lq = lq.replace(ur'*',ur'\*')
                 lq = lq.replace(ur'+',ur'\+')
                 lq = lq.replace(ur'?',ur'\?')
                 lq = lq.replace(ur'{',ur'\{')
                 lq = lq.replace(ur'[',ur'\[')
                 lq = lq.replace(ur']',ur'\]')
                 lq = lq.replace(ur'|',ur'\|')
                 lq = lq.replace(ur'(',ur'\(')
                 lq = lq.replace(ur')',ur'\)')
                 lq = lq.replace(ur'-',ur'\-')
#
#                use .sub to remove existing links,
#
		 ll=re.sub(ur'^[A-Z]',tolower,l)
		 lu=re.sub(ur'^[a-z]',toupper,l)
	         oldcontent = oldcontent.replace(ur'[['+lu+ur']]',lu)
	         oldcontent = oldcontent.replace(ur'[['+ll+ur']]',ll)

#
#                add new links (1st char may be uppercase or as-is)
#
		 lql=re.sub(ur'^[A-Z]',tolower,lq)
		 lqu=re.sub(ur'^[a-z]',toupper,lq)
                 oldcontent,n = re.subn(ur'\b'+lql+ur'\b',u'[['+ll+u']]',oldcontent,1)
                 if n < 1:
		   oldcontent = re.sub(ur'\b'+lqu+ur'\b',u'[['+lu+u']]',oldcontent,1)

        del addlinks
#
#       unlink any self-referential internal link(s)
	l=re.sub(ur'^[A-Z]',tolower,self.page)
	lu=re.sub(ur'^[a-z]',toupper,self.page)
        oldcontent = oldcontent.replace(u'[['+l+u']]',l)
        oldcontent = oldcontent.replace(u'[['+lu+u']]',lu)

#	oldcontent = oldcontent.replace(ur', and',ur' and')
#
#	TO DO: clean up nested [[wiki[[links]]]] and wikilinks in URL's, template refs or header text 
        oldcontent = re.sub(ur'\[\[.*\]\]', fixwikilink, oldcontent)
#
#	substitute ascii/iso-latin1 plain-text or html for a few problem Unicode characters
#       TO DO: in general case, HTML &# followed by a decimal Unicode value ; should always work
#       TO DO: apply the reverse of these substitutions when uploading edited article?
#
        oldcontent=oldcontent.replace(u'\u00b0',u'&degree;')
        oldcontent=oldcontent.replace(u'\u0103',u'ă')
        oldcontent=oldcontent.replace(u'\u0132',u'IJ')
        oldcontent=oldcontent.replace(u'\u0152',u'Œ')
        oldcontent=oldcontent.replace(u'\u0162',u'Ţ')
        oldcontent=oldcontent.replace(u'\u0163',u'ţ')
        oldcontent=oldcontent.replace(u'\u0402',u'Ђ')
        oldcontent=oldcontent.replace(u'\u0409',u'Љ')
        oldcontent=oldcontent.replace(u'\u040a',u'Њ')
        oldcontent=oldcontent.replace(u'\u040b',u'Ћ')
        oldcontent=oldcontent.replace(u'\u0411',u'Б')
        oldcontent=oldcontent.replace(u'\u041b',u'Л')
        oldcontent=oldcontent.replace(u'\u042b',u'Ы')
        oldcontent=oldcontent.replace(u'\u042d',u'Э')
        oldcontent=oldcontent.replace(u'\u042e',u'Ю')
        oldcontent=oldcontent.replace(u'\u05bc',u'ּ')
        oldcontent=oldcontent.replace(u'\u05db',u'כ')	# middle-dot
        oldcontent=oldcontent.replace(u'\u05dc',u'ל')	# sideways-U
        oldcontent=oldcontent.replace(u'\u0633',u'س')
        oldcontent=oldcontent.replace(u'\u0634',u'ش')
        oldcontent=oldcontent.replace(u'\u0635',u'ص')
        oldcontent=oldcontent.replace(u'\u0636',u'ض')
        oldcontent=oldcontent.replace(u'\u0637',u'ط')
        oldcontent=oldcontent.replace(u'\u0638',u'ظ')
        oldcontent=oldcontent.replace(u'\u0639',u'ع')
        oldcontent=oldcontent.replace(u'\u2013',u' - ')
        oldcontent=oldcontent.replace(u'\u2014',u' - ')
        oldcontent=oldcontent.replace(u'\u2018',u'\'')
        oldcontent=oldcontent.replace(u'\u2019',u'\'')
        oldcontent=oldcontent.replace(u'\u201c',u'\'')
        oldcontent=oldcontent.replace(u'\u201d',u'\'')
        oldcontent=oldcontent.replace(u'\u2026',u'…')	
        oldcontent=oldcontent.replace(u'\u2030',u'‰')	
        oldcontent=oldcontent.replace(u'\u203c',u'‼')	
        oldcontent=oldcontent.replace(u'\u20a7',u'₧')
        oldcontent=oldcontent.replace(u'\u20ac',u'€')	# euro
        oldcontent=oldcontent.replace(u'\u2116',u'№')
        oldcontent=oldcontent.replace(u'\u2122',u'ℜ')	# TM ( ™ )
        oldcontent=oldcontent.replace(u'\u2229',u'∩')
        oldcontent=oldcontent.replace(u'\u2646',u'♆')	# trident - Neptune's tuning-fork
        oldcontent=oldcontent.replace(u'\ufffc',u'')

#
#       finished robot-cleanup 'kludges', now allow user to count the damages and check spelnig
#	write wikified oldcontent to file for manual editing in text processor (or food processor)
#

        try:
            if self.options.new_data == '':
                ofp.write(oldcontent.encode(config_encoding)) # FIXME: encoding of wiki
            else:		
                ofp.write(oldcontent.encode(config_encoding)+'\n===========\n'+self.options.new_data) # FIXME: encoding of wiki
        except:
#
#	    fallback to writing raw Unicode file - (presently disabled, uh, differently-abled)
#           perhaps best avoided as &# nnnn ; HTML construct is a usable plain-ASCII alternative which
#           avoids problems of saving Unicode from text editors and reading it back into this script
#
#           note: writing raw Unicode does not work if file is open for text ('w') write
#           Unicode output files must be open in binary ('wb') mode *only* or output text is unusable
#
	    print "Unable to convert Unicode content to ASCII plaintext, saving as Unicode array:"
            uni = array.array('u',u'\ufeff' + oldcontent)
            uni.tofile(ofp)
            ofp.close()
            raise

        ofp.close()
        os.system("%s %s" % (self.options.editor, ofn))
        newcontent = open(ofn).read().decode(config_encoding)
        os.unlink(ofn)
        return oldcontent, newcontent

    def getcomment(self):
        comment = wikipedia.input(u"What did you change? ") + sig
        comment = wikipedia.unicode2html(comment, self.site.encoding())
        return wikipedia.unicode2html(comment, self.site.encoding())

    def handle_edit_conflict(self):
        fn = os.path.join(tempfile.gettempdir(), self.page)
        fp = open(fn, 'w')
        fp.write(new)
        fp.close()
        wikipedia.output(u"An edit conflict has arisen. Your edit has been saved to %s. Please try again." % fn)
    
    def showdiff(self,old, new):
        diff = difflib.context_diff(old.splitlines(), new.splitlines())
        wikipedia.output(u"\n".join(diff))

    def run(self):
        self.initialise_data()
        try:
            old, new = self.edit()
        except wikipedia.LockedPage:
            sys.exit("You do not have permission to edit %s" % self.pagelink.hashfreeLinkname())

        if old != new:
            new = self.repair(new)
            self.showdiff(old, new)

#           comment = self.getcomment()
            comment = "spelchek"

            try:
                self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous)
            except wikipedia.EditConflict:
                self.handle_edit_conflict()
        else:
            if self.options.index:
              wikipedia.output(u"Done")
            else:
              wikipedia.output(u"Nothing changed")

def main():
    app = EditArticle(sys.argv[1:])
    app.run()

if __name__ == "__main__":
    try:
        main()
    except:
        wikipedia.stopme()
        raise
    wikipedia.stopme()

See also User:Hymie the SpelChek™ bot/wikipedia.py