User:Hymie the SpelChek™ bot/editarticle.py
Jump to navigation
Jump to search
This is the main script behind the evil which is SpelChek™; editarticle.py is a drop-in replacement for use with the open-source pywikibot package.
Pywikibot is available from www.sourceforge.net and Python itself can be had from www.python.org
This script makes various kludgy attempts to clean up wiki text and remove broken/red links before dumping the whole mess into an external text editor for manual SpelChek™ correction. It looks for a plain-text file, addlinks.txt, which lists items which should be wikilinked once if they're found in the text.
It is 100.1% evil, so be warned:
# -*- coding: utf-8 -*- #!/usr/bin/python # Modified script: Edit a Wikipedia article with your favourite editor. Requires Python 2.3. # # (C) Gerrit Holl 2004, modified June 2005 by [[user:carlb]] for Uncyclopedia SpelChek[tm] use. # # This version has been modified to remove broken wikilinks and to add links based on addlinks.txt # The text is then loaded into an external editor to permit correction, categorisation and SpelChek. # This script is a kludge[tm] and to some degree is still under development. # # Note: If no manual changes (even if wikification auto-changed) updates will not be written to site. # # Distribute under the terms of the PSF license. # Example usage (SpelChek with 602text as external editor): # editarticle.py -e \progra~1\software602\602pro~1\602text\exec\602text.exe -u "Hymie the SpelChek™ bot" # Example usage (Undictionary index generation, based on templates): # editarticle.py -e notepad -i Undictionary: -p Template:Dict -u "Hymie the SpelChek™ bot" # # Note: this is currently based on Special:Whatlinkshere and only finds the first 500 (or 999) definitions # which use the template # Version 0.3cb # # Features: # - logging in # # TODO: - non existing pages # - correct encoding # - use cookies to remember login # - edit conflicts # - difflib # - minor edits # - watch/unwatch # And more SpelChek[tm] TODO: # - cleanup attempts to add wikification in existing section headers, HTML, wiki, template links # - fix conversions of %nn characters to ASCII in URL's (for wiki redlink/broken-link removal) # - fix Unicode->ASCII conversions before invoking external editor for spelchek functions # - ... # # Removed features: # - editing anonymously __metaclass__ = type __version__ = "$Id: editarticle.py,v 1.18 2005/02/21 11:32:36 gerrit Exp $" sig = u" ()" import array import sys import os import httplib import urllib import getpass import difflib import optparse import tempfile import re import wikipedia import login import config # # change string from re pattern-search to all-lower or all-upper case # def tolower( match ): value = match.group() return value.lower() def toupper( match ): value = match.group() return value.upper() # # optimise links of form [[word|words]] to instead use [[word]]s as wikilink # def fixpipelink( match ): value = match.group() txt = re.split('([\[\|\]]+)', value) newvalue = value if txt[1]=='[[' and txt[3] == '|' and txt[5] == ']]': txt2 = array.array('u',txt[2]) txt4 = array.array('u',txt[4]) txt2addr, txt2len = txt2.buffer_info() txt4addr, txt4len = txt4.buffer_info() match = 0 while match < txt2len and match < txt4len and txt2[match] == txt4[match]: match += 1 if match == txt2len: newvalue = u'[[' + txt2.tounicode() + u']]' while match < txt4len: newvalue += txt4[match] match += 1 newvalue = txt[0] + newvalue + txt[6] print "replacing piped|link with:", newvalue return newvalue # # check string from re pattern-search for nested wiki links created by addlinks.txt wikification # note: this does not properly handle wikilinks mixed with external links, links split between lines # or (in some cases) links nested more than two levels deep at once; it also does not target as # special template/tags, section headers and other non-body text to remove spurious wikification # it also doesn't properly handle more than two contiguous ]] or [[ tokens in a row - needs work # def fixwikilink( match ): value = match.group() newvalue = '' indent = 0 markup = re.split('([\[\]]+)', value) for txt in markup: if txt == '[[': indent+=1 if indent != 1: txt = '' if txt == ']]': indent-=1 if indent > 0: txt = '' if txt == '[[[[': indent+=2 if indent != 2: txt = '' else: txt = '[[' if txt == ']]]]': indent-=2 if indent != 0: txt = '' else: txt = ']]' newvalue += txt return newvalue class EditArticle: import string joinchars = string.letters + '[]' + string.digits # join lines if line starts with this ones def __init__(self, args): """Takes one argument, usually this is sys.argv[1:]""" self.all_args = args self.set_options() def initialise_data(self): """Login, set editor, page and pagelink attributes""" self.login()#anonymous=self.options.anonymous) self.editor = self.options.editor or wikipedia.input(u"Editor to use: ", encode=True) self.setpage() def login(self):#, anonymous): """Initialises site and username data"""#, or anonymous""" if False:#anonymous: self.site = wikipedia.getSite(user=None) else: self.username = self.options.username or wikipedia.input(u"Username: ", encode=True) self.site = wikipedia.getSite(user=self.username) self.site._fill() # load cookies if not self.site._loggedin: password = getpass.getpass("Password: ") cookie = login.login(self.site, self.username, password) if not cookie: sys.exit("Login failed") login.storecookiedata(cookie, self.site, self.username) wikipedia.output(u"Login succesful") def set_options(self): """Parse commandline and set options attribute""" my_args = [] for arg in self.all_args: arg = wikipedia.argHandler(arg) if arg: my_args.append(arg) parser = optparse.OptionParser() ## parser.add_option("-a", "--anonymous", action="store_true", default=False, help="Login anonymously") parser.add_option("-r", "--edit_redirect", action="store_true", default=False, help="Ignore/edit redirects") parser.add_option("-u", "--username", help="Username to login with") #(ignored with -a) parser.add_option("-p", "--page", help="Page to edit") parser.add_option("-i", "--index", help="Compile dictionary-style index of pages linking here") parser.add_option("-e", "--editor", help="Editor to use") parser.add_option("-j", "--join_lines", action="store_true", default=False, help="Join consecutive lines if possible") parser.add_option("-w", "--watch", action="store_true", default=False, help="Watch article after edit") parser.add_option("-n", "--new_data", default="", help="Automatically generated content") self.options = parser.parse_args(args=my_args)[0] def setpage(self): """Sets page and pagelink""" self.page = self.options.page or wikipedia.input(u"Page to edit: ", encode=True) self.pagelink = wikipedia.PageLink(self.site, self.page) if not self.options.edit_redirect and self.pagelink.isRedirectPage(): self.pagelink = wikipedia.PageLink(site, self.pagelink.getRedirectTo()) def repair(self, content): """Removes single newlines and prepare encoding for local wiki""" if self.options.join_lines: lines = content.splitlines() result = [] for i, line in enumerate(lines): try: nextline = lines[i+1] except IndexError: nextline = "last" result.append(line) if line.strip() == "" or line[0] not in self.joinchars or \ nextline.strip() == "" or nextline[0] not in self.joinchars: result.append('\n') else: result.append(" ") s = "".join(result) else: s = content return wikipedia.unicode2html(s, self.site.encoding()) def edit(self): """Edit the page using the editor. It returns two strings: the old version and the new version.""" # # to process dictionary index (edit.py -i Undictionary: -p Template:Dict -e doesntmatter ), find what links to # template and use that list to recreate the index pages A - Z as {{dictionary}} {{def|...}} templates # if self.options.index: linkshere = wikipedia.getReferences(self.pagelink) section = '' sectionindex = '' # for each link returned (linkshere is already in alphabetical order), get appropriate section (A-Z) for lnk in linkshere: oldsection = section section = '' thissect = 'A' for nextsect in ['B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','[']: if lnk > self.options.index + thissect and lnk < self.options.index + nextsect: section = self.options.index + thissect thissect = nextsect if section != oldsection: if oldsection != '': print '((',oldsection,')): ',sectionindex new = self.repair(sectionindex) comment = "generate index" self.page = oldsection self.pagelink = wikipedia.PageLink(self.site, self.page) try: self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous) except wikipedia.EditConflict: self.handle_edit_conflict() sectionindex = '{{dictionary}}\n\n== ' + section.replace(self.options.index,u'',1) sectionindex += ' ==\n\n' sectionindex += '{{def|' + lnk.replace(self.options.index,u'',1) + '}}\n' # # post this last section index to site # TO DO: see if last section (== Z ==) has already been written, if yes then avoid overwriting it??? # if oldsection != '': print '(((',oldsection,'))): ',sectionindex new = self.repair(sectionindex) comment = "generate index" self.page = oldsection self.pagelink = wikipedia.PageLink(self.site, self.page) try: self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous) except wikipedia.EditConflict: self.handle_edit_conflict() return linkshere, linkshere # # to handle semi-manual editing of all other articles, retrieve article wiki and HTML text # cleanup wiki [[pipe|links]], broken links, missing links to articles listed on addlinks.txt # return oldcontent, newcontent as post of modified content is done by calling routine # # # create temporary file to store wiki article for editing # (fd, ofn) = tempfile.mkstemp() ofp = os.fdopen(fd) try: # # retrieve existing wiki page # oldcontent = self.pagelink.get() except wikipedia.NoPage: oldcontent = "" except wikipedia.IsRedirectPage: if self.options.redirect: oldcontent = self.pagelink.get(force=True, get_redirect=redirect) else: raise # # remove broken wikilinks from page # oldcontent = re.sub(ur'\[\[\w+\|\w+\]\]',fixpipelink,oldcontent) oldredlinks = self.pagelink.redlinks() for l in oldredlinks: l=l.replace(u'%21','!') l=l.replace(u'%22','\"') l=l.replace(u'%24','$') l=l.replace(u'%26','&') l=l.replace(u'%27','\'') l=l.replace(u'%28','(') l=l.replace(u'%29',')') l=l.replace(u'%2C',',') l=l.replace(u'%3F','?') oldcontent = oldcontent.replace(u'[['+l+u']]',l) l = l.replace(u'_',u' ') l=re.sub(ur'^[A-Z]',tolower,l) lu=re.sub(ur'^[a-z]',toupper,l) oldcontent = oldcontent.replace(u'[['+l+u']]',l) oldcontent = oldcontent.replace(u'[['+lu+u']]',lu) if l != self.page and lu != self.page and l != "talk:" + self.page: print lu, "not found in", self.page # # add possibly-missing links to page based on addlinks.txt # TO DO: allow config_encoding to be overridden, but without auto-defaulting to DOS console codes # config_encoding = 'latin-1' if config_encoding != config.console_encoding: print "Using", config_encoding, "instead of default", config.console_encoding addlinks = open("addlinks.txt").read().decode(config_encoding) addlinks = addlinks.split('\n') for l in addlinks: if l != '': # # lq is l represented with escaped metacharacters: . ^ $ * + ? { [ ] \ | ( ) # required to use re.sub to replace keywords without treating these as special chrs # lq = l.replace(ur'\\',ur'\\\\') lq = lq.replace(ur'.',ur'\.') lq = lq.replace(ur'^',ur'\^') lq = lq.replace(ur'$',ur'\$') lq = lq.replace(ur'*',ur'\*') lq = lq.replace(ur'+',ur'\+') lq = lq.replace(ur'?',ur'\?') lq = lq.replace(ur'{',ur'\{') lq = lq.replace(ur'[',ur'\[') lq = lq.replace(ur']',ur'\]') lq = lq.replace(ur'|',ur'\|') lq = lq.replace(ur'(',ur'\(') lq = lq.replace(ur')',ur'\)') lq = lq.replace(ur'-',ur'\-') # # use .sub to remove existing links, # ll=re.sub(ur'^[A-Z]',tolower,l) lu=re.sub(ur'^[a-z]',toupper,l) oldcontent = oldcontent.replace(ur'[['+lu+ur']]',lu) oldcontent = oldcontent.replace(ur'[['+ll+ur']]',ll) # # add new links (1st char may be uppercase or as-is) # lql=re.sub(ur'^[A-Z]',tolower,lq) lqu=re.sub(ur'^[a-z]',toupper,lq) oldcontent,n = re.subn(ur'\b'+lql+ur'\b',u'[['+ll+u']]',oldcontent,1) if n < 1: oldcontent = re.sub(ur'\b'+lqu+ur'\b',u'[['+lu+u']]',oldcontent,1) del addlinks # # unlink any self-referential internal link(s) l=re.sub(ur'^[A-Z]',tolower,self.page) lu=re.sub(ur'^[a-z]',toupper,self.page) oldcontent = oldcontent.replace(u'[['+l+u']]',l) oldcontent = oldcontent.replace(u'[['+lu+u']]',lu) # oldcontent = oldcontent.replace(ur', and',ur' and') # # TO DO: clean up nested [[wiki[[links]]]] and wikilinks in URL's, template refs or header text oldcontent = re.sub(ur'\[\[.*\]\]', fixwikilink, oldcontent) # # substitute ascii/iso-latin1 plain-text or html for a few problem Unicode characters # TO DO: in general case, HTML &# followed by a decimal Unicode value ; should always work # TO DO: apply the reverse of these substitutions when uploading edited article? # oldcontent=oldcontent.replace(u'\u00b0',u'°ree;') oldcontent=oldcontent.replace(u'\u0103',u'ă') oldcontent=oldcontent.replace(u'\u0132',u'IJ') oldcontent=oldcontent.replace(u'\u0152',u'Œ') oldcontent=oldcontent.replace(u'\u0162',u'Ţ') oldcontent=oldcontent.replace(u'\u0163',u'ţ') oldcontent=oldcontent.replace(u'\u0402',u'Ђ') oldcontent=oldcontent.replace(u'\u0409',u'Љ') oldcontent=oldcontent.replace(u'\u040a',u'Њ') oldcontent=oldcontent.replace(u'\u040b',u'Ћ') oldcontent=oldcontent.replace(u'\u0411',u'Б') oldcontent=oldcontent.replace(u'\u041b',u'Л') oldcontent=oldcontent.replace(u'\u042b',u'Ы') oldcontent=oldcontent.replace(u'\u042d',u'Э') oldcontent=oldcontent.replace(u'\u042e',u'Ю') oldcontent=oldcontent.replace(u'\u05bc',u'ּ') oldcontent=oldcontent.replace(u'\u05db',u'כ') # middle-dot oldcontent=oldcontent.replace(u'\u05dc',u'ל') # sideways-U oldcontent=oldcontent.replace(u'\u0633',u'س') oldcontent=oldcontent.replace(u'\u0634',u'ش') oldcontent=oldcontent.replace(u'\u0635',u'ص') oldcontent=oldcontent.replace(u'\u0636',u'ض') oldcontent=oldcontent.replace(u'\u0637',u'ط') oldcontent=oldcontent.replace(u'\u0638',u'ظ') oldcontent=oldcontent.replace(u'\u0639',u'ع') oldcontent=oldcontent.replace(u'\u2013',u' - ') oldcontent=oldcontent.replace(u'\u2014',u' - ') oldcontent=oldcontent.replace(u'\u2018',u'\'') oldcontent=oldcontent.replace(u'\u2019',u'\'') oldcontent=oldcontent.replace(u'\u201c',u'\'') oldcontent=oldcontent.replace(u'\u201d',u'\'') oldcontent=oldcontent.replace(u'\u2026',u'…') oldcontent=oldcontent.replace(u'\u2030',u'‰') oldcontent=oldcontent.replace(u'\u203c',u'‼') oldcontent=oldcontent.replace(u'\u20a7',u'₧') oldcontent=oldcontent.replace(u'\u20ac',u'€') # euro oldcontent=oldcontent.replace(u'\u2116',u'№') oldcontent=oldcontent.replace(u'\u2122',u'ℜ') # TM ( ™ ) oldcontent=oldcontent.replace(u'\u2229',u'∩') oldcontent=oldcontent.replace(u'\u2646',u'♆') # trident - Neptune's tuning-fork oldcontent=oldcontent.replace(u'\ufffc',u'') # # finished robot-cleanup 'kludges', now allow user to count the damages and check spelnig # write wikified oldcontent to file for manual editing in text processor (or food processor) # try: if self.options.new_data == '': ofp.write(oldcontent.encode(config_encoding)) # FIXME: encoding of wiki else: ofp.write(oldcontent.encode(config_encoding)+'\n===========\n'+self.options.new_data) # FIXME: encoding of wiki except: # # fallback to writing raw Unicode file - (presently disabled, uh, differently-abled) # perhaps best avoided as &# nnnn ; HTML construct is a usable plain-ASCII alternative which # avoids problems of saving Unicode from text editors and reading it back into this script # # note: writing raw Unicode does not work if file is open for text ('w') write # Unicode output files must be open in binary ('wb') mode *only* or output text is unusable # print "Unable to convert Unicode content to ASCII plaintext, saving as Unicode array:" uni = array.array('u',u'\ufeff' + oldcontent) uni.tofile(ofp) ofp.close() raise ofp.close() os.system("%s %s" % (self.options.editor, ofn)) newcontent = open(ofn).read().decode(config_encoding) os.unlink(ofn) return oldcontent, newcontent def getcomment(self): comment = wikipedia.input(u"What did you change? ") + sig comment = wikipedia.unicode2html(comment, self.site.encoding()) return wikipedia.unicode2html(comment, self.site.encoding()) def handle_edit_conflict(self): fn = os.path.join(tempfile.gettempdir(), self.page) fp = open(fn, 'w') fp.write(new) fp.close() wikipedia.output(u"An edit conflict has arisen. Your edit has been saved to %s. Please try again." % fn) def showdiff(self,old, new): diff = difflib.context_diff(old.splitlines(), new.splitlines()) wikipedia.output(u"\n".join(diff)) def run(self): self.initialise_data() try: old, new = self.edit() except wikipedia.LockedPage: sys.exit("You do not have permission to edit %s" % self.pagelink.hashfreeLinkname()) if old != new: new = self.repair(new) self.showdiff(old, new) # comment = self.getcomment() comment = "spelchek" try: self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous) except wikipedia.EditConflict: self.handle_edit_conflict() else: if self.options.index: wikipedia.output(u"Done") else: wikipedia.output(u"Nothing changed") def main(): app = EditArticle(sys.argv[1:]) app.run() if __name__ == "__main__": try: main() except: wikipedia.stopme() raise wikipedia.stopme()