User:Hymie the SpelChek™ bot/editarticle.py
Jump to navigation
Jump to search
This is the main script behind the evil which is SpelChek™; editarticle.py is a drop-in replacement for use with the open-source pywikibot package.
Pywikibot is available from www.sourceforge.net and Python itself can be had from www.python.org
This script makes various kludgy attempts to clean up wiki text and remove broken/red links before dumping the whole mess into an external text editor for manual SpelChek™ correction. It looks for a plain-text file, addlinks.txt, which lists items which should be wikilinked once if they're found in the text.
It is 100.1% evil, so be warned:
# -*- coding: utf-8 -*-
#!/usr/bin/python
# Modified script: Edit a Wikipedia article with your favourite editor. Requires Python 2.3.
#
# (C) Gerrit Holl 2004, modified June 2005 by [[user:carlb]] for Uncyclopedia SpelChek[tm] use.
#
# This version has been modified to remove broken wikilinks and to add links based on addlinks.txt
# The text is then loaded into an external editor to permit correction, categorisation and SpelChek.
# This script is a kludge[tm] and to some degree is still under development.
#
# Note: If no manual changes (even if wikification auto-changed) updates will not be written to site.
#
# Distribute under the terms of the PSF license.
# Example usage (SpelChek with 602text as external editor):
# editarticle.py -e \progra~1\software602\602pro~1\602text\exec\602text.exe -u "Hymie the SpelChek™ bot"
# Example usage (Undictionary index generation, based on templates):
# editarticle.py -e notepad -i Undictionary: -p Template:Dict -u "Hymie the SpelChek™ bot"
#
# Note: this is currently based on Special:Whatlinkshere and only finds the first 500 (or 999) definitions
# which use the template
# Version 0.3cb
#
# Features:
# - logging in
#
# TODO: - non existing pages
# - correct encoding
# - use cookies to remember login
# - edit conflicts
# - difflib
# - minor edits
# - watch/unwatch
# And more SpelChek[tm] TODO:
# - cleanup attempts to add wikification in existing section headers, HTML, wiki, template links
# - fix conversions of %nn characters to ASCII in URL's (for wiki redlink/broken-link removal)
# - fix Unicode->ASCII conversions before invoking external editor for spelchek functions
# - ...
#
# Removed features:
# - editing anonymously
__metaclass__ = type
__version__ = "$Id: editarticle.py,v 1.18 2005/02/21 11:32:36 gerrit Exp $"
sig = u" ()"
import array
import sys
import os
import httplib
import urllib
import getpass
import difflib
import optparse
import tempfile
import re
import wikipedia
import login
import config
#
# change string from re pattern-search to all-lower or all-upper case
#
def tolower( match ):
value = match.group()
return value.lower()
def toupper( match ):
value = match.group()
return value.upper()
#
# optimise links of form [[word|words]] to instead use [[word]]s as wikilink
#
def fixpipelink( match ):
value = match.group()
txt = re.split('([\[\|\]]+)', value)
newvalue = value
if txt[1]=='[[' and txt[3] == '|' and txt[5] == ']]':
txt2 = array.array('u',txt[2])
txt4 = array.array('u',txt[4])
txt2addr, txt2len = txt2.buffer_info()
txt4addr, txt4len = txt4.buffer_info()
match = 0
while match < txt2len and match < txt4len and txt2[match] == txt4[match]:
match += 1
if match == txt2len:
newvalue = u'[[' + txt2.tounicode() + u']]'
while match < txt4len:
newvalue += txt4[match]
match += 1
newvalue = txt[0] + newvalue + txt[6]
print "replacing piped|link with:", newvalue
return newvalue
#
# check string from re pattern-search for nested wiki links created by addlinks.txt wikification
# note: this does not properly handle wikilinks mixed with external links, links split between lines
# or (in some cases) links nested more than two levels deep at once; it also does not target as
# special template/tags, section headers and other non-body text to remove spurious wikification
# it also doesn't properly handle more than two contiguous ]] or [[ tokens in a row - needs work
#
def fixwikilink( match ):
value = match.group()
newvalue = ''
indent = 0
markup = re.split('([\[\]]+)', value)
for txt in markup:
if txt == '[[':
indent+=1
if indent != 1:
txt = ''
if txt == ']]':
indent-=1
if indent > 0:
txt = ''
if txt == '[[[[':
indent+=2
if indent != 2:
txt = ''
else:
txt = '[['
if txt == ']]]]':
indent-=2
if indent != 0:
txt = ''
else:
txt = ']]'
newvalue += txt
return newvalue
class EditArticle:
import string
joinchars = string.letters + '[]' + string.digits # join lines if line starts with this ones
def __init__(self, args):
"""Takes one argument, usually this is sys.argv[1:]"""
self.all_args = args
self.set_options()
def initialise_data(self):
"""Login, set editor, page and pagelink attributes"""
self.login()#anonymous=self.options.anonymous)
self.editor = self.options.editor or wikipedia.input(u"Editor to use: ", encode=True)
self.setpage()
def login(self):#, anonymous):
"""Initialises site and username data"""#, or anonymous"""
if False:#anonymous:
self.site = wikipedia.getSite(user=None)
else:
self.username = self.options.username or wikipedia.input(u"Username: ", encode=True)
self.site = wikipedia.getSite(user=self.username)
self.site._fill() # load cookies
if not self.site._loggedin:
password = getpass.getpass("Password: ")
cookie = login.login(self.site, self.username, password)
if not cookie:
sys.exit("Login failed")
login.storecookiedata(cookie, self.site, self.username)
wikipedia.output(u"Login succesful")
def set_options(self):
"""Parse commandline and set options attribute"""
my_args = []
for arg in self.all_args:
arg = wikipedia.argHandler(arg)
if arg:
my_args.append(arg)
parser = optparse.OptionParser()
## parser.add_option("-a", "--anonymous", action="store_true", default=False, help="Login anonymously")
parser.add_option("-r", "--edit_redirect", action="store_true", default=False, help="Ignore/edit redirects")
parser.add_option("-u", "--username", help="Username to login with") #(ignored with -a)
parser.add_option("-p", "--page", help="Page to edit")
parser.add_option("-i", "--index", help="Compile dictionary-style index of pages linking here")
parser.add_option("-e", "--editor", help="Editor to use")
parser.add_option("-j", "--join_lines", action="store_true", default=False, help="Join consecutive lines if possible")
parser.add_option("-w", "--watch", action="store_true", default=False, help="Watch article after edit")
parser.add_option("-n", "--new_data", default="", help="Automatically generated content")
self.options = parser.parse_args(args=my_args)[0]
def setpage(self):
"""Sets page and pagelink"""
self.page = self.options.page or wikipedia.input(u"Page to edit: ", encode=True)
self.pagelink = wikipedia.PageLink(self.site, self.page)
if not self.options.edit_redirect and self.pagelink.isRedirectPage():
self.pagelink = wikipedia.PageLink(site, self.pagelink.getRedirectTo())
def repair(self, content):
"""Removes single newlines and prepare encoding for local wiki"""
if self.options.join_lines:
lines = content.splitlines()
result = []
for i, line in enumerate(lines):
try:
nextline = lines[i+1]
except IndexError:
nextline = "last"
result.append(line)
if line.strip() == "" or line[0] not in self.joinchars or \
nextline.strip() == "" or nextline[0] not in self.joinchars:
result.append('\n')
else:
result.append(" ")
s = "".join(result)
else:
s = content
return wikipedia.unicode2html(s, self.site.encoding())
def edit(self):
"""Edit the page using the editor.
It returns two strings: the old version and the new version."""
#
# to process dictionary index (edit.py -i Undictionary: -p Template:Dict -e doesntmatter ), find what links to
# template and use that list to recreate the index pages A - Z as {{dictionary}} {{def|...}} templates
#
if self.options.index:
linkshere = wikipedia.getReferences(self.pagelink)
section = ''
sectionindex = ''
# for each link returned (linkshere is already in alphabetical order), get appropriate section (A-Z)
for lnk in linkshere:
oldsection = section
section = ''
thissect = 'A'
for nextsect in ['B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','[']:
if lnk > self.options.index + thissect and lnk < self.options.index + nextsect:
section = self.options.index + thissect
thissect = nextsect
if section != oldsection:
if oldsection != '':
print '((',oldsection,')): ',sectionindex
new = self.repair(sectionindex)
comment = "generate index"
self.page = oldsection
self.pagelink = wikipedia.PageLink(self.site, self.page)
try:
self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous)
except wikipedia.EditConflict:
self.handle_edit_conflict()
sectionindex = '{{dictionary}}\n\n== ' + section.replace(self.options.index,u'',1)
sectionindex += ' ==\n\n'
sectionindex += '{{def|' + lnk.replace(self.options.index,u'',1) + '}}\n'
#
# post this last section index to site
# TO DO: see if last section (== Z ==) has already been written, if yes then avoid overwriting it???
#
if oldsection != '':
print '(((',oldsection,'))): ',sectionindex
new = self.repair(sectionindex)
comment = "generate index"
self.page = oldsection
self.pagelink = wikipedia.PageLink(self.site, self.page)
try:
self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous)
except wikipedia.EditConflict:
self.handle_edit_conflict()
return linkshere, linkshere
#
# to handle semi-manual editing of all other articles, retrieve article wiki and HTML text
# cleanup wiki [[pipe|links]], broken links, missing links to articles listed on addlinks.txt
# return oldcontent, newcontent as post of modified content is done by calling routine
#
#
# create temporary file to store wiki article for editing
#
(fd, ofn) = tempfile.mkstemp()
ofp = os.fdopen(fd)
try:
#
# retrieve existing wiki page
#
oldcontent = self.pagelink.get()
except wikipedia.NoPage:
oldcontent = ""
except wikipedia.IsRedirectPage:
if self.options.redirect:
oldcontent = self.pagelink.get(force=True, get_redirect=redirect)
else:
raise
#
# remove broken wikilinks from page
#
oldcontent = re.sub(ur'\[\[\w+\|\w+\]\]',fixpipelink,oldcontent)
oldredlinks = self.pagelink.redlinks()
for l in oldredlinks:
l=l.replace(u'%21','!')
l=l.replace(u'%22','\"')
l=l.replace(u'%24','$')
l=l.replace(u'%26','&')
l=l.replace(u'%27','\'')
l=l.replace(u'%28','(')
l=l.replace(u'%29',')')
l=l.replace(u'%2C',',')
l=l.replace(u'%3F','?')
oldcontent = oldcontent.replace(u'[['+l+u']]',l)
l = l.replace(u'_',u' ')
l=re.sub(ur'^[A-Z]',tolower,l)
lu=re.sub(ur'^[a-z]',toupper,l)
oldcontent = oldcontent.replace(u'[['+l+u']]',l)
oldcontent = oldcontent.replace(u'[['+lu+u']]',lu)
if l != self.page and lu != self.page and l != "talk:" + self.page:
print lu, "not found in", self.page
#
# add possibly-missing links to page based on addlinks.txt
# TO DO: allow config_encoding to be overridden, but without auto-defaulting to DOS console codes
#
config_encoding = 'latin-1'
if config_encoding != config.console_encoding:
print "Using", config_encoding, "instead of default", config.console_encoding
addlinks = open("addlinks.txt").read().decode(config_encoding)
addlinks = addlinks.split('\n')
for l in addlinks:
if l != '':
#
# lq is l represented with escaped metacharacters: . ^ $ * + ? { [ ] \ | ( )
# required to use re.sub to replace keywords without treating these as special chrs
#
lq = l.replace(ur'\\',ur'\\\\')
lq = lq.replace(ur'.',ur'\.')
lq = lq.replace(ur'^',ur'\^')
lq = lq.replace(ur'$',ur'\$')
lq = lq.replace(ur'*',ur'\*')
lq = lq.replace(ur'+',ur'\+')
lq = lq.replace(ur'?',ur'\?')
lq = lq.replace(ur'{',ur'\{')
lq = lq.replace(ur'[',ur'\[')
lq = lq.replace(ur']',ur'\]')
lq = lq.replace(ur'|',ur'\|')
lq = lq.replace(ur'(',ur'\(')
lq = lq.replace(ur')',ur'\)')
lq = lq.replace(ur'-',ur'\-')
#
# use .sub to remove existing links,
#
ll=re.sub(ur'^[A-Z]',tolower,l)
lu=re.sub(ur'^[a-z]',toupper,l)
oldcontent = oldcontent.replace(ur'[['+lu+ur']]',lu)
oldcontent = oldcontent.replace(ur'[['+ll+ur']]',ll)
#
# add new links (1st char may be uppercase or as-is)
#
lql=re.sub(ur'^[A-Z]',tolower,lq)
lqu=re.sub(ur'^[a-z]',toupper,lq)
oldcontent,n = re.subn(ur'\b'+lql+ur'\b',u'[['+ll+u']]',oldcontent,1)
if n < 1:
oldcontent = re.sub(ur'\b'+lqu+ur'\b',u'[['+lu+u']]',oldcontent,1)
del addlinks
#
# unlink any self-referential internal link(s)
l=re.sub(ur'^[A-Z]',tolower,self.page)
lu=re.sub(ur'^[a-z]',toupper,self.page)
oldcontent = oldcontent.replace(u'[['+l+u']]',l)
oldcontent = oldcontent.replace(u'[['+lu+u']]',lu)
# oldcontent = oldcontent.replace(ur', and',ur' and')
#
# TO DO: clean up nested [[wiki[[links]]]] and wikilinks in URL's, template refs or header text
oldcontent = re.sub(ur'\[\[.*\]\]', fixwikilink, oldcontent)
#
# substitute ascii/iso-latin1 plain-text or html for a few problem Unicode characters
# TO DO: in general case, HTML &# followed by a decimal Unicode value ; should always work
# TO DO: apply the reverse of these substitutions when uploading edited article?
#
oldcontent=oldcontent.replace(u'\u00b0',u'°ree;')
oldcontent=oldcontent.replace(u'\u0103',u'ă')
oldcontent=oldcontent.replace(u'\u0132',u'IJ')
oldcontent=oldcontent.replace(u'\u0152',u'Œ')
oldcontent=oldcontent.replace(u'\u0162',u'Ţ')
oldcontent=oldcontent.replace(u'\u0163',u'ţ')
oldcontent=oldcontent.replace(u'\u0402',u'Ђ')
oldcontent=oldcontent.replace(u'\u0409',u'Љ')
oldcontent=oldcontent.replace(u'\u040a',u'Њ')
oldcontent=oldcontent.replace(u'\u040b',u'Ћ')
oldcontent=oldcontent.replace(u'\u0411',u'Б')
oldcontent=oldcontent.replace(u'\u041b',u'Л')
oldcontent=oldcontent.replace(u'\u042b',u'Ы')
oldcontent=oldcontent.replace(u'\u042d',u'Э')
oldcontent=oldcontent.replace(u'\u042e',u'Ю')
oldcontent=oldcontent.replace(u'\u05bc',u'ּ')
oldcontent=oldcontent.replace(u'\u05db',u'כ') # middle-dot
oldcontent=oldcontent.replace(u'\u05dc',u'ל') # sideways-U
oldcontent=oldcontent.replace(u'\u0633',u'س')
oldcontent=oldcontent.replace(u'\u0634',u'ش')
oldcontent=oldcontent.replace(u'\u0635',u'ص')
oldcontent=oldcontent.replace(u'\u0636',u'ض')
oldcontent=oldcontent.replace(u'\u0637',u'ط')
oldcontent=oldcontent.replace(u'\u0638',u'ظ')
oldcontent=oldcontent.replace(u'\u0639',u'ع')
oldcontent=oldcontent.replace(u'\u2013',u' - ')
oldcontent=oldcontent.replace(u'\u2014',u' - ')
oldcontent=oldcontent.replace(u'\u2018',u'\'')
oldcontent=oldcontent.replace(u'\u2019',u'\'')
oldcontent=oldcontent.replace(u'\u201c',u'\'')
oldcontent=oldcontent.replace(u'\u201d',u'\'')
oldcontent=oldcontent.replace(u'\u2026',u'…')
oldcontent=oldcontent.replace(u'\u2030',u'‰')
oldcontent=oldcontent.replace(u'\u203c',u'‼')
oldcontent=oldcontent.replace(u'\u20a7',u'₧')
oldcontent=oldcontent.replace(u'\u20ac',u'€') # euro
oldcontent=oldcontent.replace(u'\u2116',u'№')
oldcontent=oldcontent.replace(u'\u2122',u'ℜ') # TM ( ™ )
oldcontent=oldcontent.replace(u'\u2229',u'∩')
oldcontent=oldcontent.replace(u'\u2646',u'♆') # trident - Neptune's tuning-fork
oldcontent=oldcontent.replace(u'\ufffc',u'')
#
# finished robot-cleanup 'kludges', now allow user to count the damages and check spelnig
# write wikified oldcontent to file for manual editing in text processor (or food processor)
#
try:
if self.options.new_data == '':
ofp.write(oldcontent.encode(config_encoding)) # FIXME: encoding of wiki
else:
ofp.write(oldcontent.encode(config_encoding)+'\n===========\n'+self.options.new_data) # FIXME: encoding of wiki
except:
#
# fallback to writing raw Unicode file - (presently disabled, uh, differently-abled)
# perhaps best avoided as &# nnnn ; HTML construct is a usable plain-ASCII alternative which
# avoids problems of saving Unicode from text editors and reading it back into this script
#
# note: writing raw Unicode does not work if file is open for text ('w') write
# Unicode output files must be open in binary ('wb') mode *only* or output text is unusable
#
print "Unable to convert Unicode content to ASCII plaintext, saving as Unicode array:"
uni = array.array('u',u'\ufeff' + oldcontent)
uni.tofile(ofp)
ofp.close()
raise
ofp.close()
os.system("%s %s" % (self.options.editor, ofn))
newcontent = open(ofn).read().decode(config_encoding)
os.unlink(ofn)
return oldcontent, newcontent
def getcomment(self):
comment = wikipedia.input(u"What did you change? ") + sig
comment = wikipedia.unicode2html(comment, self.site.encoding())
return wikipedia.unicode2html(comment, self.site.encoding())
def handle_edit_conflict(self):
fn = os.path.join(tempfile.gettempdir(), self.page)
fp = open(fn, 'w')
fp.write(new)
fp.close()
wikipedia.output(u"An edit conflict has arisen. Your edit has been saved to %s. Please try again." % fn)
def showdiff(self,old, new):
diff = difflib.context_diff(old.splitlines(), new.splitlines())
wikipedia.output(u"\n".join(diff))
def run(self):
self.initialise_data()
try:
old, new = self.edit()
except wikipedia.LockedPage:
sys.exit("You do not have permission to edit %s" % self.pagelink.hashfreeLinkname())
if old != new:
new = self.repair(new)
self.showdiff(old, new)
# comment = self.getcomment()
comment = "spelchek"
try:
self.pagelink.put(new, comment=comment, minorEdit=True, watchArticle=self.options.watch)#, anon=self.options.anonymous)
except wikipedia.EditConflict:
self.handle_edit_conflict()
else:
if self.options.index:
wikipedia.output(u"Done")
else:
wikipedia.output(u"Nothing changed")
def main():
app = EditArticle(sys.argv[1:])
app.run()
if __name__ == "__main__":
try:
main()
except:
wikipedia.stopme()
raise
wikipedia.stopme()