| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """Wikipedia Extractor: |
| | Extracts and cleans text from a Wikipedia database dump and stores output in a |
| | number of files of similar size in a given directory. |
| | Each file will contain several documents in the format: |
| | |
| | <doc id="" revid="" url="" title=""> |
| | ... |
| | </doc> |
| | |
| | If the program is invoked with the --json flag, then each file will |
| | contain several documents formatted as json ojects, one per line, with |
| | the following structure |
| | |
| | {"id": "", "revid": "", "url":"", "title": "", "text": "..."} |
| | |
| | Template expansion requires preprocesssng first the whole dump and |
| | collecting template definitions. |
| | |
| | """ |
| |
|
| | from __future__ import unicode_literals, division |
| |
|
| | import sys |
| | import argparse |
| | import bz2 |
| | import codecs |
| | import cgi |
| | import fileinput |
| | import logging |
| | import os.path |
| | import re |
| | import time |
| | import json |
| | from io import StringIO |
| | from multiprocessing import Queue, Process, Value, cpu_count |
| | from timeit import default_timer |
| |
|
| |
|
| | PY2 = sys.version_info[0] == 2 |
| | |
| | if PY2: |
| | from urllib import quote |
| | from htmlentitydefs import name2codepoint |
| | from itertools import izip as zip, izip_longest as zip_longest |
| | range = xrange |
| | chr = unichr |
| | text_type = unicode |
| |
|
| | class SimpleNamespace(object): |
| | def __init__ (self, **kwargs): |
| | self.__dict__.update(kwargs) |
| | def __repr__ (self): |
| | keys = sorted(self.__dict__) |
| | items = ("{}={!r}".format(k, self.__dict__[k]) for k in keys) |
| | return "{}({})".format(type(self).__name__, ", ".join(items)) |
| | def __eq__ (self, other): |
| | return self.__dict__ == other.__dict__ |
| | else: |
| | from urllib.parse import quote |
| | from html.entities import name2codepoint |
| | from itertools import zip_longest |
| | from types import SimpleNamespace |
| | text_type = str |
| |
|
| |
|
| | |
| |
|
| | |
| | version = '2.75' |
| |
|
| | |
| |
|
| | options = SimpleNamespace( |
| |
|
| | |
| | |
| | |
| | knownNamespaces = {'Template': 10}, |
| |
|
| | |
| | |
| | |
| | templateNamespace = '', |
| | templatePrefix = '', |
| |
|
| | |
| | |
| | |
| | moduleNamespace = '', |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | acceptedNamespaces = ['w', 'wiktionary', 'wikt'], |
| |
|
| | |
| | urlbase = '', |
| |
|
| | |
| | |
| | filter_disambig_pages = False, |
| |
|
| | |
| | |
| | keep_tables = False, |
| |
|
| | |
| | |
| | keepLinks = False, |
| |
|
| | |
| | |
| | keepSections = True, |
| |
|
| | |
| | |
| | keepLists = False, |
| |
|
| | |
| | |
| | toHTML = False, |
| |
|
| | |
| | |
| | write_json = False, |
| |
|
| | |
| | |
| | expand_templates = True, |
| |
|
| | |
| | |
| | escape_doc = False, |
| |
|
| | |
| | |
| | print_revision = False, |
| |
|
| | |
| | |
| | min_text_length = 0, |
| |
|
| | |
| | templates = {}, |
| | redirects = {}, |
| | |
| | |
| | templateCache = {}, |
| |
|
| | |
| |
|
| | ignored_tag_patterns = [], |
| | filter_category_include = set(), |
| | filter_category_exclude = set(), |
| |
|
| | log_file = None, |
| |
|
| | discardElements = [ |
| | 'gallery', 'timeline', 'noinclude', 'pre', |
| | 'table', 'tr', 'td', 'th', 'caption', 'div', |
| | 'form', 'input', 'select', 'option', 'textarea', |
| | 'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir', |
| | 'ref', 'references', 'img', 'imagemap', 'source', 'small', |
| | 'sub', 'sup', 'indicator' |
| | ], |
| | ) |
| |
|
| | |
| | |
| | templateKeys = set(['10', '828']) |
| |
|
| | |
| | |
| | filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}|__DISAMBIG__") |
| |
|
| | |
| | g_page_total = 0 |
| | g_page_articl_total=0 |
| | g_page_articl_used_total=0 |
| | |
| | def keepPage(ns, catSet, page): |
| | global g_page_articl_total,g_page_total,g_page_articl_used_total |
| | g_page_total += 1 |
| | if ns != '0': |
| | return False |
| | |
| | g_page_articl_total += 1 |
| | if options.filter_disambig_pages: |
| | for line in page: |
| | if filter_disambig_page_pattern.match(line): |
| | return False |
| | if len(options.filter_category_include) > 0 and len(options.filter_category_include & catSet)==0: |
| | logging.debug("***No include " + str(catSet)) |
| | return False |
| | if len(options.filter_category_exclude) > 0 and len(options.filter_category_exclude & catSet)>0: |
| | logging.debug("***Exclude " + str(catSet)) |
| | return False |
| | g_page_articl_used_total += 1 |
| | return True |
| |
|
| |
|
| | def get_url(uid): |
| | return "%s?curid=%s" % (options.urlbase, uid) |
| |
|
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | selfClosingTags = ('br', 'hr', 'nobr', 'ref', 'references', 'nowiki') |
| |
|
| | placeholder_tags = {'math': 'formula', 'code': 'codice'} |
| |
|
| |
|
| | def normalizeTitle(title): |
| | """Normalize title""" |
| | |
| | title = title.strip(' _') |
| | |
| | title = re.sub(r'[\s_]+', ' ', title) |
| |
|
| | m = re.match(r'([^:]*):(\s*)(\S(?:.*))', title) |
| | if m: |
| | prefix = m.group(1) |
| | if m.group(2): |
| | optionalWhitespace = ' ' |
| | else: |
| | optionalWhitespace = '' |
| | rest = m.group(3) |
| |
|
| | ns = normalizeNamespace(prefix) |
| | if ns in options.knownNamespaces: |
| | |
| | |
| | |
| | |
| | title = ns + ":" + ucfirst(rest) |
| | else: |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | title = ucfirst(prefix) + ":" + optionalWhitespace + ucfirst(rest) |
| | else: |
| | |
| | title = ucfirst(title) |
| | return title |
| |
|
| |
|
| | def unescape(text): |
| | """ |
| | Removes HTML or XML character references and entities from a text string. |
| | |
| | :param text The HTML (or XML) source text. |
| | :return The plain text, as a Unicode string, if necessary. |
| | """ |
| |
|
| | def fixup(m): |
| | text = m.group(0) |
| | code = m.group(1) |
| | try: |
| | if text[1] == "#": |
| | if text[2] == "x": |
| | return chr(int(code[1:], 16)) |
| | else: |
| | return chr(int(code)) |
| | else: |
| | return chr(name2codepoint[code]) |
| | except: |
| | return text |
| |
|
| | return re.sub("&#?(\w+);", fixup, text) |
| |
|
| |
|
| | |
| | |
| | comment = re.compile(r'<!--.*?-->', re.DOTALL) |
| |
|
| |
|
| | |
| | nowiki = re.compile(r'<nowiki>.*?</nowiki>') |
| |
|
| |
|
| | def ignoreTag(tag): |
| | left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL) |
| | right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE) |
| | options.ignored_tag_patterns.append((left, right)) |
| |
|
| | |
| | selfClosing_tag_patterns = [ |
| | re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags |
| | ] |
| |
|
| | |
| | placeholder_tag_patterns = [ |
| | (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE), |
| | repl) for tag, repl in placeholder_tags.items() |
| | ] |
| |
|
| | |
| | preformatted = re.compile(r'^ .*?$') |
| |
|
| | |
| | externalLink = re.compile(r'\[\w+[^ ]*? (.*?)]') |
| | externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]') |
| |
|
| | |
| | bold_italic = re.compile(r"'''''(.*?)'''''") |
| | bold = re.compile(r"'''(.*?)'''") |
| | italic_quote = re.compile(r"''\"([^\"]*?)\"''") |
| | italic = re.compile(r"''(.*?)''") |
| | quote_quote = re.compile(r'""([^"]*?)""') |
| |
|
| | |
| | spaces = re.compile(r' {2,}') |
| |
|
| | |
| | dots = re.compile(r'\.{4,}') |
| |
|
| |
|
| | |
| |
|
| |
|
| | class Template(list): |
| | """ |
| | A Template is a list of TemplateText or TemplateArgs |
| | """ |
| |
|
| | @classmethod |
| | def parse(cls, body): |
| | tpl = Template() |
| | |
| | |
| | |
| | |
| | |
| | start = 0 |
| | for s, e in findMatchingBraces(body, 3): |
| | tpl.append(TemplateText(body[start:s])) |
| | tpl.append(TemplateArg(body[s + 3:e - 3])) |
| | start = e |
| | tpl.append(TemplateText(body[start:])) |
| | return tpl |
| |
|
| |
|
| | def subst(self, params, extractor, depth=0): |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | if depth > extractor.maxParameterRecursionLevels: |
| | extractor.recursion_exceeded_3_errs += 1 |
| | return '' |
| |
|
| | return ''.join([tpl.subst(params, extractor, depth) for tpl in self]) |
| |
|
| | def __str__(self): |
| | return ''.join([text_type(x) for x in self]) |
| |
|
| |
|
| | class TemplateText(text_type): |
| | """Fixed text of template""" |
| |
|
| |
|
| | def subst(self, params, extractor, depth): |
| | return self |
| |
|
| |
|
| | class TemplateArg(object): |
| | """ |
| | parameter to a template. |
| | Has a name and a default value, both of which are Templates. |
| | """ |
| |
|
| | def __init__(self, parameter): |
| | """ |
| | :param parameter: the parts of a tplarg. |
| | """ |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | parts = splitParts(parameter) |
| | self.name = Template.parse(parts[0]) |
| | if len(parts) > 1: |
| | |
| | self.default = Template.parse(parts[1]) |
| | else: |
| | self.default = None |
| |
|
| | def __str__(self): |
| | if self.default: |
| | return '{{{%s|%s}}}' % (self.name, self.default) |
| | else: |
| | return '{{{%s}}}' % self.name |
| |
|
| |
|
| | def subst(self, params, extractor, depth): |
| | """ |
| | Substitute value for this argument from dict :param params: |
| | Use :param extractor: to evaluate expressions for name and default. |
| | Limit substitution to the maximun :param depth:. |
| | """ |
| | |
| | |
| | paramName = self.name.subst(params, extractor, depth + 1) |
| | paramName = extractor.transform(paramName) |
| | res = '' |
| | if paramName in params: |
| | res = params[paramName] |
| | elif self.default: |
| | defaultValue = self.default.subst(params, extractor, depth + 1) |
| | res = extractor.transform(defaultValue) |
| | |
| | return res |
| |
|
| |
|
| | class Frame(object): |
| |
|
| | def __init__(self, title='', args=[], prev=None): |
| | self.title = title |
| | self.args = args |
| | self.prev = prev |
| | self.depth = prev.depth + 1 if prev else 0 |
| |
|
| |
|
| | def push(self, title, args): |
| | return Frame(title, args, self) |
| |
|
| |
|
| | def pop(self): |
| | return self.prev |
| |
|
| |
|
| | def __str__(self): |
| | res = '' |
| | prev = self.prev |
| | while prev: |
| | if res: res += ', ' |
| | res += '(%s, %s)' % (prev.title, prev.args) |
| | prev = prev.prev |
| | return '<Frame [' + res + ']>' |
| |
|
| | |
| |
|
| | substWords = 'subst:|safesubst:' |
| |
|
| | class Extractor(object): |
| | """ |
| | An extraction task on a article. |
| | """ |
| | def __init__(self, id, revid, title, lines): |
| | """ |
| | :param id: id of page. |
| | :param title: tutle of page. |
| | :param lines: a list of lines. |
| | """ |
| | self.id = id |
| | self.revid = revid |
| | self.title = title |
| | self.text = ''.join(lines) |
| | self.magicWords = MagicWords() |
| | self.frame = Frame() |
| | self.recursion_exceeded_1_errs = 0 |
| | self.recursion_exceeded_2_errs = 0 |
| | self.recursion_exceeded_3_errs = 0 |
| | self.template_title_errs = 0 |
| |
|
| | def write_output(self, out, text): |
| | """ |
| | :param out: a memory file |
| | :param text: the text of the page |
| | """ |
| | url = get_url(self.id) |
| | if options.write_json: |
| | json_data = { |
| | 'id': self.id, |
| | 'url': url, |
| | 'title': self.title, |
| | 'text': "\n".join(text) |
| | } |
| | if options.print_revision: |
| | json_data['revid'] = self.revid |
| | |
| | |
| | out_str = json.dumps(json_data, ensure_ascii=False) |
| | if out == sys.stdout: |
| | out_str = out_str.encode('utf-8') |
| | out.write(out_str) |
| | out.write('\n') |
| | else: |
| | if options.print_revision: |
| | header = '<doc id="%s" revid="%s" url="%s" title="%s">\n' % (self.id, self.revid, url, self.title) |
| | else: |
| | header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title) |
| | footer = "\n</doc>\n" |
| | if out == sys.stdout: |
| | header = header.encode('utf-8') |
| | out.write(header) |
| | for line in text: |
| | if out == sys.stdout: |
| | line = line.encode('utf-8') |
| | out.write(line) |
| | out.write('\n') |
| | out.write(footer) |
| |
|
| | def extract(self, out): |
| | """ |
| | :param out: a memory file. |
| | """ |
| | logging.info('%s\t%s', self.id, self.title) |
| |
|
| | |
| | if options.toHTML: |
| | title_str = '<h1>' + self.title + '</h1>' |
| | else: |
| | title_str = self.title + '\n' |
| | |
| | colon = self.title.find(':') |
| | if colon != -1: |
| | ns = self.title[:colon] |
| | pagename = self.title[colon+1:] |
| | else: |
| | ns = '' |
| | pagename = self.title |
| | self.magicWords['NAMESPACE'] = ns |
| | self.magicWords['NAMESPACENUMBER'] = options.knownNamespaces.get(ns, '0') |
| | self.magicWords['PAGENAME'] = pagename |
| | self.magicWords['FULLPAGENAME'] = self.title |
| | slash = pagename.rfind('/') |
| | if slash != -1: |
| | self.magicWords['BASEPAGENAME'] = pagename[:slash] |
| | self.magicWords['SUBPAGENAME'] = pagename[slash+1:] |
| | else: |
| | self.magicWords['BASEPAGENAME'] = pagename |
| | self.magicWords['SUBPAGENAME'] = '' |
| | slash = pagename.find('/') |
| | if slash != -1: |
| | self.magicWords['ROOTPAGENAME'] = pagename[:slash] |
| | else: |
| | self.magicWords['ROOTPAGENAME'] = pagename |
| | self.magicWords['CURRENTYEAR'] = time.strftime('%Y') |
| | self.magicWords['CURRENTMONTH'] = time.strftime('%m') |
| | self.magicWords['CURRENTDAY'] = time.strftime('%d') |
| | self.magicWords['CURRENTHOUR'] = time.strftime('%H') |
| | self.magicWords['CURRENTTIME'] = time.strftime('%H:%M:%S') |
| | text = self.text |
| | self.text = '' |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | text = self.transform(text) |
| | text = self.wiki2text(text) |
| | text = compact(self.clean(text)) |
| | |
| | text = [title_str] + text |
| |
|
| | if sum(len(line) for line in text) < options.min_text_length: |
| | return |
| |
|
| | self.write_output(out, text) |
| |
|
| | errs = (self.template_title_errs, |
| | self.recursion_exceeded_1_errs, |
| | self.recursion_exceeded_2_errs, |
| | self.recursion_exceeded_3_errs) |
| | if any(errs): |
| | logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", |
| | self.title, self.id, *errs) |
| |
|
| |
|
| | def transform(self, wikitext): |
| | """ |
| | Transforms wiki markup. |
| | @see https://www.mediawiki.org/wiki/Help:Formatting |
| | """ |
| | |
| | res = '' |
| | cur = 0 |
| | for m in nowiki.finditer(wikitext, cur): |
| | res += self.transform1(wikitext[cur:m.start()]) + wikitext[m.start():m.end()] |
| | cur = m.end() |
| | |
| | res += self.transform1(wikitext[cur:]) |
| | return res |
| |
|
| |
|
| | def transform1(self, text): |
| | """Transform text not containing <nowiki>""" |
| | if options.expand_templates: |
| | |
| | |
| | return self.expand(text) |
| | else: |
| | |
| | return dropNested(text, r'{{', r'}}') |
| |
|
| |
|
| | def wiki2text(self, text): |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | if not options.keep_tables: |
| | text = dropNested(text, r'{{', r'}}') |
| | text = dropNested(text, r'{\|', r'\|}') |
| |
|
| | |
| | if options.toHTML: |
| | text = bold_italic.sub(r'<b>\1</b>', text) |
| | text = bold.sub(r'<b>\1</b>', text) |
| | text = italic.sub(r'<i>\1</i>', text) |
| | else: |
| | text = bold_italic.sub(r'\1', text) |
| | text = bold.sub(r'\1', text) |
| | text = italic_quote.sub(r'"\1"', text) |
| | text = italic.sub(r'"\1"', text) |
| | text = quote_quote.sub(r'"\1"', text) |
| | |
| | text = text.replace("'''", '').replace("''", '"') |
| |
|
| | |
| | text = replaceInternalLinks(text) |
| |
|
| | |
| | text = replaceExternalLinks(text) |
| |
|
| | |
| | text = magicWordsRE.sub('', text) |
| |
|
| | |
| |
|
| | |
| | res = '' |
| | cur = 0 |
| | for m in syntaxhighlight.finditer(text): |
| | res += unescape(text[cur:m.start()]) + m.group(1) |
| | cur = m.end() |
| | text = res + unescape(text[cur:]) |
| | return text |
| |
|
| |
|
| | def clean(self, text): |
| | """ |
| | Removes irrelevant parts from :param: text. |
| | """ |
| |
|
| | |
| | spans = [] |
| | |
| | for m in comment.finditer(text): |
| | spans.append((m.start(), m.end())) |
| |
|
| | |
| | for pattern in selfClosing_tag_patterns: |
| | for m in pattern.finditer(text): |
| | spans.append((m.start(), m.end())) |
| |
|
| | |
| | for left, right in options.ignored_tag_patterns: |
| | for m in left.finditer(text): |
| | spans.append((m.start(), m.end())) |
| | for m in right.finditer(text): |
| | spans.append((m.start(), m.end())) |
| |
|
| | |
| | text = dropSpans(spans, text) |
| |
|
| | |
| | for tag in options.discardElements: |
| | text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) |
| |
|
| | if not options.toHTML: |
| | |
| | text = unescape(text) |
| |
|
| | |
| | for pattern, placeholder in placeholder_tag_patterns: |
| | index = 1 |
| | for match in pattern.finditer(text): |
| | text = text.replace(match.group(), '%s_%d' % (placeholder, index)) |
| | index += 1 |
| |
|
| | text = text.replace('<<', '«').replace('>>', '»') |
| |
|
| | |
| |
|
| | |
| | text = text.replace('\t', ' ') |
| | text = spaces.sub(' ', text) |
| | text = dots.sub('...', text) |
| | text = re.sub(' (,:\.\)\]»)', r'\1', text) |
| | text = re.sub('(\[\(«) ', r'\1', text) |
| | text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) |
| | text = text.replace(',,', ',').replace(',.', '.') |
| | if options.keep_tables: |
| | |
| | |
| | |
| | text = re.sub(r'!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"', r'', text) |
| | text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text) |
| | text = text.replace('|-', '') |
| | text = text.replace('|', '') |
| | if options.toHTML: |
| | text = html.escape(text) |
| | return text |
| |
|
| |
|
| | |
| | |
| |
|
| | maxTemplateRecursionLevels = 30 |
| | maxParameterRecursionLevels = 10 |
| |
|
| | |
| | reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL) |
| |
|
| |
|
| | def expand(self, wikitext): |
| | """ |
| | :param wikitext: the text to be expanded. |
| | |
| | Templates are frequently nested. Occasionally, parsing mistakes may |
| | cause template insertion to enter an infinite loop, for instance when |
| | trying to instantiate Template:Country |
| | |
| | {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}} |
| | |
| | which is repeatedly trying to insert template 'country_', which is |
| | again resolved to Template:Country. The straightforward solution of |
| | keeping track of templates that were already inserted for the current |
| | article would not work, because the same template may legally be used |
| | more than once, with different parameters in different parts of the |
| | article. Therefore, we limit the number of iterations of nested |
| | template inclusion. |
| | |
| | """ |
| | |
| | |
| | |
| |
|
| | res = '' |
| | if self.frame.depth >= self.maxTemplateRecursionLevels: |
| | self.recursion_exceeded_1_errs += 1 |
| | return res |
| |
|
| | |
| |
|
| | cur = 0 |
| | |
| | for s, e in findMatchingBraces(wikitext, 2): |
| | res += wikitext[cur:s] + self.expandTemplate(wikitext[s + 2:e - 2]) |
| | cur = e |
| | |
| | res += wikitext[cur:] |
| | |
| | return res |
| |
|
| |
|
| | def templateParams(self, parameters): |
| | """ |
| | Build a dictionary with positional or name key to expanded parameters. |
| | :param parameters: the parts[1:] of a template, i.e. all except the title. |
| | """ |
| | templateParams = {} |
| |
|
| | if not parameters: |
| | return templateParams |
| | |
| |
|
| | |
| | |
| |
|
| | unnamedParameterCounter = 0 |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | for param in parameters: |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | m = re.match(' *([^=]*?) *?=(.*)', param, re.DOTALL) |
| | if m: |
| | |
| | |
| | |
| | |
| |
|
| | parameterName = m.group(1).strip() |
| | parameterValue = m.group(2) |
| |
|
| | if ']]' not in parameterValue: |
| | parameterValue = parameterValue.strip() |
| | templateParams[parameterName] = parameterValue |
| | else: |
| | |
| | unnamedParameterCounter += 1 |
| |
|
| | if ']]' not in param: |
| | param = param.strip() |
| | templateParams[str(unnamedParameterCounter)] = param |
| | |
| | return templateParams |
| |
|
| |
|
| | def expandTemplate(self, body): |
| | """Expands template invocation. |
| | :param body: the parts of a template. |
| | |
| | :see http://meta.wikimedia.org/wiki/Help:Expansion for an explanation |
| | of the process. |
| | |
| | See in particular: Expansion of names and values |
| | http://meta.wikimedia.org/wiki/Help:Expansion#Expansion_of_names_and_values |
| | |
| | For most parser functions all names and values are expanded, |
| | regardless of what is relevant for the result. The branching functions |
| | (#if, #ifeq, #iferror, #ifexist, #ifexpr, #switch) are exceptions. |
| | |
| | All names in a template call are expanded, and the titles of the |
| | tplargs in the template body, after which it is determined which |
| | values must be expanded, and for which tplargs in the template body |
| | the first part (default) [sic in the original doc page]. |
| | |
| | In the case of a tplarg, any parts beyond the first are never |
| | expanded. The possible name and the value of the first part is |
| | expanded if the title does not match a name in the template call. |
| | |
| | :see code for braceSubstitution at |
| | https://doc.wikimedia.org/mediawiki-core/master/php/html/Parser_8php_source.html#3397: |
| | |
| | """ |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | if self.frame.depth >= self.maxTemplateRecursionLevels: |
| | self.recursion_exceeded_2_errs += 1 |
| | |
| | return '' |
| |
|
| | logging.debug('%*sEXPAND %s', self.frame.depth, '', body) |
| | parts = splitParts(body) |
| | |
| | title = parts[0].strip() |
| | title = self.expand(title) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | subst = False |
| | if re.match(substWords, title, re.IGNORECASE): |
| | title = re.sub(substWords, '', title, 1, re.IGNORECASE) |
| | subst = True |
| |
|
| | if title in self.magicWords.values: |
| | ret = self.magicWords[title] |
| | logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', title, ret) |
| | return ret |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | colon = title.find(':') |
| | if colon > 1: |
| | funct = title[:colon] |
| | parts[0] = title[colon + 1:].strip() |
| | |
| | ret = callParserFunction(funct, parts, self) |
| | logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', funct, ret) |
| | return ret |
| |
|
| | title = fullyQualifiedTemplateTitle(title) |
| | if not title: |
| | self.template_title_errs += 1 |
| | return '' |
| |
|
| | redirected = options.redirects.get(title) |
| | if redirected: |
| | title = redirected |
| |
|
| | |
| | if title in options.templateCache: |
| | template = options.templateCache[title] |
| | elif title in options.templates: |
| | template = Template.parse(options.templates[title]) |
| | |
| | options.templateCache[title] = template |
| | del options.templates[title] |
| | else: |
| | |
| | logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', title, '') |
| | return '' |
| |
|
| | logging.debug('%*sTEMPLATE %s: %s', self.frame.depth, '', title, template) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | params = parts[1:] |
| |
|
| | |
| | |
| | |
| | if not subst: |
| | |
| | |
| | |
| | params = [self.transform(p) for p in params] |
| |
|
| | |
| | params = self.templateParams(params) |
| |
|
| | |
| | |
| | |
| | |
| | self.frame = self.frame.push(title, params) |
| | instantiated = template.subst(params, self) |
| | value = self.transform(instantiated) |
| | self.frame = self.frame.pop() |
| | logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', title, value) |
| | return value |
| |
|
| |
|
| | |
| | |
| |
|
| |
|
| | def splitParts(paramsList): |
| | """ |
| | :param paramsList: the parts of a template or tplarg. |
| | |
| | Split template parameters at the separator "|". |
| | separator "=". |
| | |
| | Template parameters often contain URLs, internal links, text or even |
| | template expressions, since we evaluate templates outside in. |
| | This is required for cases like: |
| | {{#if: {{{1}}} | {{lc:{{{1}}} | "parameter missing"}} |
| | Parameters are separated by "|" symbols. However, we |
| | cannot simply split the string on "|" symbols, since these |
| | also appear inside templates and internal links, e.g. |
| | |
| | {{if:| |
| | |{{#if:the president| |
| | |{{#if:| |
| | [[Category:Hatnote templates|A{{PAGENAME}}]] |
| | }} |
| | }} |
| | }} |
| | |
| | We split parts at the "|" symbols that are not inside any pair |
| | {{{...}}}, {{...}}, [[...]], {|...|}. |
| | """ |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | sep = '|' |
| | parameters = [] |
| | cur = 0 |
| |
|
| | for s, e in findMatchingBraces(paramsList): |
| | par = paramsList[cur:s].split(sep) |
| | if par: |
| | if parameters: |
| | |
| | parameters[-1] += par[0] |
| | if len(par) > 1: |
| | |
| | parameters.extend(par[1:]) |
| | else: |
| | parameters = par |
| | elif not parameters: |
| | parameters = [''] |
| | |
| | parameters[-1] += paramsList[s:e] |
| | cur = e |
| | |
| | par = paramsList[cur:].split(sep) |
| | if par: |
| | if parameters: |
| | |
| | parameters[-1] += par[0] |
| | if len(par) > 1: |
| | |
| | parameters.extend(par[1:]) |
| | else: |
| | parameters = par |
| |
|
| | |
| | return parameters |
| |
|
| |
|
| | def findMatchingBraces(text, ldelim=0): |
| | """ |
| | :param ldelim: number of braces to match. 0 means match [[]], {{}} and {{{}}}. |
| | """ |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | if ldelim: |
| | reOpen = re.compile('[{]{%d,}' % ldelim) |
| | reNext = re.compile('[{]{2,}|}{2,}') |
| | else: |
| | reOpen = re.compile('{{2,}|\[{2,}') |
| | reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') |
| |
|
| | cur = 0 |
| | while True: |
| | m1 = reOpen.search(text, cur) |
| | if not m1: |
| | return |
| | lmatch = m1.end() - m1.start() |
| | if m1.group()[0] == '{': |
| | stack = [lmatch] |
| | else: |
| | stack = [-lmatch] |
| | end = m1.end() |
| | while True: |
| | m2 = reNext.search(text, end) |
| | if not m2: |
| | return |
| | end = m2.end() |
| | brac = m2.group()[0] |
| | lmatch = m2.end() - m2.start() |
| |
|
| | if brac == '{': |
| | stack.append(lmatch) |
| | elif brac == '}': |
| | while stack: |
| | openCount = stack.pop() |
| | if openCount == 0: |
| | continue |
| | if lmatch >= openCount: |
| | lmatch -= openCount |
| | if lmatch <= 1: |
| | break |
| | else: |
| | |
| | stack.append(openCount - lmatch) |
| | break |
| | if not stack: |
| | yield m1.start(), end - lmatch |
| | cur = end |
| | break |
| | elif len(stack) == 1 and 0 < stack[0] < ldelim: |
| | |
| | |
| | cur = end |
| | break |
| | elif brac == '[': |
| | stack.append(-lmatch) |
| | else: |
| | while stack and stack[-1] < 0: |
| | openCount = -stack.pop() |
| | if lmatch >= openCount: |
| | lmatch -= openCount |
| | if lmatch <= 1: |
| | break |
| | else: |
| | |
| | stack.append(lmatch - openCount) |
| | break |
| | if not stack: |
| | yield m1.start(), end - lmatch |
| | cur = end |
| | break |
| | |
| | cur = end |
| |
|
| |
|
| | def findBalanced(text, openDelim=['[['], closeDelim=[']]']): |
| | """ |
| | Assuming that text contains a properly balanced expression using |
| | :param openDelim: as opening delimiters and |
| | :param closeDelim: as closing delimiters. |
| | :return: an iterator producing pairs (start, end) of start and end |
| | positions in text containing a balanced expression. |
| | """ |
| | openPat = '|'.join([re.escape(x) for x in openDelim]) |
| | |
| | afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)} |
| | stack = [] |
| | start = 0 |
| | cur = 0 |
| | |
| | startSet = False |
| | startPat = re.compile(openPat) |
| | nextPat = startPat |
| | while True: |
| | next = nextPat.search(text, cur) |
| | if not next: |
| | return |
| | if not startSet: |
| | start = next.start() |
| | startSet = True |
| | delim = next.group(0) |
| | if delim in openDelim: |
| | stack.append(delim) |
| | nextPat = afterPat[delim] |
| | else: |
| | opening = stack.pop() |
| | |
| | if stack: |
| | nextPat = afterPat[stack[-1]] |
| | else: |
| | yield start, next.end() |
| | nextPat = startPat |
| | start = next.end() |
| | startSet = False |
| | cur = next.end() |
| |
|
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | def if_empty(*rest): |
| | """ |
| | This implements If_empty from English Wikipedia module: |
| | |
| | <title>Module:If empty</title> |
| | <ns>828</ns> |
| | <text>local p = {} |
| | |
| | function p.main(frame) |
| | local args = require('Module:Arguments').getArgs(frame, {wrappers = 'Template:If empty', removeBlanks = false}) |
| | |
| | -- For backwards compatibility reasons, the first 8 parameters can be unset instead of being blank, |
| | -- even though there's really no legitimate use case for this. At some point, this will be removed. |
| | local lowestNil = math.huge |
| | for i = 8,1,-1 do |
| | if args[i] == nil then |
| | args[i] = '' |
| | lowestNil = i |
| | end |
| | end |
| | |
| | for k,v in ipairs(args) do |
| | if v ~= '' then |
| | if lowestNil < k then |
| | -- If any uses of this template depend on the behavior above, add them to a tracking category. |
| | -- This is a rather fragile, convoluted, hacky way to do it, but it ensures that this module's output won't be modified |
| | -- by it. |
| | frame:extensionTag('ref', '[[Category:Instances of Template:If_empty missing arguments]]', {group = 'TrackingCategory'}) |
| | frame:extensionTag('references', '', {group = 'TrackingCategory'}) |
| | end |
| | return v |
| | end |
| | end |
| | end |
| | |
| | return p </text> |
| | """ |
| | for arg in rest: |
| | if arg: |
| | return arg |
| | return '' |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def functionParams(args, vars): |
| | """ |
| | Build a dictionary of var/value from :param: args. |
| | Parameters can be either named or unnamed. In the latter case, their |
| | name is taken fron :param: vars. |
| | """ |
| | params = {} |
| | index = 1 |
| | for var in vars: |
| | value = args.get(var) |
| | if value is None: |
| | value = args.get(str(index)) |
| | if value is None: |
| | value = '' |
| | else: |
| | index += 1 |
| | params[var] = value |
| | return params |
| |
|
| |
|
| | def string_sub(args): |
| | params = functionParams(args, ('s', 'i', 'j')) |
| | s = params.get('s', '') |
| | i = int(params.get('i', 1) or 1) |
| | j = int(params.get('j', -1) or -1) |
| | if i > 0: i -= 1 |
| | if j < 0: j += 1 |
| | if j == 0: j = len(s) |
| | return s[i:j] |
| |
|
| |
|
| | def string_sublength(args): |
| | params = functionParams(args, ('s', 'i', 'len')) |
| | s = params.get('s', '') |
| | i = int(params.get('i', 1) or 1) - 1 |
| | len = int(params.get('len', 1) or 1) |
| | return s[i:i+len] |
| |
|
| |
|
| | def string_len(args): |
| | params = functionParams(args, ('s')) |
| | s = params.get('s', '') |
| | return len(s) |
| |
|
| |
|
| | def string_find(args): |
| | params = functionParams(args, ('source', 'target', 'start', 'plain')) |
| | source = params.get('source', '') |
| | pattern = params.get('target', '') |
| | start = int('0'+params.get('start', 1)) - 1 |
| | plain = int('0'+params.get('plain', 1)) |
| | if source == '' or pattern == '': |
| | return 0 |
| | if plain: |
| | return source.find(pattern, start) + 1 |
| | else: |
| | return (re.compile(pattern).search(source, start) or -1) + 1 |
| |
|
| |
|
| | def string_pos(args): |
| | params = functionParams(args, ('target', 'pos')) |
| | target = params.get('target', '') |
| | pos = int(params.get('pos', 1) or 1) |
| | if pos > 0: |
| | pos -= 1 |
| | return target[pos] |
| |
|
| |
|
| | def string_replace(args): |
| | params = functionParams(args, ('source', 'pattern', 'replace', 'count', 'plain')) |
| | source = params.get('source', '') |
| | pattern = params.get('pattern', '') |
| | replace = params.get('replace', '') |
| | count = int(params.get('count', 0) or 0) |
| | plain = int(params.get('plain', 1) or 1) |
| | if plain: |
| | if count: |
| | return source.replace(pattern, replace, count) |
| | else: |
| | return source.replace(pattern, replace) |
| | else: |
| | return re.compile(pattern).sub(replace, source, count) |
| |
|
| |
|
| | def string_rep(args): |
| | params = functionParams(args, ('s')) |
| | source = params.get('source', '') |
| | count = int(params.get('count', '1')) |
| | return source * count |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | def roman_main(args): |
| | """Convert first arg to roman numeral if <= 5000 else :return: second arg.""" |
| | num = int(float(args.get('1'))) |
| |
|
| | |
| | if 0 > num or num >= 5000: |
| | return args.get('2', 'N/A') |
| |
|
| | def toRoman(n, romanNumeralMap): |
| | """convert integer to Roman numeral""" |
| | result = "" |
| | for integer, numeral in romanNumeralMap: |
| | while n >= integer: |
| | result += numeral |
| | n -= integer |
| | return result |
| |
|
| | |
| | smallRomans = ( |
| | (1000, "M"), |
| | (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), |
| | (90, "XC"), (50, "L"), (40, "XL"), (10, "X"), |
| | (9, "IX"), (5, "V"), (4, "IV"), (1, "I") |
| | ) |
| | return toRoman(num, smallRomans) |
| |
|
| | |
| |
|
| | modules = { |
| | 'convert': { |
| | 'convert': lambda x, u, *rest: x + ' ' + u, |
| | }, |
| |
|
| | 'If empty': { |
| | 'main': if_empty |
| | }, |
| |
|
| | 'String': { |
| | 'len': string_len, |
| | 'sub': string_sub, |
| | 'sublength': string_sublength, |
| | 'pos': string_pos, |
| | 'find': string_find, |
| | 'replace': string_replace, |
| | 'rep': string_rep, |
| | }, |
| |
|
| | 'Roman': { |
| | 'main': roman_main |
| | }, |
| |
|
| | 'Numero romano': { |
| | 'main': roman_main |
| | } |
| | } |
| |
|
| | |
| | |
| |
|
| |
|
| | class MagicWords(object): |
| | """ |
| | One copy in each Extractor. |
| | |
| | @see https://doc.wikimedia.org/mediawiki-core/master/php/MagicWord_8php_source.html |
| | """ |
| | names = [ |
| | '!', |
| | 'currentmonth', |
| | 'currentmonth1', |
| | 'currentmonthname', |
| | 'currentmonthnamegen', |
| | 'currentmonthabbrev', |
| | 'currentday', |
| | 'currentday2', |
| | 'currentdayname', |
| | 'currentyear', |
| | 'currenttime', |
| | 'currenthour', |
| | 'localmonth', |
| | 'localmonth1', |
| | 'localmonthname', |
| | 'localmonthnamegen', |
| | 'localmonthabbrev', |
| | 'localday', |
| | 'localday2', |
| | 'localdayname', |
| | 'localyear', |
| | 'localtime', |
| | 'localhour', |
| | 'numberofarticles', |
| | 'numberoffiles', |
| | 'numberofedits', |
| | 'articlepath', |
| | 'pageid', |
| | 'sitename', |
| | 'server', |
| | 'servername', |
| | 'scriptpath', |
| | 'stylepath', |
| | 'pagename', |
| | 'pagenamee', |
| | 'fullpagename', |
| | 'fullpagenamee', |
| | 'namespace', |
| | 'namespacee', |
| | 'namespacenumber', |
| | 'currentweek', |
| | 'currentdow', |
| | 'localweek', |
| | 'localdow', |
| | 'revisionid', |
| | 'revisionday', |
| | 'revisionday2', |
| | 'revisionmonth', |
| | 'revisionmonth1', |
| | 'revisionyear', |
| | 'revisiontimestamp', |
| | 'revisionuser', |
| | 'revisionsize', |
| | 'subpagename', |
| | 'subpagenamee', |
| | 'talkspace', |
| | 'talkspacee', |
| | 'subjectspace', |
| | 'subjectspacee', |
| | 'talkpagename', |
| | 'talkpagenamee', |
| | 'subjectpagename', |
| | 'subjectpagenamee', |
| | 'numberofusers', |
| | 'numberofactiveusers', |
| | 'numberofpages', |
| | 'currentversion', |
| | 'rootpagename', |
| | 'rootpagenamee', |
| | 'basepagename', |
| | 'basepagenamee', |
| | 'currenttimestamp', |
| | 'localtimestamp', |
| | 'directionmark', |
| | 'contentlanguage', |
| | 'numberofadmins', |
| | 'cascadingsources', |
| | ] |
| |
|
| | def __init__(self): |
| | self.values = {'!': '|'} |
| |
|
| | def __getitem__(self, name): |
| | return self.values.get(name) |
| |
|
| | def __setitem__(self, name, value): |
| | self.values[name] = value |
| |
|
| | switches = ( |
| | '__NOTOC__', |
| | '__FORCETOC__', |
| | '__TOC__', |
| | '__TOC__', |
| | '__NEWSECTIONLINK__', |
| | '__NONEWSECTIONLINK__', |
| | '__NOGALLERY__', |
| | '__HIDDENCAT__', |
| | '__NOCONTENTCONVERT__', |
| | '__NOCC__', |
| | '__NOTITLECONVERT__', |
| | '__NOTC__', |
| | '__START__', |
| | '__END__', |
| | '__INDEX__', |
| | '__NOINDEX__', |
| | '__STATICREDIRECT__', |
| | '__DISAMBIG__' |
| | ) |
| |
|
| |
|
| | magicWordsRE = re.compile('|'.join(MagicWords.switches)) |
| |
|
| |
|
| | |
| | |
| |
|
| |
|
| | def ucfirst(string): |
| | """:return: a string with just its first character uppercase |
| | We can't use title() since it coverts all words. |
| | """ |
| | if string: |
| | return string[0].upper() + string[1:] |
| | else: |
| | return '' |
| |
|
| |
|
| | def lcfirst(string): |
| | """:return: a string with its first character lowercase""" |
| | if string: |
| | if len(string) > 1: |
| | return string[0].lower() + string[1:] |
| | else: |
| | return string.lower() |
| | else: |
| | return '' |
| |
|
| |
|
| | def fullyQualifiedTemplateTitle(templateTitle): |
| | """ |
| | Determine the namespace of the page being included through the template |
| | mechanism |
| | """ |
| | if templateTitle.startswith(':'): |
| | |
| | return ucfirst(templateTitle[1:]) |
| | else: |
| | m = re.match('([^:]*)(:.*)', templateTitle) |
| | if m: |
| | |
| | |
| | prefix = normalizeNamespace(m.group(1)) |
| | if prefix in options.knownNamespaces: |
| | return prefix + ucfirst(m.group(2)) |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | if templateTitle: |
| | return options.templatePrefix + ucfirst(templateTitle) |
| | else: |
| | return '' |
| |
|
| |
|
| | def normalizeNamespace(ns): |
| | return ucfirst(ns) |
| |
|
| |
|
| | |
| | |
| | |
| | |
| |
|
| |
|
| | class Infix: |
| | """Infix operators. |
| | The calling sequence for the infix is: |
| | x |op| y |
| | """ |
| |
|
| | def __init__(self, function): |
| | self.function = function |
| |
|
| | def __ror__(self, other): |
| | return Infix(lambda x, self=self, other=other: self.function(other, x)) |
| |
|
| | def __or__(self, other): |
| | return self.function(other) |
| |
|
| | def __rlshift__(self, other): |
| | return Infix(lambda x, self=self, other=other: self.function(other, x)) |
| |
|
| | def __rshift__(self, other): |
| | return self.function(other) |
| |
|
| | def __call__(self, value1, value2): |
| | return self.function(value1, value2) |
| |
|
| |
|
| | ROUND = Infix(lambda x, y: round(x, y)) |
| |
|
| |
|
| | from math import floor, ceil, pi, e, trunc, exp, log as ln, sin, cos, tan, asin, acos, atan |
| |
|
| |
|
| | def sharp_expr(extr, expr): |
| | """Tries converting a lua expr into a Python expr.""" |
| | try: |
| | expr = extr.expand(expr) |
| | expr = re.sub('(?<![!<>])=', '==', expr) |
| | expr = re.sub('mod', '%', expr) |
| | expr = re.sub('\bdiv\b', '/', expr) |
| | expr = re.sub('\bround\b', '|ROUND|', expr) |
| | return text_type(eval(expr)) |
| | except: |
| | return '<span class="error">%s</span>' % expr |
| |
|
| |
|
| | def sharp_if(extr, testValue, valueIfTrue, valueIfFalse=None, *args): |
| | |
| | |
| | if testValue.strip(): |
| | |
| | |
| | valueIfTrue = extr.expand(valueIfTrue.strip()) |
| | if valueIfTrue: |
| | return valueIfTrue |
| | elif valueIfFalse: |
| | return extr.expand(valueIfFalse.strip()) |
| | return "" |
| |
|
| |
|
| | def sharp_ifeq(extr, lvalue, rvalue, valueIfTrue, valueIfFalse=None, *args): |
| | rvalue = rvalue.strip() |
| | if rvalue: |
| | |
| | if lvalue.strip() == rvalue: |
| | |
| | |
| | |
| | |
| |
|
| | if valueIfTrue: |
| | return extr.expand(valueIfTrue.strip()) |
| | else: |
| | if valueIfFalse: |
| | return extr.expand(valueIfFalse.strip()) |
| | return "" |
| |
|
| |
|
| | def sharp_iferror(extr, test, then='', Else=None, *args): |
| | if re.match('<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test): |
| | return extr.expand(then.strip()) |
| | elif Else is None: |
| | return test.strip() |
| | else: |
| | return extr.expand(Else.strip()) |
| |
|
| |
|
| | def sharp_switch(extr, primary, *params): |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | primary = primary.strip() |
| | found = False |
| | default = None |
| | rvalue = None |
| | lvalue = '' |
| | for param in params: |
| | |
| | |
| | pair = param.split('=', 1) |
| | lvalue = extr.expand(pair[0].strip()) |
| | rvalue = None |
| | if len(pair) > 1: |
| | |
| | rvalue = extr.expand(pair[1].strip()) |
| | |
| | if found or primary in [v.strip() for v in lvalue.split('|')]: |
| | |
| | return rvalue |
| | elif lvalue == '#default': |
| | default = rvalue |
| | rvalue = None |
| | elif lvalue == primary: |
| | |
| | found = True |
| | |
| | |
| | if rvalue is not None: |
| | return lvalue |
| | elif default is not None: |
| | return default |
| | return '' |
| |
|
| |
|
| | |
| | def sharp_invoke(module, function, args): |
| | functions = modules.get(module) |
| | if functions: |
| | funct = functions.get(function) |
| | if funct: |
| | return text_type(funct(args)) |
| | return '' |
| |
|
| |
|
| | parserFunctions = { |
| |
|
| | '#expr': sharp_expr, |
| |
|
| | '#if': sharp_if, |
| |
|
| | '#ifeq': sharp_ifeq, |
| |
|
| | '#iferror': sharp_iferror, |
| |
|
| | '#ifexpr': lambda *args: '', |
| |
|
| | '#ifexist': lambda extr, title, ifex, ifnex: extr.expand(ifnex), |
| |
|
| | '#rel2abs': lambda *args: '', |
| |
|
| | '#switch': sharp_switch, |
| |
|
| | '#language': lambda *args: '', |
| |
|
| | '#time': lambda *args: '', |
| |
|
| | '#timel': lambda *args: '', |
| |
|
| | '#titleparts': lambda *args: '', |
| |
|
| | |
| | |
| | 'urlencode': lambda extr, string, *rest: quote(string.encode('utf-8')), |
| |
|
| | 'lc': lambda extr, string, *rest: string.lower() if string else '', |
| |
|
| | 'lcfirst': lambda extr, string, *rest: lcfirst(string), |
| |
|
| | 'uc': lambda extr, string, *rest: string.upper() if string else '', |
| |
|
| | 'ucfirst': lambda extr, string, *rest: ucfirst(string), |
| |
|
| | 'int': lambda extr, string, *rest: text_type(int(string)), |
| |
|
| | } |
| |
|
| |
|
| | def callParserFunction(functionName, args, extractor): |
| | """ |
| | Parser functions have similar syntax as templates, except that |
| | the first argument is everything after the first colon. |
| | :return: the result of the invocation, None in case of failure. |
| | |
| | :param: args not yet expanded (see branching functions). |
| | https://www.mediawiki.org/wiki/Help:Extension:ParserFunctions |
| | """ |
| |
|
| | try: |
| | |
| | functionName = functionName.lower() |
| | if functionName == '#invoke': |
| | module, fun = args[0].strip(), args[1].strip() |
| | logging.debug('%*s#invoke %s %s %s', extractor.frame.depth, '', module, fun, args[2:]) |
| | |
| | if len(args) == 2: |
| | |
| | |
| | templateTitle = fullyQualifiedTemplateTitle(module) |
| | if not templateTitle: |
| | logging.warn("Template with empty title") |
| | params = None |
| | frame = extractor.frame |
| | while frame: |
| | if frame.title == templateTitle: |
| | params = frame.args |
| | break |
| | frame = frame.prev |
| | else: |
| | params = [extractor.transform(p) for p in args[2:]] |
| | params = extractor.templateParams(params) |
| | ret = sharp_invoke(module, fun, params) |
| | logging.debug('%*s<#invoke %s %s %s', extractor.frame.depth, '', module, fun, ret) |
| | return ret |
| | if functionName in parserFunctions: |
| | |
| | return parserFunctions[functionName](extractor, *args) |
| | except: |
| | return "" |
| | return "" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL) |
| | reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL) |
| |
|
| | def define_template(title, page): |
| | """ |
| | Adds a template defined in the :param page:. |
| | @see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude |
| | """ |
| | |
| |
|
| | |
| | if not page: return |
| |
|
| | |
| | m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE) |
| | if m: |
| | options.redirects[title] = m.group(1) |
| | return |
| |
|
| | text = unescape(''.join(page)) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | text = comment.sub('', text) |
| |
|
| | |
| | text = reNoinclude.sub('', text) |
| | |
| | text = re.sub(r'<noinclude\s*>.*$', '', text, flags=re.DOTALL) |
| | text = re.sub(r'<noinclude/>', '', text) |
| |
|
| | onlyincludeAccumulator = '' |
| | for m in re.finditer('<onlyinclude>(.*?)</onlyinclude>', text, re.DOTALL): |
| | onlyincludeAccumulator += m.group(1) |
| | if onlyincludeAccumulator: |
| | text = onlyincludeAccumulator |
| | else: |
| | text = reIncludeonly.sub('', text) |
| |
|
| | if text: |
| | if title in options.templates: |
| | logging.warn('Redefining: %s', title) |
| | options.templates[title] = text |
| |
|
| |
|
| | |
| |
|
| | def dropNested(text, openDelim, closeDelim): |
| | """ |
| | A matching function for nested expressions, e.g. namespaces and tables. |
| | """ |
| | openRE = re.compile(openDelim, re.IGNORECASE) |
| | closeRE = re.compile(closeDelim, re.IGNORECASE) |
| | |
| | spans = [] |
| | nest = 0 |
| | start = openRE.search(text, 0) |
| | if not start: |
| | return text |
| | end = closeRE.search(text, start.end()) |
| | next = start |
| | while end: |
| | next = openRE.search(text, next.end()) |
| | if not next: |
| | while nest: |
| | nest -= 1 |
| | end0 = closeRE.search(text, end.end()) |
| | if end0: |
| | end = end0 |
| | else: |
| | break |
| | spans.append((start.start(), end.end())) |
| | break |
| | while end.end() < next.start(): |
| | |
| | if nest: |
| | nest -= 1 |
| | |
| | last = end.end() |
| | end = closeRE.search(text, end.end()) |
| | if not end: |
| | if spans: |
| | span = (spans[0][0], last) |
| | else: |
| | span = (start.start(), last) |
| | spans = [span] |
| | break |
| | else: |
| | spans.append((start.start(), end.end())) |
| | |
| | start = next |
| | end = closeRE.search(text, next.end()) |
| | break |
| | if next != start: |
| | |
| | nest += 1 |
| | |
| | return dropSpans(spans, text) |
| |
|
| |
|
| | def dropSpans(spans, text): |
| | """ |
| | Drop from text the blocks identified in :param spans:, possibly nested. |
| | """ |
| | spans.sort() |
| | res = '' |
| | offset = 0 |
| | for s, e in spans: |
| | if offset <= s: |
| | if offset < s: |
| | res += text[offset:s] |
| | offset = e |
| | res += text[offset:] |
| | return res |
| |
|
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| |
|
| | def replaceInternalLinks(text): |
| | """ |
| | Replaces internal links of the form: |
| | [[title |...|label]]trail |
| | |
| | with title concatenated with trail, when present, e.g. 's' for plural. |
| | |
| | See https://www.mediawiki.org/wiki/Help:Links#Internal_links |
| | """ |
| | |
| | |
| | cur = 0 |
| | res = '' |
| | for s, e in findBalanced(text): |
| | m = tailRE.match(text, e) |
| | if m: |
| | trail = m.group(0) |
| | end = m.end() |
| | else: |
| | trail = '' |
| | end = e |
| | inner = text[s + 2:e - 2] |
| | |
| | pipe = inner.find('|') |
| | if pipe < 0: |
| | title = inner |
| | label = title |
| | else: |
| | title = inner[:pipe].rstrip() |
| | |
| | curp = pipe + 1 |
| | for s1, e1 in findBalanced(inner): |
| | last = inner.rfind('|', curp, s1) |
| | if last >= 0: |
| | pipe = last |
| | curp = e1 |
| | label = inner[pipe + 1:].strip() |
| | res += text[cur:s] + makeInternalLink(title, label) + trail |
| | cur = end |
| | return res + text[cur:] |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | def makeInternalLink(title, label): |
| | colon = title.find(':') |
| | if colon > 0 and title[:colon] not in options.acceptedNamespaces: |
| | return '' |
| | if colon == 0: |
| | |
| | colon2 = title.find(':', colon + 1) |
| | if colon2 > 1 and title[colon + 1:colon2] not in options.acceptedNamespaces: |
| | return '' |
| | if options.keepLinks: |
| | return '<a href="%s">%s</a>' % (quote(title.encode('utf-8')), label) |
| | else: |
| | return label |
| |
|
| |
|
| | |
| | |
| |
|
| | |
| |
|
| | wgUrlProtocols = [ |
| | 'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://', |
| | 'https://', 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:', |
| | 'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://', |
| | 'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:', '//' |
| | ] |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]' |
| | ANCHOR_CLASS = r'[^][\x00-\x08\x0a-\x1F]' |
| | ExtLinkBracketedRegex = re.compile( |
| | '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)' + |
| | r'\s*((?:' + ANCHOR_CLASS + r'|\[\[' + ANCHOR_CLASS + r'+\]\])' + r'*?)\]', |
| | re.S | re.U) |
| | |
| | |
| |
|
| | EXT_IMAGE_REGEX = re.compile( |
| | r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) |
| | /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""", |
| | re.X | re.S | re.U) |
| |
|
| |
|
| | def replaceExternalLinks(text): |
| | """ |
| | https://www.mediawiki.org/wiki/Help:Links#External_links |
| | [URL anchor text] |
| | """ |
| | s = '' |
| | cur = 0 |
| | for m in ExtLinkBracketedRegex.finditer(text): |
| | s += text[cur:m.start()] |
| | cur = m.end() |
| |
|
| | url = m.group(1) |
| | label = m.group(3) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | m = EXT_IMAGE_REGEX.match(label) |
| | if m: |
| | label = makeExternalImage(label) |
| |
|
| | |
| | |
| | |
| | |
| | s += makeExternalLink(url, label) |
| |
|
| | return s + text[cur:] |
| |
|
| |
|
| | def makeExternalLink(url, anchor): |
| | """Function applied to wikiLinks""" |
| | if options.keepLinks: |
| | return '<a href="%s">%s</a>' % (quote(url.encode('utf-8')), anchor) |
| | else: |
| | return anchor |
| |
|
| |
|
| | def makeExternalImage(url, alt=''): |
| | if options.keepLinks: |
| | return '<img src="%s" alt="%s">' % (url, alt) |
| | else: |
| | return alt |
| |
|
| |
|
| | |
| |
|
| | |
| | tailRE = re.compile('\w+') |
| |
|
| | syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL) |
| |
|
| | |
| | section = re.compile(r'(==+)\s*(.*?)\s*\1') |
| |
|
| | listOpen = {'*': '<ul>', '#': '<ol>', ';': '<dl>', ':': '<dl>'} |
| | listClose = {'*': '</ul>', '#': '</ol>', ';': '</dl>', ':': '</dl>'} |
| | listItem = {'*': '<li>%s</li>', '#': '<li>%s</<li>', ';': '<dt>%s</dt>', |
| | ':': '<dd>%s</dd>'} |
| |
|
| |
|
| | def compact(text): |
| | """Deal with headers, lists, empty sections, residuals of tables. |
| | :param text: convert to HTML. |
| | """ |
| |
|
| | page = [] |
| | headers = {} |
| | emptySection = False |
| | listLevel = [] |
| | listCount = [] |
| | for line in text.split('\n'): |
| | if not line: |
| | |
| | if len(listLevel): |
| | page.append(line) |
| | if options.toHTML: |
| | for c in reversed(listLevel): |
| | page.append(listClose[c]) |
| | listLevel = [] |
| | listCount = [] |
| | emptySection = False |
| | elif page and page[-1]: |
| | page.append('') |
| | continue |
| | |
| | m = section.match(line) |
| | if m: |
| | title = m.group(2) |
| | lev = len(m.group(1)) |
| | if options.toHTML: |
| | page.append("<h%d>%s</h%d>" % (lev, title, lev)) |
| | if title and title[-1] not in '!?': |
| | title += '.' |
| | headers[lev] = title |
| | |
| | for i in list(headers.keys()): |
| | if i > lev: |
| | del headers[i] |
| | emptySection = True |
| | listLevel = [] |
| | listCount = [] |
| | continue |
| | |
| | elif line.startswith('++'): |
| | title = line[2:-2] |
| | if title: |
| | if title[-1] not in '!?': |
| | title += '.' |
| | page.append(title) |
| | |
| | elif line[0] == ':': |
| | |
| | continue |
| | |
| | elif line[0] in '*#;:': |
| | i = 0 |
| | |
| | |
| | for c, n in zip_longest(listLevel, line, fillvalue=''): |
| | if not n or n not in '*#;:': |
| | if c: |
| | if options.toHTML: |
| | page.append(listClose[c]) |
| | listLevel = listLevel[:-1] |
| | listCount = listCount[:-1] |
| | continue |
| | else: |
| | break |
| | |
| | if c != n and (not c or (c not in ';:' and n not in ';:')): |
| | if c: |
| | |
| | if options.toHTML: |
| | page.append(listClose[c]) |
| | listLevel = listLevel[:-1] |
| | listCount = listCount[:-1] |
| | listLevel += n |
| | listCount.append(0) |
| | if options.toHTML: |
| | page.append(listOpen[n]) |
| | i += 1 |
| | n = line[i - 1] |
| | line = line[i:].strip() |
| | if line: |
| | if options.keepLists: |
| | if options.keepSections: |
| | |
| | items = sorted(headers.items()) |
| | for _, v in items: |
| | page.append("Section::::" + v) |
| | headers.clear() |
| | |
| | listCount[i - 1] += 1 |
| | bullet = 'BULLET::::%d. ' % listCount[i - 1] if n == '#' else 'BULLET::::- ' |
| | page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line) |
| | elif options.toHTML: |
| | if n not in listItem: |
| | n = '*' |
| | page.append(listItem[n] % line) |
| | elif len(listLevel): |
| | if options.toHTML: |
| | for c in reversed(listLevel): |
| | page.append(listClose[c]) |
| | listLevel = [] |
| | listCount = [] |
| | page.append(line) |
| |
|
| | |
| | elif line[0] in '{|' or line[-1] == '}': |
| | continue |
| | |
| | elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '': |
| | continue |
| | elif len(headers): |
| | if options.keepSections: |
| | items = sorted(headers.items()) |
| | for i, v in items: |
| | page.append("Section::::" + v) |
| | headers.clear() |
| | page.append(line) |
| | emptySection = False |
| | elif not emptySection: |
| | |
| | if line[0] != ' ': |
| | page.append(line) |
| | return page |
| |
|
| |
|
| | def handle_unicode(entity): |
| | numeric_code = int(entity[2:-1]) |
| | if numeric_code >= 0x10000: return '' |
| | return chr(numeric_code) |
| |
|
| |
|
| | |
| | |
| |
|
| |
|
| | class NextFile(object): |
| | """ |
| | Synchronous generation of next available file name. |
| | """ |
| |
|
| | filesPerDir = 100 |
| |
|
| | def __init__(self, path_name): |
| | self.path_name = path_name |
| | self.dir_index = -1 |
| | self.file_index = -1 |
| |
|
| | def __next__(self): |
| | self.file_index = (self.file_index + 1) % NextFile.filesPerDir |
| | if self.file_index == 0: |
| | self.dir_index += 1 |
| | dirname = self._dirname() |
| | if not os.path.isdir(dirname): |
| | os.makedirs(dirname) |
| | return self._filepath() |
| |
|
| | next = __next__ |
| |
|
| | def _dirname(self): |
| | char1 = self.dir_index % 26 |
| | char2 = self.dir_index // 26 % 26 |
| | return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) |
| |
|
| | def _filepath(self): |
| | return '%s/wiki_%02d' % (self._dirname(), self.file_index) |
| |
|
| |
|
| | class OutputSplitter(object): |
| | """ |
| | File-like object, that splits output to multiple files of a given max size. |
| | """ |
| |
|
| | def __init__(self, nextFile, max_file_size=0, compress=True): |
| | """ |
| | :param nextFile: a NextFile object from which to obtain filenames |
| | to use. |
| | :param max_file_size: the maximum size of each file. |
| | :para compress: whether to write data with bzip compression. |
| | """ |
| | self.nextFile = nextFile |
| | self.compress = compress |
| | self.max_file_size = max_file_size |
| | self.file = self.open(next(self.nextFile)) |
| |
|
| | def reserve(self, size): |
| | if self.file.tell() + size > self.max_file_size: |
| | self.close() |
| | self.file = self.open(next(self.nextFile)) |
| |
|
| | def write(self, data): |
| | self.reserve(len(data)) |
| | self.file.write(data) |
| |
|
| | def close(self): |
| | self.file.close() |
| |
|
| | def open(self, filename): |
| | if self.compress: |
| | return bz2.BZ2File(filename + '.bz2', 'w') |
| | else: |
| | return open(filename, 'wb') |
| |
|
| |
|
| | |
| | |
| |
|
| | tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*?>(?:([^<]*)(<.*?>)?)?') |
| | |
| | keyRE = re.compile(r'key="(\d*)"') |
| | catRE = re.compile(r'\[\[Category:([^\|]+).*\]\].*') |
| |
|
| | def load_templates(file, output_file=None): |
| | """ |
| | Load templates from :param file:. |
| | :param output_file: file where to save templates and modules. |
| | """ |
| | options.templatePrefix = options.templateNamespace + ':' |
| | options.modulePrefix = options.moduleNamespace + ':' |
| |
|
| | if output_file: |
| | output = codecs.open(output_file, 'wb', 'utf-8') |
| | for page_count, page_data in enumerate(pages_from(file)): |
| | id, revid, title, ns,catSet, page = page_data |
| | if not output_file and (not options.templateNamespace or |
| | not options.moduleNamespace): |
| | |
| | if ns in templateKeys: |
| | colon = title.find(':') |
| | if colon > 1: |
| | if ns == '10': |
| | options.templateNamespace = title[:colon] |
| | options.templatePrefix = title[:colon + 1] |
| | elif ns == '828': |
| | options.moduleNamespace = title[:colon] |
| | options.modulePrefix = title[:colon + 1] |
| | if ns in templateKeys: |
| | text = ''.join(page) |
| | define_template(title, text) |
| | |
| | if output_file: |
| | output.write('<page>\n') |
| | output.write(' <title>%s</title>\n' % title) |
| | output.write(' <ns>%s</ns>\n' % ns) |
| | output.write(' <id>%s</id>\n' % id) |
| | output.write(' <text>') |
| | for line in page: |
| | output.write(line) |
| | output.write(' </text>\n') |
| | output.write('</page>\n') |
| | if page_count and page_count % 100000 == 0: |
| | logging.info("Preprocessed %d pages", page_count) |
| | if output_file: |
| | output.close() |
| | logging.info("Saved %d templates to '%s'", len(options.templates), output_file) |
| |
|
| |
|
| | def pages_from(input): |
| | """ |
| | Scans input extracting pages. |
| | :return: (id, revid, title, namespace key, page), page is a list of lines. |
| | """ |
| | |
| | |
| | page = [] |
| | id = None |
| | ns = '0' |
| | last_id = None |
| | revid = None |
| | inText = False |
| | redirect = False |
| | title = None |
| | for line in input: |
| | if not isinstance(line, text_type): line = line.decode('utf-8') |
| | if '<' not in line: |
| | if inText: |
| | page.append(line) |
| | |
| | if line.lstrip().startswith('[[Category:'): |
| | mCat = catRE.search(line) |
| | if mCat: |
| | catSet.add(mCat.group(1)) |
| | continue |
| | m = tagRE.search(line) |
| | if not m: |
| | continue |
| | tag = m.group(2) |
| | if tag == 'page': |
| | page = [] |
| | catSet = set() |
| | redirect = False |
| | elif tag == 'id' and not id: |
| | id = m.group(3) |
| | elif tag == 'id' and not revid: |
| | revid = m.group(3) |
| | elif tag == 'title': |
| | title = m.group(3) |
| | elif tag == 'ns': |
| | ns = m.group(3) |
| | elif tag == 'redirect': |
| | redirect = True |
| | elif tag == 'text': |
| | if m.lastindex == 3 and line[m.start(3)-2] == '/': |
| | |
| | continue |
| | inText = True |
| | line = line[m.start(3):m.end(3)] |
| | page.append(line) |
| | if m.lastindex == 4: |
| | inText = False |
| | elif tag == '/text': |
| | if m.group(1): |
| | page.append(m.group(1)) |
| | inText = False |
| | elif inText: |
| | page.append(line) |
| | elif tag == '/page': |
| | if id != last_id and not redirect: |
| | yield (id, revid, title, ns,catSet, page) |
| | last_id = id |
| | ns = '0' |
| | id = None |
| | revid = None |
| | title = None |
| | page = [] |
| |
|
| |
|
| | def process_dump(input_file, template_file, out_file, file_size, file_compress, |
| | process_count): |
| | """ |
| | :param input_file: name of the wikipedia dump file; '-' to read from stdin |
| | :param template_file: optional file with template definitions. |
| | :param out_file: directory where to store extracted data, or '-' for stdout |
| | :param file_size: max size of each extracted file, or None for no max (one file) |
| | :param file_compress: whether to compress files with bzip. |
| | :param process_count: number of extraction processes to spawn. |
| | """ |
| |
|
| | if input_file == '-': |
| | input = sys.stdin |
| | else: |
| | input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) |
| |
|
| | |
| | for line in input: |
| | |
| | if not isinstance(line, text_type): line = line.decode('utf-8') |
| | m = tagRE.search(line) |
| | if not m: |
| | continue |
| | tag = m.group(2) |
| | if tag == 'base': |
| | |
| | |
| | base = m.group(3) |
| | options.urlbase = base[:base.rfind("/")] |
| | elif tag == 'namespace': |
| | mk = keyRE.search(line) |
| | if mk: |
| | nsid = ''.join(mk.groups()) |
| | else: |
| | nsid = '' |
| | options.knownNamespaces[m.group(3)] = nsid |
| | if re.search('key="10"', line): |
| | options.templateNamespace = m.group(3) |
| | options.templatePrefix = options.templateNamespace + ':' |
| | elif re.search('key="828"', line): |
| | options.moduleNamespace = m.group(3) |
| | options.modulePrefix = options.moduleNamespace + ':' |
| | elif tag == '/siteinfo': |
| | break |
| |
|
| | if options.expand_templates: |
| | |
| | template_load_start = default_timer() |
| | if template_file: |
| | if os.path.exists(template_file): |
| | logging.info("Loading template definitions from: %s", template_file) |
| | |
| | file = fileinput.FileInput(template_file, |
| | openhook=fileinput.hook_compressed) |
| | load_templates(file) |
| | file.close() |
| | else: |
| | if input_file == '-': |
| | |
| | raise ValueError("to use templates with stdin dump, must supply explicit template-file") |
| | logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file) |
| | load_templates(input, template_file) |
| | input.close() |
| | input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) |
| | template_load_elapsed = default_timer() - template_load_start |
| | logging.info("Loaded %d templates in %.1fs", len(options.templates), template_load_elapsed) |
| |
|
| | |
| | logging.info("Starting page extraction from %s.", input_file) |
| | extract_start = default_timer() |
| |
|
| | |
| | |
| | |
| |
|
| | process_count = max(1, process_count) |
| | maxsize = 10 * process_count |
| | |
| | output_queue = Queue(maxsize=maxsize) |
| |
|
| | if out_file == '-': |
| | out_file = None |
| |
|
| | worker_count = process_count |
| |
|
| | |
| | max_spool_length = 10000 |
| | spool_length = Value('i', 0, lock=False) |
| |
|
| | |
| | reduce = Process(target=reduce_process, |
| | args=(options, output_queue, spool_length, |
| | out_file, file_size, file_compress)) |
| | reduce.start() |
| |
|
| | |
| | jobs_queue = Queue(maxsize=maxsize) |
| |
|
| | |
| | logging.info("Using %d extract processes.", worker_count) |
| | workers = [] |
| | for i in range(worker_count): |
| | extractor = Process(target=extract_process, |
| | args=(options, i, jobs_queue, output_queue)) |
| | extractor.daemon = True |
| | extractor.start() |
| | workers.append(extractor) |
| |
|
| | |
| | page_num = 0 |
| | for page_data in pages_from(input): |
| | id, revid, title, ns, catSet, page = page_data |
| | if keepPage(ns, catSet, page): |
| | |
| | delay = 0 |
| | if spool_length.value > max_spool_length: |
| | |
| | while spool_length.value > max_spool_length/10: |
| | time.sleep(10) |
| | delay += 10 |
| | if delay: |
| | logging.info('Delay %ds', delay) |
| | job = (id, revid, title, page, page_num) |
| | jobs_queue.put(job) |
| | page_num += 1 |
| | page = None |
| |
|
| | input.close() |
| |
|
| | |
| | for _ in workers: |
| | jobs_queue.put(None) |
| | |
| | for w in workers: |
| | w.join() |
| |
|
| | |
| | output_queue.put(None) |
| | |
| | reduce.join() |
| |
|
| | extract_duration = default_timer() - extract_start |
| | extract_rate = page_num / extract_duration |
| | logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)", |
| | process_count, page_num, extract_duration, extract_rate) |
| | logging.info("total of page: %d, total of articl page: %d; total of used articl page: %d" % (g_page_total, g_page_articl_total,g_page_articl_used_total)) |
| |
|
| |
|
| | |
| | |
| |
|
| |
|
| | def extract_process(opts, i, jobs_queue, output_queue): |
| | """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text |
| | :param i: process id. |
| | :param jobs_queue: where to get jobs. |
| | :param output_queue: where to queue extracted text for output. |
| | """ |
| |
|
| | global options |
| | options = opts |
| |
|
| | createLogger(options.quiet, options.debug, options.log_file) |
| |
|
| | out = StringIO() |
| |
|
| |
|
| | while True: |
| | job = jobs_queue.get() |
| | if job: |
| | id, revid, title, page, page_num = job |
| | try: |
| | e = Extractor(*job[:4]) |
| | page = None |
| | e.extract(out) |
| | text = out.getvalue() |
| | except: |
| | text = '' |
| | logging.exception('Processing page: %s %s', id, title) |
| |
|
| | output_queue.put((page_num, text)) |
| | out.truncate(0) |
| | out.seek(0) |
| | else: |
| | logging.debug('Quit extractor') |
| | break |
| | out.close() |
| |
|
| |
|
| | report_period = 10000 |
| | def reduce_process(opts, output_queue, spool_length, |
| | out_file=None, file_size=0, file_compress=True): |
| | """Pull finished article text, write series of files (or stdout) |
| | :param opts: global parameters. |
| | :param output_queue: text to be output. |
| | :param spool_length: spool length. |
| | :param out_file: filename where to print. |
| | :param file_size: max file size. |
| | :param file_compress: whether to compress output. |
| | """ |
| |
|
| | global options |
| | options = opts |
| |
|
| | createLogger(options.quiet, options.debug, options.log_file) |
| |
|
| | if out_file: |
| | nextFile = NextFile(out_file) |
| | output = OutputSplitter(nextFile, file_size, file_compress) |
| | else: |
| | output = sys.stdout if PY2 else sys.stdout.buffer |
| | if file_compress: |
| | logging.warn("writing to stdout, so no output compression (use an external tool)") |
| |
|
| | interval_start = default_timer() |
| | |
| | spool = {} |
| | next_page = 0 |
| | while True: |
| | if next_page in spool: |
| | output.write(spool.pop(next_page).encode('utf-8')) |
| | next_page += 1 |
| | |
| | spool_length.value = len(spool) |
| | |
| | if next_page % report_period == 0: |
| | interval_rate = report_period / (default_timer() - interval_start) |
| | logging.info("Extracted %d articles (%.1f art/s)", |
| | next_page, interval_rate) |
| | interval_start = default_timer() |
| | else: |
| | |
| | pair = output_queue.get() |
| | if not pair: |
| | break |
| | page_num, text = pair |
| | spool[page_num] = text |
| | |
| | spool_length.value = len(spool) |
| | |
| | |
| | if len(spool) > 200: |
| | logging.debug('Collected %d, waiting: %d, %d', len(spool), |
| | next_page, next_page == page_num) |
| | if output != sys.stdout: |
| | output.close() |
| |
|
| |
|
| | |
| |
|
| | |
| | minFileSize = 200 * 1024 |
| |
|
| | def main(): |
| |
|
| | parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), |
| | formatter_class=argparse.RawDescriptionHelpFormatter, |
| | description=__doc__) |
| | parser.add_argument("input", |
| | help="XML wiki dump file") |
| | groupO = parser.add_argument_group('Output') |
| | groupO.add_argument("-o", "--output", default="text", |
| | help="directory for extracted files (or '-' for dumping to stdout)") |
| | groupO.add_argument("-b", "--bytes", default="1M", |
| | help="maximum bytes per output file (default %(default)s)", |
| | metavar="n[KMG]") |
| | groupO.add_argument("-c", "--compress", action="store_true", |
| | help="compress output files using bzip") |
| | groupO.add_argument("--json", action="store_true", |
| | help="write output in json format instead of the default one") |
| |
|
| |
|
| | groupP = parser.add_argument_group('Processing') |
| | groupP.add_argument("--html", action="store_true", |
| | help="produce HTML output, subsumes --links") |
| | groupP.add_argument("-l", "--links", action="store_true", |
| | help="preserve links") |
| | groupP.add_argument("-s", "--sections", action="store_true", |
| | help="preserve sections") |
| | groupP.add_argument("--lists", action="store_true", |
| | help="preserve lists") |
| | groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", |
| | help="accepted namespaces in links") |
| | groupP.add_argument("--templates", |
| | help="use or create file containing templates") |
| | groupP.add_argument("--no_templates", action="store_false", |
| | help="Do not expand templates") |
| | groupP.add_argument("-r", "--revision", action="store_true", default=options.print_revision, |
| | help="Include the document revision id (default=%(default)s)") |
| | groupP.add_argument("--min_text_length", type=int, default=options.min_text_length, |
| | help="Minimum expanded text length required to write document (default=%(default)s)") |
| | groupP.add_argument("--filter_disambig_pages", action="store_true", default=options.filter_disambig_pages, |
| | help="Remove pages from output that contain disabmiguation markup (default=%(default)s)") |
| | groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big", |
| | help="comma separated list of tags that will be dropped, keeping their content") |
| | groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude", |
| | help="comma separated list of elements that will be removed from the article text") |
| | groupP.add_argument("--keep_tables", action="store_true", default=options.keep_tables, |
| | help="Preserve tables in the output article text (default=%(default)s)") |
| | default_process_count = max(1, cpu_count() - 1) |
| | parser.add_argument("--processes", type=int, default=default_process_count, |
| | help="Number of processes to use (default %(default)s)") |
| |
|
| | groupS = parser.add_argument_group('Special') |
| | groupS.add_argument("-q", "--quiet", action="store_true", |
| | help="suppress reporting progress info") |
| | groupS.add_argument("--debug", action="store_true", |
| | help="print debug info") |
| | groupS.add_argument("-a", "--article", action="store_true", |
| | help="analyze a file containing a single article (debug option)") |
| | groupS.add_argument("--log_file", |
| | help="path to save the log info") |
| | groupS.add_argument("-v", "--version", action="version", |
| | version='%(prog)s ' + version, |
| | help="print program version") |
| | groupP.add_argument("--filter_category", |
| | help="specify the file that listing the Categories you want to include or exclude. One line for" |
| | " one category. starting with: 1) '#' comment, ignored; 2) '^' exclude; Note: excluding has higher priority than including") |
| | args = parser.parse_args() |
| |
|
| | options.keepLinks = args.links |
| | options.keepSections = args.sections |
| | options.keepLists = args.lists |
| | options.toHTML = args.html |
| | options.write_json = args.json |
| | options.print_revision = args.revision |
| | options.min_text_length = args.min_text_length |
| | if args.html: |
| | options.keepLinks = True |
| |
|
| | options.expand_templates = args.no_templates |
| | options.filter_disambig_pages = args.filter_disambig_pages |
| | options.keep_tables = args.keep_tables |
| |
|
| | try: |
| | power = 'kmg'.find(args.bytes[-1].lower()) + 1 |
| | file_size = int(args.bytes[:-1]) * 1024 ** power |
| | if file_size < minFileSize: |
| | raise ValueError() |
| | except ValueError: |
| | logging.error('Insufficient or invalid size: %s', args.bytes) |
| | return |
| |
|
| | if args.namespaces: |
| | options.acceptedNamespaces = set(args.namespaces.split(',')) |
| |
|
| | |
| | if args.ignored_tags: |
| | ignoredTags = set(args.ignored_tags.split(',')) |
| | else: |
| | ignoredTags = [ |
| | 'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em', |
| | 'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', |
| | 'p', 'plaintext', 's', 'span', 'strike', 'strong', |
| | 'tt', 'u', 'var' |
| | ] |
| |
|
| | |
| | for tag in ignoredTags: |
| | ignoreTag(tag) |
| |
|
| | if args.discard_elements: |
| | options.discardElements = set(args.discard_elements.split(',')) |
| |
|
| | FORMAT = '%(levelname)s: %(message)s' |
| | logging.basicConfig(format=FORMAT) |
| |
|
| | options.quiet = args.quiet |
| | options.debug = args.debug |
| | options.log_file = args.log_file |
| | createLogger(options.quiet, options.debug, options.log_file) |
| |
|
| | input_file = args.input |
| |
|
| | if not options.keepLinks: |
| | ignoreTag('a') |
| |
|
| | |
| | |
| | |
| |
|
| | if args.article: |
| | if args.templates: |
| | if os.path.exists(args.templates): |
| | with open(args.templates) as file: |
| | load_templates(file) |
| |
|
| | file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) |
| | for page_data in pages_from(file): |
| | id, revid, title, ns,catSet, page = page_data |
| | Extractor(id, revid, title, page).extract(sys.stdout) |
| | file.close() |
| | return |
| |
|
| | output_path = args.output |
| | if output_path != '-' and not os.path.isdir(output_path): |
| | try: |
| | os.makedirs(output_path) |
| | except: |
| | logging.error('Could not create: %s', output_path) |
| | return |
| |
|
| | filter_category = args.filter_category |
| | if (filter_category != None and len(filter_category)>0): |
| | with open(filter_category) as f: |
| | error_cnt = 0 |
| | for line in f.readlines(): |
| | try: |
| | line = str(line.strip()) |
| | if line.startswith('#') or len(line) == 0: |
| | continue; |
| | elif line.startswith('^'): |
| | options.filter_category_exclude.add(line.lstrip('^')) |
| | else: |
| | options.filter_category_include.add(line) |
| | except Exception as e: |
| | error_cnt += 1 |
| | print(u"Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt,e)) |
| | print(line) |
| | logging.info("Excluding categories:",) |
| | logging.info(str(options.filter_category_exclude)) |
| | logging.info("Including categories:") |
| | logging.info(str(len(options.filter_category_include))) |
| |
|
| | process_dump(input_file, args.templates, output_path, file_size, |
| | args.compress, args.processes) |
| |
|
| | def createLogger(quiet, debug, log_file): |
| | logger = logging.getLogger() |
| | if not quiet: |
| | logger.setLevel(logging.INFO) |
| | if debug: |
| | logger.setLevel(logging.DEBUG) |
| | |
| | if log_file: |
| | fileHandler = logging.FileHandler(log_file) |
| | logger.addHandler(fileHandler) |
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|