diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/chunkparser_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/chunkparser_app.py new file mode 100644 index 0000000000000000000000000000000000000000..777793ef172702430c729b46702d7c8a25d0d155 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/chunkparser_app.py @@ -0,0 +1,1500 @@ +# Natural Language Toolkit: Regexp Chunk Parser Application +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +A graphical tool for exploring the regular expression based chunk +parser ``nltk.chunk.RegexpChunkParser``. +""" + +# Todo: Add a way to select the development set from the menubar. This +# might just need to be a selection box (conll vs treebank etc) plus +# configuration parameters to select what's being chunked (eg VP vs NP) +# and what part of the data is being used as the development set. + +import random +import re +import textwrap +import time +from tkinter import ( + Button, + Canvas, + Checkbutton, + Frame, + IntVar, + Label, + Menu, + Scrollbar, + Text, + Tk, +) +from tkinter.filedialog import askopenfilename, asksaveasfilename +from tkinter.font import Font + +from nltk.chunk import ChunkScore, RegexpChunkParser +from nltk.chunk.regexp import RegexpChunkRule +from nltk.corpus import conll2000, treebank_chunk +from nltk.draw.util import ShowText +from nltk.tree import Tree +from nltk.util import in_idle + + +class RegexpChunkApp: + """ + A graphical tool for exploring the regular expression based chunk + parser ``nltk.chunk.RegexpChunkParser``. + + See ``HELP`` for instructional text. + """ + + ##///////////////////////////////////////////////////////////////// + ## Help Text + ##///////////////////////////////////////////////////////////////// + + #: A dictionary mapping from part of speech tags to descriptions, + #: which is used in the help text. (This should probably live with + #: the conll and/or treebank corpus instead.) + TAGSET = { + "CC": "Coordinating conjunction", + "PRP$": "Possessive pronoun", + "CD": "Cardinal number", + "RB": "Adverb", + "DT": "Determiner", + "RBR": "Adverb, comparative", + "EX": "Existential there", + "RBS": "Adverb, superlative", + "FW": "Foreign word", + "RP": "Particle", + "JJ": "Adjective", + "TO": "to", + "JJR": "Adjective, comparative", + "UH": "Interjection", + "JJS": "Adjective, superlative", + "VB": "Verb, base form", + "LS": "List item marker", + "VBD": "Verb, past tense", + "MD": "Modal", + "NNS": "Noun, plural", + "NN": "Noun, singular or masps", + "VBN": "Verb, past participle", + "VBZ": "Verb,3rd ps. sing. present", + "NNP": "Proper noun, singular", + "NNPS": "Proper noun plural", + "WDT": "wh-determiner", + "PDT": "Predeterminer", + "WP": "wh-pronoun", + "POS": "Possessive ending", + "WP$": "Possessive wh-pronoun", + "PRP": "Personal pronoun", + "WRB": "wh-adverb", + "(": "open parenthesis", + ")": "close parenthesis", + "``": "open quote", + ",": "comma", + "''": "close quote", + ".": "period", + "#": "pound sign (currency marker)", + "$": "dollar sign (currency marker)", + "IN": "Preposition/subord. conjunction", + "SYM": "Symbol (mathematical or scientific)", + "VBG": "Verb, gerund/present participle", + "VBP": "Verb, non-3rd ps. sing. present", + ":": "colon", + } + + #: Contents for the help box. This is a list of tuples, one for + #: each help page, where each tuple has four elements: + #: - A title (displayed as a tab) + #: - A string description of tabstops (see Tkinter.Text for details) + #: - The text contents for the help page. You can use expressions + #: like ... to colorize the text; see ``HELP_AUTOTAG`` + #: for a list of tags you can use for colorizing. + HELP = [ + ( + "Help", + "20", + "Welcome to the regular expression chunk-parser grammar editor. " + "You can use this editor to develop and test chunk parser grammars " + "based on NLTK's RegexpChunkParser class.\n\n" + # Help box. + "Use this box ('Help') to learn more about the editor; click on the " + "tabs for help on specific topics:" + "\n" + "Rules: grammar rule types\n" + "Regexps: regular expression syntax\n" + "Tags: part of speech tags\n\n" + # Grammar. + "Use the upper-left box ('Grammar') to edit your grammar. " + "Each line of your grammar specifies a single 'rule', " + "which performs an action such as creating a chunk or merging " + "two chunks.\n\n" + # Dev set. + "The lower-left box ('Development Set') runs your grammar on the " + "development set, and displays the results. " + "Your grammar's chunks are highlighted, and " + "the correct (gold standard) chunks are " + "underlined. If they " + "match, they are displayed in green; otherwise, " + "they are displayed in red. The box displays a single " + "sentence from the development set at a time; use the scrollbar or " + "the next/previous buttons view additional sentences.\n\n" + # Performance + "The lower-right box ('Evaluation') tracks the performance of " + "your grammar on the development set. The 'precision' axis " + "indicates how many of your grammar's chunks are correct; and " + "the 'recall' axis indicates how many of the gold standard " + "chunks your system generated. Typically, you should try to " + "design a grammar that scores high on both metrics. The " + "exact precision and recall of the current grammar, as well " + "as their harmonic mean (the 'f-score'), are displayed in " + "the status bar at the bottom of the window.", + ), + ( + "Rules", + "10", + "

{...regexp...}

" + "\nChunk rule: creates new chunks from words matching " + "regexp.\n\n" + "

}...regexp...{

" + "\nStrip rule: removes words matching regexp from existing " + "chunks.\n\n" + "

...regexp1...}{...regexp2...

" + "\nSplit rule: splits chunks that match regexp1 followed by " + "regexp2 in two.\n\n" + "

...regexp...{}...regexp...

" + "\nMerge rule: joins consecutive chunks that match regexp1 " + "and regexp2\n", + ), + ( + "Regexps", + "10 60", + # "Regular Expression Syntax Summary:\n\n" + "

Pattern\t\tMatches...

\n" + "" + "\t<T>\ta word with tag T " + "(where T may be a regexp).\n" + "\tx?\tan optional x\n" + "\tx+\ta sequence of 1 or more x's\n" + "\tx*\ta sequence of 0 or more x's\n" + "\tx|y\tx or y\n" + "\t.\tmatches any character\n" + "\t(x)\tTreats x as a group\n" + "\t# x...\tTreats x... " + "(to the end of the line) as a comment\n" + "\t\\C\tmatches character C " + "(useful when C is a special character " + "like + or #)\n" + "" + "\n

Examples:

\n" + "" + "\t\n" + '\t\tMatches "cow/NN"\n' + '\t\tMatches "green/NN"\n' + "\t\n" + '\t\tMatches "eating/VBG"\n' + '\t\tMatches "ate/VBD"\n' + "\t
\n" + '\t\tMatches "on/IN the/DT car/NN"\n' + "\t?\n" + '\t\tMatches "ran/VBD"\n' + '\t\tMatches "slowly/RB ate/VBD"\n' + r"\t<\#> # This is a comment...\n" + '\t\tMatches "#/# 100/CD"\n' + "", + ), + ( + "Tags", + "10 60", + "

Part of Speech Tags:

\n" + + "" + + "<>" + + "\n", # this gets auto-substituted w/ self.TAGSET + ), + ] + + HELP_AUTOTAG = [ + ("red", dict(foreground="#a00")), + ("green", dict(foreground="#080")), + ("highlight", dict(background="#ddd")), + ("underline", dict(underline=True)), + ("h1", dict(underline=True)), + ("indent", dict(lmargin1=20, lmargin2=20)), + ("hangindent", dict(lmargin1=0, lmargin2=60)), + ("var", dict(foreground="#88f")), + ("regexp", dict(foreground="#ba7")), + ("match", dict(foreground="#6a6")), + ] + + ##///////////////////////////////////////////////////////////////// + ## Config Parameters + ##///////////////////////////////////////////////////////////////// + + _EVAL_DELAY = 1 + """If the user has not pressed any key for this amount of time (in + seconds), and the current grammar has not been evaluated, then + the eval demon will evaluate it.""" + + _EVAL_CHUNK = 15 + """The number of sentences that should be evaluated by the eval + demon each time it runs.""" + _EVAL_FREQ = 0.2 + """The frequency (in seconds) at which the eval demon is run""" + _EVAL_DEMON_MIN = 0.02 + """The minimum amount of time that the eval demon should take each time + it runs -- if it takes less than this time, _EVAL_CHUNK will be + modified upwards.""" + _EVAL_DEMON_MAX = 0.04 + """The maximum amount of time that the eval demon should take each time + it runs -- if it takes more than this time, _EVAL_CHUNK will be + modified downwards.""" + + _GRAMMARBOX_PARAMS = dict( + width=40, + height=12, + background="#efe", + highlightbackground="#efe", + highlightthickness=1, + relief="groove", + border=2, + wrap="word", + ) + _HELPBOX_PARAMS = dict( + width=15, + height=15, + background="#efe", + highlightbackground="#efe", + foreground="#555", + highlightthickness=1, + relief="groove", + border=2, + wrap="word", + ) + _DEVSETBOX_PARAMS = dict( + width=70, + height=10, + background="#eef", + highlightbackground="#eef", + highlightthickness=1, + relief="groove", + border=2, + wrap="word", + tabs=(30,), + ) + _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2) + _FONT_PARAMS = dict(family="helvetica", size=-20) + _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3) + _EVALBOX_PARAMS = dict( + background="#eef", + highlightbackground="#eef", + highlightthickness=1, + relief="groove", + border=2, + width=300, + height=280, + ) + _BUTTON_PARAMS = dict( + background="#777", activebackground="#777", highlightbackground="#777" + ) + _HELPTAB_BG_COLOR = "#aba" + _HELPTAB_FG_COLOR = "#efe" + + _HELPTAB_FG_PARAMS = dict(background="#efe") + _HELPTAB_BG_PARAMS = dict(background="#aba") + _HELPTAB_SPACER = 6 + + def normalize_grammar(self, grammar): + # Strip comments + grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar) + # Normalize whitespace + grammar = re.sub(" +", " ", grammar) + grammar = re.sub(r"\n\s+", r"\n", grammar) + grammar = grammar.strip() + # [xx] Hack: automatically backslash $! + grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar) + return grammar + + def __init__( + self, + devset_name="conll2000", + devset=None, + grammar="", + chunk_label="NP", + tagset=None, + ): + """ + :param devset_name: The name of the development set; used for + display & for save files. If either the name 'treebank' + or the name 'conll2000' is used, and devset is None, then + devset will be set automatically. + :param devset: A list of chunked sentences + :param grammar: The initial grammar to display. + :param tagset: Dictionary from tags to string descriptions, used + for the help page. Defaults to ``self.TAGSET``. + """ + self._chunk_label = chunk_label + + if tagset is None: + tagset = self.TAGSET + self.tagset = tagset + + # Named development sets: + if devset is None: + if devset_name == "conll2000": + devset = conll2000.chunked_sents("train.txt") # [:100] + elif devset == "treebank": + devset = treebank_chunk.chunked_sents() # [:100] + else: + raise ValueError("Unknown development set %s" % devset_name) + + self.chunker = None + """The chunker built from the grammar string""" + + self.grammar = grammar + """The unparsed grammar string""" + + self.normalized_grammar = None + """A normalized version of ``self.grammar``.""" + + self.grammar_changed = 0 + """The last time() that the grammar was changed.""" + + self.devset = devset + """The development set -- a list of chunked sentences.""" + + self.devset_name = devset_name + """The name of the development set (for save files).""" + + self.devset_index = -1 + """The index into the development set of the first instance + that's currently being viewed.""" + + self._last_keypress = 0 + """The time() when a key was most recently pressed""" + + self._history = [] + """A list of (grammar, precision, recall, fscore) tuples for + grammars that the user has already tried.""" + + self._history_index = 0 + """When the user is scrolling through previous grammars, this + is used to keep track of which grammar they're looking at.""" + + self._eval_grammar = None + """The grammar that is being currently evaluated by the eval + demon.""" + + self._eval_normalized_grammar = None + """A normalized copy of ``_eval_grammar``.""" + + self._eval_index = 0 + """The index of the next sentence in the development set that + should be looked at by the eval demon.""" + + self._eval_score = ChunkScore(chunk_label=chunk_label) + """The ``ChunkScore`` object that's used to keep track of the score + of the current grammar on the development set.""" + + # Set up the main window. + top = self.top = Tk() + top.geometry("+50+50") + top.title("Regexp Chunk Parser App") + top.bind("", self.destroy) + + # Variable that restricts how much of the devset we look at. + self._devset_size = IntVar(top) + self._devset_size.set(100) + + # Set up all the tkinter widgets + self._init_fonts(top) + self._init_widgets(top) + self._init_bindings(top) + self._init_menubar(top) + self.grammarbox.focus() + + # If a grammar was given, then display it. + if grammar: + self.grammarbox.insert("end", grammar + "\n") + self.grammarbox.mark_set("insert", "1.0") + + # Display the first item in the development set + self.show_devset(0) + self.update() + + def _init_bindings(self, top): + top.bind("", self._devset_next) + top.bind("", self._devset_prev) + top.bind("", self.toggle_show_trace) + top.bind("", self.update) + top.bind("", lambda e: self.save_grammar()) + top.bind("", lambda e: self.load_grammar()) + self.grammarbox.bind("", self.toggle_show_trace) + self.grammarbox.bind("", self._devset_next) + self.grammarbox.bind("", self._devset_prev) + + # Redraw the eval graph when the window size changes + self.evalbox.bind("", self._eval_plot) + + def _init_fonts(self, top): + # TWhat's our font size (default=same as sysfont) + self._size = IntVar(top) + self._size.set(20) + self._font = Font(family="helvetica", size=-self._size.get()) + self._smallfont = Font( + family="helvetica", size=-(int(self._size.get() * 14 // 20)) + ) + + def _init_menubar(self, parent): + menubar = Menu(parent) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command(label="Reset Application", underline=0, command=self.reset) + filemenu.add_command( + label="Save Current Grammar", + underline=0, + accelerator="Ctrl-s", + command=self.save_grammar, + ) + filemenu.add_command( + label="Load Grammar", + underline=0, + accelerator="Ctrl-o", + command=self.load_grammar, + ) + + filemenu.add_command( + label="Save Grammar History", underline=13, command=self.save_history + ) + + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + viewmenu = Menu(menubar, tearoff=0) + viewmenu.add_radiobutton( + label="Tiny", + variable=self._size, + underline=0, + value=10, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Small", + variable=self._size, + underline=0, + value=16, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Medium", + variable=self._size, + underline=0, + value=20, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Large", + variable=self._size, + underline=0, + value=24, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Huge", + variable=self._size, + underline=0, + value=34, + command=self.resize, + ) + menubar.add_cascade(label="View", underline=0, menu=viewmenu) + + devsetmenu = Menu(menubar, tearoff=0) + devsetmenu.add_radiobutton( + label="50 sentences", + variable=self._devset_size, + value=50, + command=self.set_devset_size, + ) + devsetmenu.add_radiobutton( + label="100 sentences", + variable=self._devset_size, + value=100, + command=self.set_devset_size, + ) + devsetmenu.add_radiobutton( + label="200 sentences", + variable=self._devset_size, + value=200, + command=self.set_devset_size, + ) + devsetmenu.add_radiobutton( + label="500 sentences", + variable=self._devset_size, + value=500, + command=self.set_devset_size, + ) + menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu) + + helpmenu = Menu(menubar, tearoff=0) + helpmenu.add_command(label="About", underline=0, command=self.about) + menubar.add_cascade(label="Help", underline=0, menu=helpmenu) + + parent.config(menu=menubar) + + def toggle_show_trace(self, *e): + if self._showing_trace: + self.show_devset() + else: + self.show_trace() + return "break" + + _SCALE_N = 5 # center on the last 5 examples. + _DRAW_LINES = False + + def _eval_plot(self, *e, **config): + width = config.get("width", self.evalbox.winfo_width()) + height = config.get("height", self.evalbox.winfo_height()) + + # Clear the canvas + self.evalbox.delete("all") + + # Draw the precision & recall labels. + tag = self.evalbox.create_text( + 10, height // 2 - 10, justify="left", anchor="w", text="Precision" + ) + left, right = self.evalbox.bbox(tag)[2] + 5, width - 10 + tag = self.evalbox.create_text( + left + (width - left) // 2, + height - 10, + anchor="s", + text="Recall", + justify="center", + ) + top, bot = 10, self.evalbox.bbox(tag)[1] - 10 + + # Draw masks for clipping the plot. + bg = self._EVALBOX_PARAMS["background"] + self.evalbox.lower( + self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg) + ) + self.evalbox.lower( + self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg) + ) + + # Calculate the plot's scale. + if self._autoscale.get() and len(self._history) > 1: + max_precision = max_recall = 0 + min_precision = min_recall = 1 + for i in range(1, min(len(self._history), self._SCALE_N + 1)): + grammar, precision, recall, fmeasure = self._history[-i] + min_precision = min(precision, min_precision) + min_recall = min(recall, min_recall) + max_precision = max(precision, max_precision) + max_recall = max(recall, max_recall) + # if max_precision-min_precision > max_recall-min_recall: + # min_recall -= (max_precision-min_precision)/2 + # max_recall += (max_precision-min_precision)/2 + # else: + # min_precision -= (max_recall-min_recall)/2 + # max_precision += (max_recall-min_recall)/2 + # if min_recall < 0: + # max_recall -= min_recall + # min_recall = 0 + # if min_precision < 0: + # max_precision -= min_precision + # min_precision = 0 + min_precision = max(min_precision - 0.01, 0) + min_recall = max(min_recall - 0.01, 0) + max_precision = min(max_precision + 0.01, 1) + max_recall = min(max_recall + 0.01, 1) + else: + min_precision = min_recall = 0 + max_precision = max_recall = 1 + + # Draw the axis lines & grid lines + for i in range(11): + x = left + (right - left) * ( + (i / 10.0 - min_recall) / (max_recall - min_recall) + ) + y = bot - (bot - top) * ( + (i / 10.0 - min_precision) / (max_precision - min_precision) + ) + if left < x < right: + self.evalbox.create_line(x, top, x, bot, fill="#888") + if top < y < bot: + self.evalbox.create_line(left, y, right, y, fill="#888") + self.evalbox.create_line(left, top, left, bot) + self.evalbox.create_line(left, bot, right, bot) + + # Display the plot's scale + self.evalbox.create_text( + left - 3, + bot, + justify="right", + anchor="se", + text="%d%%" % (100 * min_precision), + ) + self.evalbox.create_text( + left - 3, + top, + justify="right", + anchor="ne", + text="%d%%" % (100 * max_precision), + ) + self.evalbox.create_text( + left, + bot + 3, + justify="center", + anchor="nw", + text="%d%%" % (100 * min_recall), + ) + self.evalbox.create_text( + right, + bot + 3, + justify="center", + anchor="ne", + text="%d%%" % (100 * max_recall), + ) + + # Display the scores. + prev_x = prev_y = None + for i, (_, precision, recall, fscore) in enumerate(self._history): + x = left + (right - left) * ( + (recall - min_recall) / (max_recall - min_recall) + ) + y = bot - (bot - top) * ( + (precision - min_precision) / (max_precision - min_precision) + ) + if i == self._history_index: + self.evalbox.create_oval( + x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000" + ) + self.status["text"] = ( + "Precision: %.2f%%\t" % (precision * 100) + + "Recall: %.2f%%\t" % (recall * 100) + + "F-score: %.2f%%" % (fscore * 100) + ) + else: + self.evalbox.lower( + self.evalbox.create_oval( + x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8" + ) + ) + if prev_x is not None and self._eval_lines.get(): + self.evalbox.lower( + self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8") + ) + prev_x, prev_y = x, y + + _eval_demon_running = False + + def _eval_demon(self): + if self.top is None: + return + if self.chunker is None: + self._eval_demon_running = False + return + + # Note our starting time. + t0 = time.time() + + # If are still typing, then wait for them to finish. + if ( + time.time() - self._last_keypress < self._EVAL_DELAY + and self.normalized_grammar != self._eval_normalized_grammar + ): + self._eval_demon_running = True + return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) + + # If the grammar changed, restart the evaluation. + if self.normalized_grammar != self._eval_normalized_grammar: + # Check if we've seen this grammar already. If so, then + # just use the old evaluation values. + for (g, p, r, f) in self._history: + if self.normalized_grammar == self.normalize_grammar(g): + self._history.append((g, p, r, f)) + self._history_index = len(self._history) - 1 + self._eval_plot() + self._eval_demon_running = False + self._eval_normalized_grammar = None + return + self._eval_index = 0 + self._eval_score = ChunkScore(chunk_label=self._chunk_label) + self._eval_grammar = self.grammar + self._eval_normalized_grammar = self.normalized_grammar + + # If the grammar is empty, the don't bother evaluating it, or + # recording it in history -- the score will just be 0. + if self.normalized_grammar.strip() == "": + # self._eval_index = self._devset_size.get() + self._eval_demon_running = False + return + + # Score the next set of examples + for gold in self.devset[ + self._eval_index : min( + self._eval_index + self._EVAL_CHUNK, self._devset_size.get() + ) + ]: + guess = self._chunkparse(gold.leaves()) + self._eval_score.score(gold, guess) + + # update our index in the devset. + self._eval_index += self._EVAL_CHUNK + + # Check if we're done + if self._eval_index >= self._devset_size.get(): + self._history.append( + ( + self._eval_grammar, + self._eval_score.precision(), + self._eval_score.recall(), + self._eval_score.f_measure(), + ) + ) + self._history_index = len(self._history) - 1 + self._eval_plot() + self._eval_demon_running = False + self._eval_normalized_grammar = None + else: + progress = 100 * self._eval_index / self._devset_size.get() + self.status["text"] = "Evaluating on Development Set (%d%%)" % progress + self._eval_demon_running = True + self._adaptively_modify_eval_chunk(time.time() - t0) + self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) + + def _adaptively_modify_eval_chunk(self, t): + """ + Modify _EVAL_CHUNK to try to keep the amount of time that the + eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX. + + :param t: The amount of time that the eval demon took. + """ + if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5: + self._EVAL_CHUNK = min( + self._EVAL_CHUNK - 1, + max( + int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)), + self._EVAL_CHUNK - 10, + ), + ) + elif t < self._EVAL_DEMON_MIN: + self._EVAL_CHUNK = max( + self._EVAL_CHUNK + 1, + min( + int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)), + self._EVAL_CHUNK + 10, + ), + ) + + def _init_widgets(self, top): + frame0 = Frame(top, **self._FRAME_PARAMS) + frame0.grid_columnconfigure(0, weight=4) + frame0.grid_columnconfigure(3, weight=2) + frame0.grid_rowconfigure(1, weight=1) + frame0.grid_rowconfigure(5, weight=1) + + # The grammar + self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS) + self.grammarlabel = Label( + frame0, + font=self._font, + text="Grammar:", + highlightcolor="black", + background=self._GRAMMARBOX_PARAMS["background"], + ) + self.grammarlabel.grid(column=0, row=0, sticky="SW") + self.grammarbox.grid(column=0, row=1, sticky="NEWS") + + # Scroll bar for grammar + grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview) + grammar_scrollbar.grid(column=1, row=1, sticky="NWS") + self.grammarbox.config(yscrollcommand=grammar_scrollbar.set) + + # grammar buttons + bg = self._FRAME_PARAMS["background"] + frame3 = Frame(frame0, background=bg) + frame3.grid(column=0, row=2, sticky="EW") + Button( + frame3, + text="Prev Grammar", + command=self._history_prev, + **self._BUTTON_PARAMS, + ).pack(side="left") + Button( + frame3, + text="Next Grammar", + command=self._history_next, + **self._BUTTON_PARAMS, + ).pack(side="left") + + # Help box + self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS) + self.helpbox.grid(column=3, row=1, sticky="NEWS") + self.helptabs = {} + bg = self._FRAME_PARAMS["background"] + helptab_frame = Frame(frame0, background=bg) + helptab_frame.grid(column=3, row=0, sticky="SW") + for i, (tab, tabstops, text) in enumerate(self.HELP): + label = Label(helptab_frame, text=tab, font=self._smallfont) + label.grid(column=i * 2, row=0, sticky="S") + # help_frame.grid_columnconfigure(i, weight=1) + # label.pack(side='left') + label.bind("", lambda e, tab=tab: self.show_help(tab)) + self.helptabs[tab] = label + Frame( + helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg + ).grid(column=i * 2 + 1, row=0) + self.helptabs[self.HELP[0][0]].configure(font=self._font) + self.helpbox.tag_config("elide", elide=True) + for (tag, params) in self.HELP_AUTOTAG: + self.helpbox.tag_config("tag-%s" % tag, **params) + self.show_help(self.HELP[0][0]) + + # Scroll bar for helpbox + help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview) + self.helpbox.config(yscrollcommand=help_scrollbar.set) + help_scrollbar.grid(column=4, row=1, sticky="NWS") + + # The dev set + frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"]) + self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS) + self.devsetbox.pack(expand=True, fill="both") + self.devsetlabel = Label( + frame0, + font=self._font, + text="Development Set:", + justify="right", + background=self._DEVSETBOX_PARAMS["background"], + ) + self.devsetlabel.grid(column=0, row=4, sticky="SW") + frame4.grid(column=0, row=5, sticky="NEWS") + + # dev set scrollbars + self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll) + self.devset_scroll.grid(column=1, row=5, sticky="NWS") + self.devset_xscroll = Scrollbar( + frame4, command=self.devsetbox.xview, orient="horiz" + ) + self.devsetbox["xscrollcommand"] = self.devset_xscroll.set + self.devset_xscroll.pack(side="bottom", fill="x") + + # dev set buttons + bg = self._FRAME_PARAMS["background"] + frame1 = Frame(frame0, background=bg) + frame1.grid(column=0, row=7, sticky="EW") + Button( + frame1, + text="Prev Example (Ctrl-p)", + command=self._devset_prev, + **self._BUTTON_PARAMS, + ).pack(side="left") + Button( + frame1, + text="Next Example (Ctrl-n)", + command=self._devset_next, + **self._BUTTON_PARAMS, + ).pack(side="left") + self.devset_button = Button( + frame1, + text="Show example", + command=self.show_devset, + state="disabled", + **self._BUTTON_PARAMS, + ) + self.devset_button.pack(side="right") + self.trace_button = Button( + frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS + ) + self.trace_button.pack(side="right") + + # evaluation box + self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS) + label = Label( + frame0, + font=self._font, + text="Evaluation:", + justify="right", + background=self._EVALBOX_PARAMS["background"], + ) + label.grid(column=3, row=4, sticky="SW") + self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2) + + # evaluation box buttons + bg = self._FRAME_PARAMS["background"] + frame2 = Frame(frame0, background=bg) + frame2.grid(column=3, row=7, sticky="EW") + self._autoscale = IntVar(self.top) + self._autoscale.set(False) + Checkbutton( + frame2, + variable=self._autoscale, + command=self._eval_plot, + text="Zoom", + **self._BUTTON_PARAMS, + ).pack(side="left") + self._eval_lines = IntVar(self.top) + self._eval_lines.set(False) + Checkbutton( + frame2, + variable=self._eval_lines, + command=self._eval_plot, + text="Lines", + **self._BUTTON_PARAMS, + ).pack(side="left") + Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right") + + # The status label + self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS) + self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5) + + # Help box & devset box can't be edited. + self.helpbox["state"] = "disabled" + self.devsetbox["state"] = "disabled" + + # Spacers + bg = self._FRAME_PARAMS["background"] + Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3) + Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0) + Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8) + + # pack the frame. + frame0.pack(fill="both", expand=True) + + # Set up colors for the devset box + self.devsetbox.tag_config("true-pos", background="#afa", underline="True") + self.devsetbox.tag_config("false-neg", underline="True", foreground="#800") + self.devsetbox.tag_config("false-pos", background="#faa") + self.devsetbox.tag_config("trace", foreground="#666", wrap="none") + self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none") + self.devsetbox.tag_config("error", foreground="#800") + + # And for the grammarbox + self.grammarbox.tag_config("error", background="#fec") + self.grammarbox.tag_config("comment", foreground="#840") + self.grammarbox.tag_config("angle", foreground="#00f") + self.grammarbox.tag_config("brace", foreground="#0a0") + self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40) + + _showing_trace = False + + def show_trace(self, *e): + self._showing_trace = True + self.trace_button["state"] = "disabled" + self.devset_button["state"] = "normal" + + self.devsetbox["state"] = "normal" + # self.devsetbox['wrap'] = 'none' + self.devsetbox.delete("1.0", "end") + self.devsetlabel["text"] = "Development Set (%d/%d)" % ( + (self.devset_index + 1, self._devset_size.get()) + ) + + if self.chunker is None: + self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.") + self.devsetbox.tag_add("error", "1.0", "end") + return # can't do anything more + + gold_tree = self.devset[self.devset_index] + rules = self.chunker.rules() + + # Calculate the tag sequence + tagseq = "\t" + charnum = [1] + for wordnum, (word, pos) in enumerate(gold_tree.leaves()): + tagseq += "%s " % pos + charnum.append(len(tagseq)) + self.charnum = { + (i, j): charnum[j] + for i in range(len(rules) + 1) + for j in range(len(charnum)) + } + self.linenum = {i: i * 2 + 2 for i in range(len(rules) + 1)} + + for i in range(len(rules) + 1): + if i == 0: + self.devsetbox.insert("end", "Start:\n") + self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") + else: + self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1]) + self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") + # Display the tag sequence. + self.devsetbox.insert("end", tagseq + "\n") + self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c") + # Run a partial parser, and extract gold & test chunks + chunker = RegexpChunkParser(rules[:i]) + test_tree = self._chunkparse(gold_tree.leaves()) + gold_chunks = self._chunks(gold_tree) + test_chunks = self._chunks(test_tree) + # Compare them. + for chunk in gold_chunks.intersection(test_chunks): + self._color_chunk(i, chunk, "true-pos") + for chunk in gold_chunks - test_chunks: + self._color_chunk(i, chunk, "false-neg") + for chunk in test_chunks - gold_chunks: + self._color_chunk(i, chunk, "false-pos") + self.devsetbox.insert("end", "Finished.\n") + self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") + + # This is a hack, because the x-scrollbar isn't updating its + # position right -- I'm not sure what the underlying cause is + # though. (This is on OS X w/ python 2.5) + self.top.after(100, self.devset_xscroll.set, 0, 0.3) + + def show_help(self, tab): + self.helpbox["state"] = "normal" + self.helpbox.delete("1.0", "end") + for (name, tabstops, text) in self.HELP: + if name == tab: + text = text.replace( + "<>", + "\n".join( + "\t%s\t%s" % item + for item in sorted( + list(self.tagset.items()), + key=lambda t_w: re.match(r"\w+", t_w[0]) + and (0, t_w[0]) + or (1, t_w[0]), + ) + ), + ) + + self.helptabs[name].config(**self._HELPTAB_FG_PARAMS) + self.helpbox.config(tabs=tabstops) + self.helpbox.insert("1.0", text + "\n" * 20) + C = "1.0 + %d chars" + for (tag, params) in self.HELP_AUTOTAG: + pattern = f"(?s)(<{tag}>)(.*?)()" + for m in re.finditer(pattern, text): + self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1)) + self.helpbox.tag_add( + "tag-%s" % tag, C % m.start(2), C % m.end(2) + ) + self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3)) + else: + self.helptabs[name].config(**self._HELPTAB_BG_PARAMS) + self.helpbox["state"] = "disabled" + + def _history_prev(self, *e): + self._view_history(self._history_index - 1) + return "break" + + def _history_next(self, *e): + self._view_history(self._history_index + 1) + return "break" + + def _view_history(self, index): + # Bounds & sanity checking: + index = max(0, min(len(self._history) - 1, index)) + if not self._history: + return + # Already viewing the requested history item? + if index == self._history_index: + return + # Show the requested grammar. It will get added to _history + # only if they edit it (causing self.update() to get run.) + self.grammarbox["state"] = "normal" + self.grammarbox.delete("1.0", "end") + self.grammarbox.insert("end", self._history[index][0]) + self.grammarbox.mark_set("insert", "1.0") + self._history_index = index + self._syntax_highlight_grammar(self._history[index][0]) + # Record the normalized grammar & regenerate the chunker. + self.normalized_grammar = self.normalize_grammar(self._history[index][0]) + if self.normalized_grammar: + rules = [ + RegexpChunkRule.fromstring(line) + for line in self.normalized_grammar.split("\n") + ] + else: + rules = [] + self.chunker = RegexpChunkParser(rules) + # Show the score. + self._eval_plot() + # Update the devset box + self._highlight_devset() + if self._showing_trace: + self.show_trace() + # Update the grammar label + if self._history_index < len(self._history) - 1: + self.grammarlabel["text"] = "Grammar {}/{}:".format( + self._history_index + 1, + len(self._history), + ) + else: + self.grammarlabel["text"] = "Grammar:" + + def _devset_next(self, *e): + self._devset_scroll("scroll", 1, "page") + return "break" + + def _devset_prev(self, *e): + self._devset_scroll("scroll", -1, "page") + return "break" + + def destroy(self, *e): + if self.top is None: + return + self.top.destroy() + self.top = None + + def _devset_scroll(self, command, *args): + N = 1 # size of a page -- one sentence. + showing_trace = self._showing_trace + if command == "scroll" and args[1].startswith("unit"): + self.show_devset(self.devset_index + int(args[0])) + elif command == "scroll" and args[1].startswith("page"): + self.show_devset(self.devset_index + N * int(args[0])) + elif command == "moveto": + self.show_devset(int(float(args[0]) * self._devset_size.get())) + else: + assert 0, f"bad scroll command {command} {args}" + if showing_trace: + self.show_trace() + + def show_devset(self, index=None): + if index is None: + index = self.devset_index + + # Bounds checking + index = min(max(0, index), self._devset_size.get() - 1) + + if index == self.devset_index and not self._showing_trace: + return + self.devset_index = index + + self._showing_trace = False + self.trace_button["state"] = "normal" + self.devset_button["state"] = "disabled" + + # Clear the text box. + self.devsetbox["state"] = "normal" + self.devsetbox["wrap"] = "word" + self.devsetbox.delete("1.0", "end") + self.devsetlabel["text"] = "Development Set (%d/%d)" % ( + (self.devset_index + 1, self._devset_size.get()) + ) + + # Add the sentences + sample = self.devset[self.devset_index : self.devset_index + 1] + self.charnum = {} + self.linenum = {0: 1} + for sentnum, sent in enumerate(sample): + linestr = "" + for wordnum, (word, pos) in enumerate(sent.leaves()): + self.charnum[sentnum, wordnum] = len(linestr) + linestr += f"{word}/{pos} " + self.charnum[sentnum, wordnum + 1] = len(linestr) + self.devsetbox.insert("end", linestr[:-1] + "\n\n") + + # Highlight chunks in the dev set + if self.chunker is not None: + self._highlight_devset() + self.devsetbox["state"] = "disabled" + + # Update the scrollbar + first = self.devset_index / self._devset_size.get() + last = (self.devset_index + 2) / self._devset_size.get() + self.devset_scroll.set(first, last) + + def _chunks(self, tree): + chunks = set() + wordnum = 0 + for child in tree: + if isinstance(child, Tree): + if child.label() == self._chunk_label: + chunks.add((wordnum, wordnum + len(child))) + wordnum += len(child) + else: + wordnum += 1 + return chunks + + def _syntax_highlight_grammar(self, grammar): + if self.top is None: + return + self.grammarbox.tag_remove("comment", "1.0", "end") + self.grammarbox.tag_remove("angle", "1.0", "end") + self.grammarbox.tag_remove("brace", "1.0", "end") + self.grammarbox.tag_add("hangindent", "1.0", "end") + for lineno, line in enumerate(grammar.split("\n")): + if not line.strip(): + continue + m = re.match(r"(\\.|[^#])*(#.*)?", line) + comment_start = None + if m.group(2): + comment_start = m.start(2) + s = "%d.%d" % (lineno + 1, m.start(2)) + e = "%d.%d" % (lineno + 1, m.end(2)) + self.grammarbox.tag_add("comment", s, e) + for m in re.finditer("[<>{}]", line): + if comment_start is not None and m.start() >= comment_start: + break + s = "%d.%d" % (lineno + 1, m.start()) + e = "%d.%d" % (lineno + 1, m.end()) + if m.group() in "<>": + self.grammarbox.tag_add("angle", s, e) + else: + self.grammarbox.tag_add("brace", s, e) + + def _grammarcheck(self, grammar): + if self.top is None: + return + self.grammarbox.tag_remove("error", "1.0", "end") + self._grammarcheck_errs = [] + for lineno, line in enumerate(grammar.split("\n")): + line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line) + line = line.strip() + if line: + try: + RegexpChunkRule.fromstring(line) + except ValueError as e: + self.grammarbox.tag_add( + "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1) + ) + self.status["text"] = "" + + def update(self, *event): + # Record when update was called (for grammarcheck) + if event: + self._last_keypress = time.time() + + # Read the grammar from the Text box. + self.grammar = grammar = self.grammarbox.get("1.0", "end") + + # If the grammar hasn't changed, do nothing: + normalized_grammar = self.normalize_grammar(grammar) + if normalized_grammar == self.normalized_grammar: + return + else: + self.normalized_grammar = normalized_grammar + + # If the grammar has changed, and we're looking at history, + # then stop looking at history. + if self._history_index < len(self._history) - 1: + self.grammarlabel["text"] = "Grammar:" + + self._syntax_highlight_grammar(grammar) + + # The grammar has changed; try parsing it. If it doesn't + # parse, do nothing. (flag error location?) + try: + # Note: the normalized grammar has no blank lines. + if normalized_grammar: + rules = [ + RegexpChunkRule.fromstring(line) + for line in normalized_grammar.split("\n") + ] + else: + rules = [] + except ValueError as e: + # Use the un-normalized grammar for error highlighting. + self._grammarcheck(grammar) + self.chunker = None + return + + self.chunker = RegexpChunkParser(rules) + self.grammarbox.tag_remove("error", "1.0", "end") + self.grammar_changed = time.time() + # Display the results + if self._showing_trace: + self.show_trace() + else: + self._highlight_devset() + # Start the eval demon + if not self._eval_demon_running: + self._eval_demon() + + def _highlight_devset(self, sample=None): + if sample is None: + sample = self.devset[self.devset_index : self.devset_index + 1] + + self.devsetbox.tag_remove("true-pos", "1.0", "end") + self.devsetbox.tag_remove("false-neg", "1.0", "end") + self.devsetbox.tag_remove("false-pos", "1.0", "end") + + # Run the grammar on the test cases. + for sentnum, gold_tree in enumerate(sample): + # Run the chunk parser + test_tree = self._chunkparse(gold_tree.leaves()) + # Extract gold & test chunks + gold_chunks = self._chunks(gold_tree) + test_chunks = self._chunks(test_tree) + # Compare them. + for chunk in gold_chunks.intersection(test_chunks): + self._color_chunk(sentnum, chunk, "true-pos") + for chunk in gold_chunks - test_chunks: + self._color_chunk(sentnum, chunk, "false-neg") + for chunk in test_chunks - gold_chunks: + self._color_chunk(sentnum, chunk, "false-pos") + + def _chunkparse(self, words): + try: + return self.chunker.parse(words) + except (ValueError, IndexError) as e: + # There's an error somewhere in the grammar, but we're not sure + # exactly where, so just mark the whole grammar as bad. + # E.g., this is caused by: "({})" + self.grammarbox.tag_add("error", "1.0", "end") + # Treat it as tagging nothing: + return words + + def _color_chunk(self, sentnum, chunk, tag): + start, end = chunk + self.devsetbox.tag_add( + tag, + f"{self.linenum[sentnum]}.{self.charnum[sentnum, start]}", + f"{self.linenum[sentnum]}.{self.charnum[sentnum, end] - 1}", + ) + + def reset(self): + # Clear various variables + self.chunker = None + self.grammar = None + self.normalized_grammar = None + self.grammar_changed = 0 + self._history = [] + self._history_index = 0 + # Update the on-screen display. + self.grammarbox.delete("1.0", "end") + self.show_devset(0) + self.update() + # self._eval_plot() + + SAVE_GRAMMAR_TEMPLATE = ( + "# Regexp Chunk Parsing Grammar\n" + "# Saved %(date)s\n" + "#\n" + "# Development set: %(devset)s\n" + "# Precision: %(precision)s\n" + "# Recall: %(recall)s\n" + "# F-score: %(fscore)s\n\n" + "%(grammar)s\n" + ) + + def save_grammar(self, filename=None): + if not filename: + ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")] + filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk") + if not filename: + return + if self._history and self.normalized_grammar == self.normalize_grammar( + self._history[-1][0] + ): + precision, recall, fscore = ( + "%.2f%%" % (100 * v) for v in self._history[-1][1:] + ) + elif self.chunker is None: + precision = recall = fscore = "Grammar not well formed" + else: + precision = recall = fscore = "Not finished evaluation yet" + + with open(filename, "w") as outfile: + outfile.write( + self.SAVE_GRAMMAR_TEMPLATE + % dict( + date=time.ctime(), + devset=self.devset_name, + precision=precision, + recall=recall, + fscore=fscore, + grammar=self.grammar.strip(), + ) + ) + + def load_grammar(self, filename=None): + if not filename: + ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")] + filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk") + if not filename: + return + self.grammarbox.delete("1.0", "end") + self.update() + with open(filename) as infile: + grammar = infile.read() + grammar = re.sub( + r"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar + ).lstrip() + self.grammarbox.insert("1.0", grammar) + self.update() + + def save_history(self, filename=None): + if not filename: + ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")] + filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt") + if not filename: + return + + with open(filename, "w") as outfile: + outfile.write("# Regexp Chunk Parsing Grammar History\n") + outfile.write("# Saved %s\n" % time.ctime()) + outfile.write("# Development set: %s\n" % self.devset_name) + for i, (g, p, r, f) in enumerate(self._history): + hdr = ( + "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, " + "fscore=%.2f%%)" + % (i + 1, len(self._history), p * 100, r * 100, f * 100) + ) + outfile.write("\n%s\n" % hdr) + outfile.write("".join(" %s\n" % line for line in g.strip().split())) + + if not ( + self._history + and self.normalized_grammar + == self.normalize_grammar(self._history[-1][0]) + ): + if self.chunker is None: + outfile.write("\nCurrent Grammar (not well-formed)\n") + else: + outfile.write("\nCurrent Grammar (not evaluated)\n") + outfile.write( + "".join(" %s\n" % line for line in self.grammar.strip().split()) + ) + + def about(self, *e): + ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper" + TITLE = "About: Regular Expression Chunk Parser Application" + try: + from tkinter.messagebox import Message + + Message(message=ABOUT, title=TITLE).show() + except: + ShowText(self.top, TITLE, ABOUT) + + def set_devset_size(self, size=None): + if size is not None: + self._devset_size.set(size) + self._devset_size.set(min(len(self.devset), self._devset_size.get())) + self.show_devset(1) + self.show_devset(0) + # what about history? Evaluated at diff dev set sizes! + + def resize(self, size=None): + if size is not None: + self._size.set(size) + size = self._size.get() + self._font.configure(size=-(abs(size))) + self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20)) + + def mainloop(self, *args, **kwargs): + """ + Enter the Tkinter mainloop. This function must be called if + this demo is created from a non-interactive program (e.g. + from a secript); otherwise, the demo will close as soon as + the script completes. + """ + if in_idle(): + return + self.top.mainloop(*args, **kwargs) + + +def app(): + RegexpChunkApp().mainloop() + + +if __name__ == "__main__": + app() + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/collocations_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/collocations_app.py new file mode 100644 index 0000000000000000000000000000000000000000..9c34e5edc02abd3f2f0d5aebd55f849c5a7e2828 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/collocations_app.py @@ -0,0 +1,438 @@ +# Natural Language Toolkit: Collocations Application +# Much of the GUI code is imported from concordance.py; We intend to merge these tools together +# Copyright (C) 2001-2022 NLTK Project +# Author: Sumukh Ghodke +# URL: +# For license information, see LICENSE.TXT +# + + +import queue as q +import threading +from tkinter import ( + END, + LEFT, + SUNKEN, + Button, + Frame, + IntVar, + Label, + Menu, + OptionMenu, + Scrollbar, + StringVar, + Text, + Tk, +) +from tkinter.font import Font + +from nltk.corpus import ( + alpino, + brown, + cess_cat, + cess_esp, + floresta, + indian, + mac_morpho, + machado, + nps_chat, + sinica_treebank, + treebank, +) +from nltk.probability import FreqDist +from nltk.util import in_idle + +CORPUS_LOADED_EVENT = "<>" +ERROR_LOADING_CORPUS_EVENT = "<>" +POLL_INTERVAL = 100 + +_DEFAULT = "English: Brown Corpus (Humor)" +_CORPORA = { + "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), + "English: Brown Corpus": lambda: brown.words(), + "English: Brown Corpus (Press)": lambda: brown.words( + categories=["news", "editorial", "reviews"] + ), + "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), + "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), + "English: Brown Corpus (Science Fiction)": lambda: brown.words( + categories="science_fiction" + ), + "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), + "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), + "English: NPS Chat Corpus": lambda: nps_chat.words(), + "English: Wall Street Journal Corpus": lambda: treebank.words(), + "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), + "Dutch: Alpino Corpus": lambda: alpino.words(), + "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), + "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), + "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), + "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), + "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), +} + + +class CollocationsView: + _BACKGROUND_COLOUR = "#FFF" # white + + def __init__(self): + self.queue = q.Queue() + self.model = CollocationsModel(self.queue) + self.top = Tk() + self._init_top(self.top) + self._init_menubar() + self._init_widgets(self.top) + self.load_corpus(self.model.DEFAULT_CORPUS) + self.after = self.top.after(POLL_INTERVAL, self._poll) + + def _init_top(self, top): + top.geometry("550x650+50+50") + top.title("NLTK Collocations List") + top.bind("", self.destroy) + top.protocol("WM_DELETE_WINDOW", self.destroy) + top.minsize(550, 650) + + def _init_widgets(self, parent): + self.main_frame = Frame( + parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1) + ) + self._init_corpus_select(self.main_frame) + self._init_results_box(self.main_frame) + self._init_paging(self.main_frame) + self._init_status(self.main_frame) + self.main_frame.pack(fill="both", expand=True) + + def _init_corpus_select(self, parent): + innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) + self.var = StringVar(innerframe) + self.var.set(self.model.DEFAULT_CORPUS) + Label( + innerframe, + justify=LEFT, + text=" Corpus: ", + background=self._BACKGROUND_COLOUR, + padx=2, + pady=1, + border=0, + ).pack(side="left") + + other_corpora = list(self.model.CORPORA.keys()).remove( + self.model.DEFAULT_CORPUS + ) + om = OptionMenu( + innerframe, + self.var, + self.model.DEFAULT_CORPUS, + command=self.corpus_selected, + *self.model.non_default_corpora() + ) + om["borderwidth"] = 0 + om["highlightthickness"] = 1 + om.pack(side="left") + innerframe.pack(side="top", fill="x", anchor="n") + + def _init_status(self, parent): + self.status = Label( + parent, + justify=LEFT, + relief=SUNKEN, + background=self._BACKGROUND_COLOUR, + border=0, + padx=1, + pady=0, + ) + self.status.pack(side="top", anchor="sw") + + def _init_menubar(self): + self._result_size = IntVar(self.top) + menubar = Menu(self.top) + + filemenu = Menu(menubar, tearoff=0, borderwidth=0) + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + editmenu = Menu(menubar, tearoff=0) + rescntmenu = Menu(editmenu, tearoff=0) + rescntmenu.add_radiobutton( + label="20", + variable=self._result_size, + underline=0, + value=20, + command=self.set_result_size, + ) + rescntmenu.add_radiobutton( + label="50", + variable=self._result_size, + underline=0, + value=50, + command=self.set_result_size, + ) + rescntmenu.add_radiobutton( + label="100", + variable=self._result_size, + underline=0, + value=100, + command=self.set_result_size, + ) + rescntmenu.invoke(1) + editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu) + + menubar.add_cascade(label="Edit", underline=0, menu=editmenu) + self.top.config(menu=menubar) + + def set_result_size(self, **kwargs): + self.model.result_count = self._result_size.get() + + def _init_results_box(self, parent): + innerframe = Frame(parent) + i1 = Frame(innerframe) + i2 = Frame(innerframe) + vscrollbar = Scrollbar(i1, borderwidth=1) + hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz") + self.results_box = Text( + i1, + font=Font(family="courier", size="16"), + state="disabled", + borderwidth=1, + yscrollcommand=vscrollbar.set, + xscrollcommand=hscrollbar.set, + wrap="none", + width="40", + height="20", + exportselection=1, + ) + self.results_box.pack(side="left", fill="both", expand=True) + vscrollbar.pack(side="left", fill="y", anchor="e") + vscrollbar.config(command=self.results_box.yview) + hscrollbar.pack(side="left", fill="x", expand=True, anchor="w") + hscrollbar.config(command=self.results_box.xview) + # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! + Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack( + side="left", anchor="e" + ) + i1.pack(side="top", fill="both", expand=True, anchor="n") + i2.pack(side="bottom", fill="x", anchor="s") + innerframe.pack(side="top", fill="both", expand=True) + + def _init_paging(self, parent): + innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) + self.prev = prev = Button( + innerframe, + text="Previous", + command=self.previous, + width="10", + borderwidth=1, + highlightthickness=1, + state="disabled", + ) + prev.pack(side="left", anchor="center") + self.next = next = Button( + innerframe, + text="Next", + command=self.__next__, + width="10", + borderwidth=1, + highlightthickness=1, + state="disabled", + ) + next.pack(side="right", anchor="center") + innerframe.pack(side="top", fill="y") + self.reset_current_page() + + def reset_current_page(self): + self.current_page = -1 + + def _poll(self): + try: + event = self.queue.get(block=False) + except q.Empty: + pass + else: + if event == CORPUS_LOADED_EVENT: + self.handle_corpus_loaded(event) + elif event == ERROR_LOADING_CORPUS_EVENT: + self.handle_error_loading_corpus(event) + self.after = self.top.after(POLL_INTERVAL, self._poll) + + def handle_error_loading_corpus(self, event): + self.status["text"] = "Error in loading " + self.var.get() + self.unfreeze_editable() + self.clear_results_box() + self.freeze_editable() + self.reset_current_page() + + def handle_corpus_loaded(self, event): + self.status["text"] = self.var.get() + " is loaded" + self.unfreeze_editable() + self.clear_results_box() + self.reset_current_page() + # self.next() + collocations = self.model.next(self.current_page + 1) + self.write_results(collocations) + self.current_page += 1 + + def corpus_selected(self, *args): + new_selection = self.var.get() + self.load_corpus(new_selection) + + def previous(self): + self.freeze_editable() + collocations = self.model.prev(self.current_page - 1) + self.current_page = self.current_page - 1 + self.clear_results_box() + self.write_results(collocations) + self.unfreeze_editable() + + def __next__(self): + self.freeze_editable() + collocations = self.model.next(self.current_page + 1) + self.clear_results_box() + self.write_results(collocations) + self.current_page += 1 + self.unfreeze_editable() + + def load_corpus(self, selection): + if self.model.selected_corpus != selection: + self.status["text"] = "Loading " + selection + "..." + self.freeze_editable() + self.model.load_corpus(selection) + + def freeze_editable(self): + self.prev["state"] = "disabled" + self.next["state"] = "disabled" + + def clear_results_box(self): + self.results_box["state"] = "normal" + self.results_box.delete("1.0", END) + self.results_box["state"] = "disabled" + + def fire_event(self, event): + # Firing an event so that rendering of widgets happen in the mainloop thread + self.top.event_generate(event, when="tail") + + def destroy(self, *e): + if self.top is None: + return + self.top.after_cancel(self.after) + self.top.destroy() + self.top = None + + def mainloop(self, *args, **kwargs): + if in_idle(): + return + self.top.mainloop(*args, **kwargs) + + def unfreeze_editable(self): + self.set_paging_button_states() + + def set_paging_button_states(self): + if self.current_page == -1 or self.current_page == 0: + self.prev["state"] = "disabled" + else: + self.prev["state"] = "normal" + if self.model.is_last_page(self.current_page): + self.next["state"] = "disabled" + else: + self.next["state"] = "normal" + + def write_results(self, results): + self.results_box["state"] = "normal" + row = 1 + for each in results: + self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n") + row += 1 + self.results_box["state"] = "disabled" + + +class CollocationsModel: + def __init__(self, queue): + self.result_count = None + self.selected_corpus = None + self.collocations = None + self.CORPORA = _CORPORA + self.DEFAULT_CORPUS = _DEFAULT + self.queue = queue + self.reset_results() + + def reset_results(self): + self.result_pages = [] + self.results_returned = 0 + + def load_corpus(self, name): + self.selected_corpus = name + self.collocations = None + runner_thread = self.LoadCorpus(name, self) + runner_thread.start() + self.reset_results() + + def non_default_corpora(self): + copy = [] + copy.extend(list(self.CORPORA.keys())) + copy.remove(self.DEFAULT_CORPUS) + copy.sort() + return copy + + def is_last_page(self, number): + if number < len(self.result_pages): + return False + return self.results_returned + ( + number - len(self.result_pages) + ) * self.result_count >= len(self.collocations) + + def next(self, page): + if (len(self.result_pages) - 1) < page: + for i in range(page - (len(self.result_pages) - 1)): + self.result_pages.append( + self.collocations[ + self.results_returned : self.results_returned + + self.result_count + ] + ) + self.results_returned += self.result_count + return self.result_pages[page] + + def prev(self, page): + if page == -1: + return [] + return self.result_pages[page] + + class LoadCorpus(threading.Thread): + def __init__(self, name, model): + threading.Thread.__init__(self) + self.model, self.name = model, name + + def run(self): + try: + words = self.model.CORPORA[self.name]() + from operator import itemgetter + + text = [w for w in words if len(w) > 2] + fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1)) + vocab = FreqDist(text) + scored = [ + ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2])) + for w1, w2 in fd + ] + scored.sort(key=itemgetter(1), reverse=True) + self.model.collocations = list(map(itemgetter(0), scored)) + self.model.queue.put(CORPUS_LOADED_EVENT) + except Exception as e: + print(e) + self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) + + +# def collocations(): +# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]] + + +def app(): + c = CollocationsView() + c.mainloop() + + +if __name__ == "__main__": + app() + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/concordance_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/concordance_app.py new file mode 100644 index 0000000000000000000000000000000000000000..ac4064eb0e0421f5818aac88d7ac03d730bfa29e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/concordance_app.py @@ -0,0 +1,709 @@ +# Natural Language Toolkit: Concordance Application +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Sumukh Ghodke +# URL: +# For license information, see LICENSE.TXT + +import queue as q +import re +import threading +from tkinter import ( + END, + LEFT, + SUNKEN, + Button, + Entry, + Frame, + IntVar, + Label, + Menu, + OptionMenu, + Scrollbar, + StringVar, + Text, + Tk, +) +from tkinter.font import Font + +from nltk.corpus import ( + alpino, + brown, + cess_cat, + cess_esp, + floresta, + indian, + mac_morpho, + nps_chat, + sinica_treebank, + treebank, +) +from nltk.draw.util import ShowText +from nltk.util import in_idle + +WORD_OR_TAG = "[^/ ]+" +BOUNDARY = r"\b" + +CORPUS_LOADED_EVENT = "<>" +SEARCH_TERMINATED_EVENT = "<>" +SEARCH_ERROR_EVENT = "<>" +ERROR_LOADING_CORPUS_EVENT = "<>" + +POLL_INTERVAL = 50 + +# NB All corpora must be specified in a lambda expression so as not to be +# loaded when the module is imported. + +_DEFAULT = "English: Brown Corpus (Humor, simplified)" +_CORPORA = { + "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents( + tagset="universal" + ), + "English: Brown Corpus": lambda: brown.tagged_sents(), + "English: Brown Corpus (simplified)": lambda: brown.tagged_sents( + tagset="universal" + ), + "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents( + categories=["news", "editorial", "reviews"], tagset="universal" + ), + "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents( + categories="religion", tagset="universal" + ), + "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents( + categories="learned", tagset="universal" + ), + "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( + categories="science_fiction", tagset="universal" + ), + "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents( + categories="romance", tagset="universal" + ), + "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents( + categories="humor", tagset="universal" + ), + "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), + "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts( + tagset="universal" + ), + "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), + "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents( + tagset="universal" + ), + "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), + "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( + tagset="universal" + ), + "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), + "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( + tagset="universal" + ), + "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), + "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents( + files="hindi.pos", tagset="universal" + ), + "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), + "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents( + tagset="universal" + ), + "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), + "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents( + tagset="universal" + ), + "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents( + tagset="universal" + ), +} + + +class ConcordanceSearchView: + _BACKGROUND_COLOUR = "#FFF" # white + + # Colour of highlighted results + _HIGHLIGHT_WORD_COLOUR = "#F00" # red + _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG" + + _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey + _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG" + + # Percentage of text left of the scrollbar position + _FRACTION_LEFT_TEXT = 0.30 + + def __init__(self): + self.queue = q.Queue() + self.model = ConcordanceSearchModel(self.queue) + self.top = Tk() + self._init_top(self.top) + self._init_menubar() + self._init_widgets(self.top) + self.load_corpus(self.model.DEFAULT_CORPUS) + self.after = self.top.after(POLL_INTERVAL, self._poll) + + def _init_top(self, top): + top.geometry("950x680+50+50") + top.title("NLTK Concordance Search") + top.bind("", self.destroy) + top.protocol("WM_DELETE_WINDOW", self.destroy) + top.minsize(950, 680) + + def _init_widgets(self, parent): + self.main_frame = Frame( + parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1) + ) + self._init_corpus_select(self.main_frame) + self._init_query_box(self.main_frame) + self._init_results_box(self.main_frame) + self._init_paging(self.main_frame) + self._init_status(self.main_frame) + self.main_frame.pack(fill="both", expand=True) + + def _init_menubar(self): + self._result_size = IntVar(self.top) + self._cntx_bf_len = IntVar(self.top) + self._cntx_af_len = IntVar(self.top) + menubar = Menu(self.top) + + filemenu = Menu(menubar, tearoff=0, borderwidth=0) + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + editmenu = Menu(menubar, tearoff=0) + rescntmenu = Menu(editmenu, tearoff=0) + rescntmenu.add_radiobutton( + label="20", + variable=self._result_size, + underline=0, + value=20, + command=self.set_result_size, + ) + rescntmenu.add_radiobutton( + label="50", + variable=self._result_size, + underline=0, + value=50, + command=self.set_result_size, + ) + rescntmenu.add_radiobutton( + label="100", + variable=self._result_size, + underline=0, + value=100, + command=self.set_result_size, + ) + rescntmenu.invoke(1) + editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu) + + cntxmenu = Menu(editmenu, tearoff=0) + cntxbfmenu = Menu(cntxmenu, tearoff=0) + cntxbfmenu.add_radiobutton( + label="60 characters", + variable=self._cntx_bf_len, + underline=0, + value=60, + command=self.set_cntx_bf_len, + ) + cntxbfmenu.add_radiobutton( + label="80 characters", + variable=self._cntx_bf_len, + underline=0, + value=80, + command=self.set_cntx_bf_len, + ) + cntxbfmenu.add_radiobutton( + label="100 characters", + variable=self._cntx_bf_len, + underline=0, + value=100, + command=self.set_cntx_bf_len, + ) + cntxbfmenu.invoke(1) + cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu) + + cntxafmenu = Menu(cntxmenu, tearoff=0) + cntxafmenu.add_radiobutton( + label="70 characters", + variable=self._cntx_af_len, + underline=0, + value=70, + command=self.set_cntx_af_len, + ) + cntxafmenu.add_radiobutton( + label="90 characters", + variable=self._cntx_af_len, + underline=0, + value=90, + command=self.set_cntx_af_len, + ) + cntxafmenu.add_radiobutton( + label="110 characters", + variable=self._cntx_af_len, + underline=0, + value=110, + command=self.set_cntx_af_len, + ) + cntxafmenu.invoke(1) + cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu) + + editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu) + + menubar.add_cascade(label="Edit", underline=0, menu=editmenu) + + self.top.config(menu=menubar) + + def set_result_size(self, **kwargs): + self.model.result_count = self._result_size.get() + + def set_cntx_af_len(self, **kwargs): + self._char_after = self._cntx_af_len.get() + + def set_cntx_bf_len(self, **kwargs): + self._char_before = self._cntx_bf_len.get() + + def _init_corpus_select(self, parent): + innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) + self.var = StringVar(innerframe) + self.var.set(self.model.DEFAULT_CORPUS) + Label( + innerframe, + justify=LEFT, + text=" Corpus: ", + background=self._BACKGROUND_COLOUR, + padx=2, + pady=1, + border=0, + ).pack(side="left") + + other_corpora = list(self.model.CORPORA.keys()).remove( + self.model.DEFAULT_CORPUS + ) + om = OptionMenu( + innerframe, + self.var, + self.model.DEFAULT_CORPUS, + command=self.corpus_selected, + *self.model.non_default_corpora() + ) + om["borderwidth"] = 0 + om["highlightthickness"] = 1 + om.pack(side="left") + innerframe.pack(side="top", fill="x", anchor="n") + + def _init_status(self, parent): + self.status = Label( + parent, + justify=LEFT, + relief=SUNKEN, + background=self._BACKGROUND_COLOUR, + border=0, + padx=1, + pady=0, + ) + self.status.pack(side="top", anchor="sw") + + def _init_query_box(self, parent): + innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) + another = Frame(innerframe, background=self._BACKGROUND_COLOUR) + self.query_box = Entry(another, width=60) + self.query_box.pack(side="left", fill="x", pady=25, anchor="center") + self.search_button = Button( + another, + text="Search", + command=self.search, + borderwidth=1, + highlightthickness=1, + ) + self.search_button.pack(side="left", fill="x", pady=25, anchor="center") + self.query_box.bind("", self.search_enter_keypress_handler) + another.pack() + innerframe.pack(side="top", fill="x", anchor="n") + + def search_enter_keypress_handler(self, *event): + self.search() + + def _init_results_box(self, parent): + innerframe = Frame(parent) + i1 = Frame(innerframe) + i2 = Frame(innerframe) + vscrollbar = Scrollbar(i1, borderwidth=1) + hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz") + self.results_box = Text( + i1, + font=Font(family="courier", size="16"), + state="disabled", + borderwidth=1, + yscrollcommand=vscrollbar.set, + xscrollcommand=hscrollbar.set, + wrap="none", + width="40", + height="20", + exportselection=1, + ) + self.results_box.pack(side="left", fill="both", expand=True) + self.results_box.tag_config( + self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR + ) + self.results_box.tag_config( + self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR + ) + vscrollbar.pack(side="left", fill="y", anchor="e") + vscrollbar.config(command=self.results_box.yview) + hscrollbar.pack(side="left", fill="x", expand=True, anchor="w") + hscrollbar.config(command=self.results_box.xview) + # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! + Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack( + side="left", anchor="e" + ) + i1.pack(side="top", fill="both", expand=True, anchor="n") + i2.pack(side="bottom", fill="x", anchor="s") + innerframe.pack(side="top", fill="both", expand=True) + + def _init_paging(self, parent): + innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) + self.prev = prev = Button( + innerframe, + text="Previous", + command=self.previous, + width="10", + borderwidth=1, + highlightthickness=1, + state="disabled", + ) + prev.pack(side="left", anchor="center") + self.next = next = Button( + innerframe, + text="Next", + command=self.__next__, + width="10", + borderwidth=1, + highlightthickness=1, + state="disabled", + ) + next.pack(side="right", anchor="center") + innerframe.pack(side="top", fill="y") + self.current_page = 0 + + def previous(self): + self.clear_results_box() + self.freeze_editable() + self.model.prev(self.current_page - 1) + + def __next__(self): + self.clear_results_box() + self.freeze_editable() + self.model.next(self.current_page + 1) + + def about(self, *e): + ABOUT = "NLTK Concordance Search Demo\n" + TITLE = "About: NLTK Concordance Search Demo" + try: + from tkinter.messagebox import Message + + Message(message=ABOUT, title=TITLE, parent=self.main_frame).show() + except: + ShowText(self.top, TITLE, ABOUT) + + def _bind_event_handlers(self): + self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded) + self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated) + self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error) + self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus) + + def _poll(self): + try: + event = self.queue.get(block=False) + except q.Empty: + pass + else: + if event == CORPUS_LOADED_EVENT: + self.handle_corpus_loaded(event) + elif event == SEARCH_TERMINATED_EVENT: + self.handle_search_terminated(event) + elif event == SEARCH_ERROR_EVENT: + self.handle_search_error(event) + elif event == ERROR_LOADING_CORPUS_EVENT: + self.handle_error_loading_corpus(event) + self.after = self.top.after(POLL_INTERVAL, self._poll) + + def handle_error_loading_corpus(self, event): + self.status["text"] = "Error in loading " + self.var.get() + self.unfreeze_editable() + self.clear_all() + self.freeze_editable() + + def handle_corpus_loaded(self, event): + self.status["text"] = self.var.get() + " is loaded" + self.unfreeze_editable() + self.clear_all() + self.query_box.focus_set() + + def handle_search_terminated(self, event): + # todo: refactor the model such that it is less state sensitive + results = self.model.get_results() + self.write_results(results) + self.status["text"] = "" + if len(results) == 0: + self.status["text"] = "No results found for " + self.model.query + else: + self.current_page = self.model.last_requested_page + self.unfreeze_editable() + self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT) + + def handle_search_error(self, event): + self.status["text"] = "Error in query " + self.model.query + self.unfreeze_editable() + + def corpus_selected(self, *args): + new_selection = self.var.get() + self.load_corpus(new_selection) + + def load_corpus(self, selection): + if self.model.selected_corpus != selection: + self.status["text"] = "Loading " + selection + "..." + self.freeze_editable() + self.model.load_corpus(selection) + + def search(self): + self.current_page = 0 + self.clear_results_box() + self.model.reset_results() + query = self.query_box.get() + if len(query.strip()) == 0: + return + self.status["text"] = "Searching for " + query + self.freeze_editable() + self.model.search(query, self.current_page + 1) + + def write_results(self, results): + self.results_box["state"] = "normal" + row = 1 + for each in results: + sent, pos1, pos2 = each[0].strip(), each[1], each[2] + if len(sent) != 0: + if pos1 < self._char_before: + sent, pos1, pos2 = self.pad(sent, pos1, pos2) + sentence = sent[pos1 - self._char_before : pos1 + self._char_after] + if not row == len(results): + sentence += "\n" + self.results_box.insert(str(row) + ".0", sentence) + word_markers, label_markers = self.words_and_labels(sent, pos1, pos2) + for marker in word_markers: + self.results_box.tag_add( + self._HIGHLIGHT_WORD_TAG, + str(row) + "." + str(marker[0]), + str(row) + "." + str(marker[1]), + ) + for marker in label_markers: + self.results_box.tag_add( + self._HIGHLIGHT_LABEL_TAG, + str(row) + "." + str(marker[0]), + str(row) + "." + str(marker[1]), + ) + row += 1 + self.results_box["state"] = "disabled" + + def words_and_labels(self, sentence, pos1, pos2): + search_exp = sentence[pos1:pos2] + words, labels = [], [] + labeled_words = search_exp.split(" ") + index = 0 + for each in labeled_words: + if each == "": + index += 1 + else: + word, label = each.split("/") + words.append( + (self._char_before + index, self._char_before + index + len(word)) + ) + index += len(word) + 1 + labels.append( + (self._char_before + index, self._char_before + index + len(label)) + ) + index += len(label) + index += 1 + return words, labels + + def pad(self, sent, hstart, hend): + if hstart >= self._char_before: + return sent, hstart, hend + d = self._char_before - hstart + sent = "".join([" "] * d) + sent + return sent, hstart + d, hend + d + + def destroy(self, *e): + if self.top is None: + return + self.top.after_cancel(self.after) + self.top.destroy() + self.top = None + + def clear_all(self): + self.query_box.delete(0, END) + self.model.reset_query() + self.clear_results_box() + + def clear_results_box(self): + self.results_box["state"] = "normal" + self.results_box.delete("1.0", END) + self.results_box["state"] = "disabled" + + def freeze_editable(self): + self.query_box["state"] = "disabled" + self.search_button["state"] = "disabled" + self.prev["state"] = "disabled" + self.next["state"] = "disabled" + + def unfreeze_editable(self): + self.query_box["state"] = "normal" + self.search_button["state"] = "normal" + self.set_paging_button_states() + + def set_paging_button_states(self): + if self.current_page == 0 or self.current_page == 1: + self.prev["state"] = "disabled" + else: + self.prev["state"] = "normal" + if self.model.has_more_pages(self.current_page): + self.next["state"] = "normal" + else: + self.next["state"] = "disabled" + + def fire_event(self, event): + # Firing an event so that rendering of widgets happen in the mainloop thread + self.top.event_generate(event, when="tail") + + def mainloop(self, *args, **kwargs): + if in_idle(): + return + self.top.mainloop(*args, **kwargs) + + +class ConcordanceSearchModel: + def __init__(self, queue): + self.queue = queue + self.CORPORA = _CORPORA + self.DEFAULT_CORPUS = _DEFAULT + self.selected_corpus = None + self.reset_query() + self.reset_results() + self.result_count = None + self.last_sent_searched = 0 + + def non_default_corpora(self): + copy = [] + copy.extend(list(self.CORPORA.keys())) + copy.remove(self.DEFAULT_CORPUS) + copy.sort() + return copy + + def load_corpus(self, name): + self.selected_corpus = name + self.tagged_sents = [] + runner_thread = self.LoadCorpus(name, self) + runner_thread.start() + + def search(self, query, page): + self.query = query + self.last_requested_page = page + self.SearchCorpus(self, page, self.result_count).start() + + def next(self, page): + self.last_requested_page = page + if len(self.results) < page: + self.search(self.query, page) + else: + self.queue.put(SEARCH_TERMINATED_EVENT) + + def prev(self, page): + self.last_requested_page = page + self.queue.put(SEARCH_TERMINATED_EVENT) + + def reset_results(self): + self.last_sent_searched = 0 + self.results = [] + self.last_page = None + + def reset_query(self): + self.query = None + + def set_results(self, page, resultset): + self.results.insert(page - 1, resultset) + + def get_results(self): + return self.results[self.last_requested_page - 1] + + def has_more_pages(self, page): + if self.results == [] or self.results[0] == []: + return False + if self.last_page is None: + return True + return page < self.last_page + + class LoadCorpus(threading.Thread): + def __init__(self, name, model): + threading.Thread.__init__(self) + self.model, self.name = model, name + + def run(self): + try: + ts = self.model.CORPORA[self.name]() + self.model.tagged_sents = [ + " ".join(w + "/" + t for (w, t) in sent) for sent in ts + ] + self.model.queue.put(CORPUS_LOADED_EVENT) + except Exception as e: + print(e) + self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) + + class SearchCorpus(threading.Thread): + def __init__(self, model, page, count): + self.model, self.count, self.page = model, count, page + threading.Thread.__init__(self) + + def run(self): + q = self.processed_query() + sent_pos, i, sent_count = [], 0, 0 + for sent in self.model.tagged_sents[self.model.last_sent_searched :]: + try: + m = re.search(q, sent) + except re.error: + self.model.reset_results() + self.model.queue.put(SEARCH_ERROR_EVENT) + return + if m: + sent_pos.append((sent, m.start(), m.end())) + i += 1 + if i > self.count: + self.model.last_sent_searched += sent_count - 1 + break + sent_count += 1 + if self.count >= len(sent_pos): + self.model.last_sent_searched += sent_count - 1 + self.model.last_page = self.page + self.model.set_results(self.page, sent_pos) + else: + self.model.set_results(self.page, sent_pos[:-1]) + self.model.queue.put(SEARCH_TERMINATED_EVENT) + + def processed_query(self): + new = [] + for term in self.model.query.split(): + term = re.sub(r"\.", r"[^/ ]", term) + if re.match("[A-Z]+$", term): + new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY) + elif "/" in term: + new.append(BOUNDARY + term + BOUNDARY) + else: + new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY) + return " ".join(new) + + +def app(): + d = ConcordanceSearchView() + d.mainloop() + + +if __name__ == "__main__": + app() + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/nemo_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/nemo_app.py new file mode 100644 index 0000000000000000000000000000000000000000..df0ceb1be59e40bb48289f4f1411653789ca7a17 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/nemo_app.py @@ -0,0 +1,163 @@ +# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06 +# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783 + +""" +Finding (and Replacing) Nemo + +Instant Regular Expressions +Created by Aristide Grange +""" +import itertools +import re +from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk + +windowTitle = "Finding (and Replacing) Nemo" +initialFind = r"n(.*?)e(.*?)m(.*?)o" +initialRepl = r"M\1A\2K\3I" +initialText = """\ +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. +Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. +Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +""" +images = { + "FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=", + "find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7", + "REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7", + "repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=", +} +colors = ["#FF7B39", "#80F121"] +emphColors = ["#DAFC33", "#F42548"] +fieldParams = { + "height": 3, + "width": 70, + "font": ("monaco", 14), + "highlightthickness": 0, + "borderwidth": 0, + "background": "white", +} +textParams = { + "bg": "#F7E0D4", + "fg": "#2321F1", + "highlightthickness": 0, + "width": 1, + "height": 10, + "font": ("verdana", 16), + "wrap": "word", +} + + +class Zone: + def __init__(self, image, initialField, initialText): + frm = Frame(root) + frm.config(background="white") + self.image = PhotoImage(format="gif", data=images[image.upper()]) + self.imageDimmed = PhotoImage(format="gif", data=images[image]) + self.img = Label(frm) + self.img.config(borderwidth=0) + self.img.pack(side="left") + self.fld = Text(frm, **fieldParams) + self.initScrollText(frm, self.fld, initialField) + frm = Frame(root) + self.txt = Text(frm, **textParams) + self.initScrollText(frm, self.txt, initialText) + for i in range(2): + self.txt.tag_config(colors[i], background=colors[i]) + self.txt.tag_config("emph" + colors[i], foreground=emphColors[i]) + + def initScrollText(self, frm, txt, contents): + scl = Scrollbar(frm) + scl.config(command=txt.yview) + scl.pack(side="right", fill="y") + txt.pack(side="left", expand=True, fill="x") + txt.config(yscrollcommand=scl.set) + txt.insert("1.0", contents) + frm.pack(fill="x") + Frame(height=2, bd=1, relief="ridge").pack(fill="x") + + def refresh(self): + self.colorCycle = itertools.cycle(colors) + try: + self.substitute() + self.img.config(image=self.image) + except re.error: + self.img.config(image=self.imageDimmed) + + +class FindZone(Zone): + def addTags(self, m): + color = next(self.colorCycle) + self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end()) + try: + self.txt.tag_add( + "emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph") + ) + except: + pass + + def substitute(self, *args): + for color in colors: + self.txt.tag_remove(color, "1.0", "end") + self.txt.tag_remove("emph" + color, "1.0", "end") + self.rex = re.compile("") # default value in case of malformed regexp + self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE) + try: + re.compile("(?P%s)" % self.fld.get(SEL_FIRST, SEL_LAST)) + self.rexSel = re.compile( + "%s(?P%s)%s" + % ( + self.fld.get("1.0", SEL_FIRST), + self.fld.get(SEL_FIRST, SEL_LAST), + self.fld.get(SEL_LAST, "end")[:-1], + ), + re.MULTILINE, + ) + except: + self.rexSel = self.rex + self.rexSel.sub(self.addTags, self.txt.get("1.0", "end")) + + +class ReplaceZone(Zone): + def addTags(self, m): + s = sz.rex.sub(self.repl, m.group()) + self.txt.delete( + "1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff) + ) + self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle)) + self.diff += len(s) - (m.end() - m.start()) + + def substitute(self): + self.txt.delete("1.0", "end") + self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1]) + self.diff = 0 + self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1]) + sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1]) + + +def launchRefresh(_): + sz.fld.after_idle(sz.refresh) + rz.fld.after_idle(rz.refresh) + + +def app(): + global root, sz, rz, rex0 + root = Tk() + root.resizable(height=False, width=True) + root.title(windowTitle) + root.minsize(width=250, height=0) + sz = FindZone("find", initialFind, initialText) + sz.fld.bind("", launchRefresh) + sz.fld.bind("", launchRefresh) + sz.fld.bind("", launchRefresh) + sz.rexSel = re.compile("") + rz = ReplaceZone("repl", initialRepl, "") + rex0 = re.compile(r"(?", launchRefresh) + launchRefresh(None) + root.mainloop() + + +if __name__ == "__main__": + app() + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/rdparser_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/rdparser_app.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b7578bd22bb5b97a263a4d8ff30b9cd39aa28a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/rdparser_app.py @@ -0,0 +1,1052 @@ +# Natural Language Toolkit: Recursive Descent Parser Application +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +A graphical tool for exploring the recursive descent parser. + +The recursive descent parser maintains a tree, which records the +structure of the portion of the text that has been parsed. It uses +CFG productions to expand the fringe of the tree, and matches its +leaves against the text. Initially, the tree contains the start +symbol ("S"). It is shown in the main canvas, to the right of the +list of available expansions. + +The parser builds up a tree structure for the text using three +operations: + + - "expand" uses a CFG production to add children to a node on the + fringe of the tree. + - "match" compares a leaf in the tree to a text token. + - "backtrack" returns the tree to its state before the most recent + expand or match operation. + +The parser maintains a list of tree locations called a "frontier" to +remember which nodes have not yet been expanded and which leaves have +not yet been matched against the text. The leftmost frontier node is +shown in green, and the other frontier nodes are shown in blue. The +parser always performs expand and match operations on the leftmost +element of the frontier. + +You can control the parser's operation by using the "expand," "match," +and "backtrack" buttons; or you can use the "step" button to let the +parser automatically decide which operation to apply. The parser uses +the following rules to decide which operation to apply: + + - If the leftmost frontier element is a token, try matching it. + - If the leftmost frontier element is a node, try expanding it with + the first untried expansion. + - Otherwise, backtrack. + +The "expand" button applies the untried expansion whose CFG production +is listed earliest in the grammar. To manually choose which expansion +to apply, click on a CFG production from the list of available +expansions, on the left side of the main window. + +The "autostep" button will let the parser continue applying +applications to the tree until it reaches a complete parse. You can +cancel an autostep in progress at any time by clicking on the +"autostep" button again. + +Keyboard Shortcuts:: + [Space]\t Perform the next expand, match, or backtrack operation + [a]\t Step through operations until the next complete parse + [e]\t Perform an expand operation + [m]\t Perform a match operation + [b]\t Perform a backtrack operation + [Delete]\t Reset the parser + [g]\t Show/hide available expansions list + [h]\t Help + [Ctrl-p]\t Print + [q]\t Quit +""" + +from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk +from tkinter.font import Font + +from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment +from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget +from nltk.parse import SteppingRecursiveDescentParser +from nltk.tree import Tree +from nltk.util import in_idle + + +class RecursiveDescentApp: + """ + A graphical tool for exploring the recursive descent parser. The tool + displays the parser's tree and the remaining text, and allows the + user to control the parser's operation. In particular, the user + can expand subtrees on the frontier, match tokens on the frontier + against the text, and backtrack. A "step" button simply steps + through the parsing process, performing the operations that + ``RecursiveDescentParser`` would use. + """ + + def __init__(self, grammar, sent, trace=0): + self._sent = sent + self._parser = SteppingRecursiveDescentParser(grammar, trace) + + # Set up the main window. + self._top = Tk() + self._top.title("Recursive Descent Parser Application") + + # Set up key bindings. + self._init_bindings() + + # Initialize the fonts. + self._init_fonts(self._top) + + # Animations. animating_lock is a lock to prevent the demo + # from performing new operations while it's animating. + self._animation_frames = IntVar(self._top) + self._animation_frames.set(5) + self._animating_lock = 0 + self._autostep = 0 + + # The user can hide the grammar. + self._show_grammar = IntVar(self._top) + self._show_grammar.set(1) + + # Create the basic frames. + self._init_menubar(self._top) + self._init_buttons(self._top) + self._init_feedback(self._top) + self._init_grammar(self._top) + self._init_canvas(self._top) + + # Initialize the parser. + self._parser.initialize(self._sent) + + # Resize callback + self._canvas.bind("", self._configure) + + ######################################### + ## Initialization Helpers + ######################################### + + def _init_fonts(self, root): + # See: + self._sysfont = Font(font=Button()["font"]) + root.option_add("*Font", self._sysfont) + + # TWhat's our font size (default=same as sysfont) + self._size = IntVar(root) + self._size.set(self._sysfont.cget("size")) + + self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) + self._font = Font(family="helvetica", size=self._size.get()) + if self._size.get() < 0: + big = self._size.get() - 2 + else: + big = self._size.get() + 2 + self._bigfont = Font(family="helvetica", weight="bold", size=big) + + def _init_grammar(self, parent): + # Grammar view. + self._prodframe = listframe = Frame(parent) + self._prodframe.pack(fill="both", side="left", padx=2) + self._prodlist_label = Label( + self._prodframe, font=self._boldfont, text="Available Expansions" + ) + self._prodlist_label.pack() + self._prodlist = Listbox( + self._prodframe, + selectmode="single", + relief="groove", + background="white", + foreground="#909090", + font=self._font, + selectforeground="#004040", + selectbackground="#c0f0c0", + ) + + self._prodlist.pack(side="right", fill="both", expand=1) + + self._productions = list(self._parser.grammar().productions()) + for production in self._productions: + self._prodlist.insert("end", (" %s" % production)) + self._prodlist.config(height=min(len(self._productions), 25)) + + # Add a scrollbar if there are more than 25 productions. + if len(self._productions) > 25: + listscroll = Scrollbar(self._prodframe, orient="vertical") + self._prodlist.config(yscrollcommand=listscroll.set) + listscroll.config(command=self._prodlist.yview) + listscroll.pack(side="left", fill="y") + + # If they select a production, apply it. + self._prodlist.bind("<>", self._prodlist_select) + + def _init_bindings(self): + # Key bindings are a good thing. + self._top.bind("", self.destroy) + self._top.bind("", self.destroy) + self._top.bind("", self.destroy) + self._top.bind("e", self.expand) + # self._top.bind('', self.expand) + # self._top.bind('', self.expand) + self._top.bind("m", self.match) + self._top.bind("", self.match) + self._top.bind("", self.match) + self._top.bind("b", self.backtrack) + self._top.bind("", self.backtrack) + self._top.bind("", self.backtrack) + self._top.bind("", self.backtrack) + self._top.bind("", self.backtrack) + self._top.bind("a", self.autostep) + # self._top.bind('', self.autostep) + self._top.bind("", self.autostep) + self._top.bind("", self.cancel_autostep) + self._top.bind("", self.step) + self._top.bind("", self.reset) + self._top.bind("", self.postscript) + # self._top.bind('', self.help) + # self._top.bind('', self.help) + self._top.bind("", self.help) + self._top.bind("", self.help) + # self._top.bind('', self.toggle_grammar) + # self._top.bind('', self.toggle_grammar) + # self._top.bind('', self.toggle_grammar) + self._top.bind("", self.edit_grammar) + self._top.bind("", self.edit_sentence) + + def _init_buttons(self, parent): + # Set up the frames. + self._buttonframe = buttonframe = Frame(parent) + buttonframe.pack(fill="none", side="bottom", padx=3, pady=2) + Button( + buttonframe, + text="Step", + background="#90c0d0", + foreground="black", + command=self.step, + ).pack(side="left") + Button( + buttonframe, + text="Autostep", + background="#90c0d0", + foreground="black", + command=self.autostep, + ).pack(side="left") + Button( + buttonframe, + text="Expand", + underline=0, + background="#90f090", + foreground="black", + command=self.expand, + ).pack(side="left") + Button( + buttonframe, + text="Match", + underline=0, + background="#90f090", + foreground="black", + command=self.match, + ).pack(side="left") + Button( + buttonframe, + text="Backtrack", + underline=0, + background="#f0a0a0", + foreground="black", + command=self.backtrack, + ).pack(side="left") + # Replace autostep... + + # self._autostep_button = Button(buttonframe, text='Autostep', + # underline=0, command=self.autostep) + # self._autostep_button.pack(side='left') + + def _configure(self, event): + self._autostep = 0 + (x1, y1, x2, y2) = self._cframe.scrollregion() + y2 = event.height - 6 + self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2) + self._redraw() + + def _init_feedback(self, parent): + self._feedbackframe = feedbackframe = Frame(parent) + feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3) + self._lastoper_label = Label( + feedbackframe, text="Last Operation:", font=self._font + ) + self._lastoper_label.pack(side="left") + lastoperframe = Frame(feedbackframe, relief="sunken", border=1) + lastoperframe.pack(fill="x", side="right", expand=1, padx=5) + self._lastoper1 = Label( + lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font + ) + self._lastoper2 = Label( + lastoperframe, + anchor="w", + width=30, + foreground="#004040", + background="#f0f0f0", + font=self._font, + ) + self._lastoper1.pack(side="left") + self._lastoper2.pack(side="left", fill="x", expand=1) + + def _init_canvas(self, parent): + self._cframe = CanvasFrame( + parent, + background="white", + # width=525, height=250, + closeenough=10, + border=2, + relief="sunken", + ) + self._cframe.pack(expand=1, fill="both", side="top", pady=2) + canvas = self._canvas = self._cframe.canvas() + + # Initially, there's no tree or text + self._tree = None + self._textwidgets = [] + self._textline = None + + def _init_menubar(self, parent): + menubar = Menu(parent) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Reset Parser", underline=0, command=self.reset, accelerator="Del" + ) + filemenu.add_command( + label="Print to Postscript", + underline=0, + command=self.postscript, + accelerator="Ctrl-p", + ) + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + editmenu = Menu(menubar, tearoff=0) + editmenu.add_command( + label="Edit Grammar", + underline=5, + command=self.edit_grammar, + accelerator="Ctrl-g", + ) + editmenu.add_command( + label="Edit Text", + underline=5, + command=self.edit_sentence, + accelerator="Ctrl-t", + ) + menubar.add_cascade(label="Edit", underline=0, menu=editmenu) + + rulemenu = Menu(menubar, tearoff=0) + rulemenu.add_command( + label="Step", underline=1, command=self.step, accelerator="Space" + ) + rulemenu.add_separator() + rulemenu.add_command( + label="Match", underline=0, command=self.match, accelerator="Ctrl-m" + ) + rulemenu.add_command( + label="Expand", underline=0, command=self.expand, accelerator="Ctrl-e" + ) + rulemenu.add_separator() + rulemenu.add_command( + label="Backtrack", underline=0, command=self.backtrack, accelerator="Ctrl-b" + ) + menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) + + viewmenu = Menu(menubar, tearoff=0) + viewmenu.add_checkbutton( + label="Show Grammar", + underline=0, + variable=self._show_grammar, + command=self._toggle_grammar, + ) + viewmenu.add_separator() + viewmenu.add_radiobutton( + label="Tiny", + variable=self._size, + underline=0, + value=10, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Small", + variable=self._size, + underline=0, + value=12, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Medium", + variable=self._size, + underline=0, + value=14, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Large", + variable=self._size, + underline=0, + value=18, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Huge", + variable=self._size, + underline=0, + value=24, + command=self.resize, + ) + menubar.add_cascade(label="View", underline=0, menu=viewmenu) + + animatemenu = Menu(menubar, tearoff=0) + animatemenu.add_radiobutton( + label="No Animation", underline=0, variable=self._animation_frames, value=0 + ) + animatemenu.add_radiobutton( + label="Slow Animation", + underline=0, + variable=self._animation_frames, + value=10, + accelerator="-", + ) + animatemenu.add_radiobutton( + label="Normal Animation", + underline=0, + variable=self._animation_frames, + value=5, + accelerator="=", + ) + animatemenu.add_radiobutton( + label="Fast Animation", + underline=0, + variable=self._animation_frames, + value=2, + accelerator="+", + ) + menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) + + helpmenu = Menu(menubar, tearoff=0) + helpmenu.add_command(label="About", underline=0, command=self.about) + helpmenu.add_command( + label="Instructions", underline=0, command=self.help, accelerator="F1" + ) + menubar.add_cascade(label="Help", underline=0, menu=helpmenu) + + parent.config(menu=menubar) + + ######################################### + ## Helper + ######################################### + + def _get(self, widget, treeloc): + for i in treeloc: + widget = widget.subtrees()[i] + if isinstance(widget, TreeSegmentWidget): + widget = widget.label() + return widget + + ######################################### + ## Main draw procedure + ######################################### + + def _redraw(self): + canvas = self._canvas + + # Delete the old tree, widgets, etc. + if self._tree is not None: + self._cframe.destroy_widget(self._tree) + for twidget in self._textwidgets: + self._cframe.destroy_widget(twidget) + if self._textline is not None: + self._canvas.delete(self._textline) + + # Draw the tree. + helv = ("helvetica", -self._size.get()) + bold = ("helvetica", -self._size.get(), "bold") + attribs = { + "tree_color": "#000000", + "tree_width": 2, + "node_font": bold, + "leaf_font": helv, + } + tree = self._parser.tree() + self._tree = tree_to_treesegment(canvas, tree, **attribs) + self._cframe.add_widget(self._tree, 30, 5) + + # Draw the text. + helv = ("helvetica", -self._size.get()) + bottom = y = self._cframe.scrollregion()[3] + self._textwidgets = [ + TextWidget(canvas, word, font=self._font) for word in self._sent + ] + for twidget in self._textwidgets: + self._cframe.add_widget(twidget, 0, 0) + twidget.move(0, bottom - twidget.bbox()[3] - 5) + y = min(y, twidget.bbox()[1]) + + # Draw a line over the text, to separate it from the tree. + self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash=".") + + # Highlight appropriate nodes. + self._highlight_nodes() + self._highlight_prodlist() + + # Make sure the text lines up. + self._position_text() + + def _redraw_quick(self): + # This should be more-or-less sufficient after an animation. + self._highlight_nodes() + self._highlight_prodlist() + self._position_text() + + def _highlight_nodes(self): + # Highlight the list of nodes to be checked. + bold = ("helvetica", -self._size.get(), "bold") + for treeloc in self._parser.frontier()[:1]: + self._get(self._tree, treeloc)["color"] = "#20a050" + self._get(self._tree, treeloc)["font"] = bold + for treeloc in self._parser.frontier()[1:]: + self._get(self._tree, treeloc)["color"] = "#008080" + + def _highlight_prodlist(self): + # Highlight the productions that can be expanded. + # Boy, too bad tkinter doesn't implement Listbox.itemconfig; + # that would be pretty useful here. + self._prodlist.delete(0, "end") + expandable = self._parser.expandable_productions() + untried = self._parser.untried_expandable_productions() + productions = self._productions + for index in range(len(productions)): + if productions[index] in expandable: + if productions[index] in untried: + self._prodlist.insert(index, " %s" % productions[index]) + else: + self._prodlist.insert(index, " %s (TRIED)" % productions[index]) + self._prodlist.selection_set(index) + else: + self._prodlist.insert(index, " %s" % productions[index]) + + def _position_text(self): + # Line up the text widgets that are matched against the tree + numwords = len(self._sent) + num_matched = numwords - len(self._parser.remaining_text()) + leaves = self._tree_leaves()[:num_matched] + xmax = self._tree.bbox()[0] + for i in range(0, len(leaves)): + widget = self._textwidgets[i] + leaf = leaves[i] + widget["color"] = "#006040" + leaf["color"] = "#006040" + widget.move(leaf.bbox()[0] - widget.bbox()[0], 0) + xmax = widget.bbox()[2] + 10 + + # Line up the text widgets that are not matched against the tree. + for i in range(len(leaves), numwords): + widget = self._textwidgets[i] + widget["color"] = "#a0a0a0" + widget.move(xmax - widget.bbox()[0], 0) + xmax = widget.bbox()[2] + 10 + + # If we have a complete parse, make everything green :) + if self._parser.currently_complete(): + for twidget in self._textwidgets: + twidget["color"] = "#00a000" + + # Move the matched leaves down to the text. + for i in range(0, len(leaves)): + widget = self._textwidgets[i] + leaf = leaves[i] + dy = widget.bbox()[1] - leaf.bbox()[3] - 10.0 + dy = max(dy, leaf.parent().label().bbox()[3] - leaf.bbox()[3] + 10) + leaf.move(0, dy) + + def _tree_leaves(self, tree=None): + if tree is None: + tree = self._tree + if isinstance(tree, TreeSegmentWidget): + leaves = [] + for child in tree.subtrees(): + leaves += self._tree_leaves(child) + return leaves + else: + return [tree] + + ######################################### + ## Button Callbacks + ######################################### + + def destroy(self, *e): + self._autostep = 0 + if self._top is None: + return + self._top.destroy() + self._top = None + + def reset(self, *e): + self._autostep = 0 + self._parser.initialize(self._sent) + self._lastoper1["text"] = "Reset Application" + self._lastoper2["text"] = "" + self._redraw() + + def autostep(self, *e): + if self._animation_frames.get() == 0: + self._animation_frames.set(2) + if self._autostep: + self._autostep = 0 + else: + self._autostep = 1 + self._step() + + def cancel_autostep(self, *e): + # self._autostep_button['text'] = 'Autostep' + self._autostep = 0 + + # Make sure to stop auto-stepping if we get any user input. + def step(self, *e): + self._autostep = 0 + self._step() + + def match(self, *e): + self._autostep = 0 + self._match() + + def expand(self, *e): + self._autostep = 0 + self._expand() + + def backtrack(self, *e): + self._autostep = 0 + self._backtrack() + + def _step(self): + if self._animating_lock: + return + + # Try expanding, matching, and backtracking (in that order) + if self._expand(): + pass + elif self._parser.untried_match() and self._match(): + pass + elif self._backtrack(): + pass + else: + self._lastoper1["text"] = "Finished" + self._lastoper2["text"] = "" + self._autostep = 0 + + # Check if we just completed a parse. + if self._parser.currently_complete(): + self._autostep = 0 + self._lastoper2["text"] += " [COMPLETE PARSE]" + + def _expand(self, *e): + if self._animating_lock: + return + old_frontier = self._parser.frontier() + rv = self._parser.expand() + if rv is not None: + self._lastoper1["text"] = "Expand:" + self._lastoper2["text"] = rv + self._prodlist.selection_clear(0, "end") + index = self._productions.index(rv) + self._prodlist.selection_set(index) + self._animate_expand(old_frontier[0]) + return True + else: + self._lastoper1["text"] = "Expand:" + self._lastoper2["text"] = "(all expansions tried)" + return False + + def _match(self, *e): + if self._animating_lock: + return + old_frontier = self._parser.frontier() + rv = self._parser.match() + if rv is not None: + self._lastoper1["text"] = "Match:" + self._lastoper2["text"] = rv + self._animate_match(old_frontier[0]) + return True + else: + self._lastoper1["text"] = "Match:" + self._lastoper2["text"] = "(failed)" + return False + + def _backtrack(self, *e): + if self._animating_lock: + return + if self._parser.backtrack(): + elt = self._parser.tree() + for i in self._parser.frontier()[0]: + elt = elt[i] + self._lastoper1["text"] = "Backtrack" + self._lastoper2["text"] = "" + if isinstance(elt, Tree): + self._animate_backtrack(self._parser.frontier()[0]) + else: + self._animate_match_backtrack(self._parser.frontier()[0]) + return True + else: + self._autostep = 0 + self._lastoper1["text"] = "Finished" + self._lastoper2["text"] = "" + return False + + def about(self, *e): + ABOUT = ( + "NLTK Recursive Descent Parser Application\n" + "Written by Edward Loper" + ) + TITLE = "About: Recursive Descent Parser Application" + try: + from tkinter.messagebox import Message + + Message(message=ABOUT, title=TITLE).show() + except: + ShowText(self._top, TITLE, ABOUT) + + def help(self, *e): + self._autostep = 0 + # The default font's not very legible; try using 'fixed' instead. + try: + ShowText( + self._top, + "Help: Recursive Descent Parser Application", + (__doc__ or "").strip(), + width=75, + font="fixed", + ) + except: + ShowText( + self._top, + "Help: Recursive Descent Parser Application", + (__doc__ or "").strip(), + width=75, + ) + + def postscript(self, *e): + self._autostep = 0 + self._cframe.print_to_file() + + def mainloop(self, *args, **kwargs): + """ + Enter the Tkinter mainloop. This function must be called if + this demo is created from a non-interactive program (e.g. + from a secript); otherwise, the demo will close as soon as + the script completes. + """ + if in_idle(): + return + self._top.mainloop(*args, **kwargs) + + def resize(self, size=None): + if size is not None: + self._size.set(size) + size = self._size.get() + self._font.configure(size=-(abs(size))) + self._boldfont.configure(size=-(abs(size))) + self._sysfont.configure(size=-(abs(size))) + self._bigfont.configure(size=-(abs(size + 2))) + self._redraw() + + ######################################### + ## Expand Production Selection + ######################################### + + def _toggle_grammar(self, *e): + if self._show_grammar.get(): + self._prodframe.pack( + fill="both", side="left", padx=2, after=self._feedbackframe + ) + self._lastoper1["text"] = "Show Grammar" + else: + self._prodframe.pack_forget() + self._lastoper1["text"] = "Hide Grammar" + self._lastoper2["text"] = "" + + # def toggle_grammar(self, *e): + # self._show_grammar = not self._show_grammar + # if self._show_grammar: + # self._prodframe.pack(fill='both', expand='y', side='left', + # after=self._feedbackframe) + # self._lastoper1['text'] = 'Show Grammar' + # else: + # self._prodframe.pack_forget() + # self._lastoper1['text'] = 'Hide Grammar' + # self._lastoper2['text'] = '' + + def _prodlist_select(self, event): + selection = self._prodlist.curselection() + if len(selection) != 1: + return + index = int(selection[0]) + old_frontier = self._parser.frontier() + production = self._parser.expand(self._productions[index]) + + if production: + self._lastoper1["text"] = "Expand:" + self._lastoper2["text"] = production + self._prodlist.selection_clear(0, "end") + self._prodlist.selection_set(index) + self._animate_expand(old_frontier[0]) + else: + # Reset the production selections. + self._prodlist.selection_clear(0, "end") + for prod in self._parser.expandable_productions(): + index = self._productions.index(prod) + self._prodlist.selection_set(index) + + ######################################### + ## Animation + ######################################### + + def _animate_expand(self, treeloc): + oldwidget = self._get(self._tree, treeloc) + oldtree = oldwidget.parent() + top = not isinstance(oldtree.parent(), TreeSegmentWidget) + + tree = self._parser.tree() + for i in treeloc: + tree = tree[i] + + widget = tree_to_treesegment( + self._canvas, + tree, + node_font=self._boldfont, + leaf_color="white", + tree_width=2, + tree_color="white", + node_color="white", + leaf_font=self._font, + ) + widget.label()["color"] = "#20a050" + + (oldx, oldy) = oldtree.label().bbox()[:2] + (newx, newy) = widget.label().bbox()[:2] + widget.move(oldx - newx, oldy - newy) + + if top: + self._cframe.add_widget(widget, 0, 5) + widget.move(30 - widget.label().bbox()[0], 0) + self._tree = widget + else: + oldtree.parent().replace_child(oldtree, widget) + + # Move the children over so they don't overlap. + # Line the children up in a strange way. + if widget.subtrees(): + dx = ( + oldx + + widget.label().width() / 2 + - widget.subtrees()[0].bbox()[0] / 2 + - widget.subtrees()[0].bbox()[2] / 2 + ) + for subtree in widget.subtrees(): + subtree.move(dx, 0) + + self._makeroom(widget) + + if top: + self._cframe.destroy_widget(oldtree) + else: + oldtree.destroy() + + colors = [ + "gray%d" % (10 * int(10 * x / self._animation_frames.get())) + for x in range(self._animation_frames.get(), 0, -1) + ] + + # Move the text string down, if necessary. + dy = widget.bbox()[3] + 30 - self._canvas.coords(self._textline)[1] + if dy > 0: + for twidget in self._textwidgets: + twidget.move(0, dy) + self._canvas.move(self._textline, 0, dy) + + self._animate_expand_frame(widget, colors) + + def _makeroom(self, treeseg): + """ + Make sure that no sibling tree bbox's overlap. + """ + parent = treeseg.parent() + if not isinstance(parent, TreeSegmentWidget): + return + + index = parent.subtrees().index(treeseg) + + # Handle siblings to the right + rsiblings = parent.subtrees()[index + 1 :] + if rsiblings: + dx = treeseg.bbox()[2] - rsiblings[0].bbox()[0] + 10 + for sibling in rsiblings: + sibling.move(dx, 0) + + # Handle siblings to the left + if index > 0: + lsibling = parent.subtrees()[index - 1] + dx = max(0, lsibling.bbox()[2] - treeseg.bbox()[0] + 10) + treeseg.move(dx, 0) + + # Keep working up the tree. + self._makeroom(parent) + + def _animate_expand_frame(self, widget, colors): + if len(colors) > 0: + self._animating_lock = 1 + widget["color"] = colors[0] + for subtree in widget.subtrees(): + if isinstance(subtree, TreeSegmentWidget): + subtree.label()["color"] = colors[0] + else: + subtree["color"] = colors[0] + self._top.after(50, self._animate_expand_frame, widget, colors[1:]) + else: + widget["color"] = "black" + for subtree in widget.subtrees(): + if isinstance(subtree, TreeSegmentWidget): + subtree.label()["color"] = "black" + else: + subtree["color"] = "black" + self._redraw_quick() + widget.label()["color"] = "black" + self._animating_lock = 0 + if self._autostep: + self._step() + + def _animate_backtrack(self, treeloc): + # Flash red first, if we're animating. + if self._animation_frames.get() == 0: + colors = [] + else: + colors = ["#a00000", "#000000", "#a00000"] + colors += [ + "gray%d" % (10 * int(10 * x / (self._animation_frames.get()))) + for x in range(1, self._animation_frames.get() + 1) + ] + + widgets = [self._get(self._tree, treeloc).parent()] + for subtree in widgets[0].subtrees(): + if isinstance(subtree, TreeSegmentWidget): + widgets.append(subtree.label()) + else: + widgets.append(subtree) + + self._animate_backtrack_frame(widgets, colors) + + def _animate_backtrack_frame(self, widgets, colors): + if len(colors) > 0: + self._animating_lock = 1 + for widget in widgets: + widget["color"] = colors[0] + self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:]) + else: + for widget in widgets[0].subtrees(): + widgets[0].remove_child(widget) + widget.destroy() + self._redraw_quick() + self._animating_lock = 0 + if self._autostep: + self._step() + + def _animate_match_backtrack(self, treeloc): + widget = self._get(self._tree, treeloc) + node = widget.parent().label() + dy = (node.bbox()[3] - widget.bbox()[1] + 14) / max( + 1, self._animation_frames.get() + ) + self._animate_match_backtrack_frame(self._animation_frames.get(), widget, dy) + + def _animate_match(self, treeloc): + widget = self._get(self._tree, treeloc) + + dy = (self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) / max( + 1, self._animation_frames.get() + ) + self._animate_match_frame(self._animation_frames.get(), widget, dy) + + def _animate_match_frame(self, frame, widget, dy): + if frame > 0: + self._animating_lock = 1 + widget.move(0, dy) + self._top.after(10, self._animate_match_frame, frame - 1, widget, dy) + else: + widget["color"] = "#006040" + self._redraw_quick() + self._animating_lock = 0 + if self._autostep: + self._step() + + def _animate_match_backtrack_frame(self, frame, widget, dy): + if frame > 0: + self._animating_lock = 1 + widget.move(0, dy) + self._top.after( + 10, self._animate_match_backtrack_frame, frame - 1, widget, dy + ) + else: + widget.parent().remove_child(widget) + widget.destroy() + self._animating_lock = 0 + if self._autostep: + self._step() + + def edit_grammar(self, *e): + CFGEditor(self._top, self._parser.grammar(), self.set_grammar) + + def set_grammar(self, grammar): + self._parser.set_grammar(grammar) + self._productions = list(grammar.productions()) + self._prodlist.delete(0, "end") + for production in self._productions: + self._prodlist.insert("end", (" %s" % production)) + + def edit_sentence(self, *e): + sentence = " ".join(self._sent) + title = "Edit Text" + instr = "Enter a new sentence to parse." + EntryDialog(self._top, sentence, instr, self.set_sentence, title) + + def set_sentence(self, sentence): + self._sent = sentence.split() # [XX] use tagged? + self.reset() + + +def app(): + """ + Create a recursive descent parser demo, using a simple grammar and + text. + """ + from nltk.grammar import CFG + + grammar = CFG.fromstring( + """ + # Grammatical productions. + S -> NP VP + NP -> Det N PP | Det N + VP -> V NP PP | V NP | V + PP -> P NP + # Lexical productions. + NP -> 'I' + Det -> 'the' | 'a' + N -> 'man' | 'park' | 'dog' | 'telescope' + V -> 'ate' | 'saw' + P -> 'in' | 'under' | 'with' + """ + ) + + sent = "the dog saw a man in the park".split() + + RecursiveDescentApp(grammar, sent).mainloop() + + +if __name__ == "__main__": + app() + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/srparser_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/srparser_app.py new file mode 100644 index 0000000000000000000000000000000000000000..caff43865955b485152388dc9df4f2b374a3bf0c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/srparser_app.py @@ -0,0 +1,937 @@ +# Natural Language Toolkit: Shift-Reduce Parser Application +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +A graphical tool for exploring the shift-reduce parser. + +The shift-reduce parser maintains a stack, which records the structure +of the portion of the text that has been parsed. The stack is +initially empty. Its contents are shown on the left side of the main +canvas. + +On the right side of the main canvas is the remaining text. This is +the portion of the text which has not yet been considered by the +parser. + +The parser builds up a tree structure for the text using two +operations: + + - "shift" moves the first token from the remaining text to the top + of the stack. In the demo, the top of the stack is its right-hand + side. + - "reduce" uses a grammar production to combine the rightmost stack + elements into a single tree token. + +You can control the parser's operation by using the "shift" and +"reduce" buttons; or you can use the "step" button to let the parser +automatically decide which operation to apply. The parser uses the +following rules to decide which operation to apply: + + - Only shift if no reductions are available. + - If multiple reductions are available, then apply the reduction + whose CFG production is listed earliest in the grammar. + +The "reduce" button applies the reduction whose CFG production is +listed earliest in the grammar. There are two ways to manually choose +which reduction to apply: + + - Click on a CFG production from the list of available reductions, + on the left side of the main window. The reduction based on that + production will be applied to the top of the stack. + - Click on one of the stack elements. A popup window will appear, + containing all available reductions. Select one, and it will be + applied to the top of the stack. + +Note that reductions can only be applied to the top of the stack. + +Keyboard Shortcuts:: + [Space]\t Perform the next shift or reduce operation + [s]\t Perform a shift operation + [r]\t Perform a reduction operation + [Ctrl-z]\t Undo most recent operation + [Delete]\t Reset the parser + [g]\t Show/hide available production list + [Ctrl-a]\t Toggle animations + [h]\t Help + [Ctrl-p]\t Print + [q]\t Quit + +""" + +from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk +from tkinter.font import Font + +from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment +from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget +from nltk.parse import SteppingShiftReduceParser +from nltk.tree import Tree +from nltk.util import in_idle + +""" +Possible future improvements: + - button/window to change and/or select text. Just pop up a window + with an entry, and let them modify the text; and then retokenize + it? Maybe give a warning if it contains tokens whose types are + not in the grammar. + - button/window to change and/or select grammar. Select from + several alternative grammars? Or actually change the grammar? If + the later, then I'd want to define nltk.draw.cfg, which would be + responsible for that. +""" + + +class ShiftReduceApp: + """ + A graphical tool for exploring the shift-reduce parser. The tool + displays the parser's stack and the remaining text, and allows the + user to control the parser's operation. In particular, the user + can shift tokens onto the stack, and can perform reductions on the + top elements of the stack. A "step" button simply steps through + the parsing process, performing the operations that + ``nltk.parse.ShiftReduceParser`` would use. + """ + + def __init__(self, grammar, sent, trace=0): + self._sent = sent + self._parser = SteppingShiftReduceParser(grammar, trace) + + # Set up the main window. + self._top = Tk() + self._top.title("Shift Reduce Parser Application") + + # Animations. animating_lock is a lock to prevent the demo + # from performing new operations while it's animating. + self._animating_lock = 0 + self._animate = IntVar(self._top) + self._animate.set(10) # = medium + + # The user can hide the grammar. + self._show_grammar = IntVar(self._top) + self._show_grammar.set(1) + + # Initialize fonts. + self._init_fonts(self._top) + + # Set up key bindings. + self._init_bindings() + + # Create the basic frames. + self._init_menubar(self._top) + self._init_buttons(self._top) + self._init_feedback(self._top) + self._init_grammar(self._top) + self._init_canvas(self._top) + + # A popup menu for reducing. + self._reduce_menu = Menu(self._canvas, tearoff=0) + + # Reset the demo, and set the feedback frame to empty. + self.reset() + self._lastoper1["text"] = "" + + ######################################### + ## Initialization Helpers + ######################################### + + def _init_fonts(self, root): + # See: + self._sysfont = Font(font=Button()["font"]) + root.option_add("*Font", self._sysfont) + + # TWhat's our font size (default=same as sysfont) + self._size = IntVar(root) + self._size.set(self._sysfont.cget("size")) + + self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) + self._font = Font(family="helvetica", size=self._size.get()) + + def _init_grammar(self, parent): + # Grammar view. + self._prodframe = listframe = Frame(parent) + self._prodframe.pack(fill="both", side="left", padx=2) + self._prodlist_label = Label( + self._prodframe, font=self._boldfont, text="Available Reductions" + ) + self._prodlist_label.pack() + self._prodlist = Listbox( + self._prodframe, + selectmode="single", + relief="groove", + background="white", + foreground="#909090", + font=self._font, + selectforeground="#004040", + selectbackground="#c0f0c0", + ) + + self._prodlist.pack(side="right", fill="both", expand=1) + + self._productions = list(self._parser.grammar().productions()) + for production in self._productions: + self._prodlist.insert("end", (" %s" % production)) + self._prodlist.config(height=min(len(self._productions), 25)) + + # Add a scrollbar if there are more than 25 productions. + if 1: # len(self._productions) > 25: + listscroll = Scrollbar(self._prodframe, orient="vertical") + self._prodlist.config(yscrollcommand=listscroll.set) + listscroll.config(command=self._prodlist.yview) + listscroll.pack(side="left", fill="y") + + # If they select a production, apply it. + self._prodlist.bind("<>", self._prodlist_select) + + # When they hover over a production, highlight it. + self._hover = -1 + self._prodlist.bind("", self._highlight_hover) + self._prodlist.bind("", self._clear_hover) + + def _init_bindings(self): + # Quit + self._top.bind("", self.destroy) + self._top.bind("", self.destroy) + self._top.bind("", self.destroy) + self._top.bind("", self.destroy) + + # Ops (step, shift, reduce, undo) + self._top.bind("", self.step) + self._top.bind("", self.shift) + self._top.bind("", self.shift) + self._top.bind("", self.shift) + self._top.bind("", self.reduce) + self._top.bind("", self.reduce) + self._top.bind("", self.reduce) + self._top.bind("", self.reset) + self._top.bind("", self.undo) + self._top.bind("", self.undo) + self._top.bind("", self.undo) + self._top.bind("", self.undo) + self._top.bind("", self.undo) + + # Misc + self._top.bind("", self.postscript) + self._top.bind("", self.help) + self._top.bind("", self.help) + self._top.bind("", self.edit_grammar) + self._top.bind("", self.edit_sentence) + + # Animation speed control + self._top.bind("-", lambda e, a=self._animate: a.set(20)) + self._top.bind("=", lambda e, a=self._animate: a.set(10)) + self._top.bind("+", lambda e, a=self._animate: a.set(4)) + + def _init_buttons(self, parent): + # Set up the frames. + self._buttonframe = buttonframe = Frame(parent) + buttonframe.pack(fill="none", side="bottom") + Button( + buttonframe, + text="Step", + background="#90c0d0", + foreground="black", + command=self.step, + ).pack(side="left") + Button( + buttonframe, + text="Shift", + underline=0, + background="#90f090", + foreground="black", + command=self.shift, + ).pack(side="left") + Button( + buttonframe, + text="Reduce", + underline=0, + background="#90f090", + foreground="black", + command=self.reduce, + ).pack(side="left") + Button( + buttonframe, + text="Undo", + underline=0, + background="#f0a0a0", + foreground="black", + command=self.undo, + ).pack(side="left") + + def _init_menubar(self, parent): + menubar = Menu(parent) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Reset Parser", underline=0, command=self.reset, accelerator="Del" + ) + filemenu.add_command( + label="Print to Postscript", + underline=0, + command=self.postscript, + accelerator="Ctrl-p", + ) + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + editmenu = Menu(menubar, tearoff=0) + editmenu.add_command( + label="Edit Grammar", + underline=5, + command=self.edit_grammar, + accelerator="Ctrl-g", + ) + editmenu.add_command( + label="Edit Text", + underline=5, + command=self.edit_sentence, + accelerator="Ctrl-t", + ) + menubar.add_cascade(label="Edit", underline=0, menu=editmenu) + + rulemenu = Menu(menubar, tearoff=0) + rulemenu.add_command( + label="Step", underline=1, command=self.step, accelerator="Space" + ) + rulemenu.add_separator() + rulemenu.add_command( + label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s" + ) + rulemenu.add_command( + label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r" + ) + rulemenu.add_separator() + rulemenu.add_command( + label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u" + ) + menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) + + viewmenu = Menu(menubar, tearoff=0) + viewmenu.add_checkbutton( + label="Show Grammar", + underline=0, + variable=self._show_grammar, + command=self._toggle_grammar, + ) + viewmenu.add_separator() + viewmenu.add_radiobutton( + label="Tiny", + variable=self._size, + underline=0, + value=10, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Small", + variable=self._size, + underline=0, + value=12, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Medium", + variable=self._size, + underline=0, + value=14, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Large", + variable=self._size, + underline=0, + value=18, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Huge", + variable=self._size, + underline=0, + value=24, + command=self.resize, + ) + menubar.add_cascade(label="View", underline=0, menu=viewmenu) + + animatemenu = Menu(menubar, tearoff=0) + animatemenu.add_radiobutton( + label="No Animation", underline=0, variable=self._animate, value=0 + ) + animatemenu.add_radiobutton( + label="Slow Animation", + underline=0, + variable=self._animate, + value=20, + accelerator="-", + ) + animatemenu.add_radiobutton( + label="Normal Animation", + underline=0, + variable=self._animate, + value=10, + accelerator="=", + ) + animatemenu.add_radiobutton( + label="Fast Animation", + underline=0, + variable=self._animate, + value=4, + accelerator="+", + ) + menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) + + helpmenu = Menu(menubar, tearoff=0) + helpmenu.add_command(label="About", underline=0, command=self.about) + helpmenu.add_command( + label="Instructions", underline=0, command=self.help, accelerator="F1" + ) + menubar.add_cascade(label="Help", underline=0, menu=helpmenu) + + parent.config(menu=menubar) + + def _init_feedback(self, parent): + self._feedbackframe = feedbackframe = Frame(parent) + feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3) + self._lastoper_label = Label( + feedbackframe, text="Last Operation:", font=self._font + ) + self._lastoper_label.pack(side="left") + lastoperframe = Frame(feedbackframe, relief="sunken", border=1) + lastoperframe.pack(fill="x", side="right", expand=1, padx=5) + self._lastoper1 = Label( + lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font + ) + self._lastoper2 = Label( + lastoperframe, + anchor="w", + width=30, + foreground="#004040", + background="#f0f0f0", + font=self._font, + ) + self._lastoper1.pack(side="left") + self._lastoper2.pack(side="left", fill="x", expand=1) + + def _init_canvas(self, parent): + self._cframe = CanvasFrame( + parent, + background="white", + width=525, + closeenough=10, + border=2, + relief="sunken", + ) + self._cframe.pack(expand=1, fill="both", side="top", pady=2) + canvas = self._canvas = self._cframe.canvas() + + self._stackwidgets = [] + self._rtextwidgets = [] + self._titlebar = canvas.create_rectangle( + 0, 0, 0, 0, fill="#c0f0f0", outline="black" + ) + self._exprline = canvas.create_line(0, 0, 0, 0, dash=".") + self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080") + size = self._size.get() + 4 + self._stacklabel = TextWidget( + canvas, "Stack", color="#004040", font=self._boldfont + ) + self._rtextlabel = TextWidget( + canvas, "Remaining Text", color="#004040", font=self._boldfont + ) + self._cframe.add_widget(self._stacklabel) + self._cframe.add_widget(self._rtextlabel) + + ######################################### + ## Main draw procedure + ######################################### + + def _redraw(self): + scrollregion = self._canvas["scrollregion"].split() + (cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion) + + # Delete the old stack & rtext widgets. + for stackwidget in self._stackwidgets: + self._cframe.destroy_widget(stackwidget) + self._stackwidgets = [] + for rtextwidget in self._rtextwidgets: + self._cframe.destroy_widget(rtextwidget) + self._rtextwidgets = [] + + # Position the titlebar & exprline + (x1, y1, x2, y2) = self._stacklabel.bbox() + y = y2 - y1 + 10 + self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4) + self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10) + + # Position the titlebar labels.. + (x1, y1, x2, y2) = self._stacklabel.bbox() + self._stacklabel.move(5 - x1, 3 - y1) + (x1, y1, x2, y2) = self._rtextlabel.bbox() + self._rtextlabel.move(cx2 - x2 - 5, 3 - y1) + + # Draw the stack. + stackx = 5 + for tok in self._parser.stack(): + if isinstance(tok, Tree): + attribs = { + "tree_color": "#4080a0", + "tree_width": 2, + "node_font": self._boldfont, + "node_color": "#006060", + "leaf_color": "#006060", + "leaf_font": self._font, + } + widget = tree_to_treesegment(self._canvas, tok, **attribs) + widget.label()["color"] = "#000000" + else: + widget = TextWidget(self._canvas, tok, color="#000000", font=self._font) + widget.bind_click(self._popup_reduce) + self._stackwidgets.append(widget) + self._cframe.add_widget(widget, stackx, y) + stackx = widget.bbox()[2] + 10 + + # Draw the remaining text. + rtextwidth = 0 + for tok in self._parser.remaining_text(): + widget = TextWidget(self._canvas, tok, color="#000000", font=self._font) + self._rtextwidgets.append(widget) + self._cframe.add_widget(widget, rtextwidth, y) + rtextwidth = widget.bbox()[2] + 4 + + # Allow enough room to shift the next token (for animations) + if len(self._rtextwidgets) > 0: + stackx += self._rtextwidgets[0].width() + + # Move the remaining text to the correct location (keep it + # right-justified, when possible); and move the remaining text + # label, if necessary. + stackx = max(stackx, self._stacklabel.width() + 25) + rlabelwidth = self._rtextlabel.width() + 10 + if stackx >= cx2 - max(rtextwidth, rlabelwidth): + cx2 = stackx + max(rtextwidth, rlabelwidth) + for rtextwidget in self._rtextwidgets: + rtextwidget.move(4 + cx2 - rtextwidth, 0) + self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0) + + midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2 + self._canvas.coords(self._stacktop, midx, 0, midx, 5000) + (x1, y1, x2, y2) = self._stacklabel.bbox() + + # Set up binding to allow them to shift a token by dragging it. + if len(self._rtextwidgets) > 0: + + def drag_shift(widget, midx=midx, self=self): + if widget.bbox()[0] < midx: + self.shift() + else: + self._redraw() + + self._rtextwidgets[0].bind_drag(drag_shift) + self._rtextwidgets[0].bind_click(self.shift) + + # Draw the stack top. + self._highlight_productions() + + def _draw_stack_top(self, widget): + # hack.. + midx = widget.bbox()[2] + 50 + self._canvas.coords(self._stacktop, midx, 0, midx, 5000) + + def _highlight_productions(self): + # Highlight the productions that can be reduced. + self._prodlist.selection_clear(0, "end") + for prod in self._parser.reducible_productions(): + index = self._productions.index(prod) + self._prodlist.selection_set(index) + + ######################################### + ## Button Callbacks + ######################################### + + def destroy(self, *e): + if self._top is None: + return + self._top.destroy() + self._top = None + + def reset(self, *e): + self._parser.initialize(self._sent) + self._lastoper1["text"] = "Reset App" + self._lastoper2["text"] = "" + self._redraw() + + def step(self, *e): + if self.reduce(): + return True + elif self.shift(): + return True + else: + if list(self._parser.parses()): + self._lastoper1["text"] = "Finished:" + self._lastoper2["text"] = "Success" + else: + self._lastoper1["text"] = "Finished:" + self._lastoper2["text"] = "Failure" + + def shift(self, *e): + if self._animating_lock: + return + if self._parser.shift(): + tok = self._parser.stack()[-1] + self._lastoper1["text"] = "Shift:" + self._lastoper2["text"] = "%r" % tok + if self._animate.get(): + self._animate_shift() + else: + self._redraw() + return True + return False + + def reduce(self, *e): + if self._animating_lock: + return + production = self._parser.reduce() + if production: + self._lastoper1["text"] = "Reduce:" + self._lastoper2["text"] = "%s" % production + if self._animate.get(): + self._animate_reduce() + else: + self._redraw() + return production + + def undo(self, *e): + if self._animating_lock: + return + if self._parser.undo(): + self._redraw() + + def postscript(self, *e): + self._cframe.print_to_file() + + def mainloop(self, *args, **kwargs): + """ + Enter the Tkinter mainloop. This function must be called if + this demo is created from a non-interactive program (e.g. + from a secript); otherwise, the demo will close as soon as + the script completes. + """ + if in_idle(): + return + self._top.mainloop(*args, **kwargs) + + ######################################### + ## Menubar callbacks + ######################################### + + def resize(self, size=None): + if size is not None: + self._size.set(size) + size = self._size.get() + self._font.configure(size=-(abs(size))) + self._boldfont.configure(size=-(abs(size))) + self._sysfont.configure(size=-(abs(size))) + + # self._stacklabel['font'] = ('helvetica', -size-4, 'bold') + # self._rtextlabel['font'] = ('helvetica', -size-4, 'bold') + # self._lastoper_label['font'] = ('helvetica', -size) + # self._lastoper1['font'] = ('helvetica', -size) + # self._lastoper2['font'] = ('helvetica', -size) + # self._prodlist['font'] = ('helvetica', -size) + # self._prodlist_label['font'] = ('helvetica', -size-2, 'bold') + self._redraw() + + def help(self, *e): + # The default font's not very legible; try using 'fixed' instead. + try: + ShowText( + self._top, + "Help: Shift-Reduce Parser Application", + (__doc__ or "").strip(), + width=75, + font="fixed", + ) + except: + ShowText( + self._top, + "Help: Shift-Reduce Parser Application", + (__doc__ or "").strip(), + width=75, + ) + + def about(self, *e): + ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper" + TITLE = "About: Shift-Reduce Parser Application" + try: + from tkinter.messagebox import Message + + Message(message=ABOUT, title=TITLE).show() + except: + ShowText(self._top, TITLE, ABOUT) + + def edit_grammar(self, *e): + CFGEditor(self._top, self._parser.grammar(), self.set_grammar) + + def set_grammar(self, grammar): + self._parser.set_grammar(grammar) + self._productions = list(grammar.productions()) + self._prodlist.delete(0, "end") + for production in self._productions: + self._prodlist.insert("end", (" %s" % production)) + + def edit_sentence(self, *e): + sentence = " ".join(self._sent) + title = "Edit Text" + instr = "Enter a new sentence to parse." + EntryDialog(self._top, sentence, instr, self.set_sentence, title) + + def set_sentence(self, sent): + self._sent = sent.split() # [XX] use tagged? + self.reset() + + ######################################### + ## Reduce Production Selection + ######################################### + + def _toggle_grammar(self, *e): + if self._show_grammar.get(): + self._prodframe.pack( + fill="both", side="left", padx=2, after=self._feedbackframe + ) + self._lastoper1["text"] = "Show Grammar" + else: + self._prodframe.pack_forget() + self._lastoper1["text"] = "Hide Grammar" + self._lastoper2["text"] = "" + + def _prodlist_select(self, event): + selection = self._prodlist.curselection() + if len(selection) != 1: + return + index = int(selection[0]) + production = self._parser.reduce(self._productions[index]) + if production: + self._lastoper1["text"] = "Reduce:" + self._lastoper2["text"] = "%s" % production + if self._animate.get(): + self._animate_reduce() + else: + self._redraw() + else: + # Reset the production selections. + self._prodlist.selection_clear(0, "end") + for prod in self._parser.reducible_productions(): + index = self._productions.index(prod) + self._prodlist.selection_set(index) + + def _popup_reduce(self, widget): + # Remove old commands. + productions = self._parser.reducible_productions() + if len(productions) == 0: + return + + self._reduce_menu.delete(0, "end") + for production in productions: + self._reduce_menu.add_command(label=str(production), command=self.reduce) + self._reduce_menu.post( + self._canvas.winfo_pointerx(), self._canvas.winfo_pointery() + ) + + ######################################### + ## Animations + ######################################### + + def _animate_shift(self): + # What widget are we shifting? + widget = self._rtextwidgets[0] + + # Where are we shifting from & to? + right = widget.bbox()[0] + if len(self._stackwidgets) == 0: + left = 5 + else: + left = self._stackwidgets[-1].bbox()[2] + 10 + + # Start animating. + dt = self._animate.get() + dx = (left - right) * 1.0 / dt + self._animate_shift_frame(dt, widget, dx) + + def _animate_shift_frame(self, frame, widget, dx): + if frame > 0: + self._animating_lock = 1 + widget.move(dx, 0) + self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx) + else: + # but: stacktop?? + + # Shift the widget to the stack. + del self._rtextwidgets[0] + self._stackwidgets.append(widget) + self._animating_lock = 0 + + # Display the available productions. + self._draw_stack_top(widget) + self._highlight_productions() + + def _animate_reduce(self): + # What widgets are we shifting? + numwidgets = len(self._parser.stack()[-1]) # number of children + widgets = self._stackwidgets[-numwidgets:] + + # How far are we moving? + if isinstance(widgets[0], TreeSegmentWidget): + ydist = 15 + widgets[0].label().height() + else: + ydist = 15 + widgets[0].height() + + # Start animating. + dt = self._animate.get() + dy = ydist * 2.0 / dt + self._animate_reduce_frame(dt / 2, widgets, dy) + + def _animate_reduce_frame(self, frame, widgets, dy): + if frame > 0: + self._animating_lock = 1 + for widget in widgets: + widget.move(0, dy) + self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy) + else: + del self._stackwidgets[-len(widgets) :] + for widget in widgets: + self._cframe.remove_widget(widget) + tok = self._parser.stack()[-1] + if not isinstance(tok, Tree): + raise ValueError() + label = TextWidget( + self._canvas, str(tok.label()), color="#006060", font=self._boldfont + ) + widget = TreeSegmentWidget(self._canvas, label, widgets, width=2) + (x1, y1, x2, y2) = self._stacklabel.bbox() + y = y2 - y1 + 10 + if not self._stackwidgets: + x = 5 + else: + x = self._stackwidgets[-1].bbox()[2] + 10 + self._cframe.add_widget(widget, x, y) + self._stackwidgets.append(widget) + + # Display the available productions. + self._draw_stack_top(widget) + self._highlight_productions() + + # # Delete the old widgets.. + # del self._stackwidgets[-len(widgets):] + # for widget in widgets: + # self._cframe.destroy_widget(widget) + # + # # Make a new one. + # tok = self._parser.stack()[-1] + # if isinstance(tok, Tree): + # attribs = {'tree_color': '#4080a0', 'tree_width': 2, + # 'node_font': bold, 'node_color': '#006060', + # 'leaf_color': '#006060', 'leaf_font':self._font} + # widget = tree_to_treesegment(self._canvas, tok.type(), + # **attribs) + # widget.node()['color'] = '#000000' + # else: + # widget = TextWidget(self._canvas, tok.type(), + # color='#000000', font=self._font) + # widget.bind_click(self._popup_reduce) + # (x1, y1, x2, y2) = self._stacklabel.bbox() + # y = y2-y1+10 + # if not self._stackwidgets: x = 5 + # else: x = self._stackwidgets[-1].bbox()[2] + 10 + # self._cframe.add_widget(widget, x, y) + # self._stackwidgets.append(widget) + + # self._redraw() + self._animating_lock = 0 + + ######################################### + ## Hovering. + ######################################### + + def _highlight_hover(self, event): + # What production are we hovering over? + index = self._prodlist.nearest(event.y) + if self._hover == index: + return + + # Clear any previous hover highlighting. + self._clear_hover() + + # If the production corresponds to an available reduction, + # highlight the stack. + selection = [int(s) for s in self._prodlist.curselection()] + if index in selection: + rhslen = len(self._productions[index].rhs()) + for stackwidget in self._stackwidgets[-rhslen:]: + if isinstance(stackwidget, TreeSegmentWidget): + stackwidget.label()["color"] = "#00a000" + else: + stackwidget["color"] = "#00a000" + + # Remember what production we're hovering over. + self._hover = index + + def _clear_hover(self, *event): + # Clear any previous hover highlighting. + if self._hover == -1: + return + self._hover = -1 + for stackwidget in self._stackwidgets: + if isinstance(stackwidget, TreeSegmentWidget): + stackwidget.label()["color"] = "black" + else: + stackwidget["color"] = "black" + + +def app(): + """ + Create a shift reduce parser app, using a simple grammar and + text. + """ + + from nltk.grammar import CFG, Nonterminal, Production + + nonterminals = "S VP NP PP P N Name V Det" + (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) + + productions = ( + # Syntactic Productions + Production(S, [NP, VP]), + Production(NP, [Det, N]), + Production(NP, [NP, PP]), + Production(VP, [VP, PP]), + Production(VP, [V, NP, PP]), + Production(VP, [V, NP]), + Production(PP, [P, NP]), + # Lexical Productions + Production(NP, ["I"]), + Production(Det, ["the"]), + Production(Det, ["a"]), + Production(N, ["man"]), + Production(V, ["saw"]), + Production(P, ["in"]), + Production(P, ["with"]), + Production(N, ["park"]), + Production(N, ["dog"]), + Production(N, ["statue"]), + Production(Det, ["my"]), + ) + + grammar = CFG(S, productions) + + # tokenize the sentence + sent = "my dog saw a man in the park with a statue".split() + + ShiftReduceApp(grammar, sent).mainloop() + + +if __name__ == "__main__": + app() + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/wordfreq_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/wordfreq_app.py new file mode 100644 index 0000000000000000000000000000000000000000..978d042a6f687e213f015b08429e5ad891acda4c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/wordfreq_app.py @@ -0,0 +1,36 @@ +# Natural Language Toolkit: Wordfreq Application +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Sumukh Ghodke +# URL: +# For license information, see LICENSE.TXT + +from matplotlib import pylab + +from nltk.corpus import gutenberg +from nltk.text import Text + + +def plot_word_freq_dist(text): + fd = text.vocab() + + samples = [item for item, _ in fd.most_common(50)] + values = [fd[sample] for sample in samples] + values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))] + pylab.title(text.name) + pylab.xlabel("Samples") + pylab.ylabel("Cumulative Percentage") + pylab.plot(values) + pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90) + pylab.show() + + +def app(): + t1 = Text(gutenberg.words("melville-moby_dick.txt")) + plot_word_freq_dist(t1) + + +if __name__ == "__main__": + app() + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/wordnet_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/wordnet_app.py new file mode 100644 index 0000000000000000000000000000000000000000..3ec8e78f2d70a190425ff376df2acf85e8915250 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/wordnet_app.py @@ -0,0 +1,997 @@ +# Natural Language Toolkit: WordNet Browser Application +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Jussi Salmela +# Paul Bone +# URL: +# For license information, see LICENSE.TXT + +""" +A WordNet Browser application which launches the default browser +(if it is not already running) and opens a new tab with a connection +to http://localhost:port/ . It also starts an HTTP server on the +specified port and begins serving browser requests. The default +port is 8000. (For command-line help, run "python wordnet -h") +This application requires that the user's web browser supports +Javascript. + +BrowServer is a server for browsing the NLTK Wordnet database It first +launches a browser client to be used for browsing and then starts +serving the requests of that and maybe other clients + +Usage:: + + browserver.py -h + browserver.py [-s] [-p ] + +Options:: + + -h or --help + Display this help message. + + -l or --log-file + Logs messages to the given file, If this option is not specified + messages are silently dropped. + + -p or --port + Run the web server on this TCP port, defaults to 8000. + + -s or --server-mode + Do not start a web browser, and do not allow a user to + shutdown the server through the web interface. +""" +# TODO: throughout this package variable names and docstrings need +# modifying to be compliant with NLTK's coding standards. Tests also +# need to be develop to ensure this continues to work in the face of +# changes to other NLTK packages. + +import base64 +import copy +import datetime +import getopt +import os +import pickle +import re +import sys +import threading +import time +import webbrowser +from collections import defaultdict +from http.server import BaseHTTPRequestHandler, HTTPServer + +# Allow this program to run inside the NLTK source tree. +from sys import argv, path +from urllib.parse import unquote_plus + +from nltk.corpus import wordnet as wn +from nltk.corpus.reader.wordnet import Lemma, Synset + +# now included in local file +# from util import html_header, html_trailer, \ +# get_static_index_page, get_static_page_by_path, \ +# page_from_word, page_from_href + +firstClient = True + +# True if we're not also running a web browser. The value f server_mode +# gets set by demo(). +server_mode = None + +# If set this is a file object for writing log messages. +logfile = None + + +class MyServerHandler(BaseHTTPRequestHandler): + def do_HEAD(self): + self.send_head() + + def do_GET(self): + global firstClient + sp = self.path[1:] + if unquote_plus(sp) == "SHUTDOWN THE SERVER": + if server_mode: + page = "Server must be killed with SIGTERM." + type = "text/plain" + else: + print("Server shutting down!") + os._exit(0) + + elif sp == "": # First request. + type = "text/html" + if not server_mode and firstClient: + firstClient = False + page = get_static_index_page(True) + else: + page = get_static_index_page(False) + word = "green" + + elif sp.endswith(".html"): # Trying to fetch a HTML file TODO: + type = "text/html" + usp = unquote_plus(sp) + if usp == "NLTK Wordnet Browser Database Info.html": + word = "* Database Info *" + if os.path.isfile(usp): + with open(usp) as infile: + page = infile.read() + else: + page = ( + (html_header % word) + "

The database info file:" + "

" + + usp + + "" + + "

was not found. Run this:" + + "

python dbinfo_html.py" + + "

to produce it." + + html_trailer + ) + else: + # Handle files here. + word = sp + page = get_static_page_by_path(usp) + elif sp.startswith("search"): + # This doesn't seem to work with MWEs. + type = "text/html" + parts = (sp.split("?")[1]).split("&") + word = [ + p.split("=")[1].replace("+", " ") + for p in parts + if p.startswith("nextWord") + ][0] + page, word = page_from_word(word) + elif sp.startswith("lookup_"): + # TODO add a variation of this that takes a non ecoded word or MWE. + type = "text/html" + sp = sp[len("lookup_") :] + page, word = page_from_href(sp) + elif sp == "start_page": + # if this is the first request we should display help + # information, and possibly set a default word. + type = "text/html" + page, word = page_from_word("wordnet") + else: + type = "text/plain" + page = "Could not parse request: '%s'" % sp + + # Send result. + self.send_head(type) + self.wfile.write(page.encode("utf8")) + + def send_head(self, type=None): + self.send_response(200) + self.send_header("Content-type", type) + self.end_headers() + + def log_message(self, format, *args): + global logfile + + if logfile: + logfile.write( + "%s - - [%s] %s\n" + % (self.address_string(), self.log_date_time_string(), format % args) + ) + + +def get_unique_counter_from_url(sp): + """ + Extract the unique counter from the URL if it has one. Otherwise return + null. + """ + pos = sp.rfind("%23") + if pos != -1: + return int(sp[(pos + 3) :]) + else: + return None + + +def wnb(port=8000, runBrowser=True, logfilename=None): + """ + Run NLTK Wordnet Browser Server. + + :param port: The port number for the server to listen on, defaults to + 8000 + :type port: int + + :param runBrowser: True to start a web browser and point it at the web + server. + :type runBrowser: bool + """ + # The webbrowser module is unpredictable, typically it blocks if it uses + # a console web browser, and doesn't block if it uses a GUI webbrowser, + # so we need to force it to have a clear correct behaviour. + # + # Normally the server should run for as long as the user wants. they + # should idealy be able to control this from the UI by closing the + # window or tab. Second best would be clicking a button to say + # 'Shutdown' that first shutsdown the server and closes the window or + # tab, or exits the text-mode browser. Both of these are unfreasable. + # + # The next best alternative is to start the server, have it close when + # it receives SIGTERM (default), and run the browser as well. The user + # may have to shutdown both programs. + # + # Since webbrowser may block, and the webserver will block, we must run + # them in separate threads. + # + global server_mode, logfile + server_mode = not runBrowser + + # Setup logging. + if logfilename: + try: + logfile = open(logfilename, "a", 1) # 1 means 'line buffering' + except OSError as e: + sys.stderr.write("Couldn't open %s for writing: %s", logfilename, e) + sys.exit(1) + else: + logfile = None + + # Compute URL and start web browser + url = "http://localhost:" + str(port) + + server_ready = None + browser_thread = None + + if runBrowser: + server_ready = threading.Event() + browser_thread = startBrowser(url, server_ready) + + # Start the server. + server = HTTPServer(("", port), MyServerHandler) + if logfile: + logfile.write("NLTK Wordnet browser server running serving: %s\n" % url) + if runBrowser: + server_ready.set() + + try: + server.serve_forever() + except KeyboardInterrupt: + pass + + if runBrowser: + browser_thread.join() + + if logfile: + logfile.close() + + +def startBrowser(url, server_ready): + def run(): + server_ready.wait() + time.sleep(1) # Wait a little bit more, there's still the chance of + # a race condition. + webbrowser.open(url, new=2, autoraise=1) + + t = threading.Thread(target=run) + t.start() + return t + + +##################################################################### +# Utilities +##################################################################### + + +""" +WordNet Browser Utilities. + +This provides a backend to both wxbrowse and browserver.py. +""" + +################################################################################ +# +# Main logic for wordnet browser. +# + +# This is wrapped inside a function since wn is only available if the +# WordNet corpus is installed. +def _pos_tuples(): + return [ + (wn.NOUN, "N", "noun"), + (wn.VERB, "V", "verb"), + (wn.ADJ, "J", "adj"), + (wn.ADV, "R", "adv"), + ] + + +def _pos_match(pos_tuple): + """ + This function returns the complete pos tuple for the partial pos + tuple given to it. It attempts to match it against the first + non-null component of the given pos tuple. + """ + if pos_tuple[0] == "s": + pos_tuple = ("a", pos_tuple[1], pos_tuple[2]) + for n, x in enumerate(pos_tuple): + if x is not None: + break + for pt in _pos_tuples(): + if pt[n] == pos_tuple[n]: + return pt + return None + + +HYPONYM = 0 +HYPERNYM = 1 +CLASS_REGIONAL = 2 +PART_HOLONYM = 3 +PART_MERONYM = 4 +ATTRIBUTE = 5 +SUBSTANCE_HOLONYM = 6 +SUBSTANCE_MERONYM = 7 +MEMBER_HOLONYM = 8 +MEMBER_MERONYM = 9 +VERB_GROUP = 10 +INSTANCE_HYPONYM = 12 +INSTANCE_HYPERNYM = 13 +CAUSE = 14 +ALSO_SEE = 15 +SIMILAR = 16 +ENTAILMENT = 17 +ANTONYM = 18 +FRAMES = 19 +PERTAINYM = 20 + +CLASS_CATEGORY = 21 +CLASS_USAGE = 22 +CLASS_REGIONAL = 23 +CLASS_USAGE = 24 +CLASS_CATEGORY = 11 + +DERIVATIONALLY_RELATED_FORM = 25 + +INDIRECT_HYPERNYMS = 26 + + +def lemma_property(word, synset, func): + def flattern(l): + if l == []: + return [] + else: + return l[0] + flattern(l[1:]) + + return flattern([func(l) for l in synset.lemmas() if l.name == word]) + + +def rebuild_tree(orig_tree): + node = orig_tree[0] + children = orig_tree[1:] + return (node, [rebuild_tree(t) for t in children]) + + +def get_relations_data(word, synset): + """ + Get synset relations data for a synset. Note that this doesn't + yet support things such as full hyponym vs direct hyponym. + """ + if synset.pos() == wn.NOUN: + return ( + (HYPONYM, "Hyponyms", synset.hyponyms()), + (INSTANCE_HYPONYM, "Instance hyponyms", synset.instance_hyponyms()), + (HYPERNYM, "Direct hypernyms", synset.hypernyms()), + ( + INDIRECT_HYPERNYMS, + "Indirect hypernyms", + rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1], + ), + # hypernyms', 'Sister terms', + (INSTANCE_HYPERNYM, "Instance hypernyms", synset.instance_hypernyms()), + # (CLASS_REGIONAL, ['domain term region'], ), + (PART_HOLONYM, "Part holonyms", synset.part_holonyms()), + (PART_MERONYM, "Part meronyms", synset.part_meronyms()), + (SUBSTANCE_HOLONYM, "Substance holonyms", synset.substance_holonyms()), + (SUBSTANCE_MERONYM, "Substance meronyms", synset.substance_meronyms()), + (MEMBER_HOLONYM, "Member holonyms", synset.member_holonyms()), + (MEMBER_MERONYM, "Member meronyms", synset.member_meronyms()), + (ATTRIBUTE, "Attributes", synset.attributes()), + (ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())), + ( + DERIVATIONALLY_RELATED_FORM, + "Derivationally related form", + lemma_property( + word, synset, lambda l: l.derivationally_related_forms() + ), + ), + ) + elif synset.pos() == wn.VERB: + return ( + (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), + (HYPONYM, "Hyponym", synset.hyponyms()), + (HYPERNYM, "Direct hypernyms", synset.hypernyms()), + ( + INDIRECT_HYPERNYMS, + "Indirect hypernyms", + rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1], + ), + (ENTAILMENT, "Entailments", synset.entailments()), + (CAUSE, "Causes", synset.causes()), + (ALSO_SEE, "Also see", synset.also_sees()), + (VERB_GROUP, "Verb Groups", synset.verb_groups()), + ( + DERIVATIONALLY_RELATED_FORM, + "Derivationally related form", + lemma_property( + word, synset, lambda l: l.derivationally_related_forms() + ), + ), + ) + elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT: + return ( + (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), + (SIMILAR, "Similar to", synset.similar_tos()), + # Participle of verb - not supported by corpus + ( + PERTAINYM, + "Pertainyms", + lemma_property(word, synset, lambda l: l.pertainyms()), + ), + (ATTRIBUTE, "Attributes", synset.attributes()), + (ALSO_SEE, "Also see", synset.also_sees()), + ) + elif synset.pos() == wn.ADV: + # This is weird. adverbs such as 'quick' and 'fast' don't seem + # to have antonyms returned by the corpus.a + return ( + (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), + ) + # Derived from adjective - not supported by corpus + else: + raise TypeError("Unhandles synset POS type: " + str(synset.pos())) + + +html_header = """ + + + + + +NLTK Wordnet Browser display of: %s + +""" +html_trailer = """ + + +""" + +explanation = """ +

Search Help

+
  • The display below the line is an example of the output the browser +shows you when you enter a search word. The search word was green.
  • +
  • The search result shows for different parts of speech the synsets +i.e. different meanings for the word.
  • +
  • All underlined texts are hypertext links. There are two types of links: +word links and others. Clicking a word link carries out a search for the word +in the Wordnet database.
  • +
  • Clicking a link of the other type opens a display section of data attached +to that link. Clicking that link a second time closes the section again.
  • +
  • Clicking S: opens a section showing the relations for that synset. +
  • +
  • Clicking on a relation name opens a section that displays the associated +synsets.
  • +
  • Type a search word in the Word field and start the search by the +Enter/Return key or click the Search button.
  • +
+
+""" + +# HTML oriented functions + + +def _bold(txt): + return "%s" % txt + + +def _center(txt): + return "
%s
" % txt + + +def _hlev(n, txt): + return "%s" % (n, txt, n) + + +def _italic(txt): + return "%s" % txt + + +def _li(txt): + return "
  • %s
  • " % txt + + +def pg(word, body): + """ + Return a HTML page of NLTK Browser format constructed from the + word and body + + :param word: The word that the body corresponds to + :type word: str + :param body: The HTML body corresponding to the word + :type body: str + :return: a HTML page for the word-body combination + :rtype: str + """ + return (html_header % word) + body + html_trailer + + +def _ul(txt): + return "
      " + txt + "
    " + + +def _abbc(txt): + """ + abbc = asterisks, breaks, bold, center + """ + return _center(_bold("
    " * 10 + "*" * 10 + " " + txt + " " + "*" * 10)) + + +full_hyponym_cont_text = _ul(_li(_italic("(has full hyponym continuation)"))) + "\n" + + +def _get_synset(synset_key): + """ + The synset key is the unique name of the synset, this can be + retrieved via synset.name() + """ + return wn.synset(synset_key) + + +def _collect_one_synset(word, synset, synset_relations): + """ + Returns the HTML string for one synset or word + + :param word: the current word + :type word: str + :param synset: a synset + :type synset: synset + :param synset_relations: information about which synset relations + to display. + :type synset_relations: dict(synset_key, set(relation_id)) + :return: The HTML string built for this synset + :rtype: str + """ + if isinstance(synset, tuple): # It's a word + raise NotImplementedError("word not supported by _collect_one_synset") + + typ = "S" + pos_tuple = _pos_match((synset.pos(), None, None)) + assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos() + descr = pos_tuple[2] + ref = copy.deepcopy(Reference(word, synset_relations)) + ref.toggle_synset(synset) + synset_label = typ + ";" + if synset.name() in synset_relations: + synset_label = _bold(synset_label) + s = f"
  • {make_lookup_link(ref, synset_label)} ({descr}) " + + def format_lemma(w): + w = w.replace("_", " ") + if w.lower() == word: + return _bold(w) + else: + ref = Reference(w) + return make_lookup_link(ref, w) + + s += ", ".join(format_lemma(l.name()) for l in synset.lemmas()) + + gl = " ({}) {} ".format( + synset.definition(), + "; ".join('"%s"' % e for e in synset.examples()), + ) + return s + gl + _synset_relations(word, synset, synset_relations) + "
  • \n" + + +def _collect_all_synsets(word, pos, synset_relations=dict()): + """ + Return a HTML unordered list of synsets for the given word and + part of speech. + """ + return "
      %s\n
    \n" % "".join( + _collect_one_synset(word, synset, synset_relations) + for synset in wn.synsets(word, pos) + ) + + +def _synset_relations(word, synset, synset_relations): + """ + Builds the HTML string for the relations of a synset + + :param word: The current word + :type word: str + :param synset: The synset for which we're building the relations. + :type synset: Synset + :param synset_relations: synset keys and relation types for which to display relations. + :type synset_relations: dict(synset_key, set(relation_type)) + :return: The HTML for a synset's relations + :rtype: str + """ + + if not synset.name() in synset_relations: + return "" + ref = Reference(word, synset_relations) + + def relation_html(r): + if isinstance(r, Synset): + return make_lookup_link(Reference(r.lemma_names()[0]), r.lemma_names()[0]) + elif isinstance(r, Lemma): + return relation_html(r.synset()) + elif isinstance(r, tuple): + # It's probably a tuple containing a Synset and a list of + # similar tuples. This forms a tree of synsets. + return "{}\n
      {}
    \n".format( + relation_html(r[0]), + "".join("
  • %s
  • \n" % relation_html(sr) for sr in r[1]), + ) + else: + raise TypeError( + "r must be a synset, lemma or list, it was: type(r) = %s, r = %s" + % (type(r), r) + ) + + def make_synset_html(db_name, disp_name, rels): + synset_html = "%s\n" % make_lookup_link( + copy.deepcopy(ref).toggle_synset_relation(synset, db_name), + disp_name, + ) + + if db_name in ref.synset_relations[synset.name()]: + synset_html += "
      %s
    \n" % "".join( + "
  • %s
  • \n" % relation_html(r) for r in rels + ) + + return synset_html + + html = ( + "
      " + + "\n".join( + "
    • %s
    • " % make_synset_html(*rel_data) + for rel_data in get_relations_data(word, synset) + if rel_data[2] != [] + ) + + "
    " + ) + + return html + + +class Reference: + """ + A reference to a page that may be generated by page_word + """ + + def __init__(self, word, synset_relations=dict()): + """ + Build a reference to a new page. + + word is the word or words (separated by commas) for which to + search for synsets of + + synset_relations is a dictionary of synset keys to sets of + synset relation identifaiers to unfold a list of synset + relations for. + """ + self.word = word + self.synset_relations = synset_relations + + def encode(self): + """ + Encode this reference into a string to be used in a URL. + """ + # This uses a tuple rather than an object since the python + # pickle representation is much smaller and there is no need + # to represent the complete object. + string = pickle.dumps((self.word, self.synset_relations), -1) + return base64.urlsafe_b64encode(string).decode() + + @staticmethod + def decode(string): + """ + Decode a reference encoded with Reference.encode + """ + string = base64.urlsafe_b64decode(string.encode()) + word, synset_relations = pickle.loads(string) + return Reference(word, synset_relations) + + def toggle_synset_relation(self, synset, relation): + """ + Toggle the display of the relations for the given synset and + relation type. + + This function will throw a KeyError if the synset is currently + not being displayed. + """ + if relation in self.synset_relations[synset.name()]: + self.synset_relations[synset.name()].remove(relation) + else: + self.synset_relations[synset.name()].add(relation) + + return self + + def toggle_synset(self, synset): + """ + Toggle displaying of the relation types for the given synset + """ + if synset.name() in self.synset_relations: + del self.synset_relations[synset.name()] + else: + self.synset_relations[synset.name()] = set() + + return self + + +def make_lookup_link(ref, label): + return f'{label}' + + +def page_from_word(word): + """ + Return a HTML page for the given word. + + :type word: str + :param word: The currently active word + :return: A tuple (page,word), where page is the new current HTML page + to be sent to the browser and + word is the new current word + :rtype: A tuple (str,str) + """ + return page_from_reference(Reference(word)) + + +def page_from_href(href): + """ + Returns a tuple of the HTML page built and the new current word + + :param href: The hypertext reference to be solved + :type href: str + :return: A tuple (page,word), where page is the new current HTML page + to be sent to the browser and + word is the new current word + :rtype: A tuple (str,str) + """ + return page_from_reference(Reference.decode(href)) + + +def page_from_reference(href): + """ + Returns a tuple of the HTML page built and the new current word + + :param href: The hypertext reference to be solved + :type href: str + :return: A tuple (page,word), where page is the new current HTML page + to be sent to the browser and + word is the new current word + :rtype: A tuple (str,str) + """ + word = href.word + pos_forms = defaultdict(list) + words = word.split(",") + words = [w for w in [w.strip().lower().replace(" ", "_") for w in words] if w != ""] + if len(words) == 0: + # No words were found. + return "", "Please specify a word to search for." + + # This looks up multiple words at once. This is probably not + # necessary and may lead to problems. + for w in words: + for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]: + form = wn.morphy(w, pos) + if form and form not in pos_forms[pos]: + pos_forms[pos].append(form) + body = "" + for pos, pos_str, name in _pos_tuples(): + if pos in pos_forms: + body += _hlev(3, name) + "\n" + for w in pos_forms[pos]: + # Not all words of exc files are in the database, skip + # to the next word if a KeyError is raised. + try: + body += _collect_all_synsets(w, pos, href.synset_relations) + except KeyError: + pass + if not body: + body = "The word or words '%s' where not found in the dictionary." % word + return body, word + + +##################################################################### +# Static pages +##################################################################### + + +def get_static_page_by_path(path): + """ + Return a static HTML page from the path given. + """ + if path == "index_2.html": + return get_static_index_page(False) + elif path == "index.html": + return get_static_index_page(True) + elif path == "NLTK Wordnet Browser Database Info.html": + return "Display of Wordnet Database Statistics is not supported" + elif path == "upper_2.html": + return get_static_upper_page(False) + elif path == "upper.html": + return get_static_upper_page(True) + elif path == "web_help.html": + return get_static_web_help_page() + elif path == "wx_help.html": + return get_static_wx_help_page() + else: + return "Internal error: Path for static page '%s' is unknown" % path + + +def get_static_web_help_page(): + """ + Return the static web help page. + """ + return """ + + + + + + NLTK Wordnet Browser display of: * Help * + + +

    NLTK Wordnet Browser Help

    +

    The NLTK Wordnet Browser is a tool to use in browsing the Wordnet database. It tries to behave like the Wordnet project's web browser but the difference is that the NLTK Wordnet Browser uses a local Wordnet database. +

    You are using the Javascript client part of the NLTK Wordnet BrowseServer. We assume your browser is in tab sheets enabled mode.

    +

    For background information on Wordnet, see the Wordnet project home page: https://wordnet.princeton.edu/. For more information on the NLTK project, see the project home: +https://www.nltk.org/. To get an idea of what the Wordnet version used by this browser includes choose Show Database Info from the View submenu.

    +

    Word search

    +

    The word to be searched is typed into the New Word field and the search started with Enter or by clicking the Search button. There is no uppercase/lowercase distinction: the search word is transformed to lowercase before the search.

    +

    In addition, the word does not have to be in base form. The browser tries to find the possible base form(s) by making certain morphological substitutions. Typing fLIeS as an obscure example gives one this. Click the previous link to see what this kind of search looks like and then come back to this page by using the Alt+LeftArrow key combination.

    +

    The result of a search is a display of one or more +synsets for every part of speech in which a form of the +search word was found to occur. A synset is a set of words +having the same sense or meaning. Each word in a synset that is +underlined is a hyperlink which can be clicked to trigger an +automatic search for that word.

    +

    Every synset has a hyperlink S: at the start of its +display line. Clicking that symbol shows you the name of every +relation that this synset is part of. Every relation name is a hyperlink that opens up a display for that relation. Clicking it another time closes the display again. Clicking another relation name on a line that has an opened relation closes the open relation and opens the clicked relation.

    +

    It is also possible to give two or more words or collocations to be searched at the same time separating them with a comma like this cheer up,clear up, for example. Click the previous link to see what this kind of search looks like and then come back to this page by using the Alt+LeftArrow key combination. As you could see the search result includes the synsets found in the same order than the forms were given in the search field.

    +

    +There are also word level (lexical) relations recorded in the Wordnet database. Opening this kind of relation displays lines with a hyperlink W: at their beginning. Clicking this link shows more info on the word in question.

    +

    The Buttons

    +

    The Search and Help buttons need no more explanation.

    +

    The Show Database Info button shows a collection of Wordnet database statistics.

    +

    The Shutdown the Server button is shown for the first client of the BrowServer program i.e. for the client that is automatically launched when the BrowServer is started but not for the succeeding clients in order to protect the server from accidental shutdowns. +

    + +""" + + +def get_static_welcome_message(): + """ + Get the static welcome page. + """ + return """ +

    Search Help

    +
    • The display below the line is an example of the output the browser +shows you when you enter a search word. The search word was green.
    • +
    • The search result shows for different parts of speech the synsets +i.e. different meanings for the word.
    • +
    • All underlined texts are hypertext links. There are two types of links: +word links and others. Clicking a word link carries out a search for the word +in the Wordnet database.
    • +
    • Clicking a link of the other type opens a display section of data attached +to that link. Clicking that link a second time closes the section again.
    • +
    • Clicking S: opens a section showing the relations for that synset.
    • +
    • Clicking on a relation name opens a section that displays the associated +synsets.
    • +
    • Type a search word in the Next Word field and start the search by the +Enter/Return key or click the Search button.
    • +
    +""" + + +def get_static_index_page(with_shutdown): + """ + Get the static index page. + """ + template = """ + + + + + NLTK Wordnet Browser + + + + + + + +""" + if with_shutdown: + upper_link = "upper.html" + else: + upper_link = "upper_2.html" + + return template % upper_link + + +def get_static_upper_page(with_shutdown): + """ + Return the upper frame page, + + If with_shutdown is True then a 'shutdown' button is also provided + to shutdown the server. + """ + template = """ + + + + + + Untitled Document + + +
    + Current Word:  + Next Word:  + +
    + Help + %s + + + +""" + if with_shutdown: + shutdown_link = 'Shutdown' + else: + shutdown_link = "" + + return template % shutdown_link + + +def usage(): + """ + Display the command line help message. + """ + print(__doc__) + + +def app(): + # Parse and interpret options. + (opts, _) = getopt.getopt( + argv[1:], "l:p:sh", ["logfile=", "port=", "server-mode", "help"] + ) + port = 8000 + server_mode = False + help_mode = False + logfilename = None + for (opt, value) in opts: + if (opt == "-l") or (opt == "--logfile"): + logfilename = str(value) + elif (opt == "-p") or (opt == "--port"): + port = int(value) + elif (opt == "-s") or (opt == "--server-mode"): + server_mode = True + elif (opt == "-h") or (opt == "--help"): + help_mode = True + + if help_mode: + usage() + else: + wnb(port, not server_mode, logfilename) + + +if __name__ == "__main__": + app() + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d8ceabba55aeb13f47c8dc722983ed1ecad0d394 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/__init__.py @@ -0,0 +1,34 @@ +# Natural Language Toolkit: Combinatory Categorial Grammar +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Graeme Gange +# URL: +# For license information, see LICENSE.TXT + +""" +Combinatory Categorial Grammar. + +For more information see nltk/doc/contrib/ccg/ccg.pdf +""" + +from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge +from nltk.ccg.combinator import ( + BackwardApplication, + BackwardBx, + BackwardCombinator, + BackwardComposition, + BackwardSx, + BackwardT, + DirectedBinaryCombinator, + ForwardApplication, + ForwardCombinator, + ForwardComposition, + ForwardSubstitution, + ForwardT, + UndirectedBinaryCombinator, + UndirectedComposition, + UndirectedFunctionApplication, + UndirectedSubstitution, + UndirectedTypeRaise, +) +from nltk.ccg.lexicon import CCGLexicon diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/api.py new file mode 100644 index 0000000000000000000000000000000000000000..cda3bf0ba188139e3d2fce22e65daf751cc5615d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/api.py @@ -0,0 +1,358 @@ +# Natural Language Toolkit: CCG Categories +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Graeme Gange +# URL: +# For license information, see LICENSE.TXT + +from abc import ABCMeta, abstractmethod +from functools import total_ordering + +from nltk.internals import raise_unorderable_types + + +@total_ordering +class AbstractCCGCategory(metaclass=ABCMeta): + """ + Interface for categories in combinatory grammars. + """ + + @abstractmethod + def is_primitive(self): + """ + Returns true if the category is primitive. + """ + + @abstractmethod + def is_function(self): + """ + Returns true if the category is a function application. + """ + + @abstractmethod + def is_var(self): + """ + Returns true if the category is a variable. + """ + + @abstractmethod + def substitute(self, substitutions): + """ + Takes a set of (var, category) substitutions, and replaces every + occurrence of the variable with the corresponding category. + """ + + @abstractmethod + def can_unify(self, other): + """ + Determines whether two categories can be unified. + - Returns None if they cannot be unified + - Returns a list of necessary substitutions if they can. + """ + + # Utility functions: comparison, strings and hashing. + @abstractmethod + def __str__(self): + pass + + def __eq__(self, other): + return ( + self.__class__ is other.__class__ + and self._comparison_key == other._comparison_key + ) + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, AbstractCCGCategory): + raise_unorderable_types("<", self, other) + if self.__class__ is other.__class__: + return self._comparison_key < other._comparison_key + else: + return self.__class__.__name__ < other.__class__.__name__ + + def __hash__(self): + try: + return self._hash + except AttributeError: + self._hash = hash(self._comparison_key) + return self._hash + + +class CCGVar(AbstractCCGCategory): + """ + Class representing a variable CCG category. + Used for conjunctions (and possibly type-raising, if implemented as a + unary rule). + """ + + _maxID = 0 + + def __init__(self, prim_only=False): + """Initialize a variable (selects a new identifier) + + :param prim_only: a boolean that determines whether the variable is + restricted to primitives + :type prim_only: bool + """ + self._id = self.new_id() + self._prim_only = prim_only + self._comparison_key = self._id + + @classmethod + def new_id(cls): + """ + A class method allowing generation of unique variable identifiers. + """ + cls._maxID = cls._maxID + 1 + return cls._maxID - 1 + + @classmethod + def reset_id(cls): + cls._maxID = 0 + + def is_primitive(self): + return False + + def is_function(self): + return False + + def is_var(self): + return True + + def substitute(self, substitutions): + """If there is a substitution corresponding to this variable, + return the substituted category. + """ + for (var, cat) in substitutions: + if var == self: + return cat + return self + + def can_unify(self, other): + """If the variable can be replaced with other + a substitution is returned. + """ + if other.is_primitive() or not self._prim_only: + return [(self, other)] + return None + + def id(self): + return self._id + + def __str__(self): + return "_var" + str(self._id) + + +@total_ordering +class Direction: + """ + Class representing the direction of a function application. + Also contains maintains information as to which combinators + may be used with the category. + """ + + def __init__(self, dir, restrictions): + self._dir = dir + self._restrs = restrictions + self._comparison_key = (dir, tuple(restrictions)) + + # Testing the application direction + def is_forward(self): + return self._dir == "/" + + def is_backward(self): + return self._dir == "\\" + + def dir(self): + return self._dir + + def restrs(self): + """A list of restrictions on the combinators. + '.' denotes that permuting operations are disallowed + ',' denotes that function composition is disallowed + '_' denotes that the direction has variable restrictions. + (This is redundant in the current implementation of type-raising) + """ + return self._restrs + + def is_variable(self): + return self._restrs == "_" + + # Unification and substitution of variable directions. + # Used only if type-raising is implemented as a unary rule, as it + # must inherit restrictions from the argument category. + def can_unify(self, other): + if other.is_variable(): + return [("_", self.restrs())] + elif self.is_variable(): + return [("_", other.restrs())] + else: + if self.restrs() == other.restrs(): + return [] + return None + + def substitute(self, subs): + if not self.is_variable(): + return self + + for (var, restrs) in subs: + if var == "_": + return Direction(self._dir, restrs) + return self + + # Testing permitted combinators + def can_compose(self): + return "," not in self._restrs + + def can_cross(self): + return "." not in self._restrs + + def __eq__(self, other): + return ( + self.__class__ is other.__class__ + and self._comparison_key == other._comparison_key + ) + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, Direction): + raise_unorderable_types("<", self, other) + if self.__class__ is other.__class__: + return self._comparison_key < other._comparison_key + else: + return self.__class__.__name__ < other.__class__.__name__ + + def __hash__(self): + try: + return self._hash + except AttributeError: + self._hash = hash(self._comparison_key) + return self._hash + + def __str__(self): + r_str = "" + for r in self._restrs: + r_str = r_str + "%s" % r + return f"{self._dir}{r_str}" + + # The negation operator reverses the direction of the application + def __neg__(self): + if self._dir == "/": + return Direction("\\", self._restrs) + else: + return Direction("/", self._restrs) + + +class PrimitiveCategory(AbstractCCGCategory): + """ + Class representing primitive categories. + Takes a string representation of the category, and a + list of strings specifying the morphological subcategories. + """ + + def __init__(self, categ, restrictions=[]): + self._categ = categ + self._restrs = restrictions + self._comparison_key = (categ, tuple(restrictions)) + + def is_primitive(self): + return True + + def is_function(self): + return False + + def is_var(self): + return False + + def restrs(self): + return self._restrs + + def categ(self): + return self._categ + + # Substitution does nothing to a primitive category + def substitute(self, subs): + return self + + # A primitive can be unified with a class of the same + # base category, given that the other category shares all + # of its subclasses, or with a variable. + def can_unify(self, other): + if not other.is_primitive(): + return None + if other.is_var(): + return [(other, self)] + if other.categ() == self.categ(): + for restr in self._restrs: + if restr not in other.restrs(): + return None + return [] + return None + + def __str__(self): + if self._restrs == []: + return "%s" % self._categ + restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs) + return f"{self._categ}{restrictions}" + + +class FunctionalCategory(AbstractCCGCategory): + """ + Class that represents a function application category. + Consists of argument and result categories, together with + an application direction. + """ + + def __init__(self, res, arg, dir): + self._res = res + self._arg = arg + self._dir = dir + self._comparison_key = (arg, dir, res) + + def is_primitive(self): + return False + + def is_function(self): + return True + + def is_var(self): + return False + + # Substitution returns the category consisting of the + # substitution applied to each of its constituents. + def substitute(self, subs): + sub_res = self._res.substitute(subs) + sub_dir = self._dir.substitute(subs) + sub_arg = self._arg.substitute(subs) + return FunctionalCategory(sub_res, sub_arg, self._dir) + + # A function can unify with another function, so long as its + # constituents can unify, or with an unrestricted variable. + def can_unify(self, other): + if other.is_var(): + return [(other, self)] + if other.is_function(): + sa = self._res.can_unify(other.res()) + sd = self._dir.can_unify(other.dir()) + if sa is not None and sd is not None: + sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa)) + if sb is not None: + return sa + sb + return None + + # Constituent accessors + def arg(self): + return self._arg + + def res(self): + return self._res + + def dir(self): + return self._dir + + def __str__(self): + return f"({self._res}{self._dir}{self._arg})" diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/chart.py b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/chart.py new file mode 100644 index 0000000000000000000000000000000000000000..539d3bcc7ccf5e75c977b97df6dd3e14ddc50584 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/chart.py @@ -0,0 +1,480 @@ +# Natural Language Toolkit: Combinatory Categorial Grammar +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Graeme Gange +# URL: +# For license information, see LICENSE.TXT + +""" +The lexicon is constructed by calling +``lexicon.fromstring()``. + +In order to construct a parser, you also need a rule set. +The standard English rules are provided in chart as +``chart.DefaultRuleSet``. + +The parser can then be constructed by calling, for example: +``parser = chart.CCGChartParser(, )`` + +Parsing is then performed by running +``parser.parse(.split())``. + +While this returns a list of trees, the default representation +of the produced trees is not very enlightening, particularly +given that it uses the same tree class as the CFG parsers. +It is probably better to call: +``chart.printCCGDerivation()`` +which should print a nice representation of the derivation. + +This entire process is shown far more clearly in the demonstration: +python chart.py +""" + +import itertools + +from nltk.ccg.combinator import * +from nltk.ccg.combinator import ( + BackwardApplication, + BackwardBx, + BackwardComposition, + BackwardSx, + BackwardT, + ForwardApplication, + ForwardComposition, + ForwardSubstitution, + ForwardT, +) +from nltk.ccg.lexicon import Token, fromstring +from nltk.ccg.logic import * +from nltk.parse import ParserI +from nltk.parse.chart import AbstractChartRule, Chart, EdgeI +from nltk.sem.logic import * +from nltk.tree import Tree + + +# Based on the EdgeI class from NLTK. +# A number of the properties of the EdgeI interface don't +# transfer well to CCGs, however. +class CCGEdge(EdgeI): + def __init__(self, span, categ, rule): + self._span = span + self._categ = categ + self._rule = rule + self._comparison_key = (span, categ, rule) + + # Accessors + def lhs(self): + return self._categ + + def span(self): + return self._span + + def start(self): + return self._span[0] + + def end(self): + return self._span[1] + + def length(self): + return self._span[1] - self.span[0] + + def rhs(self): + return () + + def dot(self): + return 0 + + def is_complete(self): + return True + + def is_incomplete(self): + return False + + def nextsym(self): + return None + + def categ(self): + return self._categ + + def rule(self): + return self._rule + + +class CCGLeafEdge(EdgeI): + """ + Class representing leaf edges in a CCG derivation. + """ + + def __init__(self, pos, token, leaf): + self._pos = pos + self._token = token + self._leaf = leaf + self._comparison_key = (pos, token.categ(), leaf) + + # Accessors + def lhs(self): + return self._token.categ() + + def span(self): + return (self._pos, self._pos + 1) + + def start(self): + return self._pos + + def end(self): + return self._pos + 1 + + def length(self): + return 1 + + def rhs(self): + return self._leaf + + def dot(self): + return 0 + + def is_complete(self): + return True + + def is_incomplete(self): + return False + + def nextsym(self): + return None + + def token(self): + return self._token + + def categ(self): + return self._token.categ() + + def leaf(self): + return self._leaf + + +class BinaryCombinatorRule(AbstractChartRule): + """ + Class implementing application of a binary combinator to a chart. + Takes the directed combinator to apply. + """ + + NUMEDGES = 2 + + def __init__(self, combinator): + self._combinator = combinator + + # Apply a combinator + def apply(self, chart, grammar, left_edge, right_edge): + # The left & right edges must be touching. + if not (left_edge.end() == right_edge.start()): + return + + # Check if the two edges are permitted to combine. + # If so, generate the corresponding edge. + if self._combinator.can_combine(left_edge.categ(), right_edge.categ()): + for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): + new_edge = CCGEdge( + span=(left_edge.start(), right_edge.end()), + categ=res, + rule=self._combinator, + ) + if chart.insert(new_edge, (left_edge, right_edge)): + yield new_edge + + # The representation of the combinator (for printing derivations) + def __str__(self): + return "%s" % self._combinator + + +# Type-raising must be handled slightly differently to the other rules, as the +# resulting rules only span a single edge, rather than both edges. + + +class ForwardTypeRaiseRule(AbstractChartRule): + """ + Class for applying forward type raising + """ + + NUMEDGES = 2 + + def __init__(self): + self._combinator = ForwardT + + def apply(self, chart, grammar, left_edge, right_edge): + if not (left_edge.end() == right_edge.start()): + return + + for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): + new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator) + if chart.insert(new_edge, (left_edge,)): + yield new_edge + + def __str__(self): + return "%s" % self._combinator + + +class BackwardTypeRaiseRule(AbstractChartRule): + """ + Class for applying backward type raising. + """ + + NUMEDGES = 2 + + def __init__(self): + self._combinator = BackwardT + + def apply(self, chart, grammar, left_edge, right_edge): + if not (left_edge.end() == right_edge.start()): + return + + for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): + new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator) + if chart.insert(new_edge, (right_edge,)): + yield new_edge + + def __str__(self): + return "%s" % self._combinator + + +# Common sets of combinators used for English derivations. +ApplicationRuleSet = [ + BinaryCombinatorRule(ForwardApplication), + BinaryCombinatorRule(BackwardApplication), +] +CompositionRuleSet = [ + BinaryCombinatorRule(ForwardComposition), + BinaryCombinatorRule(BackwardComposition), + BinaryCombinatorRule(BackwardBx), +] +SubstitutionRuleSet = [ + BinaryCombinatorRule(ForwardSubstitution), + BinaryCombinatorRule(BackwardSx), +] +TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()] + +# The standard English rule set. +DefaultRuleSet = ( + ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet +) + + +class CCGChartParser(ParserI): + """ + Chart parser for CCGs. + Based largely on the ChartParser class from NLTK. + """ + + def __init__(self, lexicon, rules, trace=0): + self._lexicon = lexicon + self._rules = rules + self._trace = trace + + def lexicon(self): + return self._lexicon + + # Implements the CYK algorithm + def parse(self, tokens): + tokens = list(tokens) + chart = CCGChart(list(tokens)) + lex = self._lexicon + + # Initialize leaf edges. + for index in range(chart.num_leaves()): + for token in lex.categories(chart.leaf(index)): + new_edge = CCGLeafEdge(index, token, chart.leaf(index)) + chart.insert(new_edge, ()) + + # Select a span for the new edges + for span in range(2, chart.num_leaves() + 1): + for start in range(0, chart.num_leaves() - span + 1): + # Try all possible pairs of edges that could generate + # an edge for that span + for part in range(1, span): + lstart = start + mid = start + part + rend = start + span + + for left in chart.select(span=(lstart, mid)): + for right in chart.select(span=(mid, rend)): + # Generate all possible combinations of the two edges + for rule in self._rules: + edges_added_by_rule = 0 + for newedge in rule.apply(chart, lex, left, right): + edges_added_by_rule += 1 + + # Output the resulting parses + return chart.parses(lex.start()) + + +class CCGChart(Chart): + def __init__(self, tokens): + Chart.__init__(self, tokens) + + # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be + # constructed slightly differently to those in the default Chart class, so it has to + # be reimplemented + def _trees(self, edge, complete, memo, tree_class): + assert complete, "CCGChart cannot build incomplete trees" + + if edge in memo: + return memo[edge] + + if isinstance(edge, CCGLeafEdge): + word = tree_class(edge.token(), [self._tokens[edge.start()]]) + leaf = tree_class((edge.token(), "Leaf"), [word]) + memo[edge] = [leaf] + return [leaf] + + memo[edge] = [] + trees = [] + + for cpl in self.child_pointer_lists(edge): + child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] + for children in itertools.product(*child_choices): + lhs = ( + Token( + self._tokens[edge.start() : edge.end()], + edge.lhs(), + compute_semantics(children, edge), + ), + str(edge.rule()), + ) + trees.append(tree_class(lhs, children)) + + memo[edge] = trees + return trees + + +def compute_semantics(children, edge): + if children[0].label()[0].semantics() is None: + return None + + if len(children) == 2: + if isinstance(edge.rule(), BackwardCombinator): + children = [children[1], children[0]] + + combinator = edge.rule()._combinator + function = children[0].label()[0].semantics() + argument = children[1].label()[0].semantics() + + if isinstance(combinator, UndirectedFunctionApplication): + return compute_function_semantics(function, argument) + elif isinstance(combinator, UndirectedComposition): + return compute_composition_semantics(function, argument) + elif isinstance(combinator, UndirectedSubstitution): + return compute_substitution_semantics(function, argument) + else: + raise AssertionError("Unsupported combinator '" + combinator + "'") + else: + return compute_type_raised_semantics(children[0].label()[0].semantics()) + + +# -------- +# Displaying derivations +# -------- +def printCCGDerivation(tree): + # Get the leaves and initial categories + leafcats = tree.pos() + leafstr = "" + catstr = "" + + # Construct a string with both the leaf word and corresponding + # category aligned. + for (leaf, cat) in leafcats: + str_cat = "%s" % cat + nextlen = 2 + max(len(leaf), len(str_cat)) + lcatlen = (nextlen - len(str_cat)) // 2 + rcatlen = lcatlen + (nextlen - len(str_cat)) % 2 + catstr += " " * lcatlen + str_cat + " " * rcatlen + lleaflen = (nextlen - len(leaf)) // 2 + rleaflen = lleaflen + (nextlen - len(leaf)) % 2 + leafstr += " " * lleaflen + leaf + " " * rleaflen + print(leafstr.rstrip()) + print(catstr.rstrip()) + + # Display the derivation steps + printCCGTree(0, tree) + + +# Prints the sequence of derivation steps. +def printCCGTree(lwidth, tree): + rwidth = lwidth + + # Is a leaf (word). + # Increment the span by the space occupied by the leaf. + if not isinstance(tree, Tree): + return 2 + lwidth + len(tree) + + # Find the width of the current derivation step + for child in tree: + rwidth = max(rwidth, printCCGTree(rwidth, child)) + + # Is a leaf node. + # Don't print anything, but account for the space occupied. + if not isinstance(tree.label(), tuple): + return max( + rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0]) + ) + + (token, op) = tree.label() + + if op == "Leaf": + return rwidth + + # Pad to the left with spaces, followed by a sequence of '-' + # and the derivation rule. + print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op) + # Print the resulting category on a new line. + str_res = "%s" % (token.categ()) + if token.semantics() is not None: + str_res += " {" + str(token.semantics()) + "}" + respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth + print(respadlen * " " + str_res) + return rwidth + + +### Demonstration code + +# Construct the lexicon +lex = fromstring( + """ + :- S, NP, N, VP # Primitive categories, S is the target primitive + + Det :: NP/N # Family of words + Pro :: NP + TV :: VP/NP + Modal :: (S\\NP)/VP # Backslashes need to be escaped + + I => Pro # Word -> Category mapping + you => Pro + + the => Det + + # Variables have the special keyword 'var' + # '.' prevents permutation + # ',' prevents composition + and => var\\.,var/.,var + + which => (N\\N)/(S/NP) + + will => Modal # Categories can be either explicit, or families. + might => Modal + + cook => TV + eat => TV + + mushrooms => N + parsnips => N + bacon => N + """ +) + + +def demo(): + parser = CCGChartParser(lex, DefaultRuleSet) + for parse in parser.parse("I might cook and eat the bacon".split()): + printCCGDerivation(parse) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/combinator.py b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/combinator.py new file mode 100644 index 0000000000000000000000000000000000000000..d901de29bc9f1e37034fcf8373dd98402c6278e7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/combinator.py @@ -0,0 +1,339 @@ +# Natural Language Toolkit: Combinatory Categorial Grammar +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Graeme Gange +# URL: +# For license information, see LICENSE.TXT +""" +CCG Combinators +""" + +from abc import ABCMeta, abstractmethod + +from nltk.ccg.api import FunctionalCategory + + +class UndirectedBinaryCombinator(metaclass=ABCMeta): + """ + Abstract class for representing a binary combinator. + Merely defines functions for checking if the function and argument + are able to be combined, and what the resulting category is. + + Note that as no assumptions are made as to direction, the unrestricted + combinators can perform all backward, forward and crossed variations + of the combinators; these restrictions must be added in the rule + class. + """ + + @abstractmethod + def can_combine(self, function, argument): + pass + + @abstractmethod + def combine(self, function, argument): + pass + + +class DirectedBinaryCombinator(metaclass=ABCMeta): + """ + Wrapper for the undirected binary combinator. + It takes left and right categories, and decides which is to be + the function, and which the argument. + It then decides whether or not they can be combined. + """ + + @abstractmethod + def can_combine(self, left, right): + pass + + @abstractmethod + def combine(self, left, right): + pass + + +class ForwardCombinator(DirectedBinaryCombinator): + """ + Class representing combinators where the primary functor is on the left. + + Takes an undirected combinator, and a predicate which adds constraints + restricting the cases in which it may apply. + """ + + def __init__(self, combinator, predicate, suffix=""): + self._combinator = combinator + self._predicate = predicate + self._suffix = suffix + + def can_combine(self, left, right): + return self._combinator.can_combine(left, right) and self._predicate( + left, right + ) + + def combine(self, left, right): + yield from self._combinator.combine(left, right) + + def __str__(self): + return f">{self._combinator}{self._suffix}" + + +class BackwardCombinator(DirectedBinaryCombinator): + """ + The backward equivalent of the ForwardCombinator class. + """ + + def __init__(self, combinator, predicate, suffix=""): + self._combinator = combinator + self._predicate = predicate + self._suffix = suffix + + def can_combine(self, left, right): + return self._combinator.can_combine(right, left) and self._predicate( + left, right + ) + + def combine(self, left, right): + yield from self._combinator.combine(right, left) + + def __str__(self): + return f"<{self._combinator}{self._suffix}" + + +class UndirectedFunctionApplication(UndirectedBinaryCombinator): + """ + Class representing function application. + Implements rules of the form: + X/Y Y -> X (>) + And the corresponding backwards application rule + """ + + def can_combine(self, function, argument): + if not function.is_function(): + return False + + return not function.arg().can_unify(argument) is None + + def combine(self, function, argument): + if not function.is_function(): + return + + subs = function.arg().can_unify(argument) + if subs is None: + return + + yield function.res().substitute(subs) + + def __str__(self): + return "" + + +# Predicates for function application. + +# Ensures the left functor takes an argument on the right +def forwardOnly(left, right): + return left.dir().is_forward() + + +# Ensures the right functor takes an argument on the left +def backwardOnly(left, right): + return right.dir().is_backward() + + +# Application combinator instances +ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly) +BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly) + + +class UndirectedComposition(UndirectedBinaryCombinator): + """ + Functional composition (harmonic) combinator. + Implements rules of the form + X/Y Y/Z -> X/Z (B>) + And the corresponding backwards and crossed variations. + """ + + def can_combine(self, function, argument): + # Can only combine two functions, and both functions must + # allow composition. + if not (function.is_function() and argument.is_function()): + return False + if function.dir().can_compose() and argument.dir().can_compose(): + return not function.arg().can_unify(argument.res()) is None + return False + + def combine(self, function, argument): + if not (function.is_function() and argument.is_function()): + return + if function.dir().can_compose() and argument.dir().can_compose(): + subs = function.arg().can_unify(argument.res()) + if subs is not None: + yield FunctionalCategory( + function.res().substitute(subs), + argument.arg().substitute(subs), + argument.dir(), + ) + + def __str__(self): + return "B" + + +# Predicates for restricting application of straight composition. +def bothForward(left, right): + return left.dir().is_forward() and right.dir().is_forward() + + +def bothBackward(left, right): + return left.dir().is_backward() and right.dir().is_backward() + + +# Predicates for crossed composition +def crossedDirs(left, right): + return left.dir().is_forward() and right.dir().is_backward() + + +def backwardBxConstraint(left, right): + # The functors must be crossed inwards + if not crossedDirs(left, right): + return False + # Permuting combinators must be allowed + if not left.dir().can_cross() and right.dir().can_cross(): + return False + # The resulting argument category is restricted to be primitive + return left.arg().is_primitive() + + +# Straight composition combinators +ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly) +BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly) + +# Backward crossed composition +BackwardBx = BackwardCombinator( + UndirectedComposition(), backwardBxConstraint, suffix="x" +) + + +class UndirectedSubstitution(UndirectedBinaryCombinator): + r""" + Substitution (permutation) combinator. + Implements rules of the form + Y/Z (X\Y)/Z -> X/Z ( N\N +def innermostFunction(categ): + while categ.res().is_function(): + categ = categ.res() + return categ + + +class UndirectedTypeRaise(UndirectedBinaryCombinator): + """ + Undirected combinator for type raising. + """ + + def can_combine(self, function, arg): + # The argument must be a function. + # The restriction that arg.res() must be a function + # merely reduces redundant type-raising; if arg.res() is + # primitive, we have: + # X Y\X =>((>) Y + # which is equivalent to + # X Y\X =>(<) Y + if not (arg.is_function() and arg.res().is_function()): + return False + + arg = innermostFunction(arg) + + # left, arg_categ are undefined! + subs = left.can_unify(arg_categ.arg()) + if subs is not None: + return True + return False + + def combine(self, function, arg): + if not ( + function.is_primitive() and arg.is_function() and arg.res().is_function() + ): + return + + # Type-raising matches only the innermost application. + arg = innermostFunction(arg) + + subs = function.can_unify(arg.arg()) + if subs is not None: + xcat = arg.res().substitute(subs) + yield FunctionalCategory( + xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir()) + ) + + def __str__(self): + return "T" + + +# Predicates for type-raising +# The direction of the innermost category must be towards +# the primary functor. +# The restriction that the variable must be primitive is not +# common to all versions of CCGs; some authors have other restrictions. +def forwardTConstraint(left, right): + arg = innermostFunction(right) + return arg.dir().is_backward() and arg.res().is_primitive() + + +def backwardTConstraint(left, right): + arg = innermostFunction(left) + return arg.dir().is_forward() and arg.res().is_primitive() + + +# Instances of type-raising combinators +ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint) +BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/lexicon.py b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/lexicon.py new file mode 100644 index 0000000000000000000000000000000000000000..015920dd025a33c285fe37ce583a9ffdf326d910 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/lexicon.py @@ -0,0 +1,338 @@ +# Natural Language Toolkit: Combinatory Categorial Grammar +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Graeme Gange +# URL: +# For license information, see LICENSE.TXT +""" +CCG Lexicons +""" + +import re +from collections import defaultdict + +from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory +from nltk.internals import deprecated +from nltk.sem.logic import Expression + +# ------------ +# Regular expressions used for parsing components of the lexicon +# ------------ + +# Parses a primitive category and subscripts +PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""") + +# Separates the next primitive category from the remainder of the +# string +NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""") + +# Separates the next application operator from the remainder +APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""") + +# Parses the definition of the right-hand side (rhs) of either a word or a family +LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE) + +# Parses the right hand side that contains category and maybe semantic predicate +RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE) + +# Parses the semantic predicate +SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE) + +# Strips comments from a line +COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""") + + +class Token: + """ + Class representing a token. + + token => category {semantics} + e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)} + + * `token` (string) + * `categ` (string) + * `semantics` (Expression) + """ + + def __init__(self, token, categ, semantics=None): + self._token = token + self._categ = categ + self._semantics = semantics + + def categ(self): + return self._categ + + def semantics(self): + return self._semantics + + def __str__(self): + semantics_str = "" + if self._semantics is not None: + semantics_str = " {" + str(self._semantics) + "}" + return "" + str(self._categ) + semantics_str + + def __cmp__(self, other): + if not isinstance(other, Token): + return -1 + return cmp((self._categ, self._semantics), other.categ(), other.semantics()) + + +class CCGLexicon: + """ + Class representing a lexicon for CCG grammars. + + * `primitives`: The list of primitive categories for the lexicon + * `families`: Families of categories + * `entries`: A mapping of words to possible categories + """ + + def __init__(self, start, primitives, families, entries): + self._start = PrimitiveCategory(start) + self._primitives = primitives + self._families = families + self._entries = entries + + def categories(self, word): + """ + Returns all the possible categories for a word + """ + return self._entries[word] + + def start(self): + """ + Return the target category for the parser + """ + return self._start + + def __str__(self): + """ + String representation of the lexicon. Used for debugging. + """ + string = "" + first = True + for ident in sorted(self._entries): + if not first: + string = string + "\n" + string = string + ident + " => " + + first = True + for cat in self._entries[ident]: + if not first: + string = string + " | " + else: + first = False + string = string + "%s" % cat + return string + + +# ----------- +# Parsing lexicons +# ----------- + + +def matchBrackets(string): + """ + Separate the contents matching the first set of brackets from the rest of + the input. + """ + rest = string[1:] + inside = "(" + + while rest != "" and not rest.startswith(")"): + if rest.startswith("("): + (part, rest) = matchBrackets(rest) + inside = inside + part + else: + inside = inside + rest[0] + rest = rest[1:] + if rest.startswith(")"): + return (inside + ")", rest[1:]) + raise AssertionError("Unmatched bracket in string '" + string + "'") + + +def nextCategory(string): + """ + Separate the string for the next portion of the category from the rest + of the string + """ + if string.startswith("("): + return matchBrackets(string) + return NEXTPRIM_RE.match(string).groups() + + +def parseApplication(app): + """ + Parse an application operator + """ + return Direction(app[0], app[1:]) + + +def parseSubscripts(subscr): + """ + Parse the subscripts for a primitive category + """ + if subscr: + return subscr[1:-1].split(",") + return [] + + +def parsePrimitiveCategory(chunks, primitives, families, var): + """ + Parse a primitive category + + If the primitive is the special category 'var', replace it with the + correct `CCGVar`. + """ + if chunks[0] == "var": + if chunks[1] is None: + if var is None: + var = CCGVar() + return (var, var) + + catstr = chunks[0] + if catstr in families: + (cat, cvar) = families[catstr] + if var is None: + var = cvar + else: + cat = cat.substitute([(cvar, var)]) + return (cat, var) + + if catstr in primitives: + subscrs = parseSubscripts(chunks[1]) + return (PrimitiveCategory(catstr, subscrs), var) + raise AssertionError( + "String '" + catstr + "' is neither a family nor primitive category." + ) + + +def augParseCategory(line, primitives, families, var=None): + """ + Parse a string representing a category, and returns a tuple with + (possibly) the CCG variable for the category + """ + (cat_string, rest) = nextCategory(line) + + if cat_string.startswith("("): + (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var) + + else: + (res, var) = parsePrimitiveCategory( + PRIM_RE.match(cat_string).groups(), primitives, families, var + ) + + while rest != "": + app = APP_RE.match(rest).groups() + direction = parseApplication(app[0:3]) + rest = app[3] + + (cat_string, rest) = nextCategory(rest) + if cat_string.startswith("("): + (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var) + else: + (arg, var) = parsePrimitiveCategory( + PRIM_RE.match(cat_string).groups(), primitives, families, var + ) + res = FunctionalCategory(res, arg, direction) + + return (res, var) + + +def fromstring(lex_str, include_semantics=False): + """ + Convert string representation into a lexicon for CCGs. + """ + CCGVar.reset_id() + primitives = [] + families = {} + entries = defaultdict(list) + for line in lex_str.splitlines(): + # Strip comments and leading/trailing whitespace. + line = COMMENTS_RE.match(line).groups()[0].strip() + if line == "": + continue + + if line.startswith(":-"): + # A line of primitive categories. + # The first one is the target category + # ie, :- S, N, NP, VP + primitives = primitives + [ + prim.strip() for prim in line[2:].strip().split(",") + ] + else: + # Either a family definition, or a word definition + (ident, sep, rhs) = LEX_RE.match(line).groups() + (catstr, semantics_str) = RHS_RE.match(rhs).groups() + (cat, var) = augParseCategory(catstr, primitives, families) + + if sep == "::": + # Family definition + # ie, Det :: NP/N + families[ident] = (cat, var) + else: + semantics = None + if include_semantics is True: + if semantics_str is None: + raise AssertionError( + line + + " must contain semantics because include_semantics is set to True" + ) + else: + semantics = Expression.fromstring( + SEMANTICS_RE.match(semantics_str).groups()[0] + ) + # Word definition + # ie, which => (N\N)/(S/NP) + entries[ident].append(Token(ident, cat, semantics)) + return CCGLexicon(primitives[0], primitives, families, entries) + + +@deprecated("Use fromstring() instead.") +def parseLexicon(lex_str): + return fromstring(lex_str) + + +openccg_tinytiny = fromstring( + """ + # Rather minimal lexicon based on the openccg `tinytiny' grammar. + # Only incorporates a subset of the morphological subcategories, however. + :- S,NP,N # Primitive categories + Det :: NP/N # Determiners + Pro :: NP + IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular) + IntransVpl :: S\\NP[pl] # Plural + TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular) + TransVpl :: S\\NP[pl]/NP # Plural + + the => NP[sg]/N[sg] + the => NP[pl]/N[pl] + + I => Pro + me => Pro + we => Pro + us => Pro + + book => N[sg] + books => N[pl] + + peach => N[sg] + peaches => N[pl] + + policeman => N[sg] + policemen => N[pl] + + boy => N[sg] + boys => N[pl] + + sleep => IntransVsg + sleep => IntransVpl + + eat => IntransVpl + eat => TransVpl + eats => IntransVsg + eats => TransVsg + + see => TransVpl + sees => TransVsg + """ +) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/logic.py b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/logic.py new file mode 100644 index 0000000000000000000000000000000000000000..4e41e0153ceb6013cbefac3a88e2721490073fc3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/ccg/logic.py @@ -0,0 +1,60 @@ +# Natural Language Toolkit: Combinatory Categorial Grammar +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Tanin Na Nakorn (@tanin) +# URL: +# For license information, see LICENSE.TXT +""" +Helper functions for CCG semantics computation +""" + +from nltk.sem.logic import * + + +def compute_type_raised_semantics(semantics): + core = semantics + parent = None + while isinstance(core, LambdaExpression): + parent = core + core = core.term + + var = Variable("F") + while var in core.free(): + var = unique_variable(pattern=var) + core = ApplicationExpression(FunctionVariableExpression(var), core) + + if parent is not None: + parent.term = core + else: + semantics = core + + return LambdaExpression(var, semantics) + + +def compute_function_semantics(function, argument): + return ApplicationExpression(function, argument).simplify() + + +def compute_composition_semantics(function, argument): + assert isinstance(argument, LambdaExpression), ( + "`" + str(argument) + "` must be a lambda expression" + ) + return LambdaExpression( + argument.variable, ApplicationExpression(function, argument.term).simplify() + ) + + +def compute_substitution_semantics(function, argument): + assert isinstance(function, LambdaExpression) and isinstance( + function.term, LambdaExpression + ), ("`" + str(function) + "` must be a lambda expression with 2 arguments") + assert isinstance(argument, LambdaExpression), ( + "`" + str(argument) + "` must be a lambda expression" + ) + + new_argument = ApplicationExpression( + argument, VariableExpression(function.variable) + ).simplify() + new_term = ApplicationExpression(function.term, new_argument).simplify() + + return LambdaExpression(function.variable, new_term) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chat/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f5c881e11ba921d9913ab6983299e8c697091af8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/__init__.py @@ -0,0 +1,48 @@ +# Natural Language Toolkit: Chatbots +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +# Based on an Eliza implementation by Joe Strout , +# Jeff Epler and Jez Higgins . + +""" +A class for simple chatbots. These perform simple pattern matching on sentences +typed by users, and respond with automatically generated sentences. + +These chatbots may not work using the windows command line or the +windows IDLE GUI. +""" + +from nltk.chat.eliza import eliza_chat +from nltk.chat.iesha import iesha_chat +from nltk.chat.rude import rude_chat +from nltk.chat.suntsu import suntsu_chat +from nltk.chat.util import Chat +from nltk.chat.zen import zen_chat + +bots = [ + (eliza_chat, "Eliza (psycho-babble)"), + (iesha_chat, "Iesha (teen anime junky)"), + (rude_chat, "Rude (abusive bot)"), + (suntsu_chat, "Suntsu (Chinese sayings)"), + (zen_chat, "Zen (gems of wisdom)"), +] + + +def chatbots(): + print("Which chatbot would you like to talk to?") + botcount = len(bots) + for i in range(botcount): + print(" %d: %s" % (i + 1, bots[i][1])) + while True: + choice = input(f"\nEnter a number in the range 1-{botcount}: ").strip() + if choice.isdigit() and (int(choice) - 1) in range(botcount): + break + else: + print(" Error: bad chatbot number") + + chatbot = bots[int(choice) - 1][0] + chatbot() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chat/eliza.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/eliza.py new file mode 100644 index 0000000000000000000000000000000000000000..5802def2da7a0d855fe6d315670d5c526aa3b434 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/eliza.py @@ -0,0 +1,337 @@ +# Natural Language Toolkit: Eliza +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +# Based on an Eliza implementation by Joe Strout , +# Jeff Epler and Jez Higgins . + +# a translation table used to convert things you say into things the +# computer says back, e.g. "I am" --> "you are" + +from nltk.chat.util import Chat, reflections + +# a table of response pairs, where each pair consists of a +# regular expression, and a list of possible responses, +# with group-macros labelled as %1, %2. + +pairs = ( + ( + r"I need (.*)", + ( + "Why do you need %1?", + "Would it really help you to get %1?", + "Are you sure you need %1?", + ), + ), + ( + r"Why don\'t you (.*)", + ( + "Do you really think I don't %1?", + "Perhaps eventually I will %1.", + "Do you really want me to %1?", + ), + ), + ( + r"Why can\'t I (.*)", + ( + "Do you think you should be able to %1?", + "If you could %1, what would you do?", + "I don't know -- why can't you %1?", + "Have you really tried?", + ), + ), + ( + r"I can\'t (.*)", + ( + "How do you know you can't %1?", + "Perhaps you could %1 if you tried.", + "What would it take for you to %1?", + ), + ), + ( + r"I am (.*)", + ( + "Did you come to me because you are %1?", + "How long have you been %1?", + "How do you feel about being %1?", + ), + ), + ( + r"I\'m (.*)", + ( + "How does being %1 make you feel?", + "Do you enjoy being %1?", + "Why do you tell me you're %1?", + "Why do you think you're %1?", + ), + ), + ( + r"Are you (.*)", + ( + "Why does it matter whether I am %1?", + "Would you prefer it if I were not %1?", + "Perhaps you believe I am %1.", + "I may be %1 -- what do you think?", + ), + ), + ( + r"What (.*)", + ( + "Why do you ask?", + "How would an answer to that help you?", + "What do you think?", + ), + ), + ( + r"How (.*)", + ( + "How do you suppose?", + "Perhaps you can answer your own question.", + "What is it you're really asking?", + ), + ), + ( + r"Because (.*)", + ( + "Is that the real reason?", + "What other reasons come to mind?", + "Does that reason apply to anything else?", + "If %1, what else must be true?", + ), + ), + ( + r"(.*) sorry (.*)", + ( + "There are many times when no apology is needed.", + "What feelings do you have when you apologize?", + ), + ), + ( + r"Hello(.*)", + ( + "Hello... I'm glad you could drop by today.", + "Hi there... how are you today?", + "Hello, how are you feeling today?", + ), + ), + ( + r"I think (.*)", + ("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"), + ), + ( + r"(.*) friend (.*)", + ( + "Tell me more about your friends.", + "When you think of a friend, what comes to mind?", + "Why don't you tell me about a childhood friend?", + ), + ), + (r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")), + ( + r"(.*) computer(.*)", + ( + "Are you really talking about me?", + "Does it seem strange to talk to a computer?", + "How do computers make you feel?", + "Do you feel threatened by computers?", + ), + ), + ( + r"Is it (.*)", + ( + "Do you think it is %1?", + "Perhaps it's %1 -- what do you think?", + "If it were %1, what would you do?", + "It could well be that %1.", + ), + ), + ( + r"It is (.*)", + ( + "You seem very certain.", + "If I told you that it probably isn't %1, what would you feel?", + ), + ), + ( + r"Can you (.*)", + ( + "What makes you think I can't %1?", + "If I could %1, then what?", + "Why do you ask if I can %1?", + ), + ), + ( + r"Can I (.*)", + ( + "Perhaps you don't want to %1.", + "Do you want to be able to %1?", + "If you could %1, would you?", + ), + ), + ( + r"You are (.*)", + ( + "Why do you think I am %1?", + "Does it please you to think that I'm %1?", + "Perhaps you would like me to be %1.", + "Perhaps you're really talking about yourself?", + ), + ), + ( + r"You\'re (.*)", + ( + "Why do you say I am %1?", + "Why do you think I am %1?", + "Are we talking about you, or me?", + ), + ), + ( + r"I don\'t (.*)", + ("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"), + ), + ( + r"I feel (.*)", + ( + "Good, tell me more about these feelings.", + "Do you often feel %1?", + "When do you usually feel %1?", + "When you feel %1, what do you do?", + ), + ), + ( + r"I have (.*)", + ( + "Why do you tell me that you've %1?", + "Have you really %1?", + "Now that you have %1, what will you do next?", + ), + ), + ( + r"I would (.*)", + ( + "Could you explain why you would %1?", + "Why would you %1?", + "Who else knows that you would %1?", + ), + ), + ( + r"Is there (.*)", + ( + "Do you think there is %1?", + "It's likely that there is %1.", + "Would you like there to be %1?", + ), + ), + ( + r"My (.*)", + ( + "I see, your %1.", + "Why do you say that your %1?", + "When your %1, how do you feel?", + ), + ), + ( + r"You (.*)", + ( + "We should be discussing you, not me.", + "Why do you say that about me?", + "Why do you care whether I %1?", + ), + ), + (r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")), + ( + r"I want (.*)", + ( + "What would it mean to you if you got %1?", + "Why do you want %1?", + "What would you do if you got %1?", + "If you got %1, then what would you do?", + ), + ), + ( + r"(.*) mother(.*)", + ( + "Tell me more about your mother.", + "What was your relationship with your mother like?", + "How do you feel about your mother?", + "How does this relate to your feelings today?", + "Good family relations are important.", + ), + ), + ( + r"(.*) father(.*)", + ( + "Tell me more about your father.", + "How did your father make you feel?", + "How do you feel about your father?", + "Does your relationship with your father relate to your feelings today?", + "Do you have trouble showing affection with your family?", + ), + ), + ( + r"(.*) child(.*)", + ( + "Did you have close friends as a child?", + "What is your favorite childhood memory?", + "Do you remember any dreams or nightmares from childhood?", + "Did the other children sometimes tease you?", + "How do you think your childhood experiences relate to your feelings today?", + ), + ), + ( + r"(.*)\?", + ( + "Why do you ask that?", + "Please consider whether you can answer your own question.", + "Perhaps the answer lies within yourself?", + "Why don't you tell me?", + ), + ), + ( + r"quit", + ( + "Thank you for talking with me.", + "Good-bye.", + "Thank you, that will be $150. Have a good day!", + ), + ), + ( + r"(.*)", + ( + "Please tell me more.", + "Let's change focus a bit... Tell me about your family.", + "Can you elaborate on that?", + "Why do you say that %1?", + "I see.", + "Very interesting.", + "%1.", + "I see. And what does that tell you?", + "How does that make you feel?", + "How do you feel when you say that?", + ), + ), +) + +eliza_chatbot = Chat(pairs, reflections) + + +def eliza_chat(): + print("Therapist\n---------") + print("Talk to the program by typing in plain English, using normal upper-") + print('and lower-case letters and punctuation. Enter "quit" when done.') + print("=" * 72) + print("Hello. How are you feeling today?") + + eliza_chatbot.converse() + + +def demo(): + eliza_chat() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chat/iesha.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/iesha.py new file mode 100644 index 0000000000000000000000000000000000000000..3c10a3dc4ee0a749e33c7978c969efb34a0c006f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/iesha.py @@ -0,0 +1,160 @@ +# Natural Language Toolkit: Teen Chatbot +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Selina Dennis +# URL: +# For license information, see LICENSE.TXT + +""" +This chatbot is a tongue-in-cheek take on the average teen +anime junky that frequents YahooMessenger or MSNM. +All spelling mistakes and flawed grammar are intentional. +""" + +from nltk.chat.util import Chat + +reflections = { + "am": "r", + "was": "were", + "i": "u", + "i'd": "u'd", + "i've": "u'v", + "ive": "u'v", + "i'll": "u'll", + "my": "ur", + "are": "am", + "you're": "im", + "you've": "ive", + "you'll": "i'll", + "your": "my", + "yours": "mine", + "you": "me", + "u": "me", + "ur": "my", + "urs": "mine", + "me": "u", +} + +# Note: %1/2/etc are used without spaces prior as the chat bot seems +# to add a superfluous space when matching. + +pairs = ( + ( + r"I\'m (.*)", + ( + "ur%1?? that's so cool! kekekekeke ^_^ tell me more!", + "ur%1? neat!! kekeke >_<", + ), + ), + ( + r"(.*) don\'t you (.*)", + ( + r"u think I can%2??! really?? kekeke \<_\<", + "what do u mean%2??!", + "i could if i wanted, don't you think!! kekeke", + ), + ), + (r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")), + ( + r"do (you|u) (.*)\??", + ("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"), + ), + ( + r"(.*)\?", + ( + "man u ask lots of questions!", + "booooring! how old r u??", + "boooooring!! ur not very fun", + ), + ), + ( + r"(cos|because) (.*)", + ("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"), + ), + ( + r"why can\'t [iI] (.*)", + ( + "i dunno! y u askin me for!", + "try harder, silly! hee! ^_^", + "i dunno! but when i can't%1 i jump up and down!", + ), + ), + ( + r"I can\'t (.*)", + ( + "u can't what??! >_<", + "that's ok! i can't%1 either! kekekekeke ^_^", + "try harder, silly! hee! ^&^", + ), + ), + ( + r"(.*) (like|love|watch) anime", + ( + "omg i love anime!! do u like sailor moon??! ^&^", + "anime yay! anime rocks sooooo much!", + "oooh anime! i love anime more than anything!", + "anime is the bestest evar! evangelion is the best!", + "hee anime is the best! do you have ur fav??", + ), + ), + ( + r"I (like|love|watch|play) (.*)", + ("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"), + ), + ( + r"anime sucks|(.*) (hate|detest) anime", + ( + "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*", + "no way! anime is the best ever!", + "nuh-uh, anime is the best!", + ), + ), + ( + r"(are|r) (you|u) (.*)", + ("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"), + ), + ( + r"what (.*)", + ("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"), + ), + (r"how (.*)", ("not tellin!! kekekekekeke ^_^",)), + (r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)), + ( + r"quit", + ( + "mom says i have to go eat dinner now :,( bye!!", + "awww u have to go?? see u next time!!", + "how to see u again soon! ^_^", + ), + ), + ( + r"(.*)", + ( + "ur funny! kekeke", + "boooooring! talk about something else! tell me wat u like!", + "do u like anime??", + "do u watch anime? i like sailor moon! ^_^", + "i wish i was a kitty!! kekekeke ^_^", + ), + ), +) + +iesha_chatbot = Chat(pairs, reflections) + + +def iesha_chat(): + print("Iesha the TeenBoT\n---------") + print("Talk to the program by typing in plain English, using normal upper-") + print('and lower-case letters and punctuation. Enter "quit" when done.') + print("=" * 72) + print("hi!! i'm iesha! who r u??!") + + iesha_chatbot.converse() + + +def demo(): + iesha_chat() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chat/rude.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/rude.py new file mode 100644 index 0000000000000000000000000000000000000000..0e602fa40ec0bf0d7049202a79ca7eae61ec3326 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/rude.py @@ -0,0 +1,125 @@ +# Natural Language Toolkit: Rude Chatbot +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Peter Spiller +# URL: +# For license information, see LICENSE.TXT + +from nltk.chat.util import Chat, reflections + +pairs = ( + ( + r"We (.*)", + ( + "What do you mean, 'we'?", + "Don't include me in that!", + "I wouldn't be so sure about that.", + ), + ), + ( + r"You should (.*)", + ("Don't tell me what to do, buddy.", "Really? I should, should I?"), + ), + ( + r"You\'re(.*)", + ( + "More like YOU'RE %1!", + "Hah! Look who's talking.", + "Come over here and tell me I'm %1.", + ), + ), + ( + r"You are(.*)", + ( + "More like YOU'RE %1!", + "Hah! Look who's talking.", + "Come over here and tell me I'm %1.", + ), + ), + ( + r"I can\'t(.*)", + ( + "You do sound like the type who can't %1.", + "Hear that splashing sound? That's my heart bleeding for you.", + "Tell somebody who might actually care.", + ), + ), + ( + r"I think (.*)", + ( + "I wouldn't think too hard if I were you.", + "You actually think? I'd never have guessed...", + ), + ), + ( + r"I (.*)", + ( + "I'm getting a bit tired of hearing about you.", + "How about we talk about me instead?", + "Me, me, me... Frankly, I don't care.", + ), + ), + ( + r"How (.*)", + ( + "How do you think?", + "Take a wild guess.", + "I'm not even going to dignify that with an answer.", + ), + ), + (r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")), + ( + r"Why (.*)", + ( + "Why not?", + "That's so obvious I thought even you'd have already figured it out.", + ), + ), + ( + r"(.*)shut up(.*)", + ( + "Make me.", + "Getting angry at a feeble NLP assignment? Somebody's losing it.", + "Say that again, I dare you.", + ), + ), + ( + r"Shut up(.*)", + ( + "Make me.", + "Getting angry at a feeble NLP assignment? Somebody's losing it.", + "Say that again, I dare you.", + ), + ), + ( + r"Hello(.*)", + ("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."), + ), + ( + r"(.*)", + ( + "I'm getting bored here. Become more interesting.", + "Either become more thrilling or get lost, buddy.", + "Change the subject before I die of fatal boredom.", + ), + ), +) + +rude_chatbot = Chat(pairs, reflections) + + +def rude_chat(): + print("Talk to the program by typing in plain English, using normal upper-") + print('and lower-case letters and punctuation. Enter "quit" when done.') + print("=" * 72) + print("I suppose I should say hello.") + + rude_chatbot.converse() + + +def demo(): + rude_chat() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chat/suntsu.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/suntsu.py new file mode 100644 index 0000000000000000000000000000000000000000..49a5b842498c12730594d37ab91893cf87d3d5c0 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/suntsu.py @@ -0,0 +1,140 @@ +# Natural Language Toolkit: Sun Tsu-Bot +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Sam Huston 2007 +# URL: +# For license information, see LICENSE.TXT + +""" +Tsu bot responds to all queries with a Sun Tsu sayings + +Quoted from Sun Tsu's The Art of War +Translated by LIONEL GILES, M.A. 1910 +Hosted by the Gutenberg Project +https://www.gutenberg.org/ +""" + +from nltk.chat.util import Chat, reflections + +pairs = ( + (r"quit", ("Good-bye.", "Plan well", "May victory be your future")), + ( + r"[^\?]*\?", + ( + "Please consider whether you can answer your own question.", + "Ask me no questions!", + ), + ), + ( + r"[0-9]+(.*)", + ( + "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.", + "There are five essentials for victory", + ), + ), + ( + r"[A-Ca-c](.*)", + ( + "The art of war is of vital importance to the State.", + "All warfare is based on deception.", + "If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.", + "If the campaign is protracted, the resources of the State will not be equal to the strain.", + "Attack him where he is unprepared, appear where you are not expected.", + "There is no instance of a country having benefited from prolonged warfare.", + ), + ), + ( + r"[D-Fd-f](.*)", + ( + "The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.", + "Bring war material with you from home, but forage on the enemy.", + "In war, then, let your great object be victory, not lengthy campaigns.", + "To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.", + ), + ), + ( + r"[G-Ig-i](.*)", + ( + "Heaven signifies night and day, cold and heat, times and seasons.", + "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.", + "The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.", + "One may know how to conquer without being able to do it.", + ), + ), + ( + r"[J-Lj-l](.*)", + ( + "There are three ways in which a ruler can bring misfortune upon his army.", + "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.", + "By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.", + "By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.", + "There are five essentials for victory", + "He will win who knows when to fight and when not to fight.", + "He will win who knows how to handle both superior and inferior forces.", + "He will win whose army is animated by the same spirit throughout all its ranks.", + "He will win who, prepared himself, waits to take the enemy unprepared.", + "He will win who has military capacity and is not interfered with by the sovereign.", + ), + ), + ( + r"[M-Om-o](.*)", + ( + "If you know the enemy and know yourself, you need not fear the result of a hundred battles.", + "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.", + "If you know neither the enemy nor yourself, you will succumb in every battle.", + "The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.", + ), + ), + ( + r"[P-Rp-r](.*)", + ( + "Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.", + "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.", + "He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.", + "A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.", + "The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.", + ), + ), + ( + r"[S-Us-u](.*)", + ( + "What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.", + "Hence his victories bring him neither reputation for wisdom nor credit for courage.", + "Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.", + "In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.", + "There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.", + "Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.", + ), + ), + ( + r"[V-Zv-z](.*)", + ( + "It is a matter of life and death, a road either to safety or to ruin.", + "Hold out baits to entice the enemy. Feign disorder, and crush him.", + "All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.", + "Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.", + "So in war, the way is to avoid what is strong and to strike at what is weak.", + "Just as water retains no constant shape, so in warfare there are no constant conditions.", + ), + ), + (r"(.*)", ("Your statement insults me.", "")), +) + +suntsu_chatbot = Chat(pairs, reflections) + + +def suntsu_chat(): + print("Talk to the program by typing in plain English, using normal upper-") + print('and lower-case letters and punctuation. Enter "quit" when done.') + print("=" * 72) + print("You seek enlightenment?") + + suntsu_chatbot.converse() + + +def demo(): + suntsu_chat() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chat/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/util.py new file mode 100644 index 0000000000000000000000000000000000000000..82d949018c0770e424b14f6f38cb060e14b1bc39 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/util.py @@ -0,0 +1,124 @@ +# Natural Language Toolkit: Chatbot Utilities +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +# Based on an Eliza implementation by Joe Strout , +# Jeff Epler and Jez Higgins . + +import random +import re + +reflections = { + "i am": "you are", + "i was": "you were", + "i": "you", + "i'm": "you are", + "i'd": "you would", + "i've": "you have", + "i'll": "you will", + "my": "your", + "you are": "I am", + "you were": "I was", + "you've": "I have", + "you'll": "I will", + "your": "my", + "yours": "mine", + "you": "me", + "me": "you", +} + + +class Chat: + def __init__(self, pairs, reflections={}): + """ + Initialize the chatbot. Pairs is a list of patterns and responses. Each + pattern is a regular expression matching the user's statement or question, + e.g. r'I like (.*)'. For each such pattern a list of possible responses + is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material + which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to + the numbered positions in the responses, e.g. %1. + + :type pairs: list of tuple + :param pairs: The patterns and responses + :type reflections: dict + :param reflections: A mapping between first and second person expressions + :rtype: None + """ + + self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs] + self._reflections = reflections + self._regex = self._compile_reflections() + + def _compile_reflections(self): + sorted_refl = sorted(self._reflections, key=len, reverse=True) + return re.compile( + r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE + ) + + def _substitute(self, str): + """ + Substitute words in the string, according to the specified reflections, + e.g. "I'm" -> "you are" + + :type str: str + :param str: The string to be mapped + :rtype: str + """ + + return self._regex.sub( + lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower() + ) + + def _wildcards(self, response, match): + pos = response.find("%") + while pos >= 0: + num = int(response[pos + 1 : pos + 2]) + response = ( + response[:pos] + + self._substitute(match.group(num)) + + response[pos + 2 :] + ) + pos = response.find("%") + return response + + def respond(self, str): + """ + Generate a response to the user input. + + :type str: str + :param str: The string to be mapped + :rtype: str + """ + + # check each pattern + for (pattern, response) in self._pairs: + match = pattern.match(str) + + # did the pattern match? + if match: + resp = random.choice(response) # pick a random response + resp = self._wildcards(resp, match) # process wildcards + + # fix munged punctuation at the end + if resp[-2:] == "?.": + resp = resp[:-2] + "." + if resp[-2:] == "??": + resp = resp[:-2] + "?" + return resp + + # Hold a conversation with a chatbot + def converse(self, quit="quit"): + user_input = "" + while user_input != quit: + user_input = quit + try: + user_input = input(">") + except EOFError: + print(user_input) + if user_input: + while user_input[-1] in "!.": + user_input = user_input[:-1] + print(self.respond(user_input)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chat/zen.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/zen.py new file mode 100644 index 0000000000000000000000000000000000000000..d070fefae6d17666071364ee5102c4e1b8ed1dcd --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chat/zen.py @@ -0,0 +1,329 @@ +# Natural Language Toolkit: Zen Chatbot +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Amy Holland +# URL: +# For license information, see LICENSE.TXT + +""" +Zen Chatbot talks in gems of Zen wisdom. + +This is a sample conversation with Zen Chatbot: +ZC: Welcome, my child. +me: Good afternoon. +ZC: Ask the question you have come to ask. +me: How can I achieve enlightenment? +ZC: How do you suppose? +me: Through meditation. +ZC: Form is emptiness, and emptiness form. +me: How can I empty my mind of worldly troubles? +ZC: Will an answer to that really help in your search for enlightenment? +me: Yes. +ZC: It is better to be right than to be certain. +me: I seek truth and wisdom. +ZC: The search for truth is a long journey. +me: Are you sure? +ZC: Maybe sure, maybe not sure. + + +The chatbot structure is based on that of chat.eliza. Thus, it uses +a translation table to convert from question to response +i.e. "I am" --> "you are" + +Of course, since Zen Chatbot does not understand the meaning of any words, +responses are very limited. Zen Chatbot will usually answer very vaguely, or +respond to a question by asking a different question, in much the same way +as Eliza. +""" + +from nltk.chat.util import Chat, reflections + +# responses are matched top to bottom, so non-specific matches occur later +# for each match, a list of possible responses is provided +responses = ( + # Zen Chatbot opens with the line "Welcome, my child." The usual + # response will be a greeting problem: 'good' matches "good morning", + # "good day" etc, but also "good grief!" and other sentences starting + # with the word 'good' that may not be a greeting + ( + r"(hello(.*))|(good [a-zA-Z]+)", + ( + "The path to enlightenment is often difficult to see.", + "Greetings. I sense your mind is troubled. Tell me of your troubles.", + "Ask the question you have come to ask.", + "Hello. Do you seek englightenment?", + ), + ), + # "I need" and "I want" can be followed by a thing (eg 'help') + # or an action (eg 'to see you') + # + # This is a problem with this style of response - + # person: "I need you" + # chatbot: "me can be achieved by hard work and dedication of the mind" + # i.e. 'you' is not really a thing that can be mapped this way, so this + # interpretation only makes sense for some inputs + # + ( + r"i need (.*)", + ( + "%1 can be achieved by hard work and dedication of the mind.", + "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.", + "Focus your mind on%1, and you will find what you need.", + ), + ), + ( + r"i want (.*)", + ( + "Desires of the heart will distract you from the path to enlightenment.", + "Will%1 help you attain enlightenment?", + "Is%1 a desire of the mind, or of the heart?", + ), + ), + # why questions are separated into three types: + # "why..I" e.g. "why am I here?" "Why do I like cake?" + # "why..you" e.g. "why are you here?" "Why won't you tell me?" + # "why..." e.g. "Why is the sky blue?" + # problems: + # person: "Why can't you tell me?" + # chatbot: "Are you sure I tell you?" + # - this style works for positives (e.g. "why do you like cake?") + # but does not work for negatives (e.g. "why don't you like cake?") + (r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")), + (r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")), + (r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")), + # e.g. "are you listening?", "are you a duck" + ( + r"are you (.*)\?", + ("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."), + ), + # e.g. "am I a duck?", "am I going to die?" + ( + r"am i (.*)\?", + ("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."), + ), + # what questions, e.g. "what time is it?" + # problems: + # person: "What do you want?" + # chatbot: "Seek truth, not what do me want." + (r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")), + # how questions, e.g. "how do you do?" + ( + r"how (.*)\?", + ( + "How do you suppose?", + "Will an answer to that really help in your search for enlightenment?", + "Ask yourself not how, but why.", + ), + ), + # can questions, e.g. "can you run?", "can you come over here please?" + ( + r"can you (.*)\?", + ( + "I probably can, but I may not.", + "Maybe I can%1, and maybe I cannot.", + "I can do all, and I can do nothing.", + ), + ), + # can questions, e.g. "can I have some cake?", "can I know truth?" + ( + r"can i (.*)\?", + ( + "You can%1 if you believe you can%1, and have a pure spirit.", + "Seek truth and you will know if you can%1.", + ), + ), + # e.g. "It is raining" - implies the speaker is certain of a fact + ( + r"it is (.*)", + ( + "How can you be certain that%1, when you do not even know yourself?", + "Whether it is%1 or not does not change the way the world is.", + ), + ), + # e.g. "is there a doctor in the house?" + ( + r"is there (.*)\?", + ("There is%1 if you believe there is.", "It is possible that there is%1."), + ), + # e.g. "is it possible?", "is this true?" + (r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")), + # non-specific question + ( + r"(.*)\?", + ( + "Do you think %1?", + "You seek the truth. Does the truth seek you?", + "If you intentionally pursue the answers to your questions, the answers become hard to see.", + "The answer to your question cannot be told. It must be experienced.", + ), + ), + # expression of hate of form "I hate you" or "Kelly hates cheese" + ( + r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)", + ( + "Perhaps it is not about hating %2, but about hate from within.", + "Weeds only grow when we dislike them", + "Hate is a very strong emotion.", + ), + ), + # statement containing the word 'truth' + ( + r"(.*) truth(.*)", + ( + "Seek truth, and truth will seek you.", + "Remember, it is not the spoon which bends - only yourself.", + "The search for truth is a long journey.", + ), + ), + # desire to do an action + # e.g. "I want to go shopping" + ( + r"i want to (.*)", + ("You may %1 if your heart truly desires to.", "You may have to %1."), + ), + # desire for an object + # e.g. "I want a pony" + ( + r"i want (.*)", + ( + "Does your heart truly desire %1?", + "Is this a desire of the heart, or of the mind?", + ), + ), + # e.g. "I can't wait" or "I can't do this" + ( + r"i can\'t (.*)", + ( + "What we can and can't do is a limitation of the mind.", + "There are limitations of the body, and limitations of the mind.", + "Have you tried to%1 with a clear mind?", + ), + ), + # "I think.." indicates uncertainty. e.g. "I think so." + # problem: exceptions... + # e.g. "I think, therefore I am" + ( + r"i think (.*)", + ( + "Uncertainty in an uncertain world.", + "Indeed, how can we be certain of anything in such uncertain times.", + "Are you not, in fact, certain that%1?", + ), + ), + # "I feel...emotions/sick/light-headed..." + ( + r"i feel (.*)", + ( + "Your body and your emotions are both symptoms of your mind." + "What do you believe is the root of such feelings?", + "Feeling%1 can be a sign of your state-of-mind.", + ), + ), + # exclaimation mark indicating emotion + # e.g. "Wow!" or "No!" + ( + r"(.*)!", + ( + "I sense that you are feeling emotional today.", + "You need to calm your emotions.", + ), + ), + # because [statement] + # e.g. "because I said so" + ( + r"because (.*)", + ( + "Does knowning the reasons behind things help you to understand" + " the things themselves?", + "If%1, what else must be true?", + ), + ), + # yes or no - raise an issue of certainty/correctness + ( + r"(yes)|(no)", + ( + "Is there certainty in an uncertain world?", + "It is better to be right than to be certain.", + ), + ), + # sentence containing word 'love' + ( + r"(.*)love(.*)", + ( + "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.", + "Free love!", + ), + ), + # sentence containing word 'understand' - r + ( + r"(.*)understand(.*)", + ( + "If you understand, things are just as they are;" + " if you do not understand, things are just as they are.", + "Imagination is more important than knowledge.", + ), + ), + # 'I', 'me', 'my' - person is talking about themself. + # this breaks down when words contain these - eg 'Thyme', 'Irish' + ( + r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)", + ( + "'I', 'me', 'my'... these are selfish expressions.", + "Have you ever considered that you might be a selfish person?", + "Try to consider others, not just yourself.", + "Think not just of yourself, but of others.", + ), + ), + # 'you' starting a sentence + # e.g. "you stink!" + ( + r"you (.*)", + ("My path is not of concern to you.", "I am but one, and you but one more."), + ), + # say goodbye with some extra Zen wisdom. + ( + r"exit", + ( + "Farewell. The obstacle is the path.", + "Farewell. Life is a journey, not a destination.", + "Good bye. We are cups, constantly and quietly being filled." + "\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.", + ), + ), + # fall through case - + # when stumped, respond with generic zen wisdom + # + ( + r"(.*)", + ( + "When you're enlightened, every word is wisdom.", + "Random talk is useless.", + "The reverse side also has a reverse side.", + "Form is emptiness, and emptiness is form.", + "I pour out a cup of water. Is the cup empty?", + ), + ), +) + +zen_chatbot = Chat(responses, reflections) + + +def zen_chat(): + print("*" * 75) + print("Zen Chatbot!".center(75)) + print("*" * 75) + print('"Look beyond mere words and letters - look into your mind"'.center(75)) + print("* Talk your way to truth with Zen Chatbot.") + print("* Type 'quit' when you have had enough.") + print("*" * 75) + print("Welcome, my child.") + + zen_chatbot.converse() + + +def demo(): + zen_chat() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d217ec0ed58cbdb8ef574510dac7f1e9a732888f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/__init__.py @@ -0,0 +1,197 @@ +# Natural Language Toolkit: Chunkers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT +# + +""" +Classes and interfaces for identifying non-overlapping linguistic +groups (such as base noun phrases) in unrestricted text. This task is +called "chunk parsing" or "chunking", and the identified groups are +called "chunks". The chunked text is represented using a shallow +tree called a "chunk structure." A chunk structure is a tree +containing tokens and chunks, where each chunk is a subtree containing +only tokens. For example, the chunk structure for base noun phrase +chunks in the sentence "I saw the big dog on the hill" is:: + + (SENTENCE: + (NP: ) + + (NP: ) + + (NP: )) + +To convert a chunk structure back to a list of tokens, simply use the +chunk structure's ``leaves()`` method. + +This module defines ``ChunkParserI``, a standard interface for +chunking texts; and ``RegexpChunkParser``, a regular-expression based +implementation of that interface. It also defines ``ChunkScore``, a +utility class for scoring chunk parsers. + +RegexpChunkParser +================= + +``RegexpChunkParser`` is an implementation of the chunk parser interface +that uses regular-expressions over tags to chunk a text. Its +``parse()`` method first constructs a ``ChunkString``, which encodes a +particular chunking of the input text. Initially, nothing is +chunked. ``parse.RegexpChunkParser`` then applies a sequence of +``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies +the chunking that it encodes. Finally, the ``ChunkString`` is +transformed back into a chunk structure, which is returned. + +``RegexpChunkParser`` can only be used to chunk a single kind of phrase. +For example, you can use an ``RegexpChunkParser`` to chunk the noun +phrases in a text, or the verb phrases in a text; but you can not +use it to simultaneously chunk both noun phrases and verb phrases in +the same text. (This is a limitation of ``RegexpChunkParser``, not of +chunk parsers in general.) + +RegexpChunkRules +---------------- + +A ``RegexpChunkRule`` is a transformational rule that updates the +chunking of a text by modifying its ``ChunkString``. Each +``RegexpChunkRule`` defines the ``apply()`` method, which modifies +the chunking encoded by a ``ChunkString``. The +``RegexpChunkRule`` class itself can be used to implement any +transformational rule based on regular expressions. There are +also a number of subclasses, which can be used to implement +simpler types of rules: + + - ``ChunkRule`` chunks anything that matches a given regular + expression. + - ``StripRule`` strips anything that matches a given regular + expression. + - ``UnChunkRule`` will un-chunk any chunk that matches a given + regular expression. + - ``MergeRule`` can be used to merge two contiguous chunks. + - ``SplitRule`` can be used to split a single chunk into two + smaller chunks. + - ``ExpandLeftRule`` will expand a chunk to incorporate new + unchunked material on the left. + - ``ExpandRightRule`` will expand a chunk to incorporate new + unchunked material on the right. + +Tag Patterns +~~~~~~~~~~~~ + +A ``RegexpChunkRule`` uses a modified version of regular +expression patterns, called "tag patterns". Tag patterns are +used to match sequences of tags. Examples of tag patterns are:: + + r'(
    ||)+' + r'+' + r'' + +The differences between regular expression patterns and tag +patterns are: + + - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so + ``'+'`` matches one or more repetitions of ``''``, not + ``''``. + - Whitespace in tag patterns is ignored. So + ``'
    | '`` is equivalent to ``'
    |'`` + - In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so + ``''`` matches any single tag starting with ``'NN'``. + +The function ``tag_pattern2re_pattern`` can be used to transform +a tag pattern to an equivalent regular expression pattern. + +Efficiency +---------- + +Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a +rate of about 300 tokens/second, with a moderately complex rule set. + +There may be problems if ``RegexpChunkParser`` is used with more than +5,000 tokens at a time. In particular, evaluation of some regular +expressions may cause the Python regular expression engine to +exceed its maximum recursion depth. We have attempted to minimize +these problems, but it is impossible to avoid them completely. We +therefore recommend that you apply the chunk parser to a single +sentence at a time. + +Emacs Tip +--------- + +If you evaluate the following elisp expression in emacs, it will +colorize a ``ChunkString`` when you use an interactive python shell +with emacs or xemacs ("C-c !"):: + + (let () + (defconst comint-mode-font-lock-keywords + '(("<[^>]+>" 0 'font-lock-reference-face) + ("[{}]" 0 'font-lock-function-name-face))) + (add-hook 'comint-mode-hook (lambda () (turn-on-font-lock)))) + +You can evaluate this code by copying it to a temporary buffer, +placing the cursor after the last close parenthesis, and typing +"``C-x C-e``". You should evaluate it before running the interactive +session. The change will last until you close emacs. + +Unresolved Issues +----------------- + +If we use the ``re`` module for regular expressions, Python's +regular expression engine generates "maximum recursion depth +exceeded" errors when processing very large texts, even for +regular expressions that should not require any recursion. We +therefore use the ``pre`` module instead. But note that ``pre`` +does not include Unicode support, so this module will not work +with unicode strings. Note also that ``pre`` regular expressions +are not quite as advanced as ``re`` ones (e.g., no leftward +zero-length assertions). + +:type CHUNK_TAG_PATTERN: regexp +:var CHUNK_TAG_PATTERN: A regular expression to test whether a tag + pattern is valid. +""" + +from nltk.chunk.api import ChunkParserI +from nltk.chunk.regexp import RegexpChunkParser, RegexpParser +from nltk.chunk.util import ( + ChunkScore, + accuracy, + conllstr2tree, + conlltags2tree, + ieerstr2tree, + tagstr2tree, + tree2conllstr, + tree2conlltags, +) +from nltk.data import load + +# Standard treebank POS tagger +_BINARY_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_binary.pickle" +_MULTICLASS_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle" + + +def ne_chunk(tagged_tokens, binary=False): + """ + Use NLTK's currently recommended named entity chunker to + chunk the given list of tagged tokens. + """ + if binary: + chunker_pickle = _BINARY_NE_CHUNKER + else: + chunker_pickle = _MULTICLASS_NE_CHUNKER + chunker = load(chunker_pickle) + return chunker.parse(tagged_tokens) + + +def ne_chunk_sents(tagged_sentences, binary=False): + """ + Use NLTK's currently recommended named entity chunker to chunk the + given list of tagged sentences, each consisting of a list of tagged tokens. + """ + if binary: + chunker_pickle = _BINARY_NE_CHUNKER + else: + chunker_pickle = _MULTICLASS_NE_CHUNKER + chunker = load(chunker_pickle) + return chunker.parse_sents(tagged_sentences) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/api.py new file mode 100644 index 0000000000000000000000000000000000000000..c8d9d1e0804d179c229ddb859858c57d5a75cd0e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/api.py @@ -0,0 +1,56 @@ +# Natural Language Toolkit: Chunk parsing API +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# URL: +# For license information, see LICENSE.TXT + +##////////////////////////////////////////////////////// +## Chunk Parser Interface +##////////////////////////////////////////////////////// + +from nltk.chunk.util import ChunkScore +from nltk.internals import deprecated +from nltk.parse import ParserI + + +class ChunkParserI(ParserI): + """ + A processing interface for identifying non-overlapping groups in + unrestricted text. Typically, chunk parsers are used to find base + syntactic constituents, such as base noun phrases. Unlike + ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method + will always generate a parse. + """ + + def parse(self, tokens): + """ + Return the best chunk structure for the given tokens + and return a tree. + + :param tokens: The list of (word, tag) tokens to be chunked. + :type tokens: list(tuple) + :rtype: Tree + """ + raise NotImplementedError() + + @deprecated("Use accuracy(gold) instead.") + def evaluate(self, gold): + return self.accuracy(gold) + + def accuracy(self, gold): + """ + Score the accuracy of the chunker against the gold standard. + Remove the chunking the gold standard text, rechunk it using + the chunker, and return a ``ChunkScore`` object + reflecting the performance of this chunk parser. + + :type gold: list(Tree) + :param gold: The list of chunked sentences to score the chunker on. + :rtype: ChunkScore + """ + chunkscore = ChunkScore() + for correct in gold: + chunkscore.score(correct, self.parse(correct.leaves())) + return chunkscore diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/named_entity.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/named_entity.py new file mode 100644 index 0000000000000000000000000000000000000000..0aa0eebb58e030995b1480586260c2b79bb5b313 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/named_entity.py @@ -0,0 +1,352 @@ +# Natural Language Toolkit: Chunk parsing API +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Named entity chunker +""" + +import os +import pickle +import re +from xml.etree import ElementTree as ET + +from nltk.tag import ClassifierBasedTagger, pos_tag + +try: + from nltk.classify import MaxentClassifier +except ImportError: + pass + +from nltk.chunk.api import ChunkParserI +from nltk.chunk.util import ChunkScore +from nltk.data import find +from nltk.tokenize import word_tokenize +from nltk.tree import Tree + + +class NEChunkParserTagger(ClassifierBasedTagger): + """ + The IOB tagger used by the chunk parser. + """ + + def __init__(self, train): + ClassifierBasedTagger.__init__( + self, train=train, classifier_builder=self._classifier_builder + ) + + def _classifier_builder(self, train): + return MaxentClassifier.train( + train, algorithm="megam", gaussian_prior_sigma=1, trace=2 + ) + + def _english_wordlist(self): + try: + wl = self._en_wordlist + except AttributeError: + from nltk.corpus import words + + self._en_wordlist = set(words.words("en-basic")) + wl = self._en_wordlist + return wl + + def _feature_detector(self, tokens, index, history): + word = tokens[index][0] + pos = simplify_pos(tokens[index][1]) + if index == 0: + prevword = prevprevword = None + prevpos = prevprevpos = None + prevshape = prevtag = prevprevtag = None + elif index == 1: + prevword = tokens[index - 1][0].lower() + prevprevword = None + prevpos = simplify_pos(tokens[index - 1][1]) + prevprevpos = None + prevtag = history[index - 1][0] + prevshape = prevprevtag = None + else: + prevword = tokens[index - 1][0].lower() + prevprevword = tokens[index - 2][0].lower() + prevpos = simplify_pos(tokens[index - 1][1]) + prevprevpos = simplify_pos(tokens[index - 2][1]) + prevtag = history[index - 1] + prevprevtag = history[index - 2] + prevshape = shape(prevword) + if index == len(tokens) - 1: + nextword = nextnextword = None + nextpos = nextnextpos = None + elif index == len(tokens) - 2: + nextword = tokens[index + 1][0].lower() + nextpos = tokens[index + 1][1].lower() + nextnextword = None + nextnextpos = None + else: + nextword = tokens[index + 1][0].lower() + nextpos = tokens[index + 1][1].lower() + nextnextword = tokens[index + 2][0].lower() + nextnextpos = tokens[index + 2][1].lower() + + # 89.6 + features = { + "bias": True, + "shape": shape(word), + "wordlen": len(word), + "prefix3": word[:3].lower(), + "suffix3": word[-3:].lower(), + "pos": pos, + "word": word, + "en-wordlist": (word in self._english_wordlist()), + "prevtag": prevtag, + "prevpos": prevpos, + "nextpos": nextpos, + "prevword": prevword, + "nextword": nextword, + "word+nextpos": f"{word.lower()}+{nextpos}", + "pos+prevtag": f"{pos}+{prevtag}", + "shape+prevtag": f"{prevshape}+{prevtag}", + } + + return features + + +class NEChunkParser(ChunkParserI): + """ + Expected input: list of pos-tagged words + """ + + def __init__(self, train): + self._train(train) + + def parse(self, tokens): + """ + Each token should be a pos-tagged word + """ + tagged = self._tagger.tag(tokens) + tree = self._tagged_to_parse(tagged) + return tree + + def _train(self, corpus): + # Convert to tagged sequence + corpus = [self._parse_to_tagged(s) for s in corpus] + + self._tagger = NEChunkParserTagger(train=corpus) + + def _tagged_to_parse(self, tagged_tokens): + """ + Convert a list of tagged tokens to a chunk-parse tree. + """ + sent = Tree("S", []) + + for (tok, tag) in tagged_tokens: + if tag == "O": + sent.append(tok) + elif tag.startswith("B-"): + sent.append(Tree(tag[2:], [tok])) + elif tag.startswith("I-"): + if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]: + sent[-1].append(tok) + else: + sent.append(Tree(tag[2:], [tok])) + return sent + + @staticmethod + def _parse_to_tagged(sent): + """ + Convert a chunk-parse tree to a list of tagged tokens. + """ + toks = [] + for child in sent: + if isinstance(child, Tree): + if len(child) == 0: + print("Warning -- empty chunk in sentence") + continue + toks.append((child[0], f"B-{child.label()}")) + for tok in child[1:]: + toks.append((tok, f"I-{child.label()}")) + else: + toks.append((child, "O")) + return toks + + +def shape(word): + if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE): + return "number" + elif re.match(r"\W+$", word, re.UNICODE): + return "punct" + elif re.match(r"\w+$", word, re.UNICODE): + if word.istitle(): + return "upcase" + elif word.islower(): + return "downcase" + else: + return "mixedcase" + else: + return "other" + + +def simplify_pos(s): + if s.startswith("V"): + return "V" + else: + return s.split("-")[0] + + +def postag_tree(tree): + # Part-of-speech tagging. + words = tree.leaves() + tag_iter = (pos for (word, pos) in pos_tag(words)) + newtree = Tree("S", []) + for child in tree: + if isinstance(child, Tree): + newtree.append(Tree(child.label(), [])) + for subchild in child: + newtree[-1].append((subchild, next(tag_iter))) + else: + newtree.append((child, next(tag_iter))) + return newtree + + +def load_ace_data(roots, fmt="binary", skip_bnews=True): + for root in roots: + for root, dirs, files in os.walk(root): + if root.endswith("bnews") and skip_bnews: + continue + for f in files: + if f.endswith(".sgm"): + yield from load_ace_file(os.path.join(root, f), fmt) + + +def load_ace_file(textfile, fmt): + print(f" - {os.path.split(textfile)[1]}") + annfile = textfile + ".tmx.rdc.xml" + + # Read the xml file, and get a list of entities + entities = [] + with open(annfile) as infile: + xml = ET.parse(infile).getroot() + for entity in xml.findall("document/entity"): + typ = entity.find("entity_type").text + for mention in entity.findall("entity_mention"): + if mention.get("TYPE") != "NAME": + continue # only NEs + s = int(mention.find("head/charseq/start").text) + e = int(mention.find("head/charseq/end").text) + 1 + entities.append((s, e, typ)) + + # Read the text file, and mark the entities. + with open(textfile) as infile: + text = infile.read() + + # Strip XML tags, since they don't count towards the indices + text = re.sub("<(?!/?TEXT)[^>]+>", "", text) + + # Blank out anything before/after + def subfunc(m): + return " " * (m.end() - m.start() - 6) + + text = re.sub(r"[\s\S]*", subfunc, text) + text = re.sub(r"[\s\S]*", "", text) + + # Simplify quotes + text = re.sub("``", ' "', text) + text = re.sub("''", '" ', text) + + entity_types = {typ for (s, e, typ) in entities} + + # Binary distinction (NE or not NE) + if fmt == "binary": + i = 0 + toks = Tree("S", []) + for (s, e, typ) in sorted(entities): + if s < i: + s = i # Overlapping! Deal with this better? + if e <= s: + continue + toks.extend(word_tokenize(text[i:s])) + toks.append(Tree("NE", text[s:e].split())) + i = e + toks.extend(word_tokenize(text[i:])) + yield toks + + # Multiclass distinction (NE type) + elif fmt == "multiclass": + i = 0 + toks = Tree("S", []) + for (s, e, typ) in sorted(entities): + if s < i: + s = i # Overlapping! Deal with this better? + if e <= s: + continue + toks.extend(word_tokenize(text[i:s])) + toks.append(Tree(typ, text[s:e].split())) + i = e + toks.extend(word_tokenize(text[i:])) + yield toks + + else: + raise ValueError("bad fmt value") + + +# This probably belongs in a more general-purpose location (as does +# the parse_to_tagged function). +def cmp_chunks(correct, guessed): + correct = NEChunkParser._parse_to_tagged(correct) + guessed = NEChunkParser._parse_to_tagged(guessed) + ellipsis = False + for (w, ct), (w, gt) in zip(correct, guessed): + if ct == gt == "O": + if not ellipsis: + print(f" {ct:15} {gt:15} {w}") + print(" {:15} {:15} {2}".format("...", "...", "...")) + ellipsis = True + else: + ellipsis = False + print(f" {ct:15} {gt:15} {w}") + + +def build_model(fmt="binary"): + print("Loading training data...") + train_paths = [ + find("corpora/ace_data/ace.dev"), + find("corpora/ace_data/ace.heldout"), + find("corpora/ace_data/bbn.dev"), + find("corpora/ace_data/muc.dev"), + ] + train_trees = load_ace_data(train_paths, fmt) + train_data = [postag_tree(t) for t in train_trees] + print("Training...") + cp = NEChunkParser(train_data) + del train_data + + print("Loading eval data...") + eval_paths = [find("corpora/ace_data/ace.eval")] + eval_trees = load_ace_data(eval_paths, fmt) + eval_data = [postag_tree(t) for t in eval_trees] + + print("Evaluating...") + chunkscore = ChunkScore() + for i, correct in enumerate(eval_data): + guess = cp.parse(correct.leaves()) + chunkscore.score(correct, guess) + if i < 3: + cmp_chunks(correct, guess) + print(chunkscore) + + outfilename = f"/tmp/ne_chunker_{fmt}.pickle" + print(f"Saving chunker to {outfilename}...") + + with open(outfilename, "wb") as outfile: + pickle.dump(cp, outfile, -1) + + return cp + + +if __name__ == "__main__": + # Make sure that the pickled object has the right class name: + from nltk.chunk.named_entity import build_model + + build_model("binary") + build_model("multiclass") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/regexp.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/regexp.py new file mode 100644 index 0000000000000000000000000000000000000000..43d119eeb79766a41e96e94c13f8810ebf3f42c1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/regexp.py @@ -0,0 +1,1475 @@ +# Natural Language Toolkit: Regular Expression Chunkers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# URL: +# For license information, see LICENSE.TXT + +import re + +import regex + +from nltk.chunk.api import ChunkParserI +from nltk.tree import Tree + +# ////////////////////////////////////////////////////// +# ChunkString +# ////////////////////////////////////////////////////// + + +class ChunkString: + """ + A string-based encoding of a particular chunking of a text. + Internally, the ``ChunkString`` class uses a single string to + encode the chunking of the input text. This string contains a + sequence of angle-bracket delimited tags, with chunking indicated + by braces. An example of this encoding is:: + + {
    }{
    }<.>{
    }<.> + + ``ChunkString`` are created from tagged texts (i.e., lists of + ``tokens`` whose type is ``TaggedType``). Initially, nothing is + chunked. + + The chunking of a ``ChunkString`` can be modified with the ``xform()`` + method, which uses a regular expression to transform the string + representation. These transformations should only add and remove + braces; they should *not* modify the sequence of angle-bracket + delimited tags. + + :type _str: str + :ivar _str: The internal string representation of the text's + encoding. This string representation contains a sequence of + angle-bracket delimited tags, with chunking indicated by + braces. An example of this encoding is:: + + {
    }{
    }<.>{
    }<.> + + :type _pieces: list(tagged tokens and chunks) + :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``. + :ivar _debug: The debug level. See the constructor docs. + + :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that + will only match positions that are in chunks. + :cvar IN_STRIP_PATTERN: A zero-width regexp pattern string that + will only match positions that are in strips. + """ + + CHUNK_TAG_CHAR = r"[^\{\}<>]" + CHUNK_TAG = r"(<%s+?>)" % CHUNK_TAG_CHAR + + IN_CHUNK_PATTERN = r"(?=[^\{]*\})" + IN_STRIP_PATTERN = r"(?=[^\}]*(\{|$))" + + # These are used by _verify + _CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG + _STRIP = r"(%s+?)+?" % CHUNK_TAG + _VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG) + _BRACKETS = re.compile(r"[^\{\}]+") + _BALANCED_BRACKETS = re.compile(r"(\{\})*$") + + def __init__(self, chunk_struct, debug_level=1): + """ + Construct a new ``ChunkString`` that encodes the chunking of + the text ``tagged_tokens``. + + :type chunk_struct: Tree + :param chunk_struct: The chunk structure to be further chunked. + :type debug_level: int + :param debug_level: The level of debugging which should be + applied to transformations on the ``ChunkString``. The + valid levels are: + + - 0: no checks + - 1: full check on to_chunkstruct + - 2: full check on to_chunkstruct and cursory check after + each transformation. + - 3: full check on to_chunkstruct and full check after + each transformation. + + We recommend you use at least level 1. You should + probably use level 3 if you use any non-standard + subclasses of ``RegexpChunkRule``. + """ + self._root_label = chunk_struct.label() + self._pieces = chunk_struct[:] + tags = [self._tag(tok) for tok in self._pieces] + self._str = "<" + "><".join(tags) + ">" + self._debug = debug_level + + def _tag(self, tok): + if isinstance(tok, tuple): + return tok[1] + elif isinstance(tok, Tree): + return tok.label() + else: + raise ValueError("chunk structures must contain tagged " "tokens or trees") + + def _verify(self, s, verify_tags): + """ + Check to make sure that ``s`` still corresponds to some chunked + version of ``_pieces``. + + :type verify_tags: bool + :param verify_tags: Whether the individual tags should be + checked. If this is false, ``_verify`` will check to make + sure that ``_str`` encodes a chunked version of *some* + list of tokens. If this is true, then ``_verify`` will + check to make sure that the tags in ``_str`` match those in + ``_pieces``. + + :raise ValueError: if the internal string representation of + this ``ChunkString`` is invalid or not consistent with _pieces. + """ + # Check overall form + if not ChunkString._VALID.match(s): + raise ValueError( + "Transformation generated invalid " "chunkstring:\n %s" % s + ) + + # Check that parens are balanced. If the string is long, we + # have to do this in pieces, to avoid a maximum recursion + # depth limit for regular expressions. + brackets = ChunkString._BRACKETS.sub("", s) + for i in range(1 + len(brackets) // 5000): + substr = brackets[i * 5000 : i * 5000 + 5000] + if not ChunkString._BALANCED_BRACKETS.match(substr): + raise ValueError( + "Transformation generated invalid " "chunkstring:\n %s" % s + ) + + if verify_tags <= 0: + return + + tags1 = (re.split(r"[\{\}<>]+", s))[1:-1] + tags2 = [self._tag(piece) for piece in self._pieces] + if tags1 != tags2: + raise ValueError( + "Transformation generated invalid " "chunkstring: tag changed" + ) + + def to_chunkstruct(self, chunk_label="CHUNK"): + """ + Return the chunk structure encoded by this ``ChunkString``. + + :rtype: Tree + :raise ValueError: If a transformation has generated an + invalid chunkstring. + """ + if self._debug > 0: + self._verify(self._str, 1) + + # Use this alternating list to create the chunkstruct. + pieces = [] + index = 0 + piece_in_chunk = 0 + for piece in re.split("[{}]", self._str): + + # Find the list of tokens contained in this piece. + length = piece.count("<") + subsequence = self._pieces[index : index + length] + + # Add this list of tokens to our pieces. + if piece_in_chunk: + pieces.append(Tree(chunk_label, subsequence)) + else: + pieces += subsequence + + # Update index, piece_in_chunk + index += length + piece_in_chunk = not piece_in_chunk + + return Tree(self._root_label, pieces) + + def xform(self, regexp, repl): + """ + Apply the given transformation to the string encoding of this + ``ChunkString``. In particular, find all occurrences that match + ``regexp``, and replace them using ``repl`` (as done by + ``re.sub``). + + This transformation should only add and remove braces; it + should *not* modify the sequence of angle-bracket delimited + tags. Furthermore, this transformation may not result in + improper bracketing. Note, in particular, that bracketing may + not be nested. + + :type regexp: str or regexp + :param regexp: A regular expression matching the substring + that should be replaced. This will typically include a + named group, which can be used by ``repl``. + :type repl: str + :param repl: An expression specifying what should replace the + matched substring. Typically, this will include a named + replacement group, specified by ``regexp``. + :rtype: None + :raise ValueError: If this transformation generated an + invalid chunkstring. + """ + # Do the actual substitution + s = re.sub(regexp, repl, self._str) + + # The substitution might have generated "empty chunks" + # (substrings of the form "{}"). Remove them, so they don't + # interfere with other transformations. + s = re.sub(r"\{\}", "", s) + + # Make sure that the transformation was legal. + if self._debug > 1: + self._verify(s, self._debug - 2) + + # Commit the transformation. + self._str = s + + def __repr__(self): + """ + Return a string representation of this ``ChunkString``. + It has the form:: + + }{
    }'> + + :rtype: str + """ + return "" % repr(self._str) + + def __str__(self): + """ + Return a formatted representation of this ``ChunkString``. + This representation will include extra spaces to ensure that + tags will line up with the representation of other + ``ChunkStrings`` for the same text, regardless of the chunking. + + :rtype: str + """ + # Add spaces to make everything line up. + str = re.sub(r">(?!\})", r"> ", self._str) + str = re.sub(r"([^\{])<", r"\1 <", str) + if str[0] == "<": + str = " " + str + return str + + +# ////////////////////////////////////////////////////// +# Chunking Rules +# ////////////////////////////////////////////////////// + + +class RegexpChunkRule: + """ + A rule specifying how to modify the chunking in a ``ChunkString``, + using a transformational regular expression. The + ``RegexpChunkRule`` class itself can be used to implement any + transformational rule based on regular expressions. There are + also a number of subclasses, which can be used to implement + simpler types of rules, based on matching regular expressions. + + Each ``RegexpChunkRule`` has a regular expression and a + replacement expression. When a ``RegexpChunkRule`` is "applied" + to a ``ChunkString``, it searches the ``ChunkString`` for any + substring that matches the regular expression, and replaces it + using the replacement expression. This search/replace operation + has the same semantics as ``re.sub``. + + Each ``RegexpChunkRule`` also has a description string, which + gives a short (typically less than 75 characters) description of + the purpose of the rule. + + This transformation defined by this ``RegexpChunkRule`` should + only add and remove braces; it should *not* modify the sequence + of angle-bracket delimited tags. Furthermore, this transformation + may not result in nested or mismatched bracketing. + """ + + def __init__(self, regexp, repl, descr): + """ + Construct a new RegexpChunkRule. + + :type regexp: regexp or str + :param regexp: The regular expression for this ``RegexpChunkRule``. + When this rule is applied to a ``ChunkString``, any + substring that matches ``regexp`` will be replaced using + the replacement string ``repl``. Note that this must be a + normal regular expression, not a tag pattern. + :type repl: str + :param repl: The replacement expression for this ``RegexpChunkRule``. + When this rule is applied to a ``ChunkString``, any substring + that matches ``regexp`` will be replaced using ``repl``. + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + if isinstance(regexp, str): + regexp = re.compile(regexp) + self._repl = repl + self._descr = descr + self._regexp = regexp + + def apply(self, chunkstr): + # Keep docstring generic so we can inherit it. + """ + Apply this rule to the given ``ChunkString``. See the + class reference documentation for a description of what it + means to apply a rule. + + :type chunkstr: ChunkString + :param chunkstr: The chunkstring to which this rule is applied. + :rtype: None + :raise ValueError: If this transformation generated an + invalid chunkstring. + """ + chunkstr.xform(self._regexp, self._repl) + + def descr(self): + """ + Return a short description of the purpose and/or effect of + this rule. + + :rtype: str + """ + return self._descr + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + }'->''> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return ( + "" + + repr(self._repl) + + ">" + ) + + @staticmethod + def fromstring(s): + """ + Create a RegexpChunkRule from a string description. + Currently, the following formats are supported:: + + {regexp} # chunk rule + }regexp{ # strip rule + regexp}{regexp # split rule + regexp{}regexp # merge rule + + Where ``regexp`` is a regular expression for the rule. Any + text following the comment marker (``#``) will be used as + the rule's description: + + >>> from nltk.chunk.regexp import RegexpChunkRule + >>> RegexpChunkRule.fromstring('{
    ?+}') + ?+'> + """ + # Split off the comment (but don't split on '\#') + m = re.match(r"(?P(\\.|[^#])*)(?P#.*)?", s) + rule = m.group("rule").strip() + comment = (m.group("comment") or "")[1:].strip() + + # Pattern bodies: chunk, strip, split, merge + try: + if not rule: + raise ValueError("Empty chunk pattern") + if rule[0] == "{" and rule[-1] == "}": + return ChunkRule(rule[1:-1], comment) + elif rule[0] == "}" and rule[-1] == "{": + return StripRule(rule[1:-1], comment) + elif "}{" in rule: + left, right = rule.split("}{") + return SplitRule(left, right, comment) + elif "{}" in rule: + left, right = rule.split("{}") + return MergeRule(left, right, comment) + elif re.match("[^{}]*{[^{}]*}[^{}]*", rule): + left, chunk, right = re.split("[{}]", rule) + return ChunkRuleWithContext(left, chunk, right, comment) + else: + raise ValueError("Illegal chunk pattern: %s" % rule) + except (ValueError, re.error) as e: + raise ValueError("Illegal chunk pattern: %s" % rule) from e + + +class ChunkRule(RegexpChunkRule): + """ + A rule specifying how to add chunks to a ``ChunkString``, using a + matching tag pattern. When applied to a ``ChunkString``, it will + find any substring that matches this tag pattern and that is not + already part of a chunk, and create a new chunk containing that + substring. + """ + + def __init__(self, tag_pattern, descr): + """ + Construct a new ``ChunkRule``. + + :type tag_pattern: str + :param tag_pattern: This rule's tag pattern. When + applied to a ``ChunkString``, this rule will + chunk any substring that matches this tag pattern and that + is not already part of a chunk. + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + self._pattern = tag_pattern + regexp = re.compile( + "(?P%s)%s" + % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_STRIP_PATTERN) + ) + RegexpChunkRule.__init__(self, regexp, r"{\g}", descr) + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + '> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return "" + + +class StripRule(RegexpChunkRule): + """ + A rule specifying how to remove strips to a ``ChunkString``, + using a matching tag pattern. When applied to a + ``ChunkString``, it will find any substring that matches this + tag pattern and that is contained in a chunk, and remove it + from that chunk, thus creating two new chunks. + """ + + def __init__(self, tag_pattern, descr): + """ + Construct a new ``StripRule``. + + :type tag_pattern: str + :param tag_pattern: This rule's tag pattern. When + applied to a ``ChunkString``, this rule will + find any substring that matches this tag pattern and that + is contained in a chunk, and remove it from that chunk, + thus creating two new chunks. + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + self._pattern = tag_pattern + regexp = re.compile( + "(?P%s)%s" + % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN) + ) + RegexpChunkRule.__init__(self, regexp, r"}\g{", descr) + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + '> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return "" + + +class UnChunkRule(RegexpChunkRule): + """ + A rule specifying how to remove chunks to a ``ChunkString``, + using a matching tag pattern. When applied to a + ``ChunkString``, it will find any complete chunk that matches this + tag pattern, and un-chunk it. + """ + + def __init__(self, tag_pattern, descr): + """ + Construct a new ``UnChunkRule``. + + :type tag_pattern: str + :param tag_pattern: This rule's tag pattern. When + applied to a ``ChunkString``, this rule will + find any complete chunk that matches this tag pattern, + and un-chunk it. + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + self._pattern = tag_pattern + regexp = re.compile(r"\{(?P%s)\}" % tag_pattern2re_pattern(tag_pattern)) + RegexpChunkRule.__init__(self, regexp, r"\g", descr) + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + '> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return "" + + +class MergeRule(RegexpChunkRule): + """ + A rule specifying how to merge chunks in a ``ChunkString``, using + two matching tag patterns: a left pattern, and a right pattern. + When applied to a ``ChunkString``, it will find any chunk whose end + matches left pattern, and immediately followed by a chunk whose + beginning matches right pattern. It will then merge those two + chunks into a single chunk. + """ + + def __init__(self, left_tag_pattern, right_tag_pattern, descr): + """ + Construct a new ``MergeRule``. + + :type right_tag_pattern: str + :param right_tag_pattern: This rule's right tag + pattern. When applied to a ``ChunkString``, this + rule will find any chunk whose end matches + ``left_tag_pattern``, and immediately followed by a chunk + whose beginning matches this pattern. It will + then merge those two chunks into a single chunk. + :type left_tag_pattern: str + :param left_tag_pattern: This rule's left tag + pattern. When applied to a ``ChunkString``, this + rule will find any chunk whose end matches + this pattern, and immediately followed by a chunk + whose beginning matches ``right_tag_pattern``. It will + then merge those two chunks into a single chunk. + + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + # Ensure that the individual patterns are coherent. E.g., if + # left='(' and right=')', then this will raise an exception: + re.compile(tag_pattern2re_pattern(left_tag_pattern)) + re.compile(tag_pattern2re_pattern(right_tag_pattern)) + + self._left_tag_pattern = left_tag_pattern + self._right_tag_pattern = right_tag_pattern + regexp = re.compile( + "(?P%s)}{(?=%s)" + % ( + tag_pattern2re_pattern(left_tag_pattern), + tag_pattern2re_pattern(right_tag_pattern), + ) + ) + RegexpChunkRule.__init__(self, regexp, r"\g", descr) + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + ', ''> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return ( + "" + ) + + +class SplitRule(RegexpChunkRule): + """ + A rule specifying how to split chunks in a ``ChunkString``, using + two matching tag patterns: a left pattern, and a right pattern. + When applied to a ``ChunkString``, it will find any chunk that + matches the left pattern followed by the right pattern. It will + then split the chunk into two new chunks, at the point between the + two pattern matches. + """ + + def __init__(self, left_tag_pattern, right_tag_pattern, descr): + """ + Construct a new ``SplitRule``. + + :type right_tag_pattern: str + :param right_tag_pattern: This rule's right tag + pattern. When applied to a ``ChunkString``, this rule will + find any chunk containing a substring that matches + ``left_tag_pattern`` followed by this pattern. It will + then split the chunk into two new chunks at the point + between these two matching patterns. + :type left_tag_pattern: str + :param left_tag_pattern: This rule's left tag + pattern. When applied to a ``ChunkString``, this rule will + find any chunk containing a substring that matches this + pattern followed by ``right_tag_pattern``. It will then + split the chunk into two new chunks at the point between + these two matching patterns. + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + # Ensure that the individual patterns are coherent. E.g., if + # left='(' and right=')', then this will raise an exception: + re.compile(tag_pattern2re_pattern(left_tag_pattern)) + re.compile(tag_pattern2re_pattern(right_tag_pattern)) + + self._left_tag_pattern = left_tag_pattern + self._right_tag_pattern = right_tag_pattern + regexp = re.compile( + "(?P%s)(?=%s)" + % ( + tag_pattern2re_pattern(left_tag_pattern), + tag_pattern2re_pattern(right_tag_pattern), + ) + ) + RegexpChunkRule.__init__(self, regexp, r"\g}{", descr) + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + ', '
    '> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return ( + "" + ) + + +class ExpandLeftRule(RegexpChunkRule): + """ + A rule specifying how to expand chunks in a ``ChunkString`` to the left, + using two matching tag patterns: a left pattern, and a right pattern. + When applied to a ``ChunkString``, it will find any chunk whose beginning + matches right pattern, and immediately preceded by a strip whose + end matches left pattern. It will then expand the chunk to incorporate + the new material on the left. + """ + + def __init__(self, left_tag_pattern, right_tag_pattern, descr): + """ + Construct a new ``ExpandRightRule``. + + :type right_tag_pattern: str + :param right_tag_pattern: This rule's right tag + pattern. When applied to a ``ChunkString``, this + rule will find any chunk whose beginning matches + ``right_tag_pattern``, and immediately preceded by a strip + whose end matches this pattern. It will + then merge those two chunks into a single chunk. + :type left_tag_pattern: str + :param left_tag_pattern: This rule's left tag + pattern. When applied to a ``ChunkString``, this + rule will find any chunk whose beginning matches + this pattern, and immediately preceded by a strip + whose end matches ``left_tag_pattern``. It will + then expand the chunk to incorporate the new material on the left. + + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + # Ensure that the individual patterns are coherent. E.g., if + # left='(' and right=')', then this will raise an exception: + re.compile(tag_pattern2re_pattern(left_tag_pattern)) + re.compile(tag_pattern2re_pattern(right_tag_pattern)) + + self._left_tag_pattern = left_tag_pattern + self._right_tag_pattern = right_tag_pattern + regexp = re.compile( + r"(?P%s)\{(?P%s)" + % ( + tag_pattern2re_pattern(left_tag_pattern), + tag_pattern2re_pattern(right_tag_pattern), + ) + ) + RegexpChunkRule.__init__(self, regexp, r"{\g\g", descr) + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + ', ''> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return ( + "" + ) + + +class ExpandRightRule(RegexpChunkRule): + """ + A rule specifying how to expand chunks in a ``ChunkString`` to the + right, using two matching tag patterns: a left pattern, and a + right pattern. When applied to a ``ChunkString``, it will find any + chunk whose end matches left pattern, and immediately followed by + a strip whose beginning matches right pattern. It will then + expand the chunk to incorporate the new material on the right. + """ + + def __init__(self, left_tag_pattern, right_tag_pattern, descr): + """ + Construct a new ``ExpandRightRule``. + + :type right_tag_pattern: str + :param right_tag_pattern: This rule's right tag + pattern. When applied to a ``ChunkString``, this + rule will find any chunk whose end matches + ``left_tag_pattern``, and immediately followed by a strip + whose beginning matches this pattern. It will + then merge those two chunks into a single chunk. + :type left_tag_pattern: str + :param left_tag_pattern: This rule's left tag + pattern. When applied to a ``ChunkString``, this + rule will find any chunk whose end matches + this pattern, and immediately followed by a strip + whose beginning matches ``right_tag_pattern``. It will + then expand the chunk to incorporate the new material on the right. + + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + # Ensure that the individual patterns are coherent. E.g., if + # left='(' and right=')', then this will raise an exception: + re.compile(tag_pattern2re_pattern(left_tag_pattern)) + re.compile(tag_pattern2re_pattern(right_tag_pattern)) + + self._left_tag_pattern = left_tag_pattern + self._right_tag_pattern = right_tag_pattern + regexp = re.compile( + r"(?P%s)\}(?P%s)" + % ( + tag_pattern2re_pattern(left_tag_pattern), + tag_pattern2re_pattern(right_tag_pattern), + ) + ) + RegexpChunkRule.__init__(self, regexp, r"\g\g}", descr) + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + ', ''> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return ( + "" + ) + + +class ChunkRuleWithContext(RegexpChunkRule): + """ + A rule specifying how to add chunks to a ``ChunkString``, using + three matching tag patterns: one for the left context, one for the + chunk, and one for the right context. When applied to a + ``ChunkString``, it will find any substring that matches the chunk + tag pattern, is surrounded by substrings that match the two + context patterns, and is not already part of a chunk; and create a + new chunk containing the substring that matched the chunk tag + pattern. + + Caveat: Both the left and right context are consumed when this + rule matches; therefore, if you need to find overlapping matches, + you will need to apply your rule more than once. + """ + + def __init__( + self, + left_context_tag_pattern, + chunk_tag_pattern, + right_context_tag_pattern, + descr, + ): + """ + Construct a new ``ChunkRuleWithContext``. + + :type left_context_tag_pattern: str + :param left_context_tag_pattern: A tag pattern that must match + the left context of ``chunk_tag_pattern`` for this rule to + apply. + :type chunk_tag_pattern: str + :param chunk_tag_pattern: A tag pattern that must match for this + rule to apply. If the rule does apply, then this pattern + also identifies the substring that will be made into a chunk. + :type right_context_tag_pattern: str + :param right_context_tag_pattern: A tag pattern that must match + the right context of ``chunk_tag_pattern`` for this rule to + apply. + :type descr: str + :param descr: A short description of the purpose and/or effect + of this rule. + """ + # Ensure that the individual patterns are coherent. E.g., if + # left='(' and right=')', then this will raise an exception: + re.compile(tag_pattern2re_pattern(left_context_tag_pattern)) + re.compile(tag_pattern2re_pattern(chunk_tag_pattern)) + re.compile(tag_pattern2re_pattern(right_context_tag_pattern)) + + self._left_context_tag_pattern = left_context_tag_pattern + self._chunk_tag_pattern = chunk_tag_pattern + self._right_context_tag_pattern = right_context_tag_pattern + regexp = re.compile( + "(?P%s)(?P%s)(?P%s)%s" + % ( + tag_pattern2re_pattern(left_context_tag_pattern), + tag_pattern2re_pattern(chunk_tag_pattern), + tag_pattern2re_pattern(right_context_tag_pattern), + ChunkString.IN_STRIP_PATTERN, + ) + ) + replacement = r"\g{\g}\g" + RegexpChunkRule.__init__(self, regexp, replacement, descr) + + def __repr__(self): + """ + Return a string representation of this rule. It has the form:: + + ', '', '
    '> + + Note that this representation does not include the + description string; that string can be accessed + separately with the ``descr()`` method. + + :rtype: str + """ + return "".format( + self._left_context_tag_pattern, + self._chunk_tag_pattern, + self._right_context_tag_pattern, + ) + + +# ////////////////////////////////////////////////////// +# Tag Pattern Format Conversion +# ////////////////////////////////////////////////////// + +# this should probably be made more strict than it is -- e.g., it +# currently accepts 'foo'. +CHUNK_TAG_PATTERN = re.compile( + r"^(({}|<{}>)*)$".format(r"([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", r"[^\{\}<>]+") +) + + +def tag_pattern2re_pattern(tag_pattern): + """ + Convert a tag pattern to a regular expression pattern. A "tag + pattern" is a modified version of a regular expression, designed + for matching sequences of tags. The differences between regular + expression patterns and tag patterns are: + + - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so + ``'+'`` matches one or more repetitions of ``''``, not + ``''``. + - Whitespace in tag patterns is ignored. So + ``'
    | '`` is equivalent to ``'
    |'`` + - In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so + ``''`` matches any single tag starting with ``'NN'``. + + In particular, ``tag_pattern2re_pattern`` performs the following + transformations on the given pattern: + + - Replace '.' with '[^<>{}]' + - Remove any whitespace + - Add extra parens around '<' and '>', to make '<' and '>' act + like parentheses. E.g., so that in '+', the '+' has scope + over the entire ''; and so that in '', the '|' has + scope over 'NN' and 'IN', but not '<' or '>'. + - Check to make sure the resulting pattern is valid. + + :type tag_pattern: str + :param tag_pattern: The tag pattern to convert to a regular + expression pattern. + :raise ValueError: If ``tag_pattern`` is not a valid tag pattern. + In particular, ``tag_pattern`` should not include braces; and it + should not contain nested or mismatched angle-brackets. + :rtype: str + :return: A regular expression pattern corresponding to + ``tag_pattern``. + """ + # Clean up the regular expression + tag_pattern = re.sub(r"\s", "", tag_pattern) + tag_pattern = re.sub(r"<", "(<(", tag_pattern) + tag_pattern = re.sub(r">", ")>)", tag_pattern) + + # Check the regular expression + if not CHUNK_TAG_PATTERN.match(tag_pattern): + raise ValueError("Bad tag pattern: %r" % tag_pattern) + + # Replace "." with CHUNK_TAG_CHAR. + # We have to do this after, since it adds {}[]<>s, which would + # confuse CHUNK_TAG_PATTERN. + # PRE doesn't have lookback assertions, so reverse twice, and do + # the pattern backwards (with lookahead assertions). This can be + # made much cleaner once we can switch back to SRE. + def reverse_str(str): + lst = list(str) + lst.reverse() + return "".join(lst) + + tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR) + reversed = reverse_str(tag_pattern) + reversed = re.sub(r"\.(?!\\(\\\\)*($|[^\\]))", tc_rev, reversed) + tag_pattern = reverse_str(reversed) + + return tag_pattern + + +# ////////////////////////////////////////////////////// +# RegexpChunkParser +# ////////////////////////////////////////////////////// + + +class RegexpChunkParser(ChunkParserI): + """ + A regular expression based chunk parser. ``RegexpChunkParser`` uses a + sequence of "rules" to find chunks of a single type within a + text. The chunking of the text is encoded using a ``ChunkString``, + and each rule acts by modifying the chunking in the + ``ChunkString``. The rules are all implemented using regular + expression matching and substitution. + + The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``, + ``StripRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``) + define the rules that are used by ``RegexpChunkParser``. Each rule + defines an ``apply()`` method, which modifies the chunking encoded + by a given ``ChunkString``. + + :type _rules: list(RegexpChunkRule) + :ivar _rules: The list of rules that should be applied to a text. + :type _trace: int + :ivar _trace: The default level of tracing. + + """ + + def __init__(self, rules, chunk_label="NP", root_label="S", trace=0): + """ + Construct a new ``RegexpChunkParser``. + + :type rules: list(RegexpChunkRule) + :param rules: The sequence of rules that should be used to + generate the chunking for a tagged text. + :type chunk_label: str + :param chunk_label: The node value that should be used for + chunk subtrees. This is typically a short string + describing the type of information contained by the chunk, + such as ``"NP"`` for base noun phrases. + :type root_label: str + :param root_label: The node value that should be used for the + top node of the chunk structure. + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + ``1`` will generate normal tracing output; and ``2`` or + higher will generate verbose tracing output. + """ + self._rules = rules + self._trace = trace + self._chunk_label = chunk_label + self._root_label = root_label + + def _trace_apply(self, chunkstr, verbose): + """ + Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in + turn. Generate trace output between each rule. If ``verbose`` + is true, then generate verbose output. + + :type chunkstr: ChunkString + :param chunkstr: The chunk string to which each rule should be + applied. + :type verbose: bool + :param verbose: Whether output should be verbose. + :rtype: None + """ + print("# Input:") + print(chunkstr) + for rule in self._rules: + rule.apply(chunkstr) + if verbose: + print("#", rule.descr() + " (" + repr(rule) + "):") + else: + print("#", rule.descr() + ":") + print(chunkstr) + + def _notrace_apply(self, chunkstr): + """ + Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in + turn. + + :param chunkstr: The chunk string to which each rule should be + applied. + :type chunkstr: ChunkString + :rtype: None + """ + + for rule in self._rules: + rule.apply(chunkstr) + + def parse(self, chunk_struct, trace=None): + """ + :type chunk_struct: Tree + :param chunk_struct: the chunk structure to be (further) chunked + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + ``1`` will generate normal tracing output; and ``2`` or + higher will generate verbose tracing output. This value + overrides the trace level value that was given to the + constructor. + :rtype: Tree + :return: a chunk structure that encodes the chunks in a given + tagged sentence. A chunk is a non-overlapping linguistic + group, such as a noun phrase. The set of chunks + identified in the chunk structure depends on the rules + used to define this ``RegexpChunkParser``. + """ + if len(chunk_struct) == 0: + print("Warning: parsing empty text") + return Tree(self._root_label, []) + + try: + chunk_struct.label() + except AttributeError: + chunk_struct = Tree(self._root_label, chunk_struct) + + # Use the default trace value? + if trace is None: + trace = self._trace + + chunkstr = ChunkString(chunk_struct) + + # Apply the sequence of rules to the chunkstring. + if trace: + verbose = trace > 1 + self._trace_apply(chunkstr, verbose) + else: + self._notrace_apply(chunkstr) + + # Use the chunkstring to create a chunk structure. + return chunkstr.to_chunkstruct(self._chunk_label) + + def rules(self): + """ + :return: the sequence of rules used by ``RegexpChunkParser``. + :rtype: list(RegexpChunkRule) + """ + return self._rules + + def __repr__(self): + """ + :return: a concise string representation of this + ``RegexpChunkParser``. + :rtype: str + """ + return "" % len(self._rules) + + def __str__(self): + """ + :return: a verbose string representation of this ``RegexpChunkParser``. + :rtype: str + """ + s = "RegexpChunkParser with %d rules:\n" % len(self._rules) + margin = 0 + for rule in self._rules: + margin = max(margin, len(rule.descr())) + if margin < 35: + format = " %" + repr(-(margin + 3)) + "s%s\n" + else: + format = " %s\n %s\n" + for rule in self._rules: + s += format % (rule.descr(), repr(rule)) + return s[:-1] + + +# ////////////////////////////////////////////////////// +# Chunk Grammar +# ////////////////////////////////////////////////////// + + +class RegexpParser(ChunkParserI): + r""" + A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of + regular expression patterns to specify the behavior of the parser. + The chunking of the text is encoded using a ``ChunkString``, and + each rule acts by modifying the chunking in the ``ChunkString``. + The rules are all implemented using regular expression matching + and substitution. + + A grammar contains one or more clauses in the following form:: + + NP: + {} # chunk determiners and adjectives + }<[\.VI].*>+{ # strip any tag beginning with V, I, or . + <.*>}{
    # split a chunk at a determiner + {} # merge chunk ending with det/adj + # with one starting with a noun + + The patterns of a clause are executed in order. An earlier + pattern may introduce a chunk boundary that prevents a later + pattern from executing. Sometimes an individual pattern will + match on multiple, overlapping extents of the input. As with + regular expression substitution more generally, the chunker will + identify the first match possible, then continue looking for matches + after this one has ended. + + The clauses of a grammar are also executed in order. A cascaded + chunk parser is one having more than one clause. The maximum depth + of a parse tree created by this chunk parser is the same as the + number of clauses in the grammar. + + When tracing is turned on, the comment portion of a line is displayed + each time the corresponding pattern is applied. + + :type _start: str + :ivar _start: The start symbol of the grammar (the root node of + resulting trees) + :type _stages: int + :ivar _stages: The list of parsing stages corresponding to the grammar + + """ + + def __init__(self, grammar, root_label="S", loop=1, trace=0): + """ + Create a new chunk parser, from the given start state + and set of chunk patterns. + + :param grammar: The grammar, or a list of RegexpChunkParser objects + :type grammar: str or list(RegexpChunkParser) + :param root_label: The top node of the tree being created + :type root_label: str or Nonterminal + :param loop: The number of times to run through the patterns + :type loop: int + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + ``1`` will generate normal tracing output; and ``2`` or + higher will generate verbose tracing output. + """ + self._trace = trace + self._stages = [] + self._grammar = grammar + self._loop = loop + + if isinstance(grammar, str): + self._read_grammar(grammar, root_label, trace) + else: + # Make sur the grammar looks like it has the right type: + type_err = ( + "Expected string or list of RegexpChunkParsers " "for the grammar." + ) + try: + grammar = list(grammar) + except BaseException as e: + raise TypeError(type_err) from e + for elt in grammar: + if not isinstance(elt, RegexpChunkParser): + raise TypeError(type_err) + self._stages = grammar + + def _read_grammar(self, grammar, root_label, trace): + """ + Helper function for __init__: read the grammar if it is a + string. + """ + rules = [] + lhs = None + pattern = regex.compile("(?P(\\.|[^:])*)(:(?P.*))") + for line in grammar.split("\n"): + line = line.strip() + + # New stage begins if there's an unescaped ':' + m = pattern.match(line) + if m: + # Record the stage that we just completed. + self._add_stage(rules, lhs, root_label, trace) + # Start a new stage. + lhs = m.group("nonterminal").strip() + rules = [] + line = m.group("rule").strip() + + # Skip blank & comment-only lines + if line == "" or line.startswith("#"): + continue + + # Add the rule + rules.append(RegexpChunkRule.fromstring(line)) + + # Record the final stage + self._add_stage(rules, lhs, root_label, trace) + + def _add_stage(self, rules, lhs, root_label, trace): + """ + Helper function for __init__: add a new stage to the parser. + """ + if rules != []: + if not lhs: + raise ValueError("Expected stage marker (eg NP:)") + parser = RegexpChunkParser( + rules, chunk_label=lhs, root_label=root_label, trace=trace + ) + self._stages.append(parser) + + def parse(self, chunk_struct, trace=None): + """ + Apply the chunk parser to this input. + + :type chunk_struct: Tree + :param chunk_struct: the chunk structure to be (further) chunked + (this tree is modified, and is also returned) + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + ``1`` will generate normal tracing output; and ``2`` or + higher will generate verbose tracing output. This value + overrides the trace level value that was given to the + constructor. + :return: the chunked output. + :rtype: Tree + """ + if trace is None: + trace = self._trace + for i in range(self._loop): + for parser in self._stages: + chunk_struct = parser.parse(chunk_struct, trace=trace) + return chunk_struct + + def __repr__(self): + """ + :return: a concise string representation of this ``chunk.RegexpParser``. + :rtype: str + """ + return "" % len(self._stages) + + def __str__(self): + """ + :return: a verbose string representation of this + ``RegexpParser``. + :rtype: str + """ + s = "chunk.RegexpParser with %d stages:\n" % len(self._stages) + margin = 0 + for parser in self._stages: + s += "%s\n" % parser + return s[:-1] + + +# ////////////////////////////////////////////////////// +# Demonstration code +# ////////////////////////////////////////////////////// + + +def demo_eval(chunkparser, text): + """ + Demonstration code for evaluating a chunk parser, using a + ``ChunkScore``. This function assumes that ``text`` contains one + sentence per line, and that each sentence has the form expected by + ``tree.chunk``. It runs the given chunk parser on each sentence in + the text, and scores the result. It prints the final score + (precision, recall, and f-measure); and reports the set of chunks + that were missed and the set of chunks that were incorrect. (At + most 10 missing chunks and 10 incorrect chunks are reported). + + :param chunkparser: The chunkparser to be tested + :type chunkparser: ChunkParserI + :param text: The chunked tagged text that should be used for + evaluation. + :type text: str + """ + from nltk import chunk + from nltk.tree import Tree + + # Evaluate our chunk parser. + chunkscore = chunk.ChunkScore() + + for sentence in text.split("\n"): + print(sentence) + sentence = sentence.strip() + if not sentence: + continue + gold = chunk.tagstr2tree(sentence) + tokens = gold.leaves() + test = chunkparser.parse(Tree("S", tokens), trace=1) + chunkscore.score(gold, test) + print() + + print("/" + ("=" * 75) + "\\") + print("Scoring", chunkparser) + print("-" * 77) + print("Precision: %5.1f%%" % (chunkscore.precision() * 100), " " * 4, end=" ") + print("Recall: %5.1f%%" % (chunkscore.recall() * 100), " " * 6, end=" ") + print("F-Measure: %5.1f%%" % (chunkscore.f_measure() * 100)) + + # Missed chunks. + if chunkscore.missed(): + print("Missed:") + missed = chunkscore.missed() + for chunk in missed[:10]: + print(" ", " ".join(map(str, chunk))) + if len(chunkscore.missed()) > 10: + print(" ...") + + # Incorrect chunks. + if chunkscore.incorrect(): + print("Incorrect:") + incorrect = chunkscore.incorrect() + for chunk in incorrect[:10]: + print(" ", " ".join(map(str, chunk))) + if len(chunkscore.incorrect()) > 10: + print(" ...") + + print("\\" + ("=" * 75) + "/") + print() + + +def demo(): + """ + A demonstration for the ``RegexpChunkParser`` class. A single text is + parsed with four different chunk parsers, using a variety of rules + and strategies. + """ + + from nltk import Tree, chunk + + text = """\ + [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. + [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. + [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. + """ + + print("*" * 75) + print("Evaluation text:") + print(text) + print("*" * 75) + print() + + grammar = r""" + NP: # NP stage + {
    ?*} # chunk determiners, adjectives and nouns + {+} # chunk proper nouns + """ + cp = chunk.RegexpParser(grammar) + demo_eval(cp, text) + + grammar = r""" + NP: + {<.*>} # start by chunking each tag + }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods + {} # merge det/adj with nouns + """ + cp = chunk.RegexpParser(grammar) + demo_eval(cp, text) + + grammar = r""" + NP: {
    ?*} # chunk determiners, adjectives and nouns + VP: {?} # VP = verb words + """ + cp = chunk.RegexpParser(grammar) + demo_eval(cp, text) + + grammar = r""" + NP: {<.*>*} # start by chunking everything + }<[\.VI].*>+{ # strip any verbs, prepositions or periods + <.*>}{
    # separate on determiners + PP: {} # PP = preposition + noun phrase + VP: {*} # VP = verb words + NPs and PPs + """ + cp = chunk.RegexpParser(grammar) + demo_eval(cp, text) + + # Evaluation + + from nltk.corpus import conll2000 + + print() + print("Demonstration of empty grammar:") + + cp = chunk.RegexpParser("") + print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt", chunk_types=("NP",)))) + + print() + print("Demonstration of accuracy evaluation using CoNLL tags:") + + grammar = r""" + NP: + {<.*>} # start by chunking each tag + }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods + {} # merge det/adj with nouns + """ + cp = chunk.RegexpParser(grammar) + print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt")[:5])) + + print() + print("Demonstration of tagged token input") + + grammar = r""" + NP: {<.*>*} # start by chunking everything + }<[\.VI].*>+{ # strip any verbs, prepositions or periods + <.*>}{
    # separate on determiners + PP: {} # PP = preposition + noun phrase + VP: {*} # VP = verb words + NPs and PPs + """ + cp = chunk.RegexpParser(grammar) + print( + cp.parse( + [ + ("the", "DT"), + ("little", "JJ"), + ("cat", "NN"), + ("sat", "VBD"), + ("on", "IN"), + ("the", "DT"), + ("mat", "NN"), + (".", "."), + ] + ) + ) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/util.py new file mode 100644 index 0000000000000000000000000000000000000000..93c67a376c4a27d86088bf915c89fd8773dae818 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/chunk/util.py @@ -0,0 +1,643 @@ +# Natural Language Toolkit: Chunk format conversions +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# URL: +# For license information, see LICENSE.TXT + +import re + +from nltk.metrics import accuracy as _accuracy +from nltk.tag.mapping import map_tag +from nltk.tag.util import str2tuple +from nltk.tree import Tree + +##////////////////////////////////////////////////////// +## EVALUATION +##////////////////////////////////////////////////////// + + +def accuracy(chunker, gold): + """ + Score the accuracy of the chunker against the gold standard. + Strip the chunk information from the gold standard and rechunk it using + the chunker, then compute the accuracy score. + + :type chunker: ChunkParserI + :param chunker: The chunker being evaluated. + :type gold: tree + :param gold: The chunk structures to score the chunker on. + :rtype: float + """ + + gold_tags = [] + test_tags = [] + for gold_tree in gold: + test_tree = chunker.parse(gold_tree.flatten()) + gold_tags += tree2conlltags(gold_tree) + test_tags += tree2conlltags(test_tree) + + # print 'GOLD:', gold_tags[:50] + # print 'TEST:', test_tags[:50] + return _accuracy(gold_tags, test_tags) + + +# Patched for increased performance by Yoav Goldberg , 2006-01-13 +# -- statistics are evaluated only on demand, instead of at every sentence evaluation +# +# SB: use nltk.metrics for precision/recall scoring? +# +class ChunkScore: + """ + A utility class for scoring chunk parsers. ``ChunkScore`` can + evaluate a chunk parser's output, based on a number of statistics + (precision, recall, f-measure, misssed chunks, incorrect chunks). + It can also combine the scores from the parsing of multiple texts; + this makes it significantly easier to evaluate a chunk parser that + operates one sentence at a time. + + Texts are evaluated with the ``score`` method. The results of + evaluation can be accessed via a number of accessor methods, such + as ``precision`` and ``f_measure``. A typical use of the + ``ChunkScore`` class is:: + + >>> chunkscore = ChunkScore() # doctest: +SKIP + >>> for correct in correct_sentences: # doctest: +SKIP + ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP + ... chunkscore.score(correct, guess) # doctest: +SKIP + >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP + F Measure: 0.823 + + :ivar kwargs: Keyword arguments: + + - max_tp_examples: The maximum number actual examples of true + positives to record. This affects the ``correct`` member + function: ``correct`` will not return more than this number + of true positive examples. This does *not* affect any of + the numerical metrics (precision, recall, or f-measure) + + - max_fp_examples: The maximum number actual examples of false + positives to record. This affects the ``incorrect`` member + function and the ``guessed`` member function: ``incorrect`` + will not return more than this number of examples, and + ``guessed`` will not return more than this number of true + positive examples. This does *not* affect any of the + numerical metrics (precision, recall, or f-measure) + + - max_fn_examples: The maximum number actual examples of false + negatives to record. This affects the ``missed`` member + function and the ``correct`` member function: ``missed`` + will not return more than this number of examples, and + ``correct`` will not return more than this number of true + negative examples. This does *not* affect any of the + numerical metrics (precision, recall, or f-measure) + + - chunk_label: A regular expression indicating which chunks + should be compared. Defaults to ``'.*'`` (i.e., all chunks). + + :type _tp: list(Token) + :ivar _tp: List of true positives + :type _fp: list(Token) + :ivar _fp: List of false positives + :type _fn: list(Token) + :ivar _fn: List of false negatives + + :type _tp_num: int + :ivar _tp_num: Number of true positives + :type _fp_num: int + :ivar _fp_num: Number of false positives + :type _fn_num: int + :ivar _fn_num: Number of false negatives. + """ + + def __init__(self, **kwargs): + self._correct = set() + self._guessed = set() + self._tp = set() + self._fp = set() + self._fn = set() + self._max_tp = kwargs.get("max_tp_examples", 100) + self._max_fp = kwargs.get("max_fp_examples", 100) + self._max_fn = kwargs.get("max_fn_examples", 100) + self._chunk_label = kwargs.get("chunk_label", ".*") + self._tp_num = 0 + self._fp_num = 0 + self._fn_num = 0 + self._count = 0 + self._tags_correct = 0.0 + self._tags_total = 0.0 + + self._measuresNeedUpdate = False + + def _updateMeasures(self): + if self._measuresNeedUpdate: + self._tp = self._guessed & self._correct + self._fn = self._correct - self._guessed + self._fp = self._guessed - self._correct + self._tp_num = len(self._tp) + self._fp_num = len(self._fp) + self._fn_num = len(self._fn) + self._measuresNeedUpdate = False + + def score(self, correct, guessed): + """ + Given a correctly chunked sentence, score another chunked + version of the same sentence. + + :type correct: chunk structure + :param correct: The known-correct ("gold standard") chunked + sentence. + :type guessed: chunk structure + :param guessed: The chunked sentence to be scored. + """ + self._correct |= _chunksets(correct, self._count, self._chunk_label) + self._guessed |= _chunksets(guessed, self._count, self._chunk_label) + self._count += 1 + self._measuresNeedUpdate = True + # Keep track of per-tag accuracy (if possible) + try: + correct_tags = tree2conlltags(correct) + guessed_tags = tree2conlltags(guessed) + except ValueError: + # This exception case is for nested chunk structures, + # where tree2conlltags will fail with a ValueError: "Tree + # is too deeply nested to be printed in CoNLL format." + correct_tags = guessed_tags = () + self._tags_total += len(correct_tags) + self._tags_correct += sum( + 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g + ) + + def accuracy(self): + """ + Return the overall tag-based accuracy for all text that have + been scored by this ``ChunkScore``, using the IOB (conll2000) + tag encoding. + + :rtype: float + """ + if self._tags_total == 0: + return 1 + return self._tags_correct / self._tags_total + + def precision(self): + """ + Return the overall precision for all texts that have been + scored by this ``ChunkScore``. + + :rtype: float + """ + self._updateMeasures() + div = self._tp_num + self._fp_num + if div == 0: + return 0 + else: + return self._tp_num / div + + def recall(self): + """ + Return the overall recall for all texts that have been + scored by this ``ChunkScore``. + + :rtype: float + """ + self._updateMeasures() + div = self._tp_num + self._fn_num + if div == 0: + return 0 + else: + return self._tp_num / div + + def f_measure(self, alpha=0.5): + """ + Return the overall F measure for all texts that have been + scored by this ``ChunkScore``. + + :param alpha: the relative weighting of precision and recall. + Larger alpha biases the score towards the precision value, + while smaller alpha biases the score towards the recall + value. ``alpha`` should have a value in the range [0,1]. + :type alpha: float + :rtype: float + """ + self._updateMeasures() + p = self.precision() + r = self.recall() + if p == 0 or r == 0: # what if alpha is 0 or 1? + return 0 + return 1 / (alpha / p + (1 - alpha) / r) + + def missed(self): + """ + Return the chunks which were included in the + correct chunk structures, but not in the guessed chunk + structures, listed in input order. + + :rtype: list of chunks + """ + self._updateMeasures() + chunks = list(self._fn) + return [c[1] for c in chunks] # discard position information + + def incorrect(self): + """ + Return the chunks which were included in the guessed chunk structures, + but not in the correct chunk structures, listed in input order. + + :rtype: list of chunks + """ + self._updateMeasures() + chunks = list(self._fp) + return [c[1] for c in chunks] # discard position information + + def correct(self): + """ + Return the chunks which were included in the correct + chunk structures, listed in input order. + + :rtype: list of chunks + """ + chunks = list(self._correct) + return [c[1] for c in chunks] # discard position information + + def guessed(self): + """ + Return the chunks which were included in the guessed + chunk structures, listed in input order. + + :rtype: list of chunks + """ + chunks = list(self._guessed) + return [c[1] for c in chunks] # discard position information + + def __len__(self): + self._updateMeasures() + return self._tp_num + self._fn_num + + def __repr__(self): + """ + Return a concise representation of this ``ChunkScoring``. + + :rtype: str + """ + return "" + + def __str__(self): + """ + Return a verbose representation of this ``ChunkScoring``. + This representation includes the precision, recall, and + f-measure scores. For other information about the score, + use the accessor methods (e.g., ``missed()`` and ``incorrect()``). + + :rtype: str + """ + return ( + "ChunkParse score:\n" + + (f" IOB Accuracy: {self.accuracy() * 100:5.1f}%%\n") + + (f" Precision: {self.precision() * 100:5.1f}%%\n") + + (f" Recall: {self.recall() * 100:5.1f}%%\n") + + (f" F-Measure: {self.f_measure() * 100:5.1f}%%") + ) + + +# extract chunks, and assign unique id, the absolute position of +# the first word of the chunk +def _chunksets(t, count, chunk_label): + pos = 0 + chunks = [] + for child in t: + if isinstance(child, Tree): + if re.match(chunk_label, child.label()): + chunks.append(((count, pos), child.freeze())) + pos += len(child.leaves()) + else: + pos += 1 + return set(chunks) + + +def tagstr2tree( + s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None +): + """ + Divide a string of bracketted tagged text into + chunks and unchunked tokens, and produce a Tree. + Chunks are marked by square brackets (``[...]``). Words are + delimited by whitespace, and each word should have the form + ``text/tag``. Words that do not contain a slash are + assigned a ``tag`` of None. + + :param s: The string to be converted + :type s: str + :param chunk_label: The label to use for chunk nodes + :type chunk_label: str + :param root_label: The label to use for the root of the tree + :type root_label: str + :rtype: Tree + """ + + WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+") + + stack = [Tree(root_label, [])] + for match in WORD_OR_BRACKET.finditer(s): + text = match.group() + if text[0] == "[": + if len(stack) != 1: + raise ValueError(f"Unexpected [ at char {match.start():d}") + chunk = Tree(chunk_label, []) + stack[-1].append(chunk) + stack.append(chunk) + elif text[0] == "]": + if len(stack) != 2: + raise ValueError(f"Unexpected ] at char {match.start():d}") + stack.pop() + else: + if sep is None: + stack[-1].append(text) + else: + word, tag = str2tuple(text, sep) + if source_tagset and target_tagset: + tag = map_tag(source_tagset, target_tagset, tag) + stack[-1].append((word, tag)) + + if len(stack) != 1: + raise ValueError(f"Expected ] at char {len(s):d}") + return stack[0] + + +### CONLL + +_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?") + + +def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"): + """ + Return a chunk structure for a single sentence + encoded in the given CONLL 2000 style string. + This function converts a CoNLL IOB string into a tree. + It uses the specified chunk types + (defaults to NP, PP and VP), and creates a tree rooted at a node + labeled S (by default). + + :param s: The CoNLL string to be converted. + :type s: str + :param chunk_types: The chunk types to be converted. + :type chunk_types: tuple + :param root_label: The node label to use for the root. + :type root_label: str + :rtype: Tree + """ + + stack = [Tree(root_label, [])] + + for lineno, line in enumerate(s.split("\n")): + if not line.strip(): + continue + + # Decode the line. + match = _LINE_RE.match(line) + if match is None: + raise ValueError(f"Error on line {lineno:d}") + (word, tag, state, chunk_type) = match.groups() + + # If it's a chunk type we don't care about, treat it as O. + if chunk_types is not None and chunk_type not in chunk_types: + state = "O" + + # For "Begin"/"Outside", finish any completed chunks - + # also do so for "Inside" which don't match the previous token. + mismatch_I = state == "I" and chunk_type != stack[-1].label() + if state in "BO" or mismatch_I: + if len(stack) == 2: + stack.pop() + + # For "Begin", start a new chunk. + if state == "B" or mismatch_I: + chunk = Tree(chunk_type, []) + stack[-1].append(chunk) + stack.append(chunk) + + # Add the new word token. + stack[-1].append((word, tag)) + + return stack[0] + + +def tree2conlltags(t): + """ + Return a list of 3-tuples containing ``(word, tag, IOB-tag)``. + Convert a tree to the CoNLL IOB tag format. + + :param t: The tree to be converted. + :type t: Tree + :rtype: list(tuple) + """ + + tags = [] + for child in t: + try: + category = child.label() + prefix = "B-" + for contents in child: + if isinstance(contents, Tree): + raise ValueError( + "Tree is too deeply nested to be printed in CoNLL format" + ) + tags.append((contents[0], contents[1], prefix + category)) + prefix = "I-" + except AttributeError: + tags.append((child[0], child[1], "O")) + return tags + + +def conlltags2tree( + sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False +): + """ + Convert the CoNLL IOB format to a tree. + """ + tree = Tree(root_label, []) + for (word, postag, chunktag) in sentence: + if chunktag is None: + if strict: + raise ValueError("Bad conll tag sequence") + else: + # Treat as O + tree.append((word, postag)) + elif chunktag.startswith("B-"): + tree.append(Tree(chunktag[2:], [(word, postag)])) + elif chunktag.startswith("I-"): + if ( + len(tree) == 0 + or not isinstance(tree[-1], Tree) + or tree[-1].label() != chunktag[2:] + ): + if strict: + raise ValueError("Bad conll tag sequence") + else: + # Treat as B-* + tree.append(Tree(chunktag[2:], [(word, postag)])) + else: + tree[-1].append((word, postag)) + elif chunktag == "O": + tree.append((word, postag)) + else: + raise ValueError(f"Bad conll tag {chunktag!r}") + return tree + + +def tree2conllstr(t): + """ + Return a multiline string where each line contains a word, tag and IOB tag. + Convert a tree to the CoNLL IOB string format + + :param t: The tree to be converted. + :type t: Tree + :rtype: str + """ + lines = [" ".join(token) for token in tree2conlltags(t)] + return "\n".join(lines) + + +### IEER + +_IEER_DOC_RE = re.compile( + r"\s*" + r"(\s*(?P.+?)\s*\s*)?" + r"(\s*(?P.+?)\s*\s*)?" + r"(\s*(?P.+?)\s*\s*)?" + r"\s*" + r"(\s*(?P.+?)\s*\s*)?" + r"(?P.*?)\s*" + r"\s*\s*", + re.DOTALL, +) + +_IEER_TYPE_RE = re.compile(r']*?type="(?P\w+)"') + + +def _ieer_read_text(s, root_label): + stack = [Tree(root_label, [])] + # s will be None if there is no headline in the text + # return the empty list in place of a Tree + if s is None: + return [] + for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s): + piece = piece_m.group() + try: + if piece.startswith(".... + m = _IEER_DOC_RE.match(s) + if m: + return { + "text": _ieer_read_text(m.group("text"), root_label), + "docno": m.group("docno"), + "doctype": m.group("doctype"), + "date_time": m.group("date_time"), + #'headline': m.group('headline') + # we want to capture NEs in the headline too! + "headline": _ieer_read_text(m.group("headline"), root_label), + } + else: + return _ieer_read_text(s, root_label) + + +def demo(): + + s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." + import nltk + + t = nltk.chunk.tagstr2tree(s, chunk_label="NP") + t.pprint() + print() + + s = """ +These DT B-NP +research NN I-NP +protocols NNS I-NP +offer VBP B-VP +to TO B-PP +the DT B-NP +patient NN I-NP +not RB O +only RB O +the DT B-NP +very RB I-NP +best JJS I-NP +therapy NN I-NP +which WDT B-NP +we PRP B-NP +have VBP B-VP +established VBN I-VP +today NN B-NP +but CC B-NP +also RB I-NP +the DT B-NP +hope NN I-NP +of IN B-PP +something NN B-NP +still RB B-ADJP +better JJR I-ADJP +. . O +""" + + conll_tree = conllstr2tree(s, chunk_types=("NP", "PP")) + conll_tree.pprint() + + # Demonstrate CoNLL output + print("CoNLL output:") + print(nltk.chunk.tree2conllstr(conll_tree)) + print() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cad0368f6785e18c2730280915813a3912936469 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/__init__.py @@ -0,0 +1,101 @@ +# Natural Language Toolkit: Classifiers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Classes and interfaces for labeling tokens with category labels (or +"class labels"). Typically, labels are represented with strings +(such as ``'health'`` or ``'sports'``). Classifiers can be used to +perform a wide range of classification tasks. For example, +classifiers can be used... + +- to classify documents by topic +- to classify ambiguous words by which word sense is intended +- to classify acoustic signals by which phoneme they represent +- to classify sentences by their author + +Features +======== +In order to decide which category label is appropriate for a given +token, classifiers examine one or more 'features' of the token. These +"features" are typically chosen by hand, and indicate which aspects +of the token are relevant to the classification decision. For +example, a document classifier might use a separate feature for each +word, recording how often that word occurred in the document. + +Featuresets +=========== +The features describing a token are encoded using a "featureset", +which is a dictionary that maps from "feature names" to "feature +values". Feature names are unique strings that indicate what aspect +of the token is encoded by the feature. Examples include +``'prevword'``, for a feature whose value is the previous word; and +``'contains-word(library)'`` for a feature that is true when a document +contains the word ``'library'``. Feature values are typically +booleans, numbers, or strings, depending on which feature they +describe. + +Featuresets are typically constructed using a "feature detector" +(also known as a "feature extractor"). A feature detector is a +function that takes a token (and sometimes information about its +context) as its input, and returns a featureset describing that token. +For example, the following feature detector converts a document +(stored as a list of words) to a featureset describing the set of +words included in the document: + + >>> # Define a feature detector function. + >>> def document_features(document): + ... return dict([('contains-word(%s)' % w, True) for w in document]) + +Feature detectors are typically applied to each token before it is fed +to the classifier: + + >>> # Classify each Gutenberg document. + >>> from nltk.corpus import gutenberg + >>> for fileid in gutenberg.fileids(): # doctest: +SKIP + ... doc = gutenberg.words(fileid) # doctest: +SKIP + ... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP + +The parameters that a feature detector expects will vary, depending on +the task and the needs of the feature detector. For example, a +feature detector for word sense disambiguation (WSD) might take as its +input a sentence, and the index of a word that should be classified, +and return a featureset for that word. The following feature detector +for WSD includes features describing the left and right contexts of +the target word: + + >>> def wsd_features(sentence, index): + ... featureset = {} + ... for i in range(max(0, index-3), index): + ... featureset['left-context(%s)' % sentence[i]] = True + ... for i in range(index, max(index+3, len(sentence))): + ... featureset['right-context(%s)' % sentence[i]] = True + ... return featureset + +Training Classifiers +==================== +Most classifiers are built by training them on a list of hand-labeled +examples, known as the "training set". Training sets are represented +as lists of ``(featuredict, label)`` tuples. +""" + +from nltk.classify.api import ClassifierI, MultiClassifierI +from nltk.classify.decisiontree import DecisionTreeClassifier +from nltk.classify.maxent import ( + BinaryMaxentFeatureEncoding, + ConditionalExponentialClassifier, + MaxentClassifier, + TypedMaxentFeatureEncoding, +) +from nltk.classify.megam import call_megam, config_megam +from nltk.classify.naivebayes import NaiveBayesClassifier +from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier +from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features +from nltk.classify.scikitlearn import SklearnClassifier +from nltk.classify.senna import Senna +from nltk.classify.textcat import TextCat +from nltk.classify.util import accuracy, apply_features, log_likelihood +from nltk.classify.weka import WekaClassifier, config_weka diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/api.py new file mode 100644 index 0000000000000000000000000000000000000000..3760a46447f682f6a79ab161a968639dc2c35307 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/api.py @@ -0,0 +1,195 @@ +# Natural Language Toolkit: Classifier Interface +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# URL: +# For license information, see LICENSE.TXT + +""" +Interfaces for labeling tokens with category labels (or "class labels"). + +``ClassifierI`` is a standard interface for "single-category +classification", in which the set of categories is known, the number +of categories is finite, and each text belongs to exactly one +category. + +``MultiClassifierI`` is a standard interface for "multi-category +classification", which is like single-category classification except +that each text belongs to zero or more categories. +""" +from nltk.internals import overridden + +##////////////////////////////////////////////////////// +# { Classification Interfaces +##////////////////////////////////////////////////////// + + +class ClassifierI: + """ + A processing interface for labeling tokens with a single category + label (or "class"). Labels are typically strs or + ints, but can be any immutable type. The set of labels + that the classifier chooses from must be fixed and finite. + + Subclasses must define: + - ``labels()`` + - either ``classify()`` or ``classify_many()`` (or both) + + Subclasses may define: + - either ``prob_classify()`` or ``prob_classify_many()`` (or both) + """ + + def labels(self): + """ + :return: the list of category labels used by this classifier. + :rtype: list of (immutable) + """ + raise NotImplementedError() + + def classify(self, featureset): + """ + :return: the most appropriate label for the given featureset. + :rtype: label + """ + if overridden(self.classify_many): + return self.classify_many([featureset])[0] + else: + raise NotImplementedError() + + def prob_classify(self, featureset): + """ + :return: a probability distribution over labels for the given + featureset. + :rtype: ProbDistI + """ + if overridden(self.prob_classify_many): + return self.prob_classify_many([featureset])[0] + else: + raise NotImplementedError() + + def classify_many(self, featuresets): + """ + Apply ``self.classify()`` to each element of ``featuresets``. I.e.: + + return [self.classify(fs) for fs in featuresets] + + :rtype: list(label) + """ + return [self.classify(fs) for fs in featuresets] + + def prob_classify_many(self, featuresets): + """ + Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: + + return [self.prob_classify(fs) for fs in featuresets] + + :rtype: list(ProbDistI) + """ + return [self.prob_classify(fs) for fs in featuresets] + + +class MultiClassifierI: + """ + A processing interface for labeling tokens with zero or more + category labels (or "labels"). Labels are typically strs + or ints, but can be any immutable type. The set of labels + that the multi-classifier chooses from must be fixed and finite. + + Subclasses must define: + - ``labels()`` + - either ``classify()`` or ``classify_many()`` (or both) + + Subclasses may define: + - either ``prob_classify()`` or ``prob_classify_many()`` (or both) + """ + + def labels(self): + """ + :return: the list of category labels used by this classifier. + :rtype: list of (immutable) + """ + raise NotImplementedError() + + def classify(self, featureset): + """ + :return: the most appropriate set of labels for the given featureset. + :rtype: set(label) + """ + if overridden(self.classify_many): + return self.classify_many([featureset])[0] + else: + raise NotImplementedError() + + def prob_classify(self, featureset): + """ + :return: a probability distribution over sets of labels for the + given featureset. + :rtype: ProbDistI + """ + if overridden(self.prob_classify_many): + return self.prob_classify_many([featureset])[0] + else: + raise NotImplementedError() + + def classify_many(self, featuresets): + """ + Apply ``self.classify()`` to each element of ``featuresets``. I.e.: + + return [self.classify(fs) for fs in featuresets] + + :rtype: list(set(label)) + """ + return [self.classify(fs) for fs in featuresets] + + def prob_classify_many(self, featuresets): + """ + Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: + + return [self.prob_classify(fs) for fs in featuresets] + + :rtype: list(ProbDistI) + """ + return [self.prob_classify(fs) for fs in featuresets] + + +# # [XX] IN PROGRESS: +# class SequenceClassifierI: +# """ +# A processing interface for labeling sequences of tokens with a +# single category label (or "class"). Labels are typically +# strs or ints, but can be any immutable type. The set +# of labels that the classifier chooses from must be fixed and +# finite. +# """ +# def labels(self): +# """ +# :return: the list of category labels used by this classifier. +# :rtype: list of (immutable) +# """ +# raise NotImplementedError() + +# def prob_classify(self, featureset): +# """ +# Return a probability distribution over labels for the given +# featureset. + +# If ``featureset`` is a list of featuresets, then return a +# corresponding list containing the probability distribution +# over labels for each of the given featuresets, where the +# *i*\ th element of this list is the most appropriate label for +# the *i*\ th element of ``featuresets``. +# """ +# raise NotImplementedError() + +# def classify(self, featureset): +# """ +# Return the most appropriate label for the given featureset. + +# If ``featureset`` is a list of featuresets, then return a +# corresponding list containing the most appropriate label for +# each of the given featuresets, where the *i*\ th element of +# this list is the most appropriate label for the *i*\ th element +# of ``featuresets``. +# """ +# raise NotImplementedError() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/decisiontree.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/decisiontree.py new file mode 100644 index 0000000000000000000000000000000000000000..312c9460946ca66259b7b326ae60c85b619ddd71 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/decisiontree.py @@ -0,0 +1,349 @@ +# Natural Language Toolkit: Decision Tree Classifiers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +A classifier model that decides which label to assign to a token on +the basis of a tree structure, where branches correspond to conditions +on feature values, and leaves correspond to label assignments. +""" + +from collections import defaultdict + +from nltk.classify.api import ClassifierI +from nltk.probability import FreqDist, MLEProbDist, entropy + + +class DecisionTreeClassifier(ClassifierI): + def __init__(self, label, feature_name=None, decisions=None, default=None): + """ + :param label: The most likely label for tokens that reach + this node in the decision tree. If this decision tree + has no children, then this label will be assigned to + any token that reaches this decision tree. + :param feature_name: The name of the feature that this + decision tree selects for. + :param decisions: A dictionary mapping from feature values + for the feature identified by ``feature_name`` to + child decision trees. + :param default: The child that will be used if the value of + feature ``feature_name`` does not match any of the keys in + ``decisions``. This is used when constructing binary + decision trees. + """ + self._label = label + self._fname = feature_name + self._decisions = decisions + self._default = default + + def labels(self): + labels = [self._label] + if self._decisions is not None: + for dt in self._decisions.values(): + labels.extend(dt.labels()) + if self._default is not None: + labels.extend(self._default.labels()) + return list(set(labels)) + + def classify(self, featureset): + # Decision leaf: + if self._fname is None: + return self._label + + # Decision tree: + fval = featureset.get(self._fname) + if fval in self._decisions: + return self._decisions[fval].classify(featureset) + elif self._default is not None: + return self._default.classify(featureset) + else: + return self._label + + def error(self, labeled_featuresets): + errors = 0 + for featureset, label in labeled_featuresets: + if self.classify(featureset) != label: + errors += 1 + return errors / len(labeled_featuresets) + + def pretty_format(self, width=70, prefix="", depth=4): + """ + Return a string containing a pretty-printed version of this + decision tree. Each line in this string corresponds to a + single decision tree node or leaf, and indentation is used to + display the structure of the decision tree. + """ + # [xx] display default!! + if self._fname is None: + n = width - len(prefix) - 15 + return "{}{} {}\n".format(prefix, "." * n, self._label) + s = "" + for i, (fval, result) in enumerate( + sorted( + self._decisions.items(), + key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()), + ) + ): + hdr = f"{prefix}{self._fname}={fval}? " + n = width - 15 - len(hdr) + s += "{}{} {}\n".format(hdr, "." * (n), result._label) + if result._fname is not None and depth > 1: + s += result.pretty_format(width, prefix + " ", depth - 1) + if self._default is not None: + n = width - len(prefix) - 21 + s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label) + if self._default._fname is not None and depth > 1: + s += self._default.pretty_format(width, prefix + " ", depth - 1) + return s + + def pseudocode(self, prefix="", depth=4): + """ + Return a string representation of this decision tree that + expresses the decisions it makes as a nested set of pseudocode + if statements. + """ + if self._fname is None: + return f"{prefix}return {self._label!r}\n" + s = "" + for (fval, result) in sorted( + self._decisions.items(), + key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()), + ): + s += f"{prefix}if {self._fname} == {fval!r}: " + if result._fname is not None and depth > 1: + s += "\n" + result.pseudocode(prefix + " ", depth - 1) + else: + s += f"return {result._label!r}\n" + if self._default is not None: + if len(self._decisions) == 1: + s += "{}if {} != {!r}: ".format( + prefix, self._fname, list(self._decisions.keys())[0] + ) + else: + s += f"{prefix}else: " + if self._default._fname is not None and depth > 1: + s += "\n" + self._default.pseudocode(prefix + " ", depth - 1) + else: + s += f"return {self._default._label!r}\n" + return s + + def __str__(self): + return self.pretty_format() + + @staticmethod + def train( + labeled_featuresets, + entropy_cutoff=0.05, + depth_cutoff=100, + support_cutoff=10, + binary=False, + feature_values=None, + verbose=False, + ): + """ + :param binary: If true, then treat all feature/value pairs as + individual binary features, rather than using a single n-way + branch for each feature. + """ + # Collect a list of all feature names. + feature_names = set() + for featureset, label in labeled_featuresets: + for fname in featureset: + feature_names.add(fname) + + # Collect a list of the values each feature can take. + if feature_values is None and binary: + feature_values = defaultdict(set) + for featureset, label in labeled_featuresets: + for fname, fval in featureset.items(): + feature_values[fname].add(fval) + + # Start with a stump. + if not binary: + tree = DecisionTreeClassifier.best_stump( + feature_names, labeled_featuresets, verbose + ) + else: + tree = DecisionTreeClassifier.best_binary_stump( + feature_names, labeled_featuresets, feature_values, verbose + ) + + # Refine the stump. + tree.refine( + labeled_featuresets, + entropy_cutoff, + depth_cutoff - 1, + support_cutoff, + binary, + feature_values, + verbose, + ) + + # Return it + return tree + + @staticmethod + def leaf(labeled_featuresets): + label = FreqDist(label for (featureset, label) in labeled_featuresets).max() + return DecisionTreeClassifier(label) + + @staticmethod + def stump(feature_name, labeled_featuresets): + label = FreqDist(label for (featureset, label) in labeled_featuresets).max() + + # Find the best label for each value. + freqs = defaultdict(FreqDist) # freq(label|value) + for featureset, label in labeled_featuresets: + feature_value = featureset.get(feature_name) + freqs[feature_value][label] += 1 + + decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs} + return DecisionTreeClassifier(label, feature_name, decisions) + + def refine( + self, + labeled_featuresets, + entropy_cutoff, + depth_cutoff, + support_cutoff, + binary=False, + feature_values=None, + verbose=False, + ): + if len(labeled_featuresets) <= support_cutoff: + return + if self._fname is None: + return + if depth_cutoff <= 0: + return + for fval in self._decisions: + fval_featuresets = [ + (featureset, label) + for (featureset, label) in labeled_featuresets + if featureset.get(self._fname) == fval + ] + + label_freqs = FreqDist(label for (featureset, label) in fval_featuresets) + if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: + self._decisions[fval] = DecisionTreeClassifier.train( + fval_featuresets, + entropy_cutoff, + depth_cutoff, + support_cutoff, + binary, + feature_values, + verbose, + ) + if self._default is not None: + default_featuresets = [ + (featureset, label) + for (featureset, label) in labeled_featuresets + if featureset.get(self._fname) not in self._decisions + ] + label_freqs = FreqDist(label for (featureset, label) in default_featuresets) + if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: + self._default = DecisionTreeClassifier.train( + default_featuresets, + entropy_cutoff, + depth_cutoff, + support_cutoff, + binary, + feature_values, + verbose, + ) + + @staticmethod + def best_stump(feature_names, labeled_featuresets, verbose=False): + best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) + best_error = best_stump.error(labeled_featuresets) + for fname in feature_names: + stump = DecisionTreeClassifier.stump(fname, labeled_featuresets) + stump_error = stump.error(labeled_featuresets) + if stump_error < best_error: + best_error = stump_error + best_stump = stump + if verbose: + print( + "best stump for {:6d} toks uses {:20} err={:6.4f}".format( + len(labeled_featuresets), best_stump._fname, best_error + ) + ) + return best_stump + + @staticmethod + def binary_stump(feature_name, feature_value, labeled_featuresets): + label = FreqDist(label for (featureset, label) in labeled_featuresets).max() + + # Find the best label for each value. + pos_fdist = FreqDist() + neg_fdist = FreqDist() + for featureset, label in labeled_featuresets: + if featureset.get(feature_name) == feature_value: + pos_fdist[label] += 1 + else: + neg_fdist[label] += 1 + + decisions = {} + default = label + # But hopefully we have observations! + if pos_fdist.N() > 0: + decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} + if neg_fdist.N() > 0: + default = DecisionTreeClassifier(neg_fdist.max()) + + return DecisionTreeClassifier(label, feature_name, decisions, default) + + @staticmethod + def best_binary_stump( + feature_names, labeled_featuresets, feature_values, verbose=False + ): + best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) + best_error = best_stump.error(labeled_featuresets) + for fname in feature_names: + for fval in feature_values[fname]: + stump = DecisionTreeClassifier.binary_stump( + fname, fval, labeled_featuresets + ) + stump_error = stump.error(labeled_featuresets) + if stump_error < best_error: + best_error = stump_error + best_stump = stump + if verbose: + if best_stump._decisions: + descr = "{}={}".format( + best_stump._fname, list(best_stump._decisions.keys())[0] + ) + else: + descr = "(default)" + print( + "best stump for {:6d} toks uses {:20} err={:6.4f}".format( + len(labeled_featuresets), descr, best_error + ) + ) + return best_stump + + +##////////////////////////////////////////////////////// +## Demo +##////////////////////////////////////////////////////// + + +def f(x): + return DecisionTreeClassifier.train(x, binary=True, verbose=True) + + +def demo(): + from nltk.classify.util import binary_names_demo_features, names_demo + + classifier = names_demo( + f, binary_names_demo_features # DecisionTreeClassifier.train, + ) + print(classifier.pretty_format(depth=7)) + print(classifier.pseudocode(depth=7)) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/maxent.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/maxent.py new file mode 100644 index 0000000000000000000000000000000000000000..063d29cade2f28bb50bc022a79ca7b47a9048057 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/maxent.py @@ -0,0 +1,1569 @@ +# Natural Language Toolkit: Maximum Entropy Classifiers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Dmitry Chichkov (TypedMaxentFeatureEncoding) +# URL: +# For license information, see LICENSE.TXT + +""" +A classifier model based on maximum entropy modeling framework. This +framework considers all of the probability distributions that are +empirically consistent with the training data; and chooses the +distribution with the highest entropy. A probability distribution is +"empirically consistent" with a set of training data if its estimated +frequency with which a class and a feature vector value co-occur is +equal to the actual frequency in the data. + +Terminology: 'feature' +====================== +The term *feature* is usually used to refer to some property of an +unlabeled token. For example, when performing word sense +disambiguation, we might define a ``'prevword'`` feature whose value is +the word preceding the target word. However, in the context of +maxent modeling, the term *feature* is typically used to refer to a +property of a "labeled" token. In order to prevent confusion, we +will introduce two distinct terms to disambiguate these two different +concepts: + + - An "input-feature" is a property of an unlabeled token. + - A "joint-feature" is a property of a labeled token. + +In the rest of the ``nltk.classify`` module, the term "features" is +used to refer to what we will call "input-features" in this module. + +In literature that describes and discusses maximum entropy models, +input-features are typically called "contexts", and joint-features +are simply referred to as "features". + +Converting Input-Features to Joint-Features +------------------------------------------- +In maximum entropy models, joint-features are required to have numeric +values. Typically, each input-feature ``input_feat`` is mapped to a +set of joint-features of the form: + +| joint_feat(token, label) = { 1 if input_feat(token) == feat_val +| { and label == some_label +| { +| { 0 otherwise + +For all values of ``feat_val`` and ``some_label``. This mapping is +performed by classes that implement the ``MaxentFeatureEncodingI`` +interface. +""" +try: + import numpy +except ImportError: + pass + +import os +import tempfile +from collections import defaultdict + +from nltk.classify.api import ClassifierI +from nltk.classify.megam import call_megam, parse_megam_weights, write_megam_file +from nltk.classify.tadm import call_tadm, parse_tadm_weights, write_tadm_file +from nltk.classify.util import CutoffChecker, accuracy, log_likelihood +from nltk.data import gzip_open_unicode +from nltk.probability import DictionaryProbDist +from nltk.util import OrderedDict + +__docformat__ = "epytext en" + +###################################################################### +# { Classifier Model +###################################################################### + + +class MaxentClassifier(ClassifierI): + """ + A maximum entropy classifier (also known as a "conditional + exponential classifier"). This classifier is parameterized by a + set of "weights", which are used to combine the joint-features + that are generated from a featureset by an "encoding". In + particular, the encoding maps each ``(featureset, label)`` pair to + a vector. The probability of each label is then computed using + the following equation:: + + dotprod(weights, encode(fs,label)) + prob(fs|label) = --------------------------------------------------- + sum(dotprod(weights, encode(fs,l)) for l in labels) + + Where ``dotprod`` is the dot product:: + + dotprod(a,b) = sum(x*y for (x,y) in zip(a,b)) + """ + + def __init__(self, encoding, weights, logarithmic=True): + """ + Construct a new maxent classifier model. Typically, new + classifier models are created using the ``train()`` method. + + :type encoding: MaxentFeatureEncodingI + :param encoding: An encoding that is used to convert the + featuresets that are given to the ``classify`` method into + joint-feature vectors, which are used by the maxent + classifier model. + + :type weights: list of float + :param weights: The feature weight vector for this classifier. + + :type logarithmic: bool + :param logarithmic: If false, then use non-logarithmic weights. + """ + self._encoding = encoding + self._weights = weights + self._logarithmic = logarithmic + # self._logarithmic = False + assert encoding.length() == len(weights) + + def labels(self): + return self._encoding.labels() + + def set_weights(self, new_weights): + """ + Set the feature weight vector for this classifier. + :param new_weights: The new feature weight vector. + :type new_weights: list of float + """ + self._weights = new_weights + assert self._encoding.length() == len(new_weights) + + def weights(self): + """ + :return: The feature weight vector for this classifier. + :rtype: list of float + """ + return self._weights + + def classify(self, featureset): + return self.prob_classify(featureset).max() + + def prob_classify(self, featureset): + prob_dict = {} + for label in self._encoding.labels(): + feature_vector = self._encoding.encode(featureset, label) + + if self._logarithmic: + total = 0.0 + for (f_id, f_val) in feature_vector: + total += self._weights[f_id] * f_val + prob_dict[label] = total + + else: + prod = 1.0 + for (f_id, f_val) in feature_vector: + prod *= self._weights[f_id] ** f_val + prob_dict[label] = prod + + # Normalize the dictionary to give a probability distribution + return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True) + + def explain(self, featureset, columns=4): + """ + Print a table showing the effect of each of the features in + the given feature set, and how they combine to determine the + probabilities of each label for that featureset. + """ + descr_width = 50 + TEMPLATE = " %-" + str(descr_width - 2) + "s%s%8.3f" + + pdist = self.prob_classify(featureset) + labels = sorted(pdist.samples(), key=pdist.prob, reverse=True) + labels = labels[:columns] + print( + " Feature".ljust(descr_width) + + "".join("%8s" % (("%s" % l)[:7]) for l in labels) + ) + print(" " + "-" * (descr_width - 2 + 8 * len(labels))) + sums = defaultdict(int) + for i, label in enumerate(labels): + feature_vector = self._encoding.encode(featureset, label) + feature_vector.sort( + key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True + ) + for (f_id, f_val) in feature_vector: + if self._logarithmic: + score = self._weights[f_id] * f_val + else: + score = self._weights[f_id] ** f_val + descr = self._encoding.describe(f_id) + descr = descr.split(" and label is ")[0] # hack + descr += " (%s)" % f_val # hack + if len(descr) > 47: + descr = descr[:44] + "..." + print(TEMPLATE % (descr, i * 8 * " ", score)) + sums[label] += score + print(" " + "-" * (descr_width - 1 + 8 * len(labels))) + print( + " TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels) + ) + print( + " PROBS:".ljust(descr_width) + + "".join("%8.3f" % pdist.prob(l) for l in labels) + ) + + def most_informative_features(self, n=10): + """ + Generates the ranked list of informative features from most to least. + """ + if hasattr(self, "_most_informative_features"): + return self._most_informative_features[:n] + else: + self._most_informative_features = sorted( + list(range(len(self._weights))), + key=lambda fid: abs(self._weights[fid]), + reverse=True, + ) + return self._most_informative_features[:n] + + def show_most_informative_features(self, n=10, show="all"): + """ + :param show: all, neg, or pos (for negative-only or positive-only) + :type show: str + :param n: The no. of top features + :type n: int + """ + # Use None the full list of ranked features. + fids = self.most_informative_features(None) + if show == "pos": + fids = [fid for fid in fids if self._weights[fid] > 0] + elif show == "neg": + fids = [fid for fid in fids if self._weights[fid] < 0] + for fid in fids[:n]: + print(f"{self._weights[fid]:8.3f} {self._encoding.describe(fid)}") + + def __repr__(self): + return "" % ( + len(self._encoding.labels()), + self._encoding.length(), + ) + + #: A list of the algorithm names that are accepted for the + #: ``train()`` method's ``algorithm`` parameter. + ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"] + + @classmethod + def train( + cls, + train_toks, + algorithm=None, + trace=3, + encoding=None, + labels=None, + gaussian_prior_sigma=0, + **cutoffs, + ): + """ + Train a new maxent classifier based on the given corpus of + training samples. This classifier will have its weights + chosen to maximize entropy while remaining empirically + consistent with the training corpus. + + :rtype: MaxentClassifier + :return: The new maxent classifier + + :type train_toks: list + :param train_toks: Training data, represented as a list of + pairs, the first member of which is a featureset, + and the second of which is a classification label. + + :type algorithm: str + :param algorithm: A case-insensitive string, specifying which + algorithm should be used to train the classifier. The + following algorithms are currently available. + + - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``), + Improved Iterative Scaling (``'IIS'``) + - External Libraries (requiring megam): + LM-BFGS algorithm, with training performed by Megam (``'megam'``) + + The default algorithm is ``'IIS'``. + + :type trace: int + :param trace: The level of diagnostic tracing output to produce. + Higher values produce more verbose output. + :type encoding: MaxentFeatureEncodingI + :param encoding: A feature encoding, used to convert featuresets + into feature vectors. If none is specified, then a + ``BinaryMaxentFeatureEncoding`` will be built based on the + features that are attested in the training corpus. + :type labels: list(str) + :param labels: The set of possible labels. If none is given, then + the set of all labels attested in the training data will be + used instead. + :param gaussian_prior_sigma: The sigma value for a gaussian + prior on model weights. Currently, this is supported by + ``megam``. For other algorithms, its value is ignored. + :param cutoffs: Arguments specifying various conditions under + which the training should be halted. (Some of the cutoff + conditions are not supported by some algorithms.) + + - ``max_iter=v``: Terminate after ``v`` iterations. + - ``min_ll=v``: Terminate after the negative average + log-likelihood drops under ``v``. + - ``min_lldelta=v``: Terminate if a single iteration improves + log likelihood by less than ``v``. + """ + if algorithm is None: + algorithm = "iis" + for key in cutoffs: + if key not in ( + "max_iter", + "min_ll", + "min_lldelta", + "max_acc", + "min_accdelta", + "count_cutoff", + "norm", + "explicit", + "bernoulli", + ): + raise TypeError("Unexpected keyword arg %r" % key) + algorithm = algorithm.lower() + if algorithm == "iis": + return train_maxent_classifier_with_iis( + train_toks, trace, encoding, labels, **cutoffs + ) + elif algorithm == "gis": + return train_maxent_classifier_with_gis( + train_toks, trace, encoding, labels, **cutoffs + ) + elif algorithm == "megam": + return train_maxent_classifier_with_megam( + train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs + ) + elif algorithm == "tadm": + kwargs = cutoffs + kwargs["trace"] = trace + kwargs["encoding"] = encoding + kwargs["labels"] = labels + kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma + return TadmMaxentClassifier.train(train_toks, **kwargs) + else: + raise ValueError("Unknown algorithm %s" % algorithm) + + +#: Alias for MaxentClassifier. +ConditionalExponentialClassifier = MaxentClassifier + + +###################################################################### +# { Feature Encodings +###################################################################### + + +class MaxentFeatureEncodingI: + """ + A mapping that converts a set of input-feature values to a vector + of joint-feature values, given a label. This conversion is + necessary to translate featuresets into a format that can be used + by maximum entropy models. + + The set of joint-features used by a given encoding is fixed, and + each index in the generated joint-feature vectors corresponds to a + single joint-feature. The length of the generated joint-feature + vectors is therefore constant (for a given encoding). + + Because the joint-feature vectors generated by + ``MaxentFeatureEncodingI`` are typically very sparse, they are + represented as a list of ``(index, value)`` tuples, specifying the + value of each non-zero joint-feature. + + Feature encodings are generally created using the ``train()`` + method, which generates an appropriate encoding based on the + input-feature values and labels that are present in a given + corpus. + """ + + def encode(self, featureset, label): + """ + Given a (featureset, label) pair, return the corresponding + vector of joint-feature values. This vector is represented as + a list of ``(index, value)`` tuples, specifying the value of + each non-zero joint-feature. + + :type featureset: dict + :rtype: list(tuple(int, int)) + """ + raise NotImplementedError() + + def length(self): + """ + :return: The size of the fixed-length joint-feature vectors + that are generated by this encoding. + :rtype: int + """ + raise NotImplementedError() + + def labels(self): + """ + :return: A list of the \"known labels\" -- i.e., all labels + ``l`` such that ``self.encode(fs,l)`` can be a nonzero + joint-feature vector for some value of ``fs``. + :rtype: list + """ + raise NotImplementedError() + + def describe(self, fid): + """ + :return: A string describing the value of the joint-feature + whose index in the generated feature vectors is ``fid``. + :rtype: str + """ + raise NotImplementedError() + + def train(cls, train_toks): + """ + Construct and return new feature encoding, based on a given + training corpus ``train_toks``. + + :type train_toks: list(tuple(dict, str)) + :param train_toks: Training data, represented as a list of + pairs, the first member of which is a feature dictionary, + and the second of which is a classification label. + """ + raise NotImplementedError() + + +class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI): + """ + A feature encoding that calls a user-supplied function to map a + given featureset/label pair to a sparse joint-feature vector. + """ + + def __init__(self, func, length, labels): + """ + Construct a new feature encoding based on the given function. + + :type func: (callable) + :param func: A function that takes two arguments, a featureset + and a label, and returns the sparse joint feature vector + that encodes them:: + + func(featureset, label) -> feature_vector + + This sparse joint feature vector (``feature_vector``) is a + list of ``(index,value)`` tuples. + + :type length: int + :param length: The size of the fixed-length joint-feature + vectors that are generated by this encoding. + + :type labels: list + :param labels: A list of the \"known labels\" for this + encoding -- i.e., all labels ``l`` such that + ``self.encode(fs,l)`` can be a nonzero joint-feature vector + for some value of ``fs``. + """ + self._length = length + self._func = func + self._labels = labels + + def encode(self, featureset, label): + return self._func(featureset, label) + + def length(self): + return self._length + + def labels(self): + return self._labels + + def describe(self, fid): + return "no description available" + + +class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI): + """ + A feature encoding that generates vectors containing a binary + joint-features of the form: + + | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) + | { + | { 0 otherwise + + Where ``fname`` is the name of an input-feature, ``fval`` is a value + for that input-feature, and ``label`` is a label. + + Typically, these features are constructed based on a training + corpus, using the ``train()`` method. This method will create one + feature for each combination of ``fname``, ``fval``, and ``label`` + that occurs at least once in the training corpus. + + The ``unseen_features`` parameter can be used to add "unseen-value + features", which are used whenever an input feature has a value + that was not encountered in the training corpus. These features + have the form: + + | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) + | { and l == label + | { + | { 0 otherwise + + Where ``is_unseen(fname, fval)`` is true if the encoding does not + contain any joint features that are true when ``fs[fname]==fval``. + + The ``alwayson_features`` parameter can be used to add "always-on + features", which have the form:: + + | joint_feat(fs, l) = { 1 if (l == label) + | { + | { 0 otherwise + + These always-on features allow the maxent model to directly model + the prior probabilities of each label. + """ + + def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): + """ + :param labels: A list of the \"known labels\" for this encoding. + + :param mapping: A dictionary mapping from ``(fname,fval,label)`` + tuples to corresponding joint-feature indexes. These + indexes must be the set of integers from 0...len(mapping). + If ``mapping[fname,fval,label]=id``, then + ``self.encode(..., fname:fval, ..., label)[id]`` is 1; + otherwise, it is 0. + + :param unseen_features: If true, then include unseen value + features in the generated joint-feature vectors. + + :param alwayson_features: If true, then include always-on + features in the generated joint-feature vectors. + """ + if set(mapping.values()) != set(range(len(mapping))): + raise ValueError( + "Mapping values must be exactly the " + "set of integers from 0...len(mapping)" + ) + + self._labels = list(labels) + """A list of attested labels.""" + + self._mapping = mapping + """dict mapping from (fname,fval,label) -> fid""" + + self._length = len(mapping) + """The length of generated joint feature vectors.""" + + self._alwayson = None + """dict mapping from label -> fid""" + + self._unseen = None + """dict mapping from fname -> fid""" + + if alwayson_features: + self._alwayson = { + label: i + self._length for (i, label) in enumerate(labels) + } + self._length += len(self._alwayson) + + if unseen_features: + fnames = {fname for (fname, fval, label) in mapping} + self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)} + self._length += len(fnames) + + def encode(self, featureset, label): + # Inherit docs. + encoding = [] + + # Convert input-features to joint-features: + for fname, fval in featureset.items(): + # Known feature name & value: + if (fname, fval, label) in self._mapping: + encoding.append((self._mapping[fname, fval, label], 1)) + + # Otherwise, we might want to fire an "unseen-value feature". + elif self._unseen: + # Have we seen this fname/fval combination with any label? + for label2 in self._labels: + if (fname, fval, label2) in self._mapping: + break # we've seen this fname/fval combo + # We haven't -- fire the unseen-value feature + else: + if fname in self._unseen: + encoding.append((self._unseen[fname], 1)) + + # Add always-on features: + if self._alwayson and label in self._alwayson: + encoding.append((self._alwayson[label], 1)) + + return encoding + + def describe(self, f_id): + # Inherit docs. + if not isinstance(f_id, int): + raise TypeError("describe() expected an int") + try: + self._inv_mapping + except AttributeError: + self._inv_mapping = [-1] * len(self._mapping) + for (info, i) in self._mapping.items(): + self._inv_mapping[i] = info + + if f_id < len(self._mapping): + (fname, fval, label) = self._inv_mapping[f_id] + return f"{fname}=={fval!r} and label is {label!r}" + elif self._alwayson and f_id in self._alwayson.values(): + for (label, f_id2) in self._alwayson.items(): + if f_id == f_id2: + return "label is %r" % label + elif self._unseen and f_id in self._unseen.values(): + for (fname, f_id2) in self._unseen.items(): + if f_id == f_id2: + return "%s is unseen" % fname + else: + raise ValueError("Bad feature id") + + def labels(self): + # Inherit docs. + return self._labels + + def length(self): + # Inherit docs. + return self._length + + @classmethod + def train(cls, train_toks, count_cutoff=0, labels=None, **options): + """ + Construct and return new feature encoding, based on a given + training corpus ``train_toks``. See the class description + ``BinaryMaxentFeatureEncoding`` for a description of the + joint-features that will be included in this encoding. + + :type train_toks: list(tuple(dict, str)) + :param train_toks: Training data, represented as a list of + pairs, the first member of which is a feature dictionary, + and the second of which is a classification label. + + :type count_cutoff: int + :param count_cutoff: A cutoff value that is used to discard + rare joint-features. If a joint-feature's value is 1 + fewer than ``count_cutoff`` times in the training corpus, + then that joint-feature is not included in the generated + encoding. + + :type labels: list + :param labels: A list of labels that should be used by the + classifier. If not specified, then the set of labels + attested in ``train_toks`` will be used. + + :param options: Extra parameters for the constructor, such as + ``unseen_features`` and ``alwayson_features``. + """ + mapping = {} # maps (fname, fval, label) -> fid + seen_labels = set() # The set of labels we've encountered + count = defaultdict(int) # maps (fname, fval) -> count + + for (tok, label) in train_toks: + if labels and label not in labels: + raise ValueError("Unexpected label %s" % label) + seen_labels.add(label) + + # Record each of the features. + for (fname, fval) in tok.items(): + + # If a count cutoff is given, then only add a joint + # feature once the corresponding (fname, fval, label) + # tuple exceeds that cutoff. + count[fname, fval] += 1 + if count[fname, fval] >= count_cutoff: + if (fname, fval, label) not in mapping: + mapping[fname, fval, label] = len(mapping) + + if labels is None: + labels = seen_labels + return cls(labels, mapping, **options) + + +class GISEncoding(BinaryMaxentFeatureEncoding): + """ + A binary feature encoding which adds one new joint-feature to the + joint-features defined by ``BinaryMaxentFeatureEncoding``: a + correction feature, whose value is chosen to ensure that the + sparse vector always sums to a constant non-negative number. This + new feature is used to ensure two preconditions for the GIS + training algorithm: + + - At least one feature vector index must be nonzero for every + token. + - The feature vector must sum to a constant non-negative number + for every token. + """ + + def __init__( + self, labels, mapping, unseen_features=False, alwayson_features=False, C=None + ): + """ + :param C: The correction constant. The value of the correction + feature is based on this value. In particular, its value is + ``C - sum([v for (f,v) in encoding])``. + :seealso: ``BinaryMaxentFeatureEncoding.__init__`` + """ + BinaryMaxentFeatureEncoding.__init__( + self, labels, mapping, unseen_features, alwayson_features + ) + if C is None: + C = len({fname for (fname, fval, label) in mapping}) + 1 + self._C = C + + @property + def C(self): + """The non-negative constant that all encoded feature vectors + will sum to.""" + return self._C + + def encode(self, featureset, label): + # Get the basic encoding. + encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label) + base_length = BinaryMaxentFeatureEncoding.length(self) + + # Add a correction feature. + total = sum(v for (f, v) in encoding) + if total >= self._C: + raise ValueError("Correction feature is not high enough!") + encoding.append((base_length, self._C - total)) + + # Return the result + return encoding + + def length(self): + return BinaryMaxentFeatureEncoding.length(self) + 1 + + def describe(self, f_id): + if f_id == BinaryMaxentFeatureEncoding.length(self): + return "Correction feature (%s)" % self._C + else: + return BinaryMaxentFeatureEncoding.describe(self, f_id) + + +class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding): + def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): + self._mapping = OrderedDict(mapping) + self._label_mapping = OrderedDict() + BinaryMaxentFeatureEncoding.__init__( + self, labels, self._mapping, unseen_features, alwayson_features + ) + + def encode(self, featureset, label): + encoding = [] + for feature, value in featureset.items(): + if (feature, label) not in self._mapping: + self._mapping[(feature, label)] = len(self._mapping) + if value not in self._label_mapping: + if not isinstance(value, int): + self._label_mapping[value] = len(self._label_mapping) + else: + self._label_mapping[value] = value + encoding.append( + (self._mapping[(feature, label)], self._label_mapping[value]) + ) + return encoding + + def labels(self): + return self._labels + + def describe(self, fid): + for (feature, label) in self._mapping: + if self._mapping[(feature, label)] == fid: + return (feature, label) + + def length(self): + return len(self._mapping) + + @classmethod + def train(cls, train_toks, count_cutoff=0, labels=None, **options): + mapping = OrderedDict() + if not labels: + labels = [] + + # This gets read twice, so compute the values in case it's lazy. + train_toks = list(train_toks) + + for (featureset, label) in train_toks: + if label not in labels: + labels.append(label) + + for (featureset, label) in train_toks: + for label in labels: + for feature in featureset: + if (feature, label) not in mapping: + mapping[(feature, label)] = len(mapping) + + return cls(labels, mapping, **options) + + +class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI): + """ + A feature encoding that generates vectors containing integer, + float and binary joint-features of the form: + + Binary (for string and boolean features): + + | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) + | { + | { 0 otherwise + + Value (for integer and float features): + + | joint_feat(fs, l) = { fval if (fs[fname] == type(fval)) + | { and (l == label) + | { + | { not encoded otherwise + + Where ``fname`` is the name of an input-feature, ``fval`` is a value + for that input-feature, and ``label`` is a label. + + Typically, these features are constructed based on a training + corpus, using the ``train()`` method. + + For string and boolean features [type(fval) not in (int, float)] + this method will create one feature for each combination of + ``fname``, ``fval``, and ``label`` that occurs at least once in the + training corpus. + + For integer and float features [type(fval) in (int, float)] this + method will create one feature for each combination of ``fname`` + and ``label`` that occurs at least once in the training corpus. + + For binary features the ``unseen_features`` parameter can be used + to add "unseen-value features", which are used whenever an input + feature has a value that was not encountered in the training + corpus. These features have the form: + + | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) + | { and l == label + | { + | { 0 otherwise + + Where ``is_unseen(fname, fval)`` is true if the encoding does not + contain any joint features that are true when ``fs[fname]==fval``. + + The ``alwayson_features`` parameter can be used to add "always-on + features", which have the form: + + | joint_feat(fs, l) = { 1 if (l == label) + | { + | { 0 otherwise + + These always-on features allow the maxent model to directly model + the prior probabilities of each label. + """ + + def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): + """ + :param labels: A list of the \"known labels\" for this encoding. + + :param mapping: A dictionary mapping from ``(fname,fval,label)`` + tuples to corresponding joint-feature indexes. These + indexes must be the set of integers from 0...len(mapping). + If ``mapping[fname,fval,label]=id``, then + ``self.encode({..., fname:fval, ...``, label)[id]} is 1; + otherwise, it is 0. + + :param unseen_features: If true, then include unseen value + features in the generated joint-feature vectors. + + :param alwayson_features: If true, then include always-on + features in the generated joint-feature vectors. + """ + if set(mapping.values()) != set(range(len(mapping))): + raise ValueError( + "Mapping values must be exactly the " + "set of integers from 0...len(mapping)" + ) + + self._labels = list(labels) + """A list of attested labels.""" + + self._mapping = mapping + """dict mapping from (fname,fval,label) -> fid""" + + self._length = len(mapping) + """The length of generated joint feature vectors.""" + + self._alwayson = None + """dict mapping from label -> fid""" + + self._unseen = None + """dict mapping from fname -> fid""" + + if alwayson_features: + self._alwayson = { + label: i + self._length for (i, label) in enumerate(labels) + } + self._length += len(self._alwayson) + + if unseen_features: + fnames = {fname for (fname, fval, label) in mapping} + self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)} + self._length += len(fnames) + + def encode(self, featureset, label): + # Inherit docs. + encoding = [] + + # Convert input-features to joint-features: + for fname, fval in featureset.items(): + if isinstance(fval, (int, float)): + # Known feature name & value: + if (fname, type(fval), label) in self._mapping: + encoding.append((self._mapping[fname, type(fval), label], fval)) + else: + # Known feature name & value: + if (fname, fval, label) in self._mapping: + encoding.append((self._mapping[fname, fval, label], 1)) + + # Otherwise, we might want to fire an "unseen-value feature". + elif self._unseen: + # Have we seen this fname/fval combination with any label? + for label2 in self._labels: + if (fname, fval, label2) in self._mapping: + break # we've seen this fname/fval combo + # We haven't -- fire the unseen-value feature + else: + if fname in self._unseen: + encoding.append((self._unseen[fname], 1)) + + # Add always-on features: + if self._alwayson and label in self._alwayson: + encoding.append((self._alwayson[label], 1)) + + return encoding + + def describe(self, f_id): + # Inherit docs. + if not isinstance(f_id, int): + raise TypeError("describe() expected an int") + try: + self._inv_mapping + except AttributeError: + self._inv_mapping = [-1] * len(self._mapping) + for (info, i) in self._mapping.items(): + self._inv_mapping[i] = info + + if f_id < len(self._mapping): + (fname, fval, label) = self._inv_mapping[f_id] + return f"{fname}=={fval!r} and label is {label!r}" + elif self._alwayson and f_id in self._alwayson.values(): + for (label, f_id2) in self._alwayson.items(): + if f_id == f_id2: + return "label is %r" % label + elif self._unseen and f_id in self._unseen.values(): + for (fname, f_id2) in self._unseen.items(): + if f_id == f_id2: + return "%s is unseen" % fname + else: + raise ValueError("Bad feature id") + + def labels(self): + # Inherit docs. + return self._labels + + def length(self): + # Inherit docs. + return self._length + + @classmethod + def train(cls, train_toks, count_cutoff=0, labels=None, **options): + """ + Construct and return new feature encoding, based on a given + training corpus ``train_toks``. See the class description + ``TypedMaxentFeatureEncoding`` for a description of the + joint-features that will be included in this encoding. + + Note: recognized feature values types are (int, float), over + types are interpreted as regular binary features. + + :type train_toks: list(tuple(dict, str)) + :param train_toks: Training data, represented as a list of + pairs, the first member of which is a feature dictionary, + and the second of which is a classification label. + + :type count_cutoff: int + :param count_cutoff: A cutoff value that is used to discard + rare joint-features. If a joint-feature's value is 1 + fewer than ``count_cutoff`` times in the training corpus, + then that joint-feature is not included in the generated + encoding. + + :type labels: list + :param labels: A list of labels that should be used by the + classifier. If not specified, then the set of labels + attested in ``train_toks`` will be used. + + :param options: Extra parameters for the constructor, such as + ``unseen_features`` and ``alwayson_features``. + """ + mapping = {} # maps (fname, fval, label) -> fid + seen_labels = set() # The set of labels we've encountered + count = defaultdict(int) # maps (fname, fval) -> count + + for (tok, label) in train_toks: + if labels and label not in labels: + raise ValueError("Unexpected label %s" % label) + seen_labels.add(label) + + # Record each of the features. + for (fname, fval) in tok.items(): + if type(fval) in (int, float): + fval = type(fval) + # If a count cutoff is given, then only add a joint + # feature once the corresponding (fname, fval, label) + # tuple exceeds that cutoff. + count[fname, fval] += 1 + if count[fname, fval] >= count_cutoff: + if (fname, fval, label) not in mapping: + mapping[fname, fval, label] = len(mapping) + + if labels is None: + labels = seen_labels + return cls(labels, mapping, **options) + + +###################################################################### +# { Classifier Trainer: Generalized Iterative Scaling +###################################################################### + + +def train_maxent_classifier_with_gis( + train_toks, trace=3, encoding=None, labels=None, **cutoffs +): + """ + Train a new ``ConditionalExponentialClassifier``, using the given + training samples, using the Generalized Iterative Scaling + algorithm. This ``ConditionalExponentialClassifier`` will encode + the model that maximizes entropy from all the models that are + empirically consistent with ``train_toks``. + + :see: ``train_maxent_classifier()`` for parameter descriptions. + """ + cutoffs.setdefault("max_iter", 100) + cutoffchecker = CutoffChecker(cutoffs) + + # Construct an encoding from the training data. + if encoding is None: + encoding = GISEncoding.train(train_toks, labels=labels) + + if not hasattr(encoding, "C"): + raise TypeError( + "The GIS algorithm requires an encoding that " + "defines C (e.g., GISEncoding)." + ) + + # Cinv is the inverse of the sum of each joint feature vector. + # This controls the learning rate: higher Cinv (or lower C) gives + # faster learning. + Cinv = 1.0 / encoding.C + + # Count how many times each feature occurs in the training data. + empirical_fcount = calculate_empirical_fcount(train_toks, encoding) + + # Check for any features that are not attested in train_toks. + unattested = set(numpy.nonzero(empirical_fcount == 0)[0]) + + # Build the classifier. Start with weight=0 for each attested + # feature, and weight=-infinity for each unattested feature. + weights = numpy.zeros(len(empirical_fcount), "d") + for fid in unattested: + weights[fid] = numpy.NINF + classifier = ConditionalExponentialClassifier(encoding, weights) + + # Take the log of the empirical fcount. + log_empirical_fcount = numpy.log2(empirical_fcount) + del empirical_fcount + + if trace > 0: + print(" ==> Training (%d iterations)" % cutoffs["max_iter"]) + if trace > 2: + print() + print(" Iteration Log Likelihood Accuracy") + print(" ---------------------------------------") + + # Train the classifier. + try: + while True: + if trace > 2: + ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) + acc = cutoffchecker.acc or accuracy(classifier, train_toks) + iternum = cutoffchecker.iter + print(" %9d %14.5f %9.3f" % (iternum, ll, acc)) + + # Use the model to estimate the number of times each + # feature should occur in the training data. + estimated_fcount = calculate_estimated_fcount( + classifier, train_toks, encoding + ) + + # Take the log of estimated fcount (avoid taking log(0).) + for fid in unattested: + estimated_fcount[fid] += 1 + log_estimated_fcount = numpy.log2(estimated_fcount) + del estimated_fcount + + # Update the classifier weights + weights = classifier.weights() + weights += (log_empirical_fcount - log_estimated_fcount) * Cinv + classifier.set_weights(weights) + + # Check the log-likelihood & accuracy cutoffs. + if cutoffchecker.check(classifier, train_toks): + break + + except KeyboardInterrupt: + print(" Training stopped: keyboard interrupt") + except: + raise + + if trace > 2: + ll = log_likelihood(classifier, train_toks) + acc = accuracy(classifier, train_toks) + print(f" Final {ll:14.5f} {acc:9.3f}") + + # Return the classifier. + return classifier + + +def calculate_empirical_fcount(train_toks, encoding): + fcount = numpy.zeros(encoding.length(), "d") + + for tok, label in train_toks: + for (index, val) in encoding.encode(tok, label): + fcount[index] += val + + return fcount + + +def calculate_estimated_fcount(classifier, train_toks, encoding): + fcount = numpy.zeros(encoding.length(), "d") + + for tok, label in train_toks: + pdist = classifier.prob_classify(tok) + for label in pdist.samples(): + prob = pdist.prob(label) + for (fid, fval) in encoding.encode(tok, label): + fcount[fid] += prob * fval + + return fcount + + +###################################################################### +# { Classifier Trainer: Improved Iterative Scaling +###################################################################### + + +def train_maxent_classifier_with_iis( + train_toks, trace=3, encoding=None, labels=None, **cutoffs +): + """ + Train a new ``ConditionalExponentialClassifier``, using the given + training samples, using the Improved Iterative Scaling algorithm. + This ``ConditionalExponentialClassifier`` will encode the model + that maximizes entropy from all the models that are empirically + consistent with ``train_toks``. + + :see: ``train_maxent_classifier()`` for parameter descriptions. + """ + cutoffs.setdefault("max_iter", 100) + cutoffchecker = CutoffChecker(cutoffs) + + # Construct an encoding from the training data. + if encoding is None: + encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) + + # Count how many times each feature occurs in the training data. + empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks) + + # Find the nf map, and related variables nfarray and nfident. + # nf is the sum of the features for a given labeled text. + # nfmap compresses this sparse set of values to a dense list. + # nfarray performs the reverse operation. nfident is + # nfarray multiplied by an identity matrix. + nfmap = calculate_nfmap(train_toks, encoding) + nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d") + nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) + + # Check for any features that are not attested in train_toks. + unattested = set(numpy.nonzero(empirical_ffreq == 0)[0]) + + # Build the classifier. Start with weight=0 for each attested + # feature, and weight=-infinity for each unattested feature. + weights = numpy.zeros(len(empirical_ffreq), "d") + for fid in unattested: + weights[fid] = numpy.NINF + classifier = ConditionalExponentialClassifier(encoding, weights) + + if trace > 0: + print(" ==> Training (%d iterations)" % cutoffs["max_iter"]) + if trace > 2: + print() + print(" Iteration Log Likelihood Accuracy") + print(" ---------------------------------------") + + # Train the classifier. + try: + while True: + if trace > 2: + ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) + acc = cutoffchecker.acc or accuracy(classifier, train_toks) + iternum = cutoffchecker.iter + print(" %9d %14.5f %9.3f" % (iternum, ll, acc)) + + # Calculate the deltas for this iteration, using Newton's method. + deltas = calculate_deltas( + train_toks, + classifier, + unattested, + empirical_ffreq, + nfmap, + nfarray, + nftranspose, + encoding, + ) + + # Use the deltas to update our weights. + weights = classifier.weights() + weights += deltas + classifier.set_weights(weights) + + # Check the log-likelihood & accuracy cutoffs. + if cutoffchecker.check(classifier, train_toks): + break + + except KeyboardInterrupt: + print(" Training stopped: keyboard interrupt") + except: + raise + + if trace > 2: + ll = log_likelihood(classifier, train_toks) + acc = accuracy(classifier, train_toks) + print(f" Final {ll:14.5f} {acc:9.3f}") + + # Return the classifier. + return classifier + + +def calculate_nfmap(train_toks, encoding): + """ + Construct a map that can be used to compress ``nf`` (which is + typically sparse). + + *nf(feature_vector)* is the sum of the feature values for + *feature_vector*. + + This represents the number of features that are active for a + given labeled text. This method finds all values of *nf(t)* + that are attested for at least one token in the given list of + training tokens; and constructs a dictionary mapping these + attested values to a continuous range *0...N*. For example, + if the only values of *nf()* that were attested were 3, 5, and + 7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``. + + :return: A map that can be used to compress ``nf`` to a dense + vector. + :rtype: dict(int -> int) + """ + # Map from nf to indices. This allows us to use smaller arrays. + nfset = set() + for tok, _ in train_toks: + for label in encoding.labels(): + nfset.add(sum(val for (id, val) in encoding.encode(tok, label))) + return {nf: i for (i, nf) in enumerate(nfset)} + + +def calculate_deltas( + train_toks, + classifier, + unattested, + ffreq_empirical, + nfmap, + nfarray, + nftranspose, + encoding, +): + r""" + Calculate the update values for the classifier weights for + this iteration of IIS. These update weights are the value of + ``delta`` that solves the equation:: + + ffreq_empirical[i] + = + SUM[fs,l] (classifier.prob_classify(fs).prob(l) * + feature_vector(fs,l)[i] * + exp(delta[i] * nf(feature_vector(fs,l)))) + + Where: + - *(fs,l)* is a (featureset, label) tuple from ``train_toks`` + - *feature_vector(fs,l)* = ``encoding.encode(fs,l)`` + - *nf(vector)* = ``sum([val for (id,val) in vector])`` + + This method uses Newton's method to solve this equation for + *delta[i]*. In particular, it starts with a guess of + ``delta[i]`` = 1; and iteratively updates ``delta`` with: + + | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i]) + + until convergence, where *sum1* and *sum2* are defined as: + + | sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta) + | sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l))) + | f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) . + | feature_vector(fs,l)[i] . + | exp(delta[i] . nf(feature_vector(fs,l)))) + + Note that *sum1* and *sum2* depend on ``delta``; so they need + to be re-computed each iteration. + + The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are + used to generate a dense encoding for *nf(ltext)*. This + allows ``_deltas`` to calculate *sum1* and *sum2* using + matrices, which yields a significant performance improvement. + + :param train_toks: The set of training tokens. + :type train_toks: list(tuple(dict, str)) + :param classifier: The current classifier. + :type classifier: ClassifierI + :param ffreq_empirical: An array containing the empirical + frequency for each feature. The *i*\ th element of this + array is the empirical frequency for feature *i*. + :type ffreq_empirical: sequence of float + :param unattested: An array that is 1 for features that are + not attested in the training data; and 0 for features that + are attested. In other words, ``unattested[i]==0`` iff + ``ffreq_empirical[i]==0``. + :type unattested: sequence of int + :param nfmap: A map that can be used to compress ``nf`` to a dense + vector. + :type nfmap: dict(int -> int) + :param nfarray: An array that can be used to uncompress ``nf`` + from a dense vector. + :type nfarray: array(float) + :param nftranspose: The transpose of ``nfarray`` + :type nftranspose: array(float) + """ + # These parameters control when we decide that we've + # converged. It probably should be possible to set these + # manually, via keyword arguments to train. + NEWTON_CONVERGE = 1e-12 + MAX_NEWTON = 300 + + deltas = numpy.ones(encoding.length(), "d") + + # Precompute the A matrix: + # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) ) + # over all label,fs s.t. num_features[label,fs]=nf + A = numpy.zeros((len(nfmap), encoding.length()), "d") + + for tok, label in train_toks: + dist = classifier.prob_classify(tok) + + for label in encoding.labels(): + # Generate the feature vector + feature_vector = encoding.encode(tok, label) + # Find the number of active features + nf = sum(val for (id, val) in feature_vector) + # Update the A matrix + for (id, val) in feature_vector: + A[nfmap[nf], id] += dist.prob(label) * val + A /= len(train_toks) + + # Iteratively solve for delta. Use the following variables: + # - nf_delta[x][y] = nfarray[x] * delta[y] + # - exp_nf_delta[x][y] = exp(nf[x] * delta[y]) + # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y]) + # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) + # exp(delta[i]nf) + # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) + # nf exp(delta[i]nf) + for rangenum in range(MAX_NEWTON): + nf_delta = numpy.outer(nfarray, deltas) + exp_nf_delta = 2**nf_delta + nf_exp_nf_delta = nftranspose * exp_nf_delta + sum1 = numpy.sum(exp_nf_delta * A, axis=0) + sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0) + + # Avoid division by zero. + for fid in unattested: + sum2[fid] += 1 + + # Update the deltas. + deltas -= (ffreq_empirical - sum1) / -sum2 + + # We can stop once we converge. + n_error = numpy.sum(abs(ffreq_empirical - sum1)) / numpy.sum(abs(deltas)) + if n_error < NEWTON_CONVERGE: + return deltas + + return deltas + + +###################################################################### +# { Classifier Trainer: megam +###################################################################### + +# [xx] possible extension: add support for using implicit file format; +# this would need to put requirements on what encoding is used. But +# we may need this for other maxent classifier trainers that require +# implicit formats anyway. +def train_maxent_classifier_with_megam( + train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs +): + """ + Train a new ``ConditionalExponentialClassifier``, using the given + training samples, using the external ``megam`` library. This + ``ConditionalExponentialClassifier`` will encode the model that + maximizes entropy from all the models that are empirically + consistent with ``train_toks``. + + :see: ``train_maxent_classifier()`` for parameter descriptions. + :see: ``nltk.classify.megam`` + """ + + explicit = True + bernoulli = True + if "explicit" in kwargs: + explicit = kwargs["explicit"] + if "bernoulli" in kwargs: + bernoulli = kwargs["bernoulli"] + + # Construct an encoding from the training data. + if encoding is None: + # Count cutoff can also be controlled by megam with the -minfc + # option. Not sure where the best place for it is. + count_cutoff = kwargs.get("count_cutoff", 0) + encoding = BinaryMaxentFeatureEncoding.train( + train_toks, count_cutoff, labels=labels, alwayson_features=True + ) + elif labels is not None: + raise ValueError("Specify encoding or labels, not both") + + # Write a training file for megam. + try: + fd, trainfile_name = tempfile.mkstemp(prefix="nltk-") + with open(trainfile_name, "w") as trainfile: + write_megam_file( + train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli + ) + os.close(fd) + except (OSError, ValueError) as e: + raise ValueError("Error while creating megam training file: %s" % e) from e + + # Run megam on the training file. + options = [] + options += ["-nobias", "-repeat", "10"] + if explicit: + options += ["-explicit"] + if not bernoulli: + options += ["-fvals"] + if gaussian_prior_sigma: + # Lambda is just the precision of the Gaussian prior, i.e. it's the + # inverse variance, so the parameter conversion is 1.0/sigma**2. + # See https://users.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf + inv_variance = 1.0 / gaussian_prior_sigma**2 + else: + inv_variance = 0 + options += ["-lambda", "%.2f" % inv_variance, "-tune"] + if trace < 3: + options += ["-quiet"] + if "max_iter" in kwargs: + options += ["-maxi", "%s" % kwargs["max_iter"]] + if "ll_delta" in kwargs: + # [xx] this is actually a perplexity delta, not a log + # likelihood delta + options += ["-dpp", "%s" % abs(kwargs["ll_delta"])] + if hasattr(encoding, "cost"): + options += ["-multilabel"] # each possible la + options += ["multiclass", trainfile_name] + stdout = call_megam(options) + # print('./megam_i686.opt ', ' '.join(options)) + # Delete the training file + try: + os.remove(trainfile_name) + except OSError as e: + print(f"Warning: unable to delete {trainfile_name}: {e}") + + # Parse the generated weight vector. + weights = parse_megam_weights(stdout, encoding.length(), explicit) + + # Convert from base-e to base-2 weights. + weights *= numpy.log2(numpy.e) + + # Build the classifier + return MaxentClassifier(encoding, weights) + + +###################################################################### +# { Classifier Trainer: tadm +###################################################################### + + +class TadmMaxentClassifier(MaxentClassifier): + @classmethod + def train(cls, train_toks, **kwargs): + algorithm = kwargs.get("algorithm", "tao_lmvm") + trace = kwargs.get("trace", 3) + encoding = kwargs.get("encoding", None) + labels = kwargs.get("labels", None) + sigma = kwargs.get("gaussian_prior_sigma", 0) + count_cutoff = kwargs.get("count_cutoff", 0) + max_iter = kwargs.get("max_iter") + ll_delta = kwargs.get("min_lldelta") + + # Construct an encoding from the training data. + if not encoding: + encoding = TadmEventMaxentFeatureEncoding.train( + train_toks, count_cutoff, labels=labels + ) + + trainfile_fd, trainfile_name = tempfile.mkstemp( + prefix="nltk-tadm-events-", suffix=".gz" + ) + weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-") + + trainfile = gzip_open_unicode(trainfile_name, "w") + write_tadm_file(train_toks, encoding, trainfile) + trainfile.close() + + options = [] + options.extend(["-monitor"]) + options.extend(["-method", algorithm]) + if sigma: + options.extend(["-l2", "%.6f" % sigma**2]) + if max_iter: + options.extend(["-max_it", "%d" % max_iter]) + if ll_delta: + options.extend(["-fatol", "%.6f" % abs(ll_delta)]) + options.extend(["-events_in", trainfile_name]) + options.extend(["-params_out", weightfile_name]) + if trace < 3: + options.extend(["2>&1"]) + else: + options.extend(["-summary"]) + + call_tadm(options) + + with open(weightfile_name) as weightfile: + weights = parse_tadm_weights(weightfile) + + os.remove(trainfile_name) + os.remove(weightfile_name) + + # Convert from base-e to base-2 weights. + weights *= numpy.log2(numpy.e) + + # Build the classifier + return cls(encoding, weights) + + +###################################################################### +# { Demo +###################################################################### +def demo(): + from nltk.classify.util import names_demo + + classifier = names_demo(MaxentClassifier.train) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/megam.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/megam.py new file mode 100644 index 0000000000000000000000000000000000000000..4dc42875e3151a04aa458e04f3abb44c4583bda9 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/megam.py @@ -0,0 +1,184 @@ +# Natural Language Toolkit: Interface to Megam Classifier +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +A set of functions used to interface with the external megam_ maxent +optimization package. Before megam can be used, you should tell NLTK where it +can find the megam binary, using the ``config_megam()`` function. Typical +usage: + + >>> from nltk.classify import megam + >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP + [Found megam: ...] + +Use with MaxentClassifier. Example below, see MaxentClassifier documentation +for details. + + nltk.classify.MaxentClassifier.train(corpus, 'megam') + +.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html +""" +import subprocess + +from nltk.internals import find_binary + +try: + import numpy +except ImportError: + numpy = None + +###################################################################### +# { Configuration +###################################################################### + +_megam_bin = None + + +def config_megam(bin=None): + """ + Configure NLTK's interface to the ``megam`` maxent optimization + package. + + :param bin: The full path to the ``megam`` binary. If not specified, + then nltk will search the system for a ``megam`` binary; and if + one is not found, it will raise a ``LookupError`` exception. + :type bin: str + """ + global _megam_bin + _megam_bin = find_binary( + "megam", + bin, + env_vars=["MEGAM"], + binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"], + url="https://www.umiacs.umd.edu/~hal/megam/index.html", + ) + + +###################################################################### +# { Megam Interface Functions +###################################################################### + + +def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True): + """ + Generate an input file for ``megam`` based on the given corpus of + classified tokens. + + :type train_toks: list(tuple(dict, str)) + :param train_toks: Training data, represented as a list of + pairs, the first member of which is a feature dictionary, + and the second of which is a classification label. + + :type encoding: MaxentFeatureEncodingI + :param encoding: A feature encoding, used to convert featuresets + into feature vectors. May optionally implement a cost() method + in order to assign different costs to different class predictions. + + :type stream: stream + :param stream: The stream to which the megam input file should be + written. + + :param bernoulli: If true, then use the 'bernoulli' format. I.e., + all joint features have binary values, and are listed iff they + are true. Otherwise, list feature values explicitly. If + ``bernoulli=False``, then you must call ``megam`` with the + ``-fvals`` option. + + :param explicit: If true, then use the 'explicit' format. I.e., + list the features that would fire for any of the possible + labels, for each token. If ``explicit=True``, then you must + call ``megam`` with the ``-explicit`` option. + """ + # Look up the set of labels. + labels = encoding.labels() + labelnum = {label: i for (i, label) in enumerate(labels)} + + # Write the file, which contains one line per instance. + for featureset, label in train_toks: + # First, the instance number (or, in the weighted multiclass case, the cost of each label). + if hasattr(encoding, "cost"): + stream.write( + ":".join(str(encoding.cost(featureset, label, l)) for l in labels) + ) + else: + stream.write("%d" % labelnum[label]) + + # For implicit file formats, just list the features that fire + # for this instance's actual label. + if not explicit: + _write_megam_features(encoding.encode(featureset, label), stream, bernoulli) + + # For explicit formats, list the features that would fire for + # any of the possible labels. + else: + for l in labels: + stream.write(" #") + _write_megam_features(encoding.encode(featureset, l), stream, bernoulli) + + # End of the instance. + stream.write("\n") + + +def parse_megam_weights(s, features_count, explicit=True): + """ + Given the stdout output generated by ``megam`` when training a + model, return a ``numpy`` array containing the corresponding weight + vector. This function does not currently handle bias features. + """ + if numpy is None: + raise ValueError("This function requires that numpy be installed") + assert explicit, "non-explicit not supported yet" + lines = s.strip().split("\n") + weights = numpy.zeros(features_count, "d") + for line in lines: + if line.strip(): + fid, weight = line.split() + weights[int(fid)] = float(weight) + return weights + + +def _write_megam_features(vector, stream, bernoulli): + if not vector: + raise ValueError( + "MEGAM classifier requires the use of an " "always-on feature." + ) + for (fid, fval) in vector: + if bernoulli: + if fval == 1: + stream.write(" %s" % fid) + elif fval != 0: + raise ValueError( + "If bernoulli=True, then all" "features must be binary." + ) + else: + stream.write(f" {fid} {fval}") + + +def call_megam(args): + """ + Call the ``megam`` binary with the given arguments. + """ + if isinstance(args, str): + raise TypeError("args should be a list of strings") + if _megam_bin is None: + config_megam() + + # Call megam via a subprocess + cmd = [_megam_bin] + args + p = subprocess.Popen(cmd, stdout=subprocess.PIPE) + (stdout, stderr) = p.communicate() + + # Check the return code. + if p.returncode != 0: + print() + print(stderr) + raise OSError("megam command failed!") + + if isinstance(stdout, str): + return stdout + else: + return stdout.decode("utf-8") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/naivebayes.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/naivebayes.py new file mode 100644 index 0000000000000000000000000000000000000000..e3e9c232e27bcdcd38d7766c3bb841e2c8acab58 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/naivebayes.py @@ -0,0 +1,260 @@ +# Natural Language Toolkit: Naive Bayes Classifiers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +A classifier based on the Naive Bayes algorithm. In order to find the +probability for a label, this algorithm first uses the Bayes rule to +express P(label|features) in terms of P(label) and P(features|label): + +| P(label) * P(features|label) +| P(label|features) = ------------------------------ +| P(features) + +The algorithm then makes the 'naive' assumption that all features are +independent, given the label: + +| P(label) * P(f1|label) * ... * P(fn|label) +| P(label|features) = -------------------------------------------- +| P(features) + +Rather than computing P(features) explicitly, the algorithm just +calculates the numerator for each label, and normalizes them so they +sum to one: + +| P(label) * P(f1|label) * ... * P(fn|label) +| P(label|features) = -------------------------------------------- +| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) ) +""" + +from collections import defaultdict + +from nltk.classify.api import ClassifierI +from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs + +##////////////////////////////////////////////////////// +## Naive Bayes Classifier +##////////////////////////////////////////////////////// + + +class NaiveBayesClassifier(ClassifierI): + """ + A Naive Bayes classifier. Naive Bayes classifiers are + paramaterized by two probability distributions: + + - P(label) gives the probability that an input will receive each + label, given no information about the input's features. + + - P(fname=fval|label) gives the probability that a given feature + (fname) will receive a given value (fval), given that the + label (label). + + If the classifier encounters an input with a feature that has + never been seen with any label, then rather than assigning a + probability of 0 to all labels, it will ignore that feature. + + The feature value 'None' is reserved for unseen feature values; + you generally should not use 'None' as a feature value for one of + your own features. + """ + + def __init__(self, label_probdist, feature_probdist): + """ + :param label_probdist: P(label), the probability distribution + over labels. It is expressed as a ``ProbDistI`` whose + samples are labels. I.e., P(label) = + ``label_probdist.prob(label)``. + + :param feature_probdist: P(fname=fval|label), the probability + distribution for feature values, given labels. It is + expressed as a dictionary whose keys are ``(label, fname)`` + pairs and whose values are ``ProbDistI`` objects over feature + values. I.e., P(fname=fval|label) = + ``feature_probdist[label,fname].prob(fval)``. If a given + ``(label,fname)`` is not a key in ``feature_probdist``, then + it is assumed that the corresponding P(fname=fval|label) + is 0 for all values of ``fval``. + """ + self._label_probdist = label_probdist + self._feature_probdist = feature_probdist + self._labels = list(label_probdist.samples()) + + def labels(self): + return self._labels + + def classify(self, featureset): + return self.prob_classify(featureset).max() + + def prob_classify(self, featureset): + # Discard any feature names that we've never seen before. + # Otherwise, we'll just assign a probability of 0 to + # everything. + featureset = featureset.copy() + for fname in list(featureset.keys()): + for label in self._labels: + if (label, fname) in self._feature_probdist: + break + else: + # print('Ignoring unseen feature %s' % fname) + del featureset[fname] + + # Find the log probability of each label, given the features. + # Start with the log probability of the label itself. + logprob = {} + for label in self._labels: + logprob[label] = self._label_probdist.logprob(label) + + # Then add in the log probability of features given labels. + for label in self._labels: + for (fname, fval) in featureset.items(): + if (label, fname) in self._feature_probdist: + feature_probs = self._feature_probdist[label, fname] + logprob[label] += feature_probs.logprob(fval) + else: + # nb: This case will never come up if the + # classifier was created by + # NaiveBayesClassifier.train(). + logprob[label] += sum_logs([]) # = -INF. + + return DictionaryProbDist(logprob, normalize=True, log=True) + + def show_most_informative_features(self, n=10): + # Determine the most relevant features, and display them. + cpdist = self._feature_probdist + print("Most Informative Features") + + for (fname, fval) in self.most_informative_features(n): + + def labelprob(l): + return cpdist[l, fname].prob(fval) + + labels = sorted( + (l for l in self._labels if fval in cpdist[l, fname].samples()), + key=lambda element: (-labelprob(element), element), + reverse=True, + ) + if len(labels) == 1: + continue + l0 = labels[0] + l1 = labels[-1] + if cpdist[l0, fname].prob(fval) == 0: + ratio = "INF" + else: + ratio = "%8.1f" % ( + cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval) + ) + print( + "%24s = %-14r %6s : %-6s = %s : 1.0" + % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio) + ) + + def most_informative_features(self, n=100): + """ + Return a list of the 'most informative' features used by this + classifier. For the purpose of this function, the + informativeness of a feature ``(fname,fval)`` is equal to the + highest value of P(fname=fval|label), for any label, divided by + the lowest value of P(fname=fval|label), for any label: + + | max[ P(fname=fval|label1) / P(fname=fval|label2) ] + """ + if hasattr(self, "_most_informative_features"): + return self._most_informative_features[:n] + else: + # The set of (fname, fval) pairs used by this classifier. + features = set() + # The max & min probability associated w/ each (fname, fval) + # pair. Maps (fname,fval) -> float. + maxprob = defaultdict(lambda: 0.0) + minprob = defaultdict(lambda: 1.0) + + for (label, fname), probdist in self._feature_probdist.items(): + for fval in probdist.samples(): + feature = (fname, fval) + features.add(feature) + p = probdist.prob(fval) + maxprob[feature] = max(p, maxprob[feature]) + minprob[feature] = min(p, minprob[feature]) + if minprob[feature] == 0: + features.discard(feature) + + # Convert features to a list, & sort it by how informative + # features are. + self._most_informative_features = sorted( + features, + key=lambda feature_: ( + minprob[feature_] / maxprob[feature_], + feature_[0], + feature_[1] in [None, False, True], + str(feature_[1]).lower(), + ), + ) + return self._most_informative_features[:n] + + @classmethod + def train(cls, labeled_featuresets, estimator=ELEProbDist): + """ + :param labeled_featuresets: A list of classified featuresets, + i.e., a list of tuples ``(featureset, label)``. + """ + label_freqdist = FreqDist() + feature_freqdist = defaultdict(FreqDist) + feature_values = defaultdict(set) + fnames = set() + + # Count up how many times each feature value occurred, given + # the label and featurename. + for featureset, label in labeled_featuresets: + label_freqdist[label] += 1 + for fname, fval in featureset.items(): + # Increment freq(fval|label, fname) + feature_freqdist[label, fname][fval] += 1 + # Record that fname can take the value fval. + feature_values[fname].add(fval) + # Keep a list of all feature names. + fnames.add(fname) + + # If a feature didn't have a value given for an instance, then + # we assume that it gets the implicit value 'None.' This loop + # counts up the number of 'missing' feature values for each + # (label,fname) pair, and increments the count of the fval + # 'None' by that amount. + for label in label_freqdist: + num_samples = label_freqdist[label] + for fname in fnames: + count = feature_freqdist[label, fname].N() + # Only add a None key when necessary, i.e. if there are + # any samples with feature 'fname' missing. + if num_samples - count > 0: + feature_freqdist[label, fname][None] += num_samples - count + feature_values[fname].add(None) + + # Create the P(label) distribution + label_probdist = estimator(label_freqdist) + + # Create the P(fval|label, fname) distribution + feature_probdist = {} + for ((label, fname), freqdist) in feature_freqdist.items(): + probdist = estimator(freqdist, bins=len(feature_values[fname])) + feature_probdist[label, fname] = probdist + + return cls(label_probdist, feature_probdist) + + +##////////////////////////////////////////////////////// +## Demo +##////////////////////////////////////////////////////// + + +def demo(): + from nltk.classify.util import names_demo + + classifier = names_demo(NaiveBayesClassifier.train) + classifier.show_most_informative_features() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/positivenaivebayes.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/positivenaivebayes.py new file mode 100644 index 0000000000000000000000000000000000000000..23797f0970848ce9e3617b16dbf54352e5f1523c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/positivenaivebayes.py @@ -0,0 +1,180 @@ +# Natural Language Toolkit: Positive Naive Bayes Classifier +# +# Copyright (C) 2012 NLTK Project +# Author: Alessandro Presta +# URL: +# For license information, see LICENSE.TXT + +""" +A variant of the Naive Bayes Classifier that performs binary classification with +partially-labeled training sets. In other words, assume we want to build a classifier +that assigns each example to one of two complementary classes (e.g., male names and +female names). +If we have a training set with labeled examples for both classes, we can use a +standard Naive Bayes Classifier. However, consider the case when we only have labeled +examples for one of the classes, and other, unlabeled, examples. +Then, assuming a prior distribution on the two labels, we can use the unlabeled set +to estimate the frequencies of the various features. + +Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1 +and unlabeled examples. We are also given an estimate of P(1). + +We compute P(feature|1) exactly as in the standard case. + +To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are +assuming that the unlabeled examples are drawn according to the given prior distribution) +and then express the conditional probability as: + +| P(feature) - P(feature|1) * P(1) +| P(feature|0) = ---------------------------------- +| P(0) + +Example: + + >>> from nltk.classify import PositiveNaiveBayesClassifier + +Some sentences about sports: + + >>> sports_sentences = [ 'The team dominated the game', + ... 'They lost the ball', + ... 'The game was intense', + ... 'The goalkeeper catched the ball', + ... 'The other team controlled the ball' ] + +Mixed topics, including sports: + + >>> various_sentences = [ 'The President did not comment', + ... 'I lost the keys', + ... 'The team won the game', + ... 'Sara has two kids', + ... 'The ball went off the court', + ... 'They had the ball for the whole game', + ... 'The show is over' ] + +The features of a sentence are simply the words it contains: + + >>> def features(sentence): + ... words = sentence.lower().split() + ... return dict(('contains(%s)' % w, True) for w in words) + +We use the sports sentences as positive examples, the mixed ones ad unlabeled examples: + + >>> positive_featuresets = map(features, sports_sentences) + >>> unlabeled_featuresets = map(features, various_sentences) + >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, + ... unlabeled_featuresets) + +Is the following sentence about sports? + + >>> classifier.classify(features('The cat is on the table')) + False + +What about this one? + + >>> classifier.classify(features('My team lost the game')) + True +""" + +from collections import defaultdict + +from nltk.classify.naivebayes import NaiveBayesClassifier +from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist + +##////////////////////////////////////////////////////// +## Positive Naive Bayes Classifier +##////////////////////////////////////////////////////// + + +class PositiveNaiveBayesClassifier(NaiveBayesClassifier): + @staticmethod + def train( + positive_featuresets, + unlabeled_featuresets, + positive_prob_prior=0.5, + estimator=ELEProbDist, + ): + """ + :param positive_featuresets: An iterable of featuresets that are known as positive + examples (i.e., their label is ``True``). + + :param unlabeled_featuresets: An iterable of featuresets whose label is unknown. + + :param positive_prob_prior: A prior estimate of the probability of the label + ``True`` (default 0.5). + """ + positive_feature_freqdist = defaultdict(FreqDist) + unlabeled_feature_freqdist = defaultdict(FreqDist) + feature_values = defaultdict(set) + fnames = set() + + # Count up how many times each feature value occurred in positive examples. + num_positive_examples = 0 + for featureset in positive_featuresets: + for fname, fval in featureset.items(): + positive_feature_freqdist[fname][fval] += 1 + feature_values[fname].add(fval) + fnames.add(fname) + num_positive_examples += 1 + + # Count up how many times each feature value occurred in unlabeled examples. + num_unlabeled_examples = 0 + for featureset in unlabeled_featuresets: + for fname, fval in featureset.items(): + unlabeled_feature_freqdist[fname][fval] += 1 + feature_values[fname].add(fval) + fnames.add(fname) + num_unlabeled_examples += 1 + + # If a feature didn't have a value given for an instance, then we assume that + # it gets the implicit value 'None'. + for fname in fnames: + count = positive_feature_freqdist[fname].N() + positive_feature_freqdist[fname][None] += num_positive_examples - count + feature_values[fname].add(None) + + for fname in fnames: + count = unlabeled_feature_freqdist[fname].N() + unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count + feature_values[fname].add(None) + + negative_prob_prior = 1.0 - positive_prob_prior + + # Create the P(label) distribution. + label_probdist = DictionaryProbDist( + {True: positive_prob_prior, False: negative_prob_prior} + ) + + # Create the P(fval|label, fname) distribution. + feature_probdist = {} + for fname, freqdist in positive_feature_freqdist.items(): + probdist = estimator(freqdist, bins=len(feature_values[fname])) + feature_probdist[True, fname] = probdist + + for fname, freqdist in unlabeled_feature_freqdist.items(): + global_probdist = estimator(freqdist, bins=len(feature_values[fname])) + negative_feature_probs = {} + for fval in feature_values[fname]: + prob = ( + global_probdist.prob(fval) + - positive_prob_prior * feature_probdist[True, fname].prob(fval) + ) / negative_prob_prior + # TODO: We need to add some kind of smoothing here, instead of + # setting negative probabilities to zero and normalizing. + negative_feature_probs[fval] = max(prob, 0.0) + feature_probdist[False, fname] = DictionaryProbDist( + negative_feature_probs, normalize=True + ) + + return PositiveNaiveBayesClassifier(label_probdist, feature_probdist) + + +##////////////////////////////////////////////////////// +## Demo +##////////////////////////////////////////////////////// + + +def demo(): + from nltk.classify.util import partial_names_demo + + classifier = partial_names_demo(PositiveNaiveBayesClassifier.train) + classifier.show_most_informative_features() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/util.py new file mode 100644 index 0000000000000000000000000000000000000000..5da5e947199a6d8cf2314a6f8b365512708d1369 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/util.py @@ -0,0 +1,346 @@ +# Natural Language Toolkit: Classifier Utility Functions +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# URL: +# For license information, see LICENSE.TXT + +""" +Utility functions and classes for classifiers. +""" + +import math + +# from nltk.util import Deprecated +import nltk.classify.util # for accuracy & log_likelihood +from nltk.util import LazyMap + +###################################################################### +# { Helper Functions +###################################################################### + +# alternative name possibility: 'map_featurefunc()'? +# alternative name possibility: 'detect_features()'? +# alternative name possibility: 'map_featuredetect()'? +# or.. just have users use LazyMap directly? +def apply_features(feature_func, toks, labeled=None): + """ + Use the ``LazyMap`` class to construct a lazy list-like + object that is analogous to ``map(feature_func, toks)``. In + particular, if ``labeled=False``, then the returned list-like + object's values are equal to:: + + [feature_func(tok) for tok in toks] + + If ``labeled=True``, then the returned list-like object's values + are equal to:: + + [(feature_func(tok), label) for (tok, label) in toks] + + The primary purpose of this function is to avoid the memory + overhead involved in storing all the featuresets for every token + in a corpus. Instead, these featuresets are constructed lazily, + as-needed. The reduction in memory overhead can be especially + significant when the underlying list of tokens is itself lazy (as + is the case with many corpus readers). + + :param feature_func: The function that will be applied to each + token. It should return a featureset -- i.e., a dict + mapping feature names to feature values. + :param toks: The list of tokens to which ``feature_func`` should be + applied. If ``labeled=True``, then the list elements will be + passed directly to ``feature_func()``. If ``labeled=False``, + then the list elements should be tuples ``(tok,label)``, and + ``tok`` will be passed to ``feature_func()``. + :param labeled: If true, then ``toks`` contains labeled tokens -- + i.e., tuples of the form ``(tok, label)``. (Default: + auto-detect based on types.) + """ + if labeled is None: + labeled = toks and isinstance(toks[0], (tuple, list)) + if labeled: + + def lazy_func(labeled_token): + return (feature_func(labeled_token[0]), labeled_token[1]) + + return LazyMap(lazy_func, toks) + else: + return LazyMap(feature_func, toks) + + +def attested_labels(tokens): + """ + :return: A list of all labels that are attested in the given list + of tokens. + :rtype: list of (immutable) + :param tokens: The list of classified tokens from which to extract + labels. A classified token has the form ``(token, label)``. + :type tokens: list + """ + return tuple({label for (tok, label) in tokens}) + + +def log_likelihood(classifier, gold): + results = classifier.prob_classify_many([fs for (fs, l) in gold]) + ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)] + return math.log(sum(ll) / len(ll)) + + +def accuracy(classifier, gold): + results = classifier.classify_many([fs for (fs, l) in gold]) + correct = [l == r for ((fs, l), r) in zip(gold, results)] + if correct: + return sum(correct) / len(correct) + else: + return 0 + + +class CutoffChecker: + """ + A helper class that implements cutoff checks based on number of + iterations and log likelihood. + + Accuracy cutoffs are also implemented, but they're almost never + a good idea to use. + """ + + def __init__(self, cutoffs): + self.cutoffs = cutoffs.copy() + if "min_ll" in cutoffs: + cutoffs["min_ll"] = -abs(cutoffs["min_ll"]) + if "min_lldelta" in cutoffs: + cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"]) + self.ll = None + self.acc = None + self.iter = 1 + + def check(self, classifier, train_toks): + cutoffs = self.cutoffs + self.iter += 1 + if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]: + return True # iteration cutoff. + + new_ll = nltk.classify.util.log_likelihood(classifier, train_toks) + if math.isnan(new_ll): + return True + + if "min_ll" in cutoffs or "min_lldelta" in cutoffs: + if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]: + return True # log likelihood cutoff + if ( + "min_lldelta" in cutoffs + and self.ll + and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"])) + ): + return True # log likelihood delta cutoff + self.ll = new_ll + + if "max_acc" in cutoffs or "min_accdelta" in cutoffs: + new_acc = nltk.classify.util.log_likelihood(classifier, train_toks) + if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]: + return True # log likelihood cutoff + if ( + "min_accdelta" in cutoffs + and self.acc + and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"])) + ): + return True # log likelihood delta cutoff + self.acc = new_acc + + return False # no cutoff reached. + + +###################################################################### +# { Demos +###################################################################### + + +def names_demo_features(name): + features = {} + features["alwayson"] = True + features["startswith"] = name[0].lower() + features["endswith"] = name[-1].lower() + for letter in "abcdefghijklmnopqrstuvwxyz": + features["count(%s)" % letter] = name.lower().count(letter) + features["has(%s)" % letter] = letter in name.lower() + return features + + +def binary_names_demo_features(name): + features = {} + features["alwayson"] = True + features["startswith(vowel)"] = name[0].lower() in "aeiouy" + features["endswith(vowel)"] = name[-1].lower() in "aeiouy" + for letter in "abcdefghijklmnopqrstuvwxyz": + features["count(%s)" % letter] = name.lower().count(letter) + features["has(%s)" % letter] = letter in name.lower() + features["startswith(%s)" % letter] = letter == name[0].lower() + features["endswith(%s)" % letter] = letter == name[-1].lower() + return features + + +def names_demo(trainer, features=names_demo_features): + import random + + from nltk.corpus import names + + # Construct a list of classified names, using the names corpus. + namelist = [(name, "male") for name in names.words("male.txt")] + [ + (name, "female") for name in names.words("female.txt") + ] + + # Randomly split the names into a test & train set. + random.seed(123456) + random.shuffle(namelist) + train = namelist[:5000] + test = namelist[5000:5500] + + # Train up a classifier. + print("Training classifier...") + classifier = trainer([(features(n), g) for (n, g) in train]) + + # Run the classifier on the test data. + print("Testing classifier...") + acc = accuracy(classifier, [(features(n), g) for (n, g) in test]) + print("Accuracy: %6.4f" % acc) + + # For classifiers that can find probabilities, show the log + # likelihood and some sample probability distributions. + try: + test_featuresets = [features(n) for (n, g) in test] + pdists = classifier.prob_classify_many(test_featuresets) + ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] + print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) + print() + print("Unseen Names P(Male) P(Female)\n" + "-" * 40) + for ((name, gender), pdist) in list(zip(test, pdists))[:5]: + if gender == "male": + fmt = " %-15s *%6.4f %6.4f" + else: + fmt = " %-15s %6.4f *%6.4f" + print(fmt % (name, pdist.prob("male"), pdist.prob("female"))) + except NotImplementedError: + pass + + # Return the classifier + return classifier + + +def partial_names_demo(trainer, features=names_demo_features): + import random + + from nltk.corpus import names + + male_names = names.words("male.txt") + female_names = names.words("female.txt") + + random.seed(654321) + random.shuffle(male_names) + random.shuffle(female_names) + + # Create a list of male names to be used as positive-labeled examples for training + positive = map(features, male_names[:2000]) + + # Create a list of male and female names to be used as unlabeled examples + unlabeled = map(features, male_names[2000:2500] + female_names[:500]) + + # Create a test set with correctly-labeled male and female names + test = [(name, True) for name in male_names[2500:2750]] + [ + (name, False) for name in female_names[500:750] + ] + + random.shuffle(test) + + # Train up a classifier. + print("Training classifier...") + classifier = trainer(positive, unlabeled) + + # Run the classifier on the test data. + print("Testing classifier...") + acc = accuracy(classifier, [(features(n), m) for (n, m) in test]) + print("Accuracy: %6.4f" % acc) + + # For classifiers that can find probabilities, show the log + # likelihood and some sample probability distributions. + try: + test_featuresets = [features(n) for (n, m) in test] + pdists = classifier.prob_classify_many(test_featuresets) + ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] + print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) + print() + print("Unseen Names P(Male) P(Female)\n" + "-" * 40) + for ((name, is_male), pdist) in zip(test, pdists)[:5]: + if is_male == True: + fmt = " %-15s *%6.4f %6.4f" + else: + fmt = " %-15s %6.4f *%6.4f" + print(fmt % (name, pdist.prob(True), pdist.prob(False))) + except NotImplementedError: + pass + + # Return the classifier + return classifier + + +_inst_cache = {} + + +def wsd_demo(trainer, word, features, n=1000): + import random + + from nltk.corpus import senseval + + # Get the instances. + print("Reading data...") + global _inst_cache + if word not in _inst_cache: + _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] + instances = _inst_cache[word][:] + if n > len(instances): + n = len(instances) + senses = list({l for (i, l) in instances}) + print(" Senses: " + " ".join(senses)) + + # Randomly split the names into a test & train set. + print("Splitting into test & train...") + random.seed(123456) + random.shuffle(instances) + train = instances[: int(0.8 * n)] + test = instances[int(0.8 * n) : n] + + # Train up a classifier. + print("Training classifier...") + classifier = trainer([(features(i), l) for (i, l) in train]) + + # Run the classifier on the test data. + print("Testing classifier...") + acc = accuracy(classifier, [(features(i), l) for (i, l) in test]) + print("Accuracy: %6.4f" % acc) + + # For classifiers that can find probabilities, show the log + # likelihood and some sample probability distributions. + try: + test_featuresets = [features(i) for (i, n) in test] + pdists = classifier.prob_classify_many(test_featuresets) + ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] + print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) + except NotImplementedError: + pass + + # Return the classifier + return classifier + + +def check_megam_config(): + """ + Checks whether the MEGAM binary is configured. + """ + try: + _megam_bin + except NameError as e: + err_msg = str( + "Please configure your megam binary first, e.g.\n" + ">>> nltk.config_megam('/usr/bin/local/megam')" + ) + raise NameError(err_msg) from e diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/weka.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/weka.py new file mode 100644 index 0000000000000000000000000000000000000000..94a6fbe628d26b562fd14d93779ca87ed5efbb06 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/weka.py @@ -0,0 +1,377 @@ +# Natural Language Toolkit: Interface to Weka Classsifiers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Classifiers that make use of the external 'Weka' package. +""" + +import os +import re +import subprocess +import tempfile +import time +import zipfile +from sys import stdin + +from nltk.classify.api import ClassifierI +from nltk.internals import config_java, java +from nltk.probability import DictionaryProbDist + +_weka_classpath = None +_weka_search = [ + ".", + "/usr/share/weka", + "/usr/local/share/weka", + "/usr/lib/weka", + "/usr/local/lib/weka", +] + + +def config_weka(classpath=None): + global _weka_classpath + + # Make sure java's configured first. + config_java() + + if classpath is not None: + _weka_classpath = classpath + + if _weka_classpath is None: + searchpath = _weka_search + if "WEKAHOME" in os.environ: + searchpath.insert(0, os.environ["WEKAHOME"]) + + for path in searchpath: + if os.path.exists(os.path.join(path, "weka.jar")): + _weka_classpath = os.path.join(path, "weka.jar") + version = _check_weka_version(_weka_classpath) + if version: + print(f"[Found Weka: {_weka_classpath} (version {version})]") + else: + print("[Found Weka: %s]" % _weka_classpath) + _check_weka_version(_weka_classpath) + + if _weka_classpath is None: + raise LookupError( + "Unable to find weka.jar! Use config_weka() " + "or set the WEKAHOME environment variable. " + "For more information about Weka, please see " + "https://www.cs.waikato.ac.nz/ml/weka/" + ) + + +def _check_weka_version(jar): + try: + zf = zipfile.ZipFile(jar) + except (SystemExit, KeyboardInterrupt): + raise + except: + return None + try: + try: + return zf.read("weka/core/version.txt") + except KeyError: + return None + finally: + zf.close() + + +class WekaClassifier(ClassifierI): + def __init__(self, formatter, model_filename): + self._formatter = formatter + self._model = model_filename + + def prob_classify_many(self, featuresets): + return self._classify_many(featuresets, ["-p", "0", "-distribution"]) + + def classify_many(self, featuresets): + return self._classify_many(featuresets, ["-p", "0"]) + + def _classify_many(self, featuresets, options): + # Make sure we can find java & weka. + config_weka() + + temp_dir = tempfile.mkdtemp() + try: + # Write the test data file. + test_filename = os.path.join(temp_dir, "test.arff") + self._formatter.write(test_filename, featuresets) + + # Call weka to classify the data. + cmd = [ + "weka.classifiers.bayes.NaiveBayes", + "-l", + self._model, + "-T", + test_filename, + ] + options + (stdout, stderr) = java( + cmd, + classpath=_weka_classpath, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # Check if something went wrong: + if stderr and not stdout: + if "Illegal options: -distribution" in stderr: + raise ValueError( + "The installed version of weka does " + "not support probability distribution " + "output." + ) + else: + raise ValueError("Weka failed to generate output:\n%s" % stderr) + + # Parse weka's output. + return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n")) + + finally: + for f in os.listdir(temp_dir): + os.remove(os.path.join(temp_dir, f)) + os.rmdir(temp_dir) + + def parse_weka_distribution(self, s): + probs = [float(v) for v in re.split("[*,]+", s) if v.strip()] + probs = dict(zip(self._formatter.labels(), probs)) + return DictionaryProbDist(probs) + + def parse_weka_output(self, lines): + # Strip unwanted text from stdout + for i, line in enumerate(lines): + if line.strip().startswith("inst#"): + lines = lines[i:] + break + + if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]: + return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()] + elif lines[0].split() == [ + "inst#", + "actual", + "predicted", + "error", + "distribution", + ]: + return [ + self.parse_weka_distribution(line.split()[-1]) + for line in lines[1:] + if line.strip() + ] + + # is this safe:? + elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]): + return [line.split()[1] for line in lines if line.strip()] + + else: + for line in lines[:10]: + print(line) + raise ValueError( + "Unhandled output format -- your version " + "of weka may not be supported.\n" + " Header: %s" % lines[0] + ) + + # [xx] full list of classifiers (some may be abstract?): + # ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule, + # DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48, + # JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic, + # LogisticBase, M5Base, MultilayerPerceptron, + # MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial, + # NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART, + # PreConstructedLinearModel, Prism, RandomForest, + # RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor, + # RuleNode, SimpleLinearRegression, SimpleLogistic, + # SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI, + # VotedPerceptron, Winnow, ZeroR + + _CLASSIFIER_CLASS = { + "naivebayes": "weka.classifiers.bayes.NaiveBayes", + "C4.5": "weka.classifiers.trees.J48", + "log_regression": "weka.classifiers.functions.Logistic", + "svm": "weka.classifiers.functions.SMO", + "kstar": "weka.classifiers.lazy.KStar", + "ripper": "weka.classifiers.rules.JRip", + } + + @classmethod + def train( + cls, + model_filename, + featuresets, + classifier="naivebayes", + options=[], + quiet=True, + ): + # Make sure we can find java & weka. + config_weka() + + # Build an ARFF formatter. + formatter = ARFF_Formatter.from_train(featuresets) + + temp_dir = tempfile.mkdtemp() + try: + # Write the training data file. + train_filename = os.path.join(temp_dir, "train.arff") + formatter.write(train_filename, featuresets) + + if classifier in cls._CLASSIFIER_CLASS: + javaclass = cls._CLASSIFIER_CLASS[classifier] + elif classifier in cls._CLASSIFIER_CLASS.values(): + javaclass = classifier + else: + raise ValueError("Unknown classifier %s" % classifier) + + # Train the weka model. + cmd = [javaclass, "-d", model_filename, "-t", train_filename] + cmd += list(options) + if quiet: + stdout = subprocess.PIPE + else: + stdout = None + java(cmd, classpath=_weka_classpath, stdout=stdout) + + # Return the new classifier. + return WekaClassifier(formatter, model_filename) + + finally: + for f in os.listdir(temp_dir): + os.remove(os.path.join(temp_dir, f)) + os.rmdir(temp_dir) + + +class ARFF_Formatter: + """ + Converts featuresets and labeled featuresets to ARFF-formatted + strings, appropriate for input into Weka. + + Features and classes can be specified manually in the constructor, or may + be determined from data using ``from_train``. + """ + + def __init__(self, labels, features): + """ + :param labels: A list of all class labels that can be generated. + :param features: A list of feature specifications, where + each feature specification is a tuple (fname, ftype); + and ftype is an ARFF type string such as NUMERIC or + STRING. + """ + self._labels = labels + self._features = features + + def format(self, tokens): + """Returns a string representation of ARFF output for the given data.""" + return self.header_section() + self.data_section(tokens) + + def labels(self): + """Returns the list of classes.""" + return list(self._labels) + + def write(self, outfile, tokens): + """Writes ARFF data to a file for the given data.""" + if not hasattr(outfile, "write"): + outfile = open(outfile, "w") + outfile.write(self.format(tokens)) + outfile.close() + + @staticmethod + def from_train(tokens): + """ + Constructs an ARFF_Formatter instance with class labels and feature + types determined from the given data. Handles boolean, numeric and + string (note: not nominal) types. + """ + # Find the set of all attested labels. + labels = {label for (tok, label) in tokens} + + # Determine the types of all features. + features = {} + for tok, label in tokens: + for (fname, fval) in tok.items(): + if issubclass(type(fval), bool): + ftype = "{True, False}" + elif issubclass(type(fval), (int, float, bool)): + ftype = "NUMERIC" + elif issubclass(type(fval), str): + ftype = "STRING" + elif fval is None: + continue # can't tell the type. + else: + raise ValueError("Unsupported value type %r" % ftype) + + if features.get(fname, ftype) != ftype: + raise ValueError("Inconsistent type for %s" % fname) + features[fname] = ftype + features = sorted(features.items()) + + return ARFF_Formatter(labels, features) + + def header_section(self): + """Returns an ARFF header as a string.""" + # Header comment. + s = ( + "% Weka ARFF file\n" + + "% Generated automatically by NLTK\n" + + "%% %s\n\n" % time.ctime() + ) + + # Relation name + s += "@RELATION rel\n\n" + + # Input attribute specifications + for fname, ftype in self._features: + s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype) + + # Label attribute specification + s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels)) + + return s + + def data_section(self, tokens, labeled=None): + """ + Returns the ARFF data section for the given data. + + :param tokens: a list of featuresets (dicts) or labelled featuresets + which are tuples (featureset, label). + :param labeled: Indicates whether the given tokens are labeled + or not. If None, then the tokens will be assumed to be + labeled if the first token's value is a tuple or list. + """ + # Check if the tokens are labeled or unlabeled. If unlabeled, + # then use 'None' + if labeled is None: + labeled = tokens and isinstance(tokens[0], (tuple, list)) + if not labeled: + tokens = [(tok, None) for tok in tokens] + + # Data section + s = "\n@DATA\n" + for (tok, label) in tokens: + for fname, ftype in self._features: + s += "%s," % self._fmt_arff_val(tok.get(fname)) + s += "%s\n" % self._fmt_arff_val(label) + + return s + + def _fmt_arff_val(self, fval): + if fval is None: + return "?" + elif isinstance(fval, (bool, int)): + return "%s" % fval + elif isinstance(fval, float): + return "%r" % fval + else: + return "%r" % fval + + +if __name__ == "__main__": + from nltk.classify.util import binary_names_demo_features, names_demo + + def make_classifier(featuresets): + return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5") + + classifier = names_demo(make_classifier, binary_names_demo_features) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b49e354dc7954e38a085d0dd8f8198cd3cdd54ae --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/__init__.py @@ -0,0 +1,92 @@ +# Natural Language Toolkit: Clusterers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# URL: +# For license information, see LICENSE.TXT + +""" +This module contains a number of basic clustering algorithms. Clustering +describes the task of discovering groups of similar items with a large +collection. It is also describe as unsupervised machine learning, as the data +from which it learns is unannotated with class information, as is the case for +supervised learning. Annotated data is difficult and expensive to obtain in +the quantities required for the majority of supervised learning algorithms. +This problem, the knowledge acquisition bottleneck, is common to most natural +language processing tasks, thus fueling the need for quality unsupervised +approaches. + +This module contains a k-means clusterer, E-M clusterer and a group average +agglomerative clusterer (GAAC). All these clusterers involve finding good +cluster groupings for a set of vectors in multi-dimensional space. + +The K-means clusterer starts with k arbitrary chosen means then allocates each +vector to the cluster with the closest mean. It then recalculates the means of +each cluster as the centroid of the vectors in the cluster. This process +repeats until the cluster memberships stabilise. This is a hill-climbing +algorithm which may converge to a local maximum. Hence the clustering is +often repeated with random initial means and the most commonly occurring +output means are chosen. + +The GAAC clusterer starts with each of the *N* vectors as singleton clusters. +It then iteratively merges pairs of clusters which have the closest centroids. +This continues until there is only one cluster. The order of merges gives rise +to a dendrogram - a tree with the earlier merges lower than later merges. The +membership of a given number of clusters *c*, *1 <= c <= N*, can be found by +cutting the dendrogram at depth *c*. + +The Gaussian EM clusterer models the vectors as being produced by a mixture +of k Gaussian sources. The parameters of these sources (prior probability, +mean and covariance matrix) are then found to maximise the likelihood of the +given data. This is done with the expectation maximisation algorithm. It +starts with k arbitrarily chosen means, priors and covariance matrices. It +then calculates the membership probabilities for each vector in each of the +clusters - this is the 'E' step. The cluster parameters are then updated in +the 'M' step using the maximum likelihood estimate from the cluster membership +probabilities. This process continues until the likelihood of the data does +not significantly increase. + +They all extend the ClusterI interface which defines common operations +available with each clusterer. These operations include: + +- cluster: clusters a sequence of vectors +- classify: assign a vector to a cluster +- classification_probdist: give the probability distribution over cluster memberships + +The current existing classifiers also extend cluster.VectorSpace, an +abstract class which allows for singular value decomposition (SVD) and vector +normalisation. SVD is used to reduce the dimensionality of the vector space in +such a manner as to preserve as much of the variation as possible, by +reparameterising the axes in order of variability and discarding all bar the +first d dimensions. Normalisation ensures that vectors fall in the unit +hypersphere. + +Usage example (see also demo()):: + + from nltk import cluster + from nltk.cluster import euclidean_distance + from numpy import array + + vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]] + + # initialise the clusterer (will also assign the vectors to clusters) + clusterer = cluster.KMeansClusterer(2, euclidean_distance) + clusterer.cluster(vectors, True) + + # classify a new vector + print(clusterer.classify(array([3, 3]))) + +Note that the vectors must use numpy array-like +objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for +efficiency when required. +""" + +from nltk.cluster.em import EMClusterer +from nltk.cluster.gaac import GAAClusterer +from nltk.cluster.kmeans import KMeansClusterer +from nltk.cluster.util import ( + Dendrogram, + VectorSpaceClusterer, + cosine_distance, + euclidean_distance, +) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/api.py new file mode 100644 index 0000000000000000000000000000000000000000..debc604d77a2cbbca679eb3d5b06cfae6b66d598 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/api.py @@ -0,0 +1,74 @@ +# Natural Language Toolkit: Clusterer Interfaces +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# Porting: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from abc import ABCMeta, abstractmethod + +from nltk.probability import DictionaryProbDist + + +class ClusterI(metaclass=ABCMeta): + """ + Interface covering basic clustering functionality. + """ + + @abstractmethod + def cluster(self, vectors, assign_clusters=False): + """ + Assigns the vectors to clusters, learning the clustering parameters + from the data. Returns a cluster identifier for each vector. + """ + + @abstractmethod + def classify(self, token): + """ + Classifies the token into a cluster, setting the token's CLUSTER + parameter to that cluster identifier. + """ + + def likelihood(self, vector, label): + """ + Returns the likelihood (a float) of the token having the + corresponding cluster. + """ + if self.classify(vector) == label: + return 1.0 + else: + return 0.0 + + def classification_probdist(self, vector): + """ + Classifies the token into a cluster, returning + a probability distribution over the cluster identifiers. + """ + likelihoods = {} + sum = 0.0 + for cluster in self.cluster_names(): + likelihoods[cluster] = self.likelihood(vector, cluster) + sum += likelihoods[cluster] + for cluster in self.cluster_names(): + likelihoods[cluster] /= sum + return DictionaryProbDist(likelihoods) + + @abstractmethod + def num_clusters(self): + """ + Returns the number of clusters. + """ + + def cluster_names(self): + """ + Returns the names of the clusters. + :rtype: list + """ + return list(range(self.num_clusters())) + + def cluster_name(self, index): + """ + Returns the names of the cluster at index. + """ + return index diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/em.py b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/em.py new file mode 100644 index 0000000000000000000000000000000000000000..635bea7fbe2fe2b8819cfc87f4c1bb481e482324 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/em.py @@ -0,0 +1,219 @@ +# Natural Language Toolkit: Expectation Maximization Clusterer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# URL: +# For license information, see LICENSE.TXT + +try: + import numpy +except ImportError: + pass + +from nltk.cluster.util import VectorSpaceClusterer + + +class EMClusterer(VectorSpaceClusterer): + """ + The Gaussian EM clusterer models the vectors as being produced by + a mixture of k Gaussian sources. The parameters of these sources + (prior probability, mean and covariance matrix) are then found to + maximise the likelihood of the given data. This is done with the + expectation maximisation algorithm. It starts with k arbitrarily + chosen means, priors and covariance matrices. It then calculates + the membership probabilities for each vector in each of the + clusters; this is the 'E' step. The cluster parameters are then + updated in the 'M' step using the maximum likelihood estimate from + the cluster membership probabilities. This process continues until + the likelihood of the data does not significantly increase. + """ + + def __init__( + self, + initial_means, + priors=None, + covariance_matrices=None, + conv_threshold=1e-6, + bias=0.1, + normalise=False, + svd_dimensions=None, + ): + """ + Creates an EM clusterer with the given starting parameters, + convergence threshold and vector mangling parameters. + + :param initial_means: the means of the gaussian cluster centers + :type initial_means: [seq of] numpy array or seq of SparseArray + :param priors: the prior probability for each cluster + :type priors: numpy array or seq of float + :param covariance_matrices: the covariance matrix for each cluster + :type covariance_matrices: [seq of] numpy array + :param conv_threshold: maximum change in likelihood before deemed + convergent + :type conv_threshold: int or float + :param bias: variance bias used to ensure non-singular covariance + matrices + :type bias: float + :param normalise: should vectors be normalised to length 1 + :type normalise: boolean + :param svd_dimensions: number of dimensions to use in reducing vector + dimensionsionality with SVD + :type svd_dimensions: int + """ + VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) + self._means = numpy.array(initial_means, numpy.float64) + self._num_clusters = len(initial_means) + self._conv_threshold = conv_threshold + self._covariance_matrices = covariance_matrices + self._priors = priors + self._bias = bias + + def num_clusters(self): + return self._num_clusters + + def cluster_vectorspace(self, vectors, trace=False): + assert len(vectors) > 0 + + # set the parameters to initial values + dimensions = len(vectors[0]) + means = self._means + priors = self._priors + if not priors: + priors = self._priors = ( + numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters + ) + covariances = self._covariance_matrices + if not covariances: + covariances = self._covariance_matrices = [ + numpy.identity(dimensions, numpy.float64) + for i in range(self._num_clusters) + ] + + # do the E and M steps until the likelihood plateaus + lastl = self._loglikelihood(vectors, priors, means, covariances) + converged = False + + while not converged: + if trace: + print("iteration; loglikelihood", lastl) + # E-step, calculate hidden variables, h[i,j] + h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64) + for i in range(len(vectors)): + for j in range(self._num_clusters): + h[i, j] = priors[j] * self._gaussian( + means[j], covariances[j], vectors[i] + ) + h[i, :] /= sum(h[i, :]) + + # M-step, update parameters - cvm, p, mean + for j in range(self._num_clusters): + covariance_before = covariances[j] + new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64) + new_mean = numpy.zeros(dimensions, numpy.float64) + sum_hj = 0.0 + for i in range(len(vectors)): + delta = vectors[i] - means[j] + new_covariance += h[i, j] * numpy.multiply.outer(delta, delta) + sum_hj += h[i, j] + new_mean += h[i, j] * vectors[i] + covariances[j] = new_covariance / sum_hj + means[j] = new_mean / sum_hj + priors[j] = sum_hj / len(vectors) + + # bias term to stop covariance matrix being singular + covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64) + + # calculate likelihood - FIXME: may be broken + l = self._loglikelihood(vectors, priors, means, covariances) + + # check for convergence + if abs(lastl - l) < self._conv_threshold: + converged = True + lastl = l + + def classify_vectorspace(self, vector): + best = None + for j in range(self._num_clusters): + p = self._priors[j] * self._gaussian( + self._means[j], self._covariance_matrices[j], vector + ) + if not best or p > best[0]: + best = (p, j) + return best[1] + + def likelihood_vectorspace(self, vector, cluster): + cid = self.cluster_names().index(cluster) + return self._priors[cluster] * self._gaussian( + self._means[cluster], self._covariance_matrices[cluster], vector + ) + + def _gaussian(self, mean, cvm, x): + m = len(mean) + assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape) + try: + det = numpy.linalg.det(cvm) + inv = numpy.linalg.inv(cvm) + a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0) + dx = x - mean + print(dx, inv) + b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx) + return a * numpy.exp(b) + except OverflowError: + # happens when the exponent is negative infinity - i.e. b = 0 + # i.e. the inverse of cvm is huge (cvm is almost zero) + return 0 + + def _loglikelihood(self, vectors, priors, means, covariances): + llh = 0.0 + for vector in vectors: + p = 0 + for j in range(len(priors)): + p += priors[j] * self._gaussian(means[j], covariances[j], vector) + llh += numpy.log(p) + return llh + + def __repr__(self): + return "" % list(self._means) + + +def demo(): + """ + Non-interactive demonstration of the clusterers with simple 2-D data. + """ + + from nltk import cluster + + # example from figure 14.10, page 519, Manning and Schutze + + vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] + means = [[4, 2], [4, 2.01]] + + clusterer = cluster.EMClusterer(means, bias=0.1) + clusters = clusterer.cluster(vectors, True, trace=True) + + print("Clustered:", vectors) + print("As: ", clusters) + print() + + for c in range(2): + print("Cluster:", c) + print("Prior: ", clusterer._priors[c]) + print("Mean: ", clusterer._means[c]) + print("Covar: ", clusterer._covariance_matrices[c]) + print() + + # classify a new vector + vector = numpy.array([2, 2]) + print("classify(%s):" % vector, end=" ") + print(clusterer.classify(vector)) + + # show the classification probabilities + vector = numpy.array([2, 2]) + print("classification_probdist(%s):" % vector) + pdist = clusterer.classification_probdist(vector) + for sample in pdist.samples(): + print(f"{sample} => {pdist.prob(sample) * 100:.0f}%") + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/gaac.py b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/gaac.py new file mode 100644 index 0000000000000000000000000000000000000000..037ad6bf58701c6ab983a5187c5f75fd947a12fb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/gaac.py @@ -0,0 +1,170 @@ +# Natural Language Toolkit: Group Average Agglomerative Clusterer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# URL: +# For license information, see LICENSE.TXT + +try: + import numpy +except ImportError: + pass + +from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance + + +class GAAClusterer(VectorSpaceClusterer): + """ + The Group Average Agglomerative starts with each of the N vectors as singleton + clusters. It then iteratively merges pairs of clusters which have the + closest centroids. This continues until there is only one cluster. The + order of merges gives rise to a dendrogram: a tree with the earlier merges + lower than later merges. The membership of a given number of clusters c, 1 + <= c <= N, can be found by cutting the dendrogram at depth c. + + This clusterer uses the cosine similarity metric only, which allows for + efficient speed-up in the clustering process. + """ + + def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None): + VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) + self._num_clusters = num_clusters + self._dendrogram = None + self._groups_values = None + + def cluster(self, vectors, assign_clusters=False, trace=False): + # stores the merge order + self._dendrogram = Dendrogram( + [numpy.array(vector, numpy.float64) for vector in vectors] + ) + return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) + + def cluster_vectorspace(self, vectors, trace=False): + # variables describing the initial situation + N = len(vectors) + cluster_len = [1] * N + cluster_count = N + index_map = numpy.arange(N) + + # construct the similarity matrix + dims = (N, N) + dist = numpy.ones(dims, dtype=float) * numpy.inf + for i in range(N): + for j in range(i + 1, N): + dist[i, j] = cosine_distance(vectors[i], vectors[j]) + + while cluster_count > max(self._num_clusters, 1): + i, j = numpy.unravel_index(dist.argmin(), dims) + if trace: + print("merging %d and %d" % (i, j)) + + # update similarities for merging i and j + self._merge_similarities(dist, cluster_len, i, j) + + # remove j + dist[:, j] = numpy.inf + dist[j, :] = numpy.inf + + # merge the clusters + cluster_len[i] = cluster_len[i] + cluster_len[j] + self._dendrogram.merge(index_map[i], index_map[j]) + cluster_count -= 1 + + # update the index map to reflect the indexes if we + # had removed j + index_map[j + 1 :] -= 1 + index_map[j] = N + + self.update_clusters(self._num_clusters) + + def _merge_similarities(self, dist, cluster_len, i, j): + # the new cluster i merged from i and j adopts the average of + # i and j's similarity to each other cluster, weighted by the + # number of points in the clusters i and j + i_weight = cluster_len[i] + j_weight = cluster_len[j] + weight_sum = i_weight + j_weight + + # update for x 0 + if self._should_normalise: + centroid = self._normalise(cluster[0]) + else: + centroid = numpy.array(cluster[0]) + for vector in cluster[1:]: + if self._should_normalise: + centroid += self._normalise(vector) + else: + centroid += vector + centroid /= len(cluster) + self._centroids.append(centroid) + self._num_clusters = len(self._centroids) + + def classify_vectorspace(self, vector): + best = None + for i in range(self._num_clusters): + centroid = self._centroids[i] + dist = cosine_distance(vector, centroid) + if not best or dist < best[0]: + best = (dist, i) + return best[1] + + def dendrogram(self): + """ + :return: The dendrogram representing the current clustering + :rtype: Dendrogram + """ + return self._dendrogram + + def num_clusters(self): + return self._num_clusters + + def __repr__(self): + return "" % self._num_clusters + + +def demo(): + """ + Non-interactive demonstration of the clusterers with simple 2-D data. + """ + + from nltk.cluster import GAAClusterer + + # use a set of tokens with 2D indices + vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] + + # test the GAAC clusterer with 4 clusters + clusterer = GAAClusterer(4) + clusters = clusterer.cluster(vectors, True) + + print("Clusterer:", clusterer) + print("Clustered:", vectors) + print("As:", clusters) + print() + + # show the dendrogram + clusterer.dendrogram().show() + + # classify a new vector + vector = numpy.array([3, 3]) + print("classify(%s):" % vector, end=" ") + print(clusterer.classify(vector)) + print() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/kmeans.py b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/kmeans.py new file mode 100644 index 0000000000000000000000000000000000000000..079ce083efc60926f04dad9bdc5db13401190aab --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/kmeans.py @@ -0,0 +1,231 @@ +# Natural Language Toolkit: K-Means Clusterer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# URL: +# For license information, see LICENSE.TXT + +import copy +import random +import sys + +try: + import numpy +except ImportError: + pass + + +from nltk.cluster.util import VectorSpaceClusterer + + +class KMeansClusterer(VectorSpaceClusterer): + """ + The K-means clusterer starts with k arbitrary chosen means then allocates + each vector to the cluster with the closest mean. It then recalculates the + means of each cluster as the centroid of the vectors in the cluster. This + process repeats until the cluster memberships stabilise. This is a + hill-climbing algorithm which may converge to a local maximum. Hence the + clustering is often repeated with random initial means and the most + commonly occurring output means are chosen. + """ + + def __init__( + self, + num_means, + distance, + repeats=1, + conv_test=1e-6, + initial_means=None, + normalise=False, + svd_dimensions=None, + rng=None, + avoid_empty_clusters=False, + ): + + """ + :param num_means: the number of means to use (may use fewer) + :type num_means: int + :param distance: measure of distance between two vectors + :type distance: function taking two vectors and returning a float + :param repeats: number of randomised clustering trials to use + :type repeats: int + :param conv_test: maximum variation in mean differences before + deemed convergent + :type conv_test: number + :param initial_means: set of k initial means + :type initial_means: sequence of vectors + :param normalise: should vectors be normalised to length 1 + :type normalise: boolean + :param svd_dimensions: number of dimensions to use in reducing vector + dimensionsionality with SVD + :type svd_dimensions: int + :param rng: random number generator (or None) + :type rng: Random + :param avoid_empty_clusters: include current centroid in computation + of next one; avoids undefined behavior + when clusters become empty + :type avoid_empty_clusters: boolean + """ + VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) + self._num_means = num_means + self._distance = distance + self._max_difference = conv_test + assert not initial_means or len(initial_means) == num_means + self._means = initial_means + assert repeats >= 1 + assert not (initial_means and repeats > 1) + self._repeats = repeats + self._rng = rng if rng else random.Random() + self._avoid_empty_clusters = avoid_empty_clusters + + def cluster_vectorspace(self, vectors, trace=False): + if self._means and self._repeats > 1: + print("Warning: means will be discarded for subsequent trials") + + meanss = [] + for trial in range(self._repeats): + if trace: + print("k-means trial", trial) + if not self._means or trial > 1: + self._means = self._rng.sample(list(vectors), self._num_means) + self._cluster_vectorspace(vectors, trace) + meanss.append(self._means) + + if len(meanss) > 1: + # sort the means first (so that different cluster numbering won't + # effect the distance comparison) + for means in meanss: + means.sort(key=sum) + + # find the set of means that's minimally different from the others + min_difference = min_means = None + for i in range(len(meanss)): + d = 0 + for j in range(len(meanss)): + if i != j: + d += self._sum_distances(meanss[i], meanss[j]) + if min_difference is None or d < min_difference: + min_difference, min_means = d, meanss[i] + + # use the best means + self._means = min_means + + def _cluster_vectorspace(self, vectors, trace=False): + if self._num_means < len(vectors): + # perform k-means clustering + converged = False + while not converged: + # assign the tokens to clusters based on minimum distance to + # the cluster means + clusters = [[] for m in range(self._num_means)] + for vector in vectors: + index = self.classify_vectorspace(vector) + clusters[index].append(vector) + + if trace: + print("iteration") + # for i in range(self._num_means): + # print ' mean', i, 'allocated', len(clusters[i]), 'vectors' + + # recalculate cluster means by computing the centroid of each cluster + new_means = list(map(self._centroid, clusters, self._means)) + + # measure the degree of change from the previous step for convergence + difference = self._sum_distances(self._means, new_means) + if difference < self._max_difference: + converged = True + + # remember the new means + self._means = new_means + + def classify_vectorspace(self, vector): + # finds the closest cluster centroid + # returns that cluster's index + best_distance = best_index = None + for index in range(len(self._means)): + mean = self._means[index] + dist = self._distance(vector, mean) + if best_distance is None or dist < best_distance: + best_index, best_distance = index, dist + return best_index + + def num_clusters(self): + if self._means: + return len(self._means) + else: + return self._num_means + + def means(self): + """ + The means used for clustering. + """ + return self._means + + def _sum_distances(self, vectors1, vectors2): + difference = 0.0 + for u, v in zip(vectors1, vectors2): + difference += self._distance(u, v) + return difference + + def _centroid(self, cluster, mean): + if self._avoid_empty_clusters: + centroid = copy.copy(mean) + for vector in cluster: + centroid += vector + return centroid / (1 + len(cluster)) + else: + if not len(cluster): + sys.stderr.write("Error: no centroid defined for empty cluster.\n") + sys.stderr.write( + "Try setting argument 'avoid_empty_clusters' to True\n" + ) + assert False + centroid = copy.copy(cluster[0]) + for vector in cluster[1:]: + centroid += vector + return centroid / len(cluster) + + def __repr__(self): + return "" % (self._means, self._repeats) + + +################################################################################# + + +def demo(): + # example from figure 14.9, page 517, Manning and Schutze + + from nltk.cluster import KMeansClusterer, euclidean_distance + + vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] + means = [[4, 3], [5, 5]] + + clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) + clusters = clusterer.cluster(vectors, True, trace=True) + + print("Clustered:", vectors) + print("As:", clusters) + print("Means:", clusterer.means()) + print() + + vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] + + # test k-means using the euclidean distance metric, 2 means and repeat + # clustering 10 times with random seeds + + clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) + clusters = clusterer.cluster(vectors, True) + print("Clustered:", vectors) + print("As:", clusters) + print("Means:", clusterer.means()) + print() + + # classify a new vector + vector = numpy.array([3, 3]) + print("classify(%s):" % vector, end=" ") + print(clusterer.classify(vector)) + print() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cbe36d011aa3239a95d906dae4974487750bcaa1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/__init__.py @@ -0,0 +1,529 @@ +# Natural Language Toolkit: Corpus Readers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +# TODO this docstring isn't up-to-date! +""" +NLTK corpus readers. The modules in this package provide functions +that can be used to read corpus files in a variety of formats. These +functions can be used to read both the corpus files that are +distributed in the NLTK corpus package, and corpus files that are part +of external corpora. + +Available Corpora +================= + +Please see https://www.nltk.org/nltk_data/ for a complete list. +Install corpora using nltk.download(). + +Corpus Reader Functions +======================= +Each corpus module defines one or more "corpus reader functions", +which can be used to read documents from that corpus. These functions +take an argument, ``item``, which is used to indicate which document +should be read from the corpus: + +- If ``item`` is one of the unique identifiers listed in the corpus + module's ``items`` variable, then the corresponding document will + be loaded from the NLTK corpus package. +- If ``item`` is a filename, then that file will be read. + +Additionally, corpus reader functions can be given lists of item +names; in which case, they will return a concatenation of the +corresponding documents. + +Corpus reader functions are named based on the type of information +they return. Some common examples, and their return types, are: + +- words(): list of str +- sents(): list of (list of str) +- paras(): list of (list of (list of str)) +- tagged_words(): list of (str,str) tuple +- tagged_sents(): list of (list of (str,str)) +- tagged_paras(): list of (list of (list of (str,str))) +- chunked_sents(): list of (Tree w/ (str,str) leaves) +- parsed_sents(): list of (Tree with str leaves) +- parsed_paras(): list of (list of (Tree with str leaves)) +- xml(): A single xml ElementTree +- raw(): unprocessed corpus contents + +For example, to read a list of the words in the Brown Corpus, use +``nltk.corpus.brown.words()``: + + >>> from nltk.corpus import brown + >>> print(", ".join(brown.words())) # doctest: +ELLIPSIS + The, Fulton, County, Grand, Jury, said, ... + +""" + +import re + +from nltk.corpus.reader import * +from nltk.corpus.util import LazyCorpusLoader +from nltk.tokenize import RegexpTokenizer + +abc: PlaintextCorpusReader = LazyCorpusLoader( + "abc", + PlaintextCorpusReader, + r"(?!\.).*\.txt", + encoding=[("science", "latin_1"), ("rural", "utf8")], +) +alpino: AlpinoCorpusReader = LazyCorpusLoader( + "alpino", AlpinoCorpusReader, tagset="alpino" +) +bcp47: BCP47CorpusReader = LazyCorpusLoader( + "bcp47", BCP47CorpusReader, r"(cldr|iana)/*" +) +brown: CategorizedTaggedCorpusReader = LazyCorpusLoader( + "brown", + CategorizedTaggedCorpusReader, + r"c[a-z]\d\d", + cat_file="cats.txt", + tagset="brown", + encoding="ascii", +) +cess_cat: BracketParseCorpusReader = LazyCorpusLoader( + "cess_cat", + BracketParseCorpusReader, + r"(?!\.).*\.tbf", + tagset="unknown", + encoding="ISO-8859-15", +) +cess_esp: BracketParseCorpusReader = LazyCorpusLoader( + "cess_esp", + BracketParseCorpusReader, + r"(?!\.).*\.tbf", + tagset="unknown", + encoding="ISO-8859-15", +) +cmudict: CMUDictCorpusReader = LazyCorpusLoader( + "cmudict", CMUDictCorpusReader, ["cmudict"] +) +comtrans: AlignedCorpusReader = LazyCorpusLoader( + "comtrans", AlignedCorpusReader, r"(?!\.).*\.txt" +) +comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader( + "comparative_sentences", + ComparativeSentencesCorpusReader, + r"labeledSentences\.txt", + encoding="latin-1", +) +conll2000: ConllChunkCorpusReader = LazyCorpusLoader( + "conll2000", + ConllChunkCorpusReader, + ["train.txt", "test.txt"], + ("NP", "VP", "PP"), + tagset="wsj", + encoding="ascii", +) +conll2002: ConllChunkCorpusReader = LazyCorpusLoader( + "conll2002", + ConllChunkCorpusReader, + r".*\.(test|train).*", + ("LOC", "PER", "ORG", "MISC"), + encoding="utf-8", +) +conll2007: DependencyCorpusReader = LazyCorpusLoader( + "conll2007", + DependencyCorpusReader, + r".*\.(test|train).*", + encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")], +) +crubadan: CrubadanCorpusReader = LazyCorpusLoader( + "crubadan", CrubadanCorpusReader, r".*\.txt" +) +dependency_treebank: DependencyCorpusReader = LazyCorpusLoader( + "dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii" +) +extended_omw: CorpusReader = LazyCorpusLoader( + "extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8" +) +floresta: BracketParseCorpusReader = LazyCorpusLoader( + "floresta", + BracketParseCorpusReader, + r"(?!\.).*\.ptb", + "#", + tagset="unknown", + encoding="ISO-8859-15", +) +framenet15: FramenetCorpusReader = LazyCorpusLoader( + "framenet_v15", + FramenetCorpusReader, + [ + "frRelation.xml", + "frameIndex.xml", + "fulltextIndex.xml", + "luIndex.xml", + "semTypes.xml", + ], +) +framenet: FramenetCorpusReader = LazyCorpusLoader( + "framenet_v17", + FramenetCorpusReader, + [ + "frRelation.xml", + "frameIndex.xml", + "fulltextIndex.xml", + "luIndex.xml", + "semTypes.xml", + ], +) +gazetteers: WordListCorpusReader = LazyCorpusLoader( + "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2" +) +genesis: PlaintextCorpusReader = LazyCorpusLoader( + "genesis", + PlaintextCorpusReader, + r"(?!\.).*\.txt", + encoding=[ + ("finnish|french|german", "latin_1"), + ("swedish", "cp865"), + (".*", "utf_8"), + ], +) +gutenberg: PlaintextCorpusReader = LazyCorpusLoader( + "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1" +) +ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*") +inaugural: PlaintextCorpusReader = LazyCorpusLoader( + "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1" +) +# [XX] This should probably just use TaggedCorpusReader: +indian: IndianCorpusReader = LazyCorpusLoader( + "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8" +) + +jeita: ChasenCorpusReader = LazyCorpusLoader( + "jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8" +) +knbc: KNBCorpusReader = LazyCorpusLoader( + "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp" +) +lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader( + "lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp" +) +mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader( + "mac_morpho", + MacMorphoCorpusReader, + r"(?!\.).*\.txt", + tagset="unknown", + encoding="latin-1", +) +machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader( + "machado", + PortugueseCategorizedPlaintextCorpusReader, + r"(?!\.).*\.txt", + cat_pattern=r"([a-z]*)/.*", + encoding="latin-1", +) +masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader( + "masc_tagged", + CategorizedTaggedCorpusReader, + r"(spoken|written)/.*\.txt", + cat_file="categories.txt", + tagset="wsj", + encoding="utf-8", + sep="_", +) +movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader( + "movie_reviews", + CategorizedPlaintextCorpusReader, + r"(?!\.).*\.txt", + cat_pattern=r"(neg|pos)/.*", + encoding="ascii", +) +multext_east: MTECorpusReader = LazyCorpusLoader( + "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8" +) +names: WordListCorpusReader = LazyCorpusLoader( + "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii" +) +nps_chat: NPSChatCorpusReader = LazyCorpusLoader( + "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj" +) +opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader( + "opinion_lexicon", + OpinionLexiconCorpusReader, + r"(\w+)\-words\.txt", + encoding="ISO-8859-2", +) +ppattach: PPAttachmentCorpusReader = LazyCorpusLoader( + "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"] +) +product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader( + "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8" +) +product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader( + "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8" +) +pros_cons: ProsConsCorpusReader = LazyCorpusLoader( + "pros_cons", + ProsConsCorpusReader, + r"Integrated(Cons|Pros)\.txt", + cat_pattern=r"Integrated(Cons|Pros)\.txt", + encoding="ISO-8859-2", +) +ptb: CategorizedBracketParseCorpusReader = ( + LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions + "ptb", + CategorizedBracketParseCorpusReader, + r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG", + cat_file="allcats.txt", + tagset="wsj", + ) +) +qc: StringCategoryCorpusReader = LazyCorpusLoader( + "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2" +) +reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader( + "reuters", + CategorizedPlaintextCorpusReader, + "(training|test).*", + cat_file="cats.txt", + encoding="ISO-8859-2", +) +rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml") +senseval: SensevalCorpusReader = LazyCorpusLoader( + "senseval", SensevalCorpusReader, r"(?!\.).*\.pos" +) +sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader( + "sentence_polarity", + CategorizedSentencesCorpusReader, + r"rt-polarity\.(neg|pos)", + cat_pattern=r"rt-polarity\.(neg|pos)", + encoding="utf-8", +) +sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader( + "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8" +) +shakespeare: XMLCorpusReader = LazyCorpusLoader( + "shakespeare", XMLCorpusReader, r"(?!\.).*\.xml" +) +sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader( + "sinica_treebank", + SinicaTreebankCorpusReader, + ["parsed"], + tagset="unknown", + encoding="utf-8", +) +state_union: PlaintextCorpusReader = LazyCorpusLoader( + "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2" +) +stopwords: WordListCorpusReader = LazyCorpusLoader( + "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8" +) +subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader( + "subjectivity", + CategorizedSentencesCorpusReader, + r"(quote.tok.gt9|plot.tok.gt9)\.5000", + cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]}, + encoding="latin-1", +) +swadesh: SwadeshCorpusReader = LazyCorpusLoader( + "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8" +) +swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader( + "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8" +) +swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader( + "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8" +) +switchboard: SwitchboardCorpusReader = LazyCorpusLoader( + "switchboard", SwitchboardCorpusReader, tagset="wsj" +) +timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader) +timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader( + "timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii" +) +toolbox: ToolboxCorpusReader = LazyCorpusLoader( + "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)" +) +treebank: BracketParseCorpusReader = LazyCorpusLoader( + "treebank/combined", + BracketParseCorpusReader, + r"wsj_.*\.mrg", + tagset="wsj", + encoding="ascii", +) +treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader( + "treebank/tagged", + ChunkedCorpusReader, + r"wsj_.*\.pos", + sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True), + para_block_reader=tagged_treebank_para_block_reader, + tagset="wsj", + encoding="ascii", +) +treebank_raw: PlaintextCorpusReader = LazyCorpusLoader( + "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2" +) +twitter_samples: TwitterCorpusReader = LazyCorpusLoader( + "twitter_samples", TwitterCorpusReader, r".*\.json" +) +udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader) +udhr2: PlaintextCorpusReader = LazyCorpusLoader( + "udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8" +) +universal_treebanks: ConllCorpusReader = LazyCorpusLoader( + "universal_treebanks_v20", + ConllCorpusReader, + r".*\.conll", + columntypes=( + "ignore", + "words", + "ignore", + "ignore", + "pos", + "ignore", + "ignore", + "ignore", + "ignore", + "ignore", + ), +) +verbnet: VerbnetCorpusReader = LazyCorpusLoader( + "verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml" +) +webtext: PlaintextCorpusReader = LazyCorpusLoader( + "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2" +) +wordnet: WordNetCorpusReader = LazyCorpusLoader( + "wordnet", + WordNetCorpusReader, + LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), +) +wordnet31: WordNetCorpusReader = LazyCorpusLoader( + "wordnet31", + WordNetCorpusReader, + LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), +) +wordnet2021: WordNetCorpusReader = LazyCorpusLoader( + "wordnet2021", + WordNetCorpusReader, + LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), +) +wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader( + "wordnet_ic", WordNetICCorpusReader, r".*\.dat" +) +words: WordListCorpusReader = LazyCorpusLoader( + "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii" +) + +# defined after treebank +propbank: PropbankCorpusReader = LazyCorpusLoader( + "propbank", + PropbankCorpusReader, + "prop.txt", + r"frames/.*\.xml", + "verbs.txt", + lambda filename: re.sub(r"^wsj/\d\d/", "", filename), + treebank, +) # Must be defined *after* treebank corpus. +nombank: NombankCorpusReader = LazyCorpusLoader( + "nombank.1.0", + NombankCorpusReader, + "nombank.1.0", + r"frames/.*\.xml", + "nombank.1.0.words", + lambda filename: re.sub(r"^wsj/\d\d/", "", filename), + treebank, +) # Must be defined *after* treebank corpus. +propbank_ptb: PropbankCorpusReader = LazyCorpusLoader( + "propbank", + PropbankCorpusReader, + "prop.txt", + r"frames/.*\.xml", + "verbs.txt", + lambda filename: filename.upper(), + ptb, +) # Must be defined *after* ptb corpus. +nombank_ptb: NombankCorpusReader = LazyCorpusLoader( + "nombank.1.0", + NombankCorpusReader, + "nombank.1.0", + r"frames/.*\.xml", + "nombank.1.0.words", + lambda filename: filename.upper(), + ptb, +) # Must be defined *after* ptb corpus. +semcor: SemcorCorpusReader = LazyCorpusLoader( + "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet +) # Must be defined *after* wordnet corpus. + +nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader( + "nonbreaking_prefixes", + NonbreakingPrefixesCorpusReader, + r"(?!README|\.).*", + encoding="utf8", +) +perluniprops: UnicharsCorpusReader = LazyCorpusLoader( + "perluniprops", + UnicharsCorpusReader, + r"(?!README|\.).*", + nltk_data_subdir="misc", + encoding="utf8", +) + +# mwa_ppdb = LazyCorpusLoader( +# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8') + +# See https://github.com/nltk/nltk/issues/1579 +# and https://github.com/nltk/nltk/issues/1716 +# +# pl196x = LazyCorpusLoader( +# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml', +# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8') +# +# ipipan = LazyCorpusLoader( +# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml') +# +# nkjp = LazyCorpusLoader( +# 'nkjp', NKJPCorpusReader, r'', encoding='utf8') +# +# panlex_lite = LazyCorpusLoader( +# 'panlex_lite', PanLexLiteCorpusReader) +# +# ycoe = LazyCorpusLoader( +# 'ycoe', YCOECorpusReader) +# +# corpus not available with NLTK; these lines caused help(nltk.corpus) to break +# hebrew_treebank = LazyCorpusLoader( +# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt') + +# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116 +def demo(): + # This is out-of-date: + abc.demo() + brown.demo() + # chat80.demo() + cmudict.demo() + conll2000.demo() + conll2002.demo() + genesis.demo() + gutenberg.demo() + ieer.demo() + inaugural.demo() + indian.demo() + names.demo() + ppattach.demo() + senseval.demo() + shakespeare.demo() + sinica_treebank.demo() + state_union.demo() + stopwords.demo() + timit.demo() + toolbox.demo() + treebank.demo() + udhr.demo() + webtext.demo() + words.demo() + + +# ycoe.demo() + +if __name__ == "__main__": + # demo() + pass diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/europarl_raw.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/europarl_raw.py new file mode 100644 index 0000000000000000000000000000000000000000..dd52f3ede614b84dfddb8f8cbfe45ca3f9013f8d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/europarl_raw.py @@ -0,0 +1,56 @@ +# Natural Language Toolkit: Europarl Corpus Readers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Nitin Madnani +# URL: +# For license information, see LICENSE.TXT + +import re + +from nltk.corpus.reader import * +from nltk.corpus.util import LazyCorpusLoader + +# Create a new corpus reader instance for each European language +danish: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8" +) + +dutch: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8" +) + +english: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8" +) + +finnish: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8" +) + +french: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8" +) + +german: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8" +) + +greek: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8" +) + +italian: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8" +) + +portuguese: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8" +) + +spanish: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8" +) + +swedish: EuroparlCorpusReader = LazyCorpusLoader( + "europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8" +) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/childes.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/childes.py new file mode 100644 index 0000000000000000000000000000000000000000..5347a91bdd0312ffbf78721250a7f1fbe6a63e36 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/childes.py @@ -0,0 +1,630 @@ +# CHILDES XML Corpus Reader + +# Copyright (C) 2001-2022 NLTK Project +# Author: Tomonori Nagano +# Alexis Dimitriadis +# URL: +# For license information, see LICENSE.TXT + +""" +Corpus reader for the XML version of the CHILDES corpus. +""" + +__docformat__ = "epytext en" + +import re +from collections import defaultdict + +from nltk.corpus.reader.util import concat +from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader +from nltk.util import LazyConcatenation, LazyMap, flatten + +# to resolve the namespace issue +NS = "http://www.talkbank.org/ns/talkbank" + + +class CHILDESCorpusReader(XMLCorpusReader): + """ + Corpus reader for the XML version of the CHILDES corpus. + The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML + version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``. + Copy the needed parts of the CHILDES XML corpus into the NLTK data directory + (``nltk_data/corpora/CHILDES/``). + + For access to the file text use the usual nltk functions, + ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``. + """ + + def __init__(self, root, fileids, lazy=True): + XMLCorpusReader.__init__(self, root, fileids) + self._lazy = lazy + + def words( + self, + fileids=None, + speaker="ALL", + stem=False, + relation=False, + strip_space=True, + replace=False, + ): + """ + :return: the given file(s) as a list of words + :rtype: list(str) + + :param speaker: If specified, select specific speaker(s) defined + in the corpus. Default is 'ALL' (all participants). Common choices + are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude + researchers) + :param stem: If true, then use word stems instead of word strings. + :param relation: If true, then return tuples of (stem, index, + dependent_index) + :param strip_space: If true, then strip trailing spaces from word + tokens. Otherwise, leave the spaces on the tokens. + :param replace: If true, then use the replaced (intended) word instead + of the original word (e.g., 'wat' will be replaced with 'watch') + """ + sent = None + pos = False + if not self._lazy: + return [ + self._get_words( + fileid, speaker, sent, stem, relation, pos, strip_space, replace + ) + for fileid in self.abspaths(fileids) + ] + + get_words = lambda fileid: self._get_words( + fileid, speaker, sent, stem, relation, pos, strip_space, replace + ) + return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) + + def tagged_words( + self, + fileids=None, + speaker="ALL", + stem=False, + relation=False, + strip_space=True, + replace=False, + ): + """ + :return: the given file(s) as a list of tagged + words and punctuation symbols, encoded as tuples + ``(word,tag)``. + :rtype: list(tuple(str,str)) + + :param speaker: If specified, select specific speaker(s) defined + in the corpus. Default is 'ALL' (all participants). Common choices + are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude + researchers) + :param stem: If true, then use word stems instead of word strings. + :param relation: If true, then return tuples of (stem, index, + dependent_index) + :param strip_space: If true, then strip trailing spaces from word + tokens. Otherwise, leave the spaces on the tokens. + :param replace: If true, then use the replaced (intended) word instead + of the original word (e.g., 'wat' will be replaced with 'watch') + """ + sent = None + pos = True + if not self._lazy: + return [ + self._get_words( + fileid, speaker, sent, stem, relation, pos, strip_space, replace + ) + for fileid in self.abspaths(fileids) + ] + + get_words = lambda fileid: self._get_words( + fileid, speaker, sent, stem, relation, pos, strip_space, replace + ) + return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) + + def sents( + self, + fileids=None, + speaker="ALL", + stem=False, + relation=None, + strip_space=True, + replace=False, + ): + """ + :return: the given file(s) as a list of sentences or utterances, each + encoded as a list of word strings. + :rtype: list(list(str)) + + :param speaker: If specified, select specific speaker(s) defined + in the corpus. Default is 'ALL' (all participants). Common choices + are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude + researchers) + :param stem: If true, then use word stems instead of word strings. + :param relation: If true, then return tuples of ``(str,pos,relation_list)``. + If there is manually-annotated relation info, it will return + tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` + :param strip_space: If true, then strip trailing spaces from word + tokens. Otherwise, leave the spaces on the tokens. + :param replace: If true, then use the replaced (intended) word instead + of the original word (e.g., 'wat' will be replaced with 'watch') + """ + sent = True + pos = False + if not self._lazy: + return [ + self._get_words( + fileid, speaker, sent, stem, relation, pos, strip_space, replace + ) + for fileid in self.abspaths(fileids) + ] + + get_words = lambda fileid: self._get_words( + fileid, speaker, sent, stem, relation, pos, strip_space, replace + ) + return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) + + def tagged_sents( + self, + fileids=None, + speaker="ALL", + stem=False, + relation=None, + strip_space=True, + replace=False, + ): + """ + :return: the given file(s) as a list of + sentences, each encoded as a list of ``(word,tag)`` tuples. + :rtype: list(list(tuple(str,str))) + + :param speaker: If specified, select specific speaker(s) defined + in the corpus. Default is 'ALL' (all participants). Common choices + are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude + researchers) + :param stem: If true, then use word stems instead of word strings. + :param relation: If true, then return tuples of ``(str,pos,relation_list)``. + If there is manually-annotated relation info, it will return + tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` + :param strip_space: If true, then strip trailing spaces from word + tokens. Otherwise, leave the spaces on the tokens. + :param replace: If true, then use the replaced (intended) word instead + of the original word (e.g., 'wat' will be replaced with 'watch') + """ + sent = True + pos = True + if not self._lazy: + return [ + self._get_words( + fileid, speaker, sent, stem, relation, pos, strip_space, replace + ) + for fileid in self.abspaths(fileids) + ] + + get_words = lambda fileid: self._get_words( + fileid, speaker, sent, stem, relation, pos, strip_space, replace + ) + return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) + + def corpus(self, fileids=None): + """ + :return: the given file(s) as a dict of ``(corpus_property_key, value)`` + :rtype: list(dict) + """ + if not self._lazy: + return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)] + return LazyMap(self._get_corpus, self.abspaths(fileids)) + + def _get_corpus(self, fileid): + results = dict() + xmldoc = ElementTree.parse(fileid).getroot() + for key, value in xmldoc.items(): + results[key] = value + return results + + def participants(self, fileids=None): + """ + :return: the given file(s) as a dict of + ``(participant_property_key, value)`` + :rtype: list(dict) + """ + if not self._lazy: + return [self._get_participants(fileid) for fileid in self.abspaths(fileids)] + return LazyMap(self._get_participants, self.abspaths(fileids)) + + def _get_participants(self, fileid): + # multidimensional dicts + def dictOfDicts(): + return defaultdict(dictOfDicts) + + xmldoc = ElementTree.parse(fileid).getroot() + # getting participants' data + pat = dictOfDicts() + for participant in xmldoc.findall( + f".//{{{NS}}}Participants/{{{NS}}}participant" + ): + for (key, value) in participant.items(): + pat[participant.get("id")][key] = value + return pat + + def age(self, fileids=None, speaker="CHI", month=False): + """ + :return: the given file(s) as string or int + :rtype: list or int + + :param month: If true, return months instead of year-month-date + """ + if not self._lazy: + return [ + self._get_age(fileid, speaker, month) + for fileid in self.abspaths(fileids) + ] + get_age = lambda fileid: self._get_age(fileid, speaker, month) + return LazyMap(get_age, self.abspaths(fileids)) + + def _get_age(self, fileid, speaker, month): + xmldoc = ElementTree.parse(fileid).getroot() + for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"): + try: + if pat.get("id") == speaker: + age = pat.get("age") + if month: + age = self.convert_age(age) + return age + # some files don't have age data + except (TypeError, AttributeError) as e: + return None + + def convert_age(self, age_year): + "Caclculate age in months from a string in CHILDES format" + m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year) + age_month = int(m.group(1)) * 12 + int(m.group(2)) + try: + if int(m.group(3)) > 15: + age_month += 1 + # some corpora don't have age information? + except ValueError as e: + pass + return age_month + + def MLU(self, fileids=None, speaker="CHI"): + """ + :return: the given file(s) as a floating number + :rtype: list(float) + """ + if not self._lazy: + return [ + self._getMLU(fileid, speaker=speaker) + for fileid in self.abspaths(fileids) + ] + get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker) + return LazyMap(get_MLU, self.abspaths(fileids)) + + def _getMLU(self, fileid, speaker): + sents = self._get_words( + fileid, + speaker=speaker, + sent=True, + stem=True, + relation=False, + pos=True, + strip_space=True, + replace=True, + ) + results = [] + lastSent = [] + numFillers = 0 + sentDiscount = 0 + for sent in sents: + posList = [pos for (word, pos) in sent] + # if any part of the sentence is intelligible + if any(pos == "unk" for pos in posList): + continue + # if the sentence is null + elif sent == []: + continue + # if the sentence is the same as the last sent + elif sent == lastSent: + continue + else: + results.append([word for (word, pos) in sent]) + # count number of fillers + if len({"co", None}.intersection(posList)) > 0: + numFillers += posList.count("co") + numFillers += posList.count(None) + sentDiscount += 1 + lastSent = sent + try: + thisWordList = flatten(results) + # count number of morphemes + # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) + numWords = ( + len(flatten([word.split("-") for word in thisWordList])) - numFillers + ) + numSents = len(results) - sentDiscount + mlu = numWords / numSents + except ZeroDivisionError: + mlu = 0 + # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} + return mlu + + def _get_words( + self, fileid, speaker, sent, stem, relation, pos, strip_space, replace + ): + if ( + isinstance(speaker, str) and speaker != "ALL" + ): # ensure we have a list of speakers + speaker = [speaker] + xmldoc = ElementTree.parse(fileid).getroot() + # processing each xml doc + results = [] + for xmlsent in xmldoc.findall(".//{%s}u" % NS): + sents = [] + # select speakers + if speaker == "ALL" or xmlsent.get("who") in speaker: + for xmlword in xmlsent.findall(".//{%s}w" % NS): + infl = None + suffixStem = None + suffixTag = None + # getting replaced words + if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"): + xmlword = xmlsent.find( + f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w" + ) + elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"): + xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk") + # get text + if xmlword.text: + word = xmlword.text + else: + word = "" + # strip tailing space + if strip_space: + word = word.strip() + # stem + if relation or stem: + try: + xmlstem = xmlword.find(".//{%s}stem" % NS) + word = xmlstem.text + except AttributeError as e: + pass + # if there is an inflection + try: + xmlinfl = xmlword.find( + f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk" + ) + word += "-" + xmlinfl.text + except: + pass + # if there is a suffix + try: + xmlsuffix = xmlword.find( + ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem" + % (NS, NS, NS, NS) + ) + suffixStem = xmlsuffix.text + except AttributeError: + suffixStem = "" + if suffixStem: + word += "~" + suffixStem + # pos + if relation or pos: + try: + xmlpos = xmlword.findall(".//{%s}c" % NS) + xmlpos2 = xmlword.findall(".//{%s}s" % NS) + if xmlpos2 != []: + tag = xmlpos[0].text + ":" + xmlpos2[0].text + else: + tag = xmlpos[0].text + except (AttributeError, IndexError) as e: + tag = "" + try: + xmlsuffixpos = xmlword.findall( + ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c" + % (NS, NS, NS, NS, NS) + ) + xmlsuffixpos2 = xmlword.findall( + ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s" + % (NS, NS, NS, NS, NS) + ) + if xmlsuffixpos2: + suffixTag = ( + xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text + ) + else: + suffixTag = xmlsuffixpos[0].text + except: + pass + if suffixTag: + tag += "~" + suffixTag + word = (word, tag) + # relational + # the gold standard is stored in + # + if relation == True: + for xmlstem_rel in xmlword.findall( + f".//{{{NS}}}mor/{{{NS}}}gra" + ): + if not xmlstem_rel.get("type") == "grt": + word = ( + word[0], + word[1], + xmlstem_rel.get("index") + + "|" + + xmlstem_rel.get("head") + + "|" + + xmlstem_rel.get("relation"), + ) + else: + word = ( + word[0], + word[1], + word[2], + word[0], + word[1], + xmlstem_rel.get("index") + + "|" + + xmlstem_rel.get("head") + + "|" + + xmlstem_rel.get("relation"), + ) + try: + for xmlpost_rel in xmlword.findall( + f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra" + ): + if not xmlpost_rel.get("type") == "grt": + suffixStem = ( + suffixStem[0], + suffixStem[1], + xmlpost_rel.get("index") + + "|" + + xmlpost_rel.get("head") + + "|" + + xmlpost_rel.get("relation"), + ) + else: + suffixStem = ( + suffixStem[0], + suffixStem[1], + suffixStem[2], + suffixStem[0], + suffixStem[1], + xmlpost_rel.get("index") + + "|" + + xmlpost_rel.get("head") + + "|" + + xmlpost_rel.get("relation"), + ) + except: + pass + sents.append(word) + if sent or relation: + results.append(sents) + else: + results.extend(sents) + return LazyMap(lambda x: x, results) + + # Ready-to-use browser opener + + """ + The base URL for viewing files on the childes website. This + shouldn't need to be changed, unless CHILDES changes the configuration + of their server or unless the user sets up their own corpus webserver. + """ + childes_url_base = r"https://childes.talkbank.org/browser/index.php?url=" + + def webview_file(self, fileid, urlbase=None): + """Map a corpus file to its web version on the CHILDES website, + and open it in a web browser. + + The complete URL to be used is: + childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha') + + If no urlbase is passed, we try to calculate it. This + requires that the childes corpus was set up to mirror the + folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.: + nltk_data/corpora/childes/Eng-USA/Cornell/??? or + nltk_data/corpora/childes/Romance/Spanish/Aguirre/??? + + The function first looks (as a special case) if "Eng-USA" is + on the path consisting of +fileid; then if + "childes", possibly followed by "data-xml", appears. If neither + one is found, we use the unmodified fileid and hope for the best. + If this is not right, specify urlbase explicitly, e.g., if the + corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'. + """ + + import webbrowser + + if urlbase: + path = urlbase + "/" + fileid + else: + full = self.root + "/" + fileid + full = re.sub(r"\\", "/", full) + if "/childes/" in full.lower(): + # Discard /data-xml/ if present + path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0] + elif "eng-usa" in full.lower(): + path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0] + else: + path = fileid + + # Strip ".xml" and add ".cha", as necessary: + if path.endswith(".xml"): + path = path[:-4] + + if not path.endswith(".cha"): + path = path + ".cha" + + url = self.childes_url_base + path + + webbrowser.open_new_tab(url) + print("Opening in browser:", url) + # Pausing is a good idea, but it's up to the user... + # raw_input("Hit Return to continue") + + +def demo(corpus_root=None): + """ + The CHILDES corpus should be manually downloaded and saved + to ``[NLTK_Data_Dir]/corpora/childes/`` + """ + if not corpus_root: + from nltk.data import find + + corpus_root = find("corpora/childes/data-xml/Eng-USA/") + + try: + childes = CHILDESCorpusReader(corpus_root, ".*.xml") + # describe all corpus + for file in childes.fileids()[:5]: + corpus = "" + corpus_id = "" + for (key, value) in childes.corpus(file)[0].items(): + if key == "Corpus": + corpus = value + if key == "Id": + corpus_id = value + print("Reading", corpus, corpus_id, " .....") + print("words:", childes.words(file)[:7], "...") + print( + "words with replaced words:", + childes.words(file, replace=True)[:7], + " ...", + ) + print("words with pos tags:", childes.tagged_words(file)[:7], " ...") + print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...") + print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...") + print("stemmed words:", childes.words(file, stem=True)[:7], " ...") + print( + "words with relations and pos-tag:", + childes.words(file, relation=True)[:5], + " ...", + ) + print("sentence:", childes.sents(file)[:2], " ...") + for (participant, values) in childes.participants(file)[0].items(): + for (key, value) in values.items(): + print("\tparticipant", participant, key, ":", value) + print("num of sent:", len(childes.sents(file))) + print("num of morphemes:", len(childes.words(file, stem=True))) + print("age:", childes.age(file)) + print("age in month:", childes.age(file, month=True)) + print("MLU:", childes.MLU(file)) + print() + + except LookupError as e: + print( + """The CHILDES corpus, or the parts you need, should be manually + downloaded from https://childes.talkbank.org/data-xml/ and saved at + [NLTK_Data_Dir]/corpora/childes/ + Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.: + demo('/path/to/childes/data-xml/Eng-USA/") + """ + ) + # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip') + # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read())) + ##this fails + # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist()) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chunked.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chunked.py new file mode 100644 index 0000000000000000000000000000000000000000..a0f3497f8b250403ff0a427da31a9e6a3c279a5a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chunked.py @@ -0,0 +1,273 @@ +# Natural Language Toolkit: Chunked Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +A reader for corpora that contain chunked (and optionally tagged) +documents. +""" + +import codecs +import os.path + +import nltk +from nltk.chunk import tagstr2tree +from nltk.corpus.reader.api import * +from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader +from nltk.corpus.reader.util import * +from nltk.tokenize import * +from nltk.tree import Tree + + +class ChunkedCorpusReader(CorpusReader): + """ + Reader for chunked (and optionally tagged) corpora. Paragraphs + are split using a block reader. They are then tokenized into + sentences using a sentence tokenizer. Finally, these sentences + are parsed into chunk trees using a string-to-chunktree conversion + function. Each of these steps can be performed using a default + function or a custom function. By default, paragraphs are split + on blank lines; sentences are listed one per line; and sentences + are parsed into chunk trees using ``nltk.chunk.tagstr2tree``. + """ + + def __init__( + self, + root, + fileids, + extension="", + str2chunktree=tagstr2tree, + sent_tokenizer=RegexpTokenizer("\n", gaps=True), + para_block_reader=read_blankline_block, + encoding="utf8", + tagset=None, + ): + """ + :param root: The root directory for this corpus. + :param fileids: A list or regexp specifying the fileids in this corpus. + """ + CorpusReader.__init__(self, root, fileids, encoding) + self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset) + """Arguments for corpus views generated by this corpus: a tuple + (str2chunktree, sent_tokenizer, para_block_tokenizer)""" + + def words(self, fileids=None): + """ + :return: the given file(s) as a list of words + and punctuation symbols. + :rtype: list(str) + """ + return concat( + [ + ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def sents(self, fileids=None): + """ + :return: the given file(s) as a list of + sentences or utterances, each encoded as a list of word + strings. + :rtype: list(list(str)) + """ + return concat( + [ + ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def paras(self, fileids=None): + """ + :return: the given file(s) as a list of + paragraphs, each encoded as a list of sentences, which are + in turn encoded as lists of word strings. + :rtype: list(list(list(str))) + """ + return concat( + [ + ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_words(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of tagged + words and punctuation symbols, encoded as tuples + ``(word,tag)``. + :rtype: list(tuple(str,str)) + """ + return concat( + [ + ChunkedCorpusView( + f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset + ) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_sents(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of + sentences, each encoded as a list of ``(word,tag)`` tuples. + + :rtype: list(list(tuple(str,str))) + """ + return concat( + [ + ChunkedCorpusView( + f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset + ) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_paras(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of + paragraphs, each encoded as a list of sentences, which are + in turn encoded as lists of ``(word,tag)`` tuples. + :rtype: list(list(list(tuple(str,str)))) + """ + return concat( + [ + ChunkedCorpusView( + f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset + ) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def chunked_words(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of tagged + words and chunks. Words are encoded as ``(word, tag)`` + tuples (if the corpus has tags) or word strings (if the + corpus has no tags). Chunks are encoded as depth-one + trees over ``(word,tag)`` tuples or word strings. + :rtype: list(tuple(str,str) and Tree) + """ + return concat( + [ + ChunkedCorpusView( + f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset + ) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def chunked_sents(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of + sentences, each encoded as a shallow Tree. The leaves + of these trees are encoded as ``(word, tag)`` tuples (if + the corpus has tags) or word strings (if the corpus has no + tags). + :rtype: list(Tree) + """ + return concat( + [ + ChunkedCorpusView( + f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset + ) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def chunked_paras(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of + paragraphs, each encoded as a list of sentences, which are + in turn encoded as a shallow Tree. The leaves of these + trees are encoded as ``(word, tag)`` tuples (if the corpus + has tags) or word strings (if the corpus has no tags). + :rtype: list(list(Tree)) + """ + return concat( + [ + ChunkedCorpusView( + f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset + ) + for (f, enc) in self.abspaths(fileids, True) + ] + ) + + def _read_block(self, stream): + return [tagstr2tree(t) for t in read_blankline_block(stream)] + + +class ChunkedCorpusView(StreamBackedCorpusView): + def __init__( + self, + fileid, + encoding, + tagged, + group_by_sent, + group_by_para, + chunked, + str2chunktree, + sent_tokenizer, + para_block_reader, + source_tagset=None, + target_tagset=None, + ): + StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) + self._tagged = tagged + self._group_by_sent = group_by_sent + self._group_by_para = group_by_para + self._chunked = chunked + self._str2chunktree = str2chunktree + self._sent_tokenizer = sent_tokenizer + self._para_block_reader = para_block_reader + self._source_tagset = source_tagset + self._target_tagset = target_tagset + + def read_block(self, stream): + block = [] + for para_str in self._para_block_reader(stream): + para = [] + for sent_str in self._sent_tokenizer.tokenize(para_str): + sent = self._str2chunktree( + sent_str, + source_tagset=self._source_tagset, + target_tagset=self._target_tagset, + ) + + # If requested, throw away the tags. + if not self._tagged: + sent = self._untag(sent) + + # If requested, throw away the chunks. + if not self._chunked: + sent = sent.leaves() + + # Add the sentence to `para`. + if self._group_by_sent: + para.append(sent) + else: + para.extend(sent) + + # Add the paragraph to `block`. + if self._group_by_para: + block.append(para) + else: + block.extend(para) + + # Return the block + return block + + def _untag(self, tree): + for i, child in enumerate(tree): + if isinstance(child, Tree): + self._untag(child) + elif isinstance(child, tuple): + tree[i] = child[0] + else: + raise ValueError("expected child to be Tree or tuple") + return tree diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/cmudict.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/cmudict.py new file mode 100644 index 0000000000000000000000000000000000000000..d91c256d6f50dd21ba0107d9cda04bb4904e6cd7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/cmudict.py @@ -0,0 +1,88 @@ +# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6] +ftp://ftp.cs.cmu.edu/project/speech/dict/ +Copyright 1998 Carnegie Mellon University + +File Format: Each line consists of an uppercased word, a counter +(for alternative pronunciations), and a transcription. Vowels are +marked for stress (1=primary, 2=secondary, 0=no stress). E.g.: +NATURAL 1 N AE1 CH ER0 AH0 L + +The dictionary contains 127069 entries. Of these, 119400 words are assigned +a unique pronunciation, 6830 words have two pronunciations, and 839 words have +three or more pronunciations. Many of these are fast-speech variants. + +Phonemes: There are 39 phonemes, as shown below: + +Phoneme Example Translation Phoneme Example Translation +------- ------- ----------- ------- ------- ----------- +AA odd AA D AE at AE T +AH hut HH AH T AO ought AO T +AW cow K AW AY hide HH AY D +B be B IY CH cheese CH IY Z +D dee D IY DH thee DH IY +EH Ed EH D ER hurt HH ER T +EY ate EY T F fee F IY +G green G R IY N HH he HH IY +IH it IH T IY eat IY T +JH gee JH IY K key K IY +L lee L IY M me M IY +N knee N IY NG ping P IH NG +OW oat OW T OY toy T OY +P pee P IY R read R IY D +S sea S IY SH she SH IY +T tea T IY TH theta TH EY T AH +UH hood HH UH D UW two T UW +V vee V IY W we W IY +Y yield Y IY L D Z zee Z IY +ZH seizure S IY ZH ER +""" + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.util import Index + + +class CMUDictCorpusReader(CorpusReader): + def entries(self): + """ + :return: the cmudict lexicon as a list of entries + containing (word, transcriptions) tuples. + """ + return concat( + [ + StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc) + for fileid, enc in self.abspaths(None, True) + ] + ) + + def words(self): + """ + :return: a list of all words defined in the cmudict lexicon. + """ + return [word.lower() for (word, _) in self.entries()] + + def dict(self): + """ + :return: the cmudict lexicon as a dictionary, whose keys are + lowercase words and whose values are lists of pronunciations. + """ + return dict(Index(self.entries())) + + +def read_cmudict_block(stream): + entries = [] + while len(entries) < 100: # Read 100 at a time. + line = stream.readline() + if line == "": + return entries # end of file. + pieces = line.split() + entries.append((pieces[0].lower(), pieces[2:])) + return entries diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/comparative_sents.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/comparative_sents.py new file mode 100644 index 0000000000000000000000000000000000000000..917772ec095b266ba70b0f776d0cbd94b673ae97 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/comparative_sents.py @@ -0,0 +1,309 @@ +# Natural Language Toolkit: Comparative Sentence Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Pierpaolo Pantone <24alsecondo@gmail.com> +# URL: +# For license information, see LICENSE.TXT + +""" +CorpusReader for the Comparative Sentence Dataset. + +- Comparative Sentence Dataset information - + +Annotated by: Nitin Jindal and Bing Liu, 2006. + Department of Computer Sicence + University of Illinois at Chicago + +Contact: Nitin Jindal, njindal@cs.uic.edu + Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub) + +Distributed with permission. + +Related papers: + +- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents". + Proceedings of the ACM SIGIR International Conference on Information Retrieval + (SIGIR-06), 2006. + +- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations". + Proceedings of Twenty First National Conference on Artificial Intelligence + (AAAI-2006), 2006. + +- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences". + Proceedings of the 22nd International Conference on Computational Linguistics + (Coling-2008), Manchester, 18-22 August, 2008. +""" +import re + +from nltk.corpus.reader.api import * +from nltk.tokenize import * + +# Regular expressions for dataset components +STARS = re.compile(r"^\*+$") +COMPARISON = re.compile(r"") +CLOSE_COMPARISON = re.compile(r"") +GRAD_COMPARISON = re.compile(r"") +NON_GRAD_COMPARISON = re.compile(r"") +ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)") +KEYWORD = re.compile(r"\(([^\(]*)\)$") + + +class Comparison: + """ + A Comparison represents a comparative sentence and its constituents. + """ + + def __init__( + self, + text=None, + comp_type=None, + entity_1=None, + entity_2=None, + feature=None, + keyword=None, + ): + """ + :param text: a string (optionally tokenized) containing a comparison. + :param comp_type: an integer defining the type of comparison expressed. + Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative), + 4 (Non-gradable). + :param entity_1: the first entity considered in the comparison relation. + :param entity_2: the second entity considered in the comparison relation. + :param feature: the feature considered in the comparison relation. + :param keyword: the word or phrase which is used for that comparative relation. + """ + self.text = text + self.comp_type = comp_type + self.entity_1 = entity_1 + self.entity_2 = entity_2 + self.feature = feature + self.keyword = keyword + + def __repr__(self): + return ( + 'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", ' + 'feature="{}", keyword="{}")' + ).format( + self.text, + self.comp_type, + self.entity_1, + self.entity_2, + self.feature, + self.keyword, + ) + + +class ComparativeSentencesCorpusReader(CorpusReader): + """ + Reader for the Comparative Sentence Dataset by Jindal and Liu (2006). + + >>> from nltk.corpus import comparative_sentences + >>> comparison = comparative_sentences.comparisons()[0] + >>> comparison.text # doctest: +NORMALIZE_WHITESPACE + ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly', + 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve", + 'had', '.'] + >>> comparison.entity_2 + 'models' + >>> (comparison.feature, comparison.keyword) + ('rewind', 'more') + >>> len(comparative_sentences.comparisons()) + 853 + """ + + CorpusView = StreamBackedCorpusView + + def __init__( + self, + root, + fileids, + word_tokenizer=WhitespaceTokenizer(), + sent_tokenizer=None, + encoding="utf8", + ): + """ + :param root: The root directory for this corpus. + :param fileids: a list or regexp specifying the fileids in this corpus. + :param word_tokenizer: tokenizer for breaking sentences or paragraphs + into words. Default: `WhitespaceTokenizer` + :param sent_tokenizer: tokenizer for breaking paragraphs into sentences. + :param encoding: the encoding that should be used to read the corpus. + """ + + CorpusReader.__init__(self, root, fileids, encoding) + self._word_tokenizer = word_tokenizer + self._sent_tokenizer = sent_tokenizer + self._readme = "README.txt" + + def comparisons(self, fileids=None): + """ + Return all comparisons in the corpus. + + :param fileids: a list or regexp specifying the ids of the files whose + comparisons have to be returned. + :return: the given file(s) as a list of Comparison objects. + :rtype: list(Comparison) + """ + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + return concat( + [ + self.CorpusView(path, self._read_comparison_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def keywords(self, fileids=None): + """ + Return a set of all keywords used in the corpus. + + :param fileids: a list or regexp specifying the ids of the files whose + keywords have to be returned. + :return: the set of keywords and comparative phrases used in the corpus. + :rtype: set(str) + """ + all_keywords = concat( + [ + self.CorpusView(path, self._read_keyword_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + keywords_set = {keyword.lower() for keyword in all_keywords if keyword} + return keywords_set + + def keywords_readme(self): + """ + Return the list of words and constituents considered as clues of a + comparison (from listOfkeywords.txt). + """ + keywords = [] + with self.open("listOfkeywords.txt") as fp: + raw_text = fp.read() + for line in raw_text.split("\n"): + if not line or line.startswith("//"): + continue + keywords.append(line.strip()) + return keywords + + def sents(self, fileids=None): + """ + Return all sentences in the corpus. + + :param fileids: a list or regexp specifying the ids of the files whose + sentences have to be returned. + :return: all sentences of the corpus as lists of tokens (or as plain + strings, if no word tokenizer is specified). + :rtype: list(list(str)) or list(str) + """ + return concat( + [ + self.CorpusView(path, self._read_sent_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def words(self, fileids=None): + """ + Return all words and punctuation symbols in the corpus. + + :param fileids: a list or regexp specifying the ids of the files whose + words have to be returned. + :return: the given file(s) as a list of words and punctuation symbols. + :rtype: list(str) + """ + return concat( + [ + self.CorpusView(path, self._read_word_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def _read_comparison_block(self, stream): + while True: + line = stream.readline() + if not line: + return [] # end of file. + comparison_tags = re.findall(COMPARISON, line) + if comparison_tags: + grad_comparisons = re.findall(GRAD_COMPARISON, line) + non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line) + # Advance to the next line (it contains the comparative sentence) + comparison_text = stream.readline().strip() + if self._word_tokenizer: + comparison_text = self._word_tokenizer.tokenize(comparison_text) + # Skip the next line (it contains closing comparison tags) + stream.readline() + # If gradable comparisons are found, create Comparison instances + # and populate their fields + comparison_bundle = [] + if grad_comparisons: + # Each comparison tag has its own relations on a separate line + for comp in grad_comparisons: + comp_type = int(re.match(r"", comp).group(1)) + comparison = Comparison( + text=comparison_text, comp_type=comp_type + ) + line = stream.readline() + entities_feats = ENTITIES_FEATS.findall(line) + if entities_feats: + for (code, entity_feat) in entities_feats: + if code == "1": + comparison.entity_1 = entity_feat.strip() + elif code == "2": + comparison.entity_2 = entity_feat.strip() + elif code == "3": + comparison.feature = entity_feat.strip() + keyword = KEYWORD.findall(line) + if keyword: + comparison.keyword = keyword[0] + comparison_bundle.append(comparison) + # If non-gradable comparisons are found, create a simple Comparison + # instance for each one + if non_grad_comparisons: + for comp in non_grad_comparisons: + # comp_type in this case should always be 4. + comp_type = int(re.match(r"", comp).group(1)) + comparison = Comparison( + text=comparison_text, comp_type=comp_type + ) + comparison_bundle.append(comparison) + # Flatten the list of comparisons before returning them + # return concat([comparison_bundle]) + return comparison_bundle + + def _read_keyword_block(self, stream): + keywords = [] + for comparison in self._read_comparison_block(stream): + keywords.append(comparison.keyword) + return keywords + + def _read_sent_block(self, stream): + while True: + line = stream.readline() + if re.match(STARS, line): + while True: + line = stream.readline() + if re.match(STARS, line): + break + continue + if ( + not re.findall(COMPARISON, line) + and not ENTITIES_FEATS.findall(line) + and not re.findall(CLOSE_COMPARISON, line) + ): + if self._sent_tokenizer: + return [ + self._word_tokenizer.tokenize(sent) + for sent in self._sent_tokenizer.tokenize(line) + ] + else: + return [self._word_tokenizer.tokenize(line)] + + def _read_word_block(self, stream): + words = [] + for sent in self._read_sent_block(stream): + words.extend(sent) + return words diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/conll.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/conll.py new file mode 100644 index 0000000000000000000000000000000000000000..57b586e1d7972ea890a70c8c445b374bb81167fc --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/conll.py @@ -0,0 +1,579 @@ +# Natural Language Toolkit: CONLL Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Read CoNLL-style chunk fileids. +""" + +import textwrap + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.tag import map_tag +from nltk.tree import Tree +from nltk.util import LazyConcatenation, LazyMap + + +class ConllCorpusReader(CorpusReader): + """ + A corpus reader for CoNLL-style files. These files consist of a + series of sentences, separated by blank lines. Each sentence is + encoded using a table (or "grid") of values, where each line + corresponds to a single word, and each column corresponds to an + annotation type. The set of columns used by CoNLL-style files can + vary from corpus to corpus; the ``ConllCorpusReader`` constructor + therefore takes an argument, ``columntypes``, which is used to + specify the columns that are used by a given corpus. By default + columns are split by consecutive whitespaces, with the + ``separator`` argument you can set a string to split by (e.g. + ``\'\t\'``). + + + @todo: Add support for reading from corpora where different + parallel files contain different columns. + @todo: Possibly add caching of the grid corpus view? This would + allow the same grid view to be used by different data access + methods (eg words() and parsed_sents() could both share the + same grid corpus view object). + @todo: Better support for -DOCSTART-. Currently, we just ignore + it, but it could be used to define methods that retrieve a + document at a time (eg parsed_documents()). + """ + + # ///////////////////////////////////////////////////////////////// + # Column Types + # ///////////////////////////////////////////////////////////////// + + WORDS = "words" #: column type for words + POS = "pos" #: column type for part-of-speech tags + TREE = "tree" #: column type for parse trees + CHUNK = "chunk" #: column type for chunk structures + NE = "ne" #: column type for named entities + SRL = "srl" #: column type for semantic role labels + IGNORE = "ignore" #: column type for column that should be ignored + + #: A list of all column types supported by the conll corpus reader. + COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE) + + # ///////////////////////////////////////////////////////////////// + # Constructor + # ///////////////////////////////////////////////////////////////// + + def __init__( + self, + root, + fileids, + columntypes, + chunk_types=None, + root_label="S", + pos_in_tree=False, + srl_includes_roleset=True, + encoding="utf8", + tree_class=Tree, + tagset=None, + separator=None, + ): + for columntype in columntypes: + if columntype not in self.COLUMN_TYPES: + raise ValueError("Bad column type %r" % columntype) + if isinstance(chunk_types, str): + chunk_types = [chunk_types] + self._chunk_types = chunk_types + self._colmap = {c: i for (i, c) in enumerate(columntypes)} + self._pos_in_tree = pos_in_tree + self._root_label = root_label # for chunks + self._srl_includes_roleset = srl_includes_roleset + self._tree_class = tree_class + CorpusReader.__init__(self, root, fileids, encoding) + self._tagset = tagset + self.sep = separator + + # ///////////////////////////////////////////////////////////////// + # Data Access Methods + # ///////////////////////////////////////////////////////////////// + + def words(self, fileids=None): + self._require(self.WORDS) + return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids))) + + def sents(self, fileids=None): + self._require(self.WORDS) + return LazyMap(self._get_words, self._grids(fileids)) + + def tagged_words(self, fileids=None, tagset=None): + self._require(self.WORDS, self.POS) + + def get_tagged_words(grid): + return self._get_tagged_words(grid, tagset) + + return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids))) + + def tagged_sents(self, fileids=None, tagset=None): + self._require(self.WORDS, self.POS) + + def get_tagged_words(grid): + return self._get_tagged_words(grid, tagset) + + return LazyMap(get_tagged_words, self._grids(fileids)) + + def chunked_words(self, fileids=None, chunk_types=None, tagset=None): + self._require(self.WORDS, self.POS, self.CHUNK) + if chunk_types is None: + chunk_types = self._chunk_types + + def get_chunked_words(grid): # capture chunk_types as local var + return self._get_chunked_words(grid, chunk_types, tagset) + + return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids))) + + def chunked_sents(self, fileids=None, chunk_types=None, tagset=None): + self._require(self.WORDS, self.POS, self.CHUNK) + if chunk_types is None: + chunk_types = self._chunk_types + + def get_chunked_words(grid): # capture chunk_types as local var + return self._get_chunked_words(grid, chunk_types, tagset) + + return LazyMap(get_chunked_words, self._grids(fileids)) + + def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None): + self._require(self.WORDS, self.POS, self.TREE) + if pos_in_tree is None: + pos_in_tree = self._pos_in_tree + + def get_parsed_sent(grid): # capture pos_in_tree as local var + return self._get_parsed_sent(grid, pos_in_tree, tagset) + + return LazyMap(get_parsed_sent, self._grids(fileids)) + + def srl_spans(self, fileids=None): + self._require(self.SRL) + return LazyMap(self._get_srl_spans, self._grids(fileids)) + + def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True): + self._require(self.WORDS, self.POS, self.TREE, self.SRL) + if pos_in_tree is None: + pos_in_tree = self._pos_in_tree + + def get_srl_instances(grid): # capture pos_in_tree as local var + return self._get_srl_instances(grid, pos_in_tree) + + result = LazyMap(get_srl_instances, self._grids(fileids)) + if flatten: + result = LazyConcatenation(result) + return result + + def iob_words(self, fileids=None, tagset=None): + """ + :return: a list of word/tag/IOB tuples + :rtype: list(tuple) + :param fileids: the list of fileids that make up this corpus + :type fileids: None or str or list + """ + self._require(self.WORDS, self.POS, self.CHUNK) + + def get_iob_words(grid): + return self._get_iob_words(grid, tagset) + + return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids))) + + def iob_sents(self, fileids=None, tagset=None): + """ + :return: a list of lists of word/tag/IOB tuples + :rtype: list(list) + :param fileids: the list of fileids that make up this corpus + :type fileids: None or str or list + """ + self._require(self.WORDS, self.POS, self.CHUNK) + + def get_iob_words(grid): + return self._get_iob_words(grid, tagset) + + return LazyMap(get_iob_words, self._grids(fileids)) + + # ///////////////////////////////////////////////////////////////// + # Grid Reading + # ///////////////////////////////////////////////////////////////// + + def _grids(self, fileids=None): + # n.b.: we could cache the object returned here (keyed on + # fileids), which would let us reuse the same corpus view for + # different things (eg srl and parse trees). + return concat( + [ + StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def _read_grid_block(self, stream): + grids = [] + for block in read_blankline_block(stream): + block = block.strip() + if not block: + continue + + grid = [line.split(self.sep) for line in block.split("\n")] + + # If there's a docstart row, then discard. ([xx] eventually it + # would be good to actually use it) + if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-": + del grid[0] + + # Check that the grid is consistent. + for row in grid: + if len(row) != len(grid[0]): + raise ValueError("Inconsistent number of columns:\n%s" % block) + grids.append(grid) + return grids + + # ///////////////////////////////////////////////////////////////// + # Transforms + # ///////////////////////////////////////////////////////////////// + # given a grid, transform it into some representation (e.g., + # a list of words or a parse tree). + + def _get_words(self, grid): + return self._get_column(grid, self._colmap["words"]) + + def _get_tagged_words(self, grid, tagset=None): + pos_tags = self._get_column(grid, self._colmap["pos"]) + if tagset and tagset != self._tagset: + pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] + return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags)) + + def _get_iob_words(self, grid, tagset=None): + pos_tags = self._get_column(grid, self._colmap["pos"]) + if tagset and tagset != self._tagset: + pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] + return list( + zip( + self._get_column(grid, self._colmap["words"]), + pos_tags, + self._get_column(grid, self._colmap["chunk"]), + ) + ) + + def _get_chunked_words(self, grid, chunk_types, tagset=None): + # n.b.: this method is very similar to conllstr2tree. + words = self._get_column(grid, self._colmap["words"]) + pos_tags = self._get_column(grid, self._colmap["pos"]) + if tagset and tagset != self._tagset: + pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] + chunk_tags = self._get_column(grid, self._colmap["chunk"]) + + stack = [Tree(self._root_label, [])] + + for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): + if chunk_tag == "O": + state, chunk_type = "O", "" + else: + (state, chunk_type) = chunk_tag.split("-") + # If it's a chunk we don't care about, treat it as O. + if chunk_types is not None and chunk_type not in chunk_types: + state = "O" + # Treat a mismatching I like a B. + if state == "I" and chunk_type != stack[-1].label(): + state = "B" + # For B or I: close any open chunks + if state in "BO" and len(stack) == 2: + stack.pop() + # For B: start a new chunk. + if state == "B": + new_chunk = Tree(chunk_type, []) + stack[-1].append(new_chunk) + stack.append(new_chunk) + # Add the word token. + stack[-1].append((word, pos_tag)) + + return stack[0] + + def _get_parsed_sent(self, grid, pos_in_tree, tagset=None): + words = self._get_column(grid, self._colmap["words"]) + pos_tags = self._get_column(grid, self._colmap["pos"]) + if tagset and tagset != self._tagset: + pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] + parse_tags = self._get_column(grid, self._colmap["tree"]) + + treestr = "" + for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): + if word == "(": + word = "-LRB-" + if word == ")": + word = "-RRB-" + if pos_tag == "(": + pos_tag = "-LRB-" + if pos_tag == ")": + pos_tag = "-RRB-" + (left, right) = parse_tag.split("*") + right = right.count(")") * ")" # only keep ')'. + treestr += f"{left} ({pos_tag} {word}) {right}" + try: + tree = self._tree_class.fromstring(treestr) + except (ValueError, IndexError): + tree = self._tree_class.fromstring(f"({self._root_label} {treestr})") + + if not pos_in_tree: + for subtree in tree.subtrees(): + for i, child in enumerate(subtree): + if ( + isinstance(child, Tree) + and len(child) == 1 + and isinstance(child[0], str) + ): + subtree[i] = (child[0], child.label()) + + return tree + + def _get_srl_spans(self, grid): + """ + list of list of (start, end), tag) tuples + """ + if self._srl_includes_roleset: + predicates = self._get_column(grid, self._colmap["srl"] + 1) + start_col = self._colmap["srl"] + 2 + else: + predicates = self._get_column(grid, self._colmap["srl"]) + start_col = self._colmap["srl"] + 1 + + # Count how many predicates there are. This tells us how many + # columns to expect for SRL data. + num_preds = len([p for p in predicates if p != "-"]) + + spanlists = [] + for i in range(num_preds): + col = self._get_column(grid, start_col + i) + spanlist = [] + stack = [] + for wordnum, srl_tag in enumerate(col): + (left, right) = srl_tag.split("*") + for tag in left.split("("): + if tag: + stack.append((tag, wordnum)) + for i in range(right.count(")")): + (tag, start) = stack.pop() + spanlist.append(((start, wordnum + 1), tag)) + spanlists.append(spanlist) + + return spanlists + + def _get_srl_instances(self, grid, pos_in_tree): + tree = self._get_parsed_sent(grid, pos_in_tree) + spanlists = self._get_srl_spans(grid) + if self._srl_includes_roleset: + predicates = self._get_column(grid, self._colmap["srl"] + 1) + rolesets = self._get_column(grid, self._colmap["srl"]) + else: + predicates = self._get_column(grid, self._colmap["srl"]) + rolesets = [None] * len(predicates) + + instances = ConllSRLInstanceList(tree) + for wordnum, predicate in enumerate(predicates): + if predicate == "-": + continue + # Decide which spanlist to use. Don't assume that they're + # sorted in the same order as the predicates (even though + # they usually are). + for spanlist in spanlists: + for (start, end), tag in spanlist: + if wordnum in range(start, end) and tag in ("V", "C-V"): + break + else: + continue + break + else: + raise ValueError("No srl column found for %r" % predicate) + instances.append( + ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist) + ) + + return instances + + # ///////////////////////////////////////////////////////////////// + # Helper Methods + # ///////////////////////////////////////////////////////////////// + + def _require(self, *columntypes): + for columntype in columntypes: + if columntype not in self._colmap: + raise ValueError( + "This corpus does not contain a %s " "column." % columntype + ) + + @staticmethod + def _get_column(grid, column_index): + return [grid[i][column_index] for i in range(len(grid))] + + +class ConllSRLInstance: + """ + An SRL instance from a CoNLL corpus, which identifies and + providing labels for the arguments of a single verb. + """ + + # [xx] add inst.core_arguments, inst.argm_arguments? + + def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans): + self.verb = [] + """A list of the word indices of the words that compose the + verb whose arguments are identified by this instance. + This will contain multiple word indices when multi-word + verbs are used (e.g. 'turn on').""" + + self.verb_head = verb_head + """The word index of the head word of the verb whose arguments + are identified by this instance. E.g., for a sentence that + uses the verb 'turn on,' ``verb_head`` will be the word index + of the word 'turn'.""" + + self.verb_stem = verb_stem + + self.roleset = roleset + + self.arguments = [] + """A list of ``(argspan, argid)`` tuples, specifying the location + and type for each of the arguments identified by this + instance. ``argspan`` is a tuple ``start, end``, indicating + that the argument consists of the ``words[start:end]``.""" + + self.tagged_spans = tagged_spans + """A list of ``(span, id)`` tuples, specifying the location and + type for each of the arguments, as well as the verb pieces, + that make up this instance.""" + + self.tree = tree + """The parse tree for the sentence containing this instance.""" + + self.words = tree.leaves() + """A list of the words in the sentence containing this + instance.""" + + # Fill in the self.verb and self.arguments values. + for (start, end), tag in tagged_spans: + if tag in ("V", "C-V"): + self.verb += list(range(start, end)) + else: + self.arguments.append(((start, end), tag)) + + def __repr__(self): + # Originally, its: + ##plural = 's' if len(self.arguments) != 1 else '' + plural = "s" if len(self.arguments) != 1 else "" + return "" % ( + (self.verb_stem, len(self.arguments), plural) + ) + + def pprint(self): + verbstr = " ".join(self.words[i][0] for i in self.verb) + hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n" + s = "" + for i, word in enumerate(self.words): + if isinstance(word, tuple): + word = word[0] + for (start, end), argid in self.arguments: + if i == start: + s += "[%s " % argid + if i == end: + s += "] " + if i in self.verb: + word = "<<%s>>" % word + s += word + " " + return hdr + textwrap.fill( + s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" " + ) + + +class ConllSRLInstanceList(list): + """ + Set of instances for a single sentence + """ + + def __init__(self, tree, instances=()): + self.tree = tree + list.__init__(self, instances) + + def __str__(self): + return self.pprint() + + def pprint(self, include_tree=False): + # Sanity check: trees should be the same + for inst in self: + if inst.tree != self.tree: + raise ValueError("Tree mismatch!") + + # If desired, add trees: + if include_tree: + words = self.tree.leaves() + pos = [None] * len(words) + synt = ["*"] * len(words) + self._tree2conll(self.tree, 0, words, pos, synt) + + s = "" + for i in range(len(words)): + # optional tree columns + if include_tree: + s += "%-20s " % words[i] + s += "%-8s " % pos[i] + s += "%15s*%-8s " % tuple(synt[i].split("*")) + + # verb head column + for inst in self: + if i == inst.verb_head: + s += "%-20s " % inst.verb_stem + break + else: + s += "%-20s " % "-" + # Remaining columns: self + for inst in self: + argstr = "*" + for (start, end), argid in inst.tagged_spans: + if i == start: + argstr = f"({argid}{argstr}" + if i == (end - 1): + argstr += ")" + s += "%-12s " % argstr + s += "\n" + return s + + def _tree2conll(self, tree, wordnum, words, pos, synt): + assert isinstance(tree, Tree) + if len(tree) == 1 and isinstance(tree[0], str): + pos[wordnum] = tree.label() + assert words[wordnum] == tree[0] + return wordnum + 1 + elif len(tree) == 1 and isinstance(tree[0], tuple): + assert len(tree[0]) == 2 + pos[wordnum], pos[wordnum] = tree[0] + return wordnum + 1 + else: + synt[wordnum] = f"({tree.label()}{synt[wordnum]}" + for child in tree: + wordnum = self._tree2conll(child, wordnum, words, pos, synt) + synt[wordnum - 1] += ")" + return wordnum + + +class ConllChunkCorpusReader(ConllCorpusReader): + """ + A ConllCorpusReader whose data file contains three columns: words, + pos, and chunk. + """ + + def __init__( + self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None + ): + ConllCorpusReader.__init__( + self, + root, + fileids, + ("words", "pos", "chunk"), + chunk_types=chunk_types, + encoding=encoding, + tagset=tagset, + separator=separator, + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/crubadan.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/crubadan.py new file mode 100644 index 0000000000000000000000000000000000000000..3e0f2750e7336b328e70c4fa2f346049caa9184c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/crubadan.py @@ -0,0 +1,106 @@ +# Natural Language Toolkit: An Crubadan N-grams Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Avital Pekker +# +# URL: +# For license information, see LICENSE.TXT + +""" +An NLTK interface for the n-gram statistics gathered from +the corpora for each language using An Crubadan. + +There are multiple potential applications for the data but +this reader was created with the goal of using it in the +context of language identification. + +For details about An Crubadan, this data, and its potential uses, see: +http://borel.slu.edu/crubadan/index.html +""" + +import re +from os import path + +from nltk.corpus.reader import CorpusReader +from nltk.data import ZipFilePathPointer +from nltk.probability import FreqDist + + +class CrubadanCorpusReader(CorpusReader): + """ + A corpus reader used to access language An Crubadan n-gram files. + """ + + _LANG_MAPPER_FILE = "table.txt" + _all_lang_freq = {} + + def __init__(self, root, fileids, encoding="utf8", tagset=None): + super().__init__(root, fileids, encoding="utf8") + self._lang_mapping_data = [] + self._load_lang_mapping_data() + + def lang_freq(self, lang): + """Return n-gram FreqDist for a specific language + given ISO 639-3 language code""" + + if lang not in self._all_lang_freq: + self._all_lang_freq[lang] = self._load_lang_ngrams(lang) + + return self._all_lang_freq[lang] + + def langs(self): + """Return a list of supported languages as ISO 639-3 codes""" + return [row[1] for row in self._lang_mapping_data] + + def iso_to_crubadan(self, lang): + """Return internal Crubadan code based on ISO 639-3 code""" + for i in self._lang_mapping_data: + if i[1].lower() == lang.lower(): + return i[0] + + def crubadan_to_iso(self, lang): + """Return ISO 639-3 code given internal Crubadan code""" + for i in self._lang_mapping_data: + if i[0].lower() == lang.lower(): + return i[1] + + def _load_lang_mapping_data(self): + """Load language mappings between codes and description from table.txt""" + if isinstance(self.root, ZipFilePathPointer): + raise RuntimeError( + "Please install the 'crubadan' corpus first, use nltk.download()" + ) + + mapper_file = path.join(self.root, self._LANG_MAPPER_FILE) + if self._LANG_MAPPER_FILE not in self.fileids(): + raise RuntimeError("Could not find language mapper file: " + mapper_file) + + with open(mapper_file, encoding="utf-8") as raw: + strip_raw = raw.read().strip() + + self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")] + + def _load_lang_ngrams(self, lang): + """Load single n-gram language file given the ISO 639-3 language code + and return its FreqDist""" + + if lang not in self.langs(): + raise RuntimeError("Unsupported language.") + + crubadan_code = self.iso_to_crubadan(lang) + ngram_file = path.join(self.root, crubadan_code + "-3grams.txt") + + if not path.isfile(ngram_file): + raise RuntimeError("No N-gram file found for requested language.") + + counts = FreqDist() + with open(ngram_file, encoding="utf-8") as f: + for line in f: + data = line.split(" ") + + ngram = data[1].strip("\n") + freq = int(data[0]) + + counts[ngram] = freq + + return counts diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/dependency.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/dependency.py new file mode 100644 index 0000000000000000000000000000000000000000..30aedf7d1ee3cd4e2e3276ee4f5c8816b9d353d5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/dependency.py @@ -0,0 +1,115 @@ +# Natural Language Toolkit: Dependency Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Kepa Sarasola +# Iker Manterola +# +# URL: +# For license information, see LICENSE.TXT + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.parse import DependencyGraph +from nltk.tokenize import * + + +class DependencyCorpusReader(SyntaxCorpusReader): + def __init__( + self, + root, + fileids, + encoding="utf8", + word_tokenizer=TabTokenizer(), + sent_tokenizer=RegexpTokenizer("\n", gaps=True), + para_block_reader=read_blankline_block, + ): + SyntaxCorpusReader.__init__(self, root, fileids, encoding) + + ######################################################### + + def words(self, fileids=None): + return concat( + [ + DependencyCorpusView(fileid, False, False, False, encoding=enc) + for fileid, enc in self.abspaths(fileids, include_encoding=True) + ] + ) + + def tagged_words(self, fileids=None): + return concat( + [ + DependencyCorpusView(fileid, True, False, False, encoding=enc) + for fileid, enc in self.abspaths(fileids, include_encoding=True) + ] + ) + + def sents(self, fileids=None): + return concat( + [ + DependencyCorpusView(fileid, False, True, False, encoding=enc) + for fileid, enc in self.abspaths(fileids, include_encoding=True) + ] + ) + + def tagged_sents(self, fileids=None): + return concat( + [ + DependencyCorpusView(fileid, True, True, False, encoding=enc) + for fileid, enc in self.abspaths(fileids, include_encoding=True) + ] + ) + + def parsed_sents(self, fileids=None): + sents = concat( + [ + DependencyCorpusView(fileid, False, True, True, encoding=enc) + for fileid, enc in self.abspaths(fileids, include_encoding=True) + ] + ) + return [DependencyGraph(sent) for sent in sents] + + +class DependencyCorpusView(StreamBackedCorpusView): + _DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da + + def __init__( + self, + corpus_file, + tagged, + group_by_sent, + dependencies, + chunk_types=None, + encoding="utf8", + ): + self._tagged = tagged + self._dependencies = dependencies + self._group_by_sent = group_by_sent + self._chunk_types = chunk_types + StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) + + def read_block(self, stream): + # Read the next sentence. + sent = read_blankline_block(stream)[0].strip() + # Strip off the docstart marker, if present. + if sent.startswith(self._DOCSTART): + sent = sent[len(self._DOCSTART) :].lstrip() + + # extract word and tag from any of the formats + if not self._dependencies: + lines = [line.split("\t") for line in sent.split("\n")] + if len(lines[0]) == 3 or len(lines[0]) == 4: + sent = [(line[0], line[1]) for line in lines] + elif len(lines[0]) == 10: + sent = [(line[1], line[4]) for line in lines] + else: + raise ValueError("Unexpected number of fields in dependency tree file") + + # discard tags if they weren't requested + if not self._tagged: + sent = [word for (word, tag) in sent] + + # Return the result. + if self._group_by_sent: + return [sent] + else: + return list(sent) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/framenet.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/framenet.py new file mode 100644 index 0000000000000000000000000000000000000000..190751a4b3d0a998f372662afca746fdcfbcad20 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/framenet.py @@ -0,0 +1,3442 @@ +# Natural Language Toolkit: Framenet Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Chuck Wooters , +# Nathan Schneider +# URL: +# For license information, see LICENSE.TXT + + +""" +Corpus reader for the FrameNet 1.7 lexicon and corpus. +""" + +import itertools +import os +import re +import sys +import textwrap +import types +from collections import OrderedDict, defaultdict +from itertools import zip_longest +from operator import itemgetter +from pprint import pprint + +from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView +from nltk.util import LazyConcatenation, LazyIteratorList, LazyMap + +__docformat__ = "epytext en" + + +def mimic_wrap(lines, wrap_at=65, **kwargs): + """ + Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same + positions as the first. + """ + l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split("\n") + yield l0 + + def _(line): + il0 = 0 + while line and il0 < len(l0) - 1: + yield line[: len(l0[il0])] + line = line[len(l0[il0]) :] + il0 += 1 + if line: # Remaining stuff on this line past the end of the mimicked line. + # So just textwrap this line. + yield from textwrap.fill(line, wrap_at, drop_whitespace=False).split("\n") + + for l in lines[1:]: + yield list(_(l)) + + +def _pretty_longstring(defstr, prefix="", wrap_at=65): + + """ + Helper function for pretty-printing a long string. + + :param defstr: The string to be printed. + :type defstr: str + :return: A nicely formatted string representation of the long string. + :rtype: str + """ + + outstr = "" + for line in textwrap.fill(defstr, wrap_at).split("\n"): + outstr += prefix + line + "\n" + return outstr + + +def _pretty_any(obj): + + """ + Helper function for pretty-printing any AttrDict object. + + :param obj: The obj to be printed. + :type obj: AttrDict + :return: A nicely formatted string representation of the AttrDict object. + :rtype: str + """ + + outstr = "" + for k in obj: + if isinstance(obj[k], str) and len(obj[k]) > 65: + outstr += f"[{k}]\n" + outstr += "{}".format(_pretty_longstring(obj[k], prefix=" ")) + outstr += "\n" + else: + outstr += f"[{k}] {obj[k]}\n" + + return outstr + + +def _pretty_semtype(st): + + """ + Helper function for pretty-printing a semantic type. + + :param st: The semantic type to be printed. + :type st: AttrDict + :return: A nicely formatted string representation of the semantic type. + :rtype: str + """ + + semkeys = st.keys() + if len(semkeys) == 1: + return "" + + outstr = "" + outstr += "semantic type ({0.ID}): {0.name}\n".format(st) + if "abbrev" in semkeys: + outstr += f"[abbrev] {st.abbrev}\n" + if "definition" in semkeys: + outstr += "[definition]\n" + outstr += _pretty_longstring(st.definition, " ") + outstr += f"[rootType] {st.rootType.name}({st.rootType.ID})\n" + if st.superType is None: + outstr += "[superType] \n" + else: + outstr += f"[superType] {st.superType.name}({st.superType.ID})\n" + outstr += f"[subTypes] {len(st.subTypes)} subtypes\n" + outstr += ( + " " + + ", ".join(f"{x.name}({x.ID})" for x in st.subTypes) + + "\n" * (len(st.subTypes) > 0) + ) + return outstr + + +def _pretty_frame_relation_type(freltyp): + + """ + Helper function for pretty-printing a frame relation type. + + :param freltyp: The frame relation type to be printed. + :type freltyp: AttrDict + :return: A nicely formatted string representation of the frame relation type. + :rtype: str + """ + outstr = " {0.subFrameName}>".format( + freltyp + ) + return outstr + + +def _pretty_frame_relation(frel): + + """ + Helper function for pretty-printing a frame relation. + + :param frel: The frame relation to be printed. + :type frel: AttrDict + :return: A nicely formatted string representation of the frame relation. + :rtype: str + """ + outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format( + frel + ) + return outstr + + +def _pretty_fe_relation(ferel): + + """ + Helper function for pretty-printing an FE relation. + + :param ferel: The FE relation to be printed. + :type ferel: AttrDict + :return: A nicely formatted string representation of the FE relation. + :rtype: str + """ + outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format( + ferel + ) + return outstr + + +def _pretty_lu(lu): + + """ + Helper function for pretty-printing a lexical unit. + + :param lu: The lu to be printed. + :type lu: AttrDict + :return: A nicely formatted string representation of the lexical unit. + :rtype: str + """ + + lukeys = lu.keys() + outstr = "" + outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu) + if "definition" in lukeys: + outstr += "[definition]\n" + outstr += _pretty_longstring(lu.definition, " ") + if "frame" in lukeys: + outstr += f"\n[frame] {lu.frame.name}({lu.frame.ID})\n" + if "incorporatedFE" in lukeys: + outstr += f"\n[incorporatedFE] {lu.incorporatedFE}\n" + if "POS" in lukeys: + outstr += f"\n[POS] {lu.POS}\n" + if "status" in lukeys: + outstr += f"\n[status] {lu.status}\n" + if "totalAnnotated" in lukeys: + outstr += f"\n[totalAnnotated] {lu.totalAnnotated} annotated examples\n" + if "lexemes" in lukeys: + outstr += "\n[lexemes] {}\n".format( + " ".join(f"{lex.name}/{lex.POS}" for lex in lu.lexemes) + ) + if "semTypes" in lukeys: + outstr += f"\n[semTypes] {len(lu.semTypes)} semantic types\n" + outstr += ( + " " * (len(lu.semTypes) > 0) + + ", ".join(f"{x.name}({x.ID})" for x in lu.semTypes) + + "\n" * (len(lu.semTypes) > 0) + ) + if "URL" in lukeys: + outstr += f"\n[URL] {lu.URL}\n" + if "subCorpus" in lukeys: + subc = [x.name for x in lu.subCorpus] + outstr += f"\n[subCorpus] {len(lu.subCorpus)} subcorpora\n" + for line in textwrap.fill(", ".join(sorted(subc)), 60).split("\n"): + outstr += f" {line}\n" + if "exemplars" in lukeys: + outstr += "\n[exemplars] {} sentences across all subcorpora\n".format( + len(lu.exemplars) + ) + + return outstr + + +def _pretty_exemplars(exemplars, lu): + """ + Helper function for pretty-printing a list of exemplar sentences for a lexical unit. + + :param sent: The list of exemplar sentences to be printed. + :type sent: list(AttrDict) + :return: An index of the text of the exemplar sentences. + :rtype: str + """ + + outstr = "" + outstr += "exemplar sentences for {0.name} in {0.frame.name}:\n\n".format(lu) + for i, sent in enumerate(exemplars): + outstr += f"[{i}] {sent.text}\n" + outstr += "\n" + return outstr + + +def _pretty_fulltext_sentences(sents): + """ + Helper function for pretty-printing a list of annotated sentences for a full-text document. + + :param sent: The list of sentences to be printed. + :type sent: list(AttrDict) + :return: An index of the text of the sentences. + :rtype: str + """ + + outstr = "" + outstr += "full-text document ({0.ID}) {0.name}:\n\n".format(sents) + outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format( + sents + ) + outstr += f"[sentence]\n" + for i, sent in enumerate(sents.sentence): + outstr += f"[{i}] {sent.text}\n" + outstr += "\n" + return outstr + + +def _pretty_fulltext_sentence(sent): + """ + Helper function for pretty-printing an annotated sentence from a full-text document. + + :param sent: The sentence to be printed. + :type sent: list(AttrDict) + :return: The text of the sentence with annotation set indices on frame targets. + :rtype: str + """ + + outstr = "" + outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format( + sent, sent.doc.get("name", sent.doc.description) + ) + outstr += f"\n[POS] {len(sent.POS)} tags\n" + outstr += f"\n[POS_tagset] {sent.POS_tagset}\n\n" + outstr += "[text] + [annotationSet]\n\n" + outstr += sent._ascii() # -> _annotation_ascii() + outstr += "\n" + return outstr + + +def _pretty_pos(aset): + """ + Helper function for pretty-printing a sentence with its POS tags. + + :param aset: The POS annotation set of the sentence to be printed. + :type sent: list(AttrDict) + :return: The text of the sentence and its POS tags. + :rtype: str + """ + + outstr = "" + outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format( + aset + ) + + # list the target spans and their associated aset index + overt = sorted(aset.POS) + + sent = aset.sent + s0 = sent.text + s1 = "" + s2 = "" + i = 0 + adjust = 0 + for j, k, lbl in overt: + assert j >= i, ("Overlapping targets?", (j, k, lbl)) + s1 += " " * (j - i) + "-" * (k - j) + if len(lbl) > (k - j): + # add space in the sentence to make room for the annotation index + amt = len(lbl) - (k - j) + s0 = ( + s0[: k + adjust] + "~" * amt + s0[k + adjust :] + ) # '~' to prevent line wrapping + s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :] + adjust += amt + s2 += " " * (j - i) + lbl.ljust(k - j) + i = k + + long_lines = [s0, s1, s2] + + outstr += "\n\n".join( + map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) + ).replace("~", " ") + outstr += "\n" + return outstr + + +def _pretty_annotation(sent, aset_level=False): + """ + Helper function for pretty-printing an exemplar sentence for a lexical unit. + + :param sent: An annotation set or exemplar sentence to be printed. + :param aset_level: If True, 'sent' is actually an annotation set within a sentence. + :type sent: AttrDict + :return: A nicely formatted string representation of the exemplar sentence + with its target, frame, and FE annotations. + :rtype: str + """ + + sentkeys = sent.keys() + outstr = "annotation set" if aset_level else "exemplar sentence" + outstr += f" ({sent.ID}):\n" + if aset_level: # TODO: any UNANN exemplars? + outstr += f"\n[status] {sent.status}\n" + for k in ("corpID", "docID", "paragNo", "sentNo", "aPos"): + if k in sentkeys: + outstr += f"[{k}] {sent[k]}\n" + outstr += ( + "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU) + if sent.LU + else "\n[LU] Not found!" + ) + outstr += "\n[frame] ({0.ID}) {0.name}\n".format( + sent.frame + ) # redundant with above, but .frame is convenient + if not aset_level: + outstr += "\n[annotationSet] {} annotation sets\n".format( + len(sent.annotationSet) + ) + outstr += f"\n[POS] {len(sent.POS)} tags\n" + outstr += f"\n[POS_tagset] {sent.POS_tagset}\n" + outstr += "\n[GF] {} relation{}\n".format( + len(sent.GF), "s" if len(sent.GF) != 1 else "" + ) + outstr += "\n[PT] {} phrase{}\n".format( + len(sent.PT), "s" if len(sent.PT) != 1 else "" + ) + """ + Special Layers + -------------- + + The 'NER' layer contains, for some of the data, named entity labels. + + The 'WSL' (word status layer) contains, for some of the data, + spans which should not in principle be considered targets (NT). + + The 'Other' layer records relative clause constructions (Rel=relativizer, Ant=antecedent), + pleonastic 'it' (Null), and existential 'there' (Exist). + On occasion they are duplicated by accident (e.g., annotationSet 1467275 in lu6700.xml). + + The 'Sent' layer appears to contain labels that the annotator has flagged the + sentence with for their convenience: values include + 'sense1', 'sense2', 'sense3', etc.; + 'Blend', 'Canonical', 'Idiom', 'Metaphor', 'Special-Sent', + 'keepS', 'deleteS', 'reexamine' + (sometimes they are duplicated for no apparent reason). + + The POS-specific layers may contain the following kinds of spans: + Asp (aspectual particle), Non-Asp (non-aspectual particle), + Cop (copula), Supp (support), Ctrlr (controller), + Gov (governor), X. Gov and X always cooccur. + + >>> from nltk.corpus import framenet as fn + >>> def f(luRE, lyr, ignore=set()): + ... for i,ex in enumerate(fn.exemplars(luRE)): + ... if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore: + ... print(i,ex[lyr]) + + - Verb: Asp, Non-Asp + - Noun: Cop, Supp, Ctrlr, Gov, X + - Adj: Cop, Supp, Ctrlr, Gov, X + - Prep: Cop, Supp, Ctrlr + - Adv: Ctrlr + - Scon: (none) + - Art: (none) + """ + for lyr in ("NER", "WSL", "Other", "Sent"): + if lyr in sent and sent[lyr]: + outstr += "\n[{}] {} entr{}\n".format( + lyr, len(sent[lyr]), "ies" if len(sent[lyr]) != 1 else "y" + ) + outstr += "\n[text] + [Target] + [FE]" + # POS-specific layers: syntactically important words that are neither the target + # nor the FEs. Include these along with the first FE layer but with '^' underlining. + for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"): + if lyr in sent and sent[lyr]: + outstr += f" + [{lyr}]" + if "FE2" in sentkeys: + outstr += " + [FE2]" + if "FE3" in sentkeys: + outstr += " + [FE3]" + outstr += "\n\n" + outstr += sent._ascii() # -> _annotation_ascii() + outstr += "\n" + + return outstr + + +def _annotation_ascii(sent): + """ + Given a sentence or FE annotation set, construct the width-limited string showing + an ASCII visualization of the sentence's annotations, calling either + _annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate. + This will be attached as a method to appropriate AttrDict instances + and called in the full pretty-printing of the instance. + """ + if sent._type == "fulltext_sentence" or ( + "annotationSet" in sent and len(sent.annotationSet) > 2 + ): + # a full-text sentence OR sentence with multiple targets. + # (multiple targets = >2 annotation sets, because the first annotation set is POS.) + return _annotation_ascii_frames(sent) + else: # an FE annotation set, or an LU sentence with 1 target + return _annotation_ascii_FEs(sent) + + +def _annotation_ascii_frames(sent): + """ + ASCII string rendering of the sentence along with its targets and frame names. + Called for all full-text sentences, as well as the few LU sentences with multiple + targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets). + Line-wrapped to limit the display width. + """ + # list the target spans and their associated aset index + overt = [] + for a, aset in enumerate(sent.annotationSet[1:]): + for j, k in aset.Target: + indexS = f"[{a + 1}]" + if aset.status == "UNANN" or aset.LU.status == "Problem": + indexS += " " + if aset.status == "UNANN": + indexS += "!" # warning indicator that there is a frame annotation but no FE annotation + if aset.LU.status == "Problem": + indexS += "?" # warning indicator that there is a missing LU definition (because the LU has Problem status) + overt.append((j, k, aset.LU.frame.name, indexS)) + overt = sorted(overt) + + duplicates = set() + for o, (j, k, fname, asetIndex) in enumerate(overt): + if o > 0 and j <= overt[o - 1][1]: + # multiple annotation sets on the same target + # (e.g. due to a coordination construction or multiple annotators) + if ( + overt[o - 1][:2] == (j, k) and overt[o - 1][2] == fname + ): # same target, same frame + # splice indices together + combinedIndex = ( + overt[o - 1][3] + asetIndex + ) # e.g., '[1][2]', '[1]! [2]' + combinedIndex = combinedIndex.replace(" !", "! ").replace(" ?", "? ") + overt[o - 1] = overt[o - 1][:3] + (combinedIndex,) + duplicates.add(o) + else: # different frames, same or overlapping targets + s = sent.text + for j, k, fname, asetIndex in overt: + s += "\n" + asetIndex + " " + sent.text[j:k] + " :: " + fname + s += "\n(Unable to display sentence with targets marked inline due to overlap)" + return s + for o in reversed(sorted(duplicates)): + del overt[o] + + s0 = sent.text + s1 = "" + s11 = "" + s2 = "" + i = 0 + adjust = 0 + fAbbrevs = OrderedDict() + for j, k, fname, asetIndex in overt: + if not j >= i: + assert j >= i, ( + "Overlapping targets?" + + ( + " UNANN" + if any(aset.status == "UNANN" for aset in sent.annotationSet[1:]) + else "" + ), + (j, k, asetIndex), + ) + s1 += " " * (j - i) + "*" * (k - j) + short = fname[: k - j] + if (k - j) < len(fname): + r = 0 + while short in fAbbrevs: + if fAbbrevs[short] == fname: + break + r += 1 + short = fname[: k - j - 1] + str(r) + else: # short not in fAbbrevs + fAbbrevs[short] = fname + s11 += " " * (j - i) + short.ljust(k - j) + if len(asetIndex) > (k - j): + # add space in the sentence to make room for the annotation index + amt = len(asetIndex) - (k - j) + s0 = ( + s0[: k + adjust] + "~" * amt + s0[k + adjust :] + ) # '~' to prevent line wrapping + s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :] + s11 = s11[: k + adjust] + " " * amt + s11[k + adjust :] + adjust += amt + s2 += " " * (j - i) + asetIndex.ljust(k - j) + i = k + + long_lines = [s0, s1, s11, s2] + + outstr = "\n\n".join( + map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) + ).replace("~", " ") + outstr += "\n" + if fAbbrevs: + outstr += " (" + ", ".join("=".join(pair) for pair in fAbbrevs.items()) + ")" + assert len(fAbbrevs) == len(dict(fAbbrevs)), "Abbreviation clash" + + return outstr + + +def _annotation_ascii_FE_layer(overt, ni, feAbbrevs): + """Helper for _annotation_ascii_FEs().""" + s1 = "" + s2 = "" + i = 0 + for j, k, fename in overt: + s1 += " " * (j - i) + ("^" if fename.islower() else "-") * (k - j) + short = fename[: k - j] + if len(fename) > len(short): + r = 0 + while short in feAbbrevs: + if feAbbrevs[short] == fename: + break + r += 1 + short = fename[: k - j - 1] + str(r) + else: # short not in feAbbrevs + feAbbrevs[short] = fename + s2 += " " * (j - i) + short.ljust(k - j) + i = k + + sNI = "" + if ni: + sNI += " [" + ", ".join(":".join(x) for x in sorted(ni.items())) + "]" + return [s1, s2, sNI] + + +def _annotation_ascii_FEs(sent): + """ + ASCII string rendering of the sentence along with a single target and its FEs. + Secondary and tertiary FE layers are included if present. + 'sent' can be an FE annotation set or an LU sentence with a single target. + Line-wrapped to limit the display width. + """ + feAbbrevs = OrderedDict() + posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula]) + posspec_separate = False + for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"): + if lyr in sent and sent[lyr]: + for a, b, lbl in sent[lyr]: + if ( + lbl == "X" + ): # skip this, which covers an entire phrase typically containing the target and all its FEs + # (but do display the Gov) + continue + if any(1 for x, y, felbl in sent.FE[0] if x <= a < y or a <= x < b): + # overlap between one of the POS-specific layers and first FE layer + posspec_separate = ( + True # show POS-specific layers on a separate line + ) + posspec.append( + (a, b, lbl.lower().replace("-", "")) + ) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names + if posspec_separate: + POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs) + FE1 = _annotation_ascii_FE_layer( + sorted(sent.FE[0] + (posspec if not posspec_separate else [])), + sent.FE[1], + feAbbrevs, + ) + FE2 = FE3 = None + if "FE2" in sent: + FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs) + if "FE3" in sent: + FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs) + + for i, j in sent.Target: + FE1span, FE1name, FE1exp = FE1 + if len(FE1span) < j: + FE1span += " " * (j - len(FE1span)) + if len(FE1name) < j: + FE1name += " " * (j - len(FE1name)) + FE1[1] = FE1name + FE1[0] = ( + FE1span[:i] + FE1span[i:j].replace(" ", "*").replace("-", "=") + FE1span[j:] + ) + long_lines = [sent.text] + if posspec_separate: + long_lines.extend(POSSPEC[:2]) + long_lines.extend([FE1[0], FE1[1] + FE1[2]]) # lines with no length limit + if FE2: + long_lines.extend([FE2[0], FE2[1] + FE2[2]]) + if FE3: + long_lines.extend([FE3[0], FE3[1] + FE3[2]]) + long_lines.append("") + outstr = "\n".join( + map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) + ) + if feAbbrevs: + outstr += "(" + ", ".join("=".join(pair) for pair in feAbbrevs.items()) + ")" + assert len(feAbbrevs) == len(dict(feAbbrevs)), "Abbreviation clash" + outstr += "\n" + + return outstr + + +def _pretty_fe(fe): + + """ + Helper function for pretty-printing a frame element. + + :param fe: The frame element to be printed. + :type fe: AttrDict + :return: A nicely formatted string representation of the frame element. + :rtype: str + """ + fekeys = fe.keys() + outstr = "" + outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format( + fe, fe.frame + ) + if "definition" in fekeys: + outstr += "[definition]\n" + outstr += _pretty_longstring(fe.definition, " ") + if "abbrev" in fekeys: + outstr += f"[abbrev] {fe.abbrev}\n" + if "coreType" in fekeys: + outstr += f"[coreType] {fe.coreType}\n" + if "requiresFE" in fekeys: + outstr += "[requiresFE] " + if fe.requiresFE is None: + outstr += "\n" + else: + outstr += f"{fe.requiresFE.name}({fe.requiresFE.ID})\n" + if "excludesFE" in fekeys: + outstr += "[excludesFE] " + if fe.excludesFE is None: + outstr += "\n" + else: + outstr += f"{fe.excludesFE.name}({fe.excludesFE.ID})\n" + if "semType" in fekeys: + outstr += "[semType] " + if fe.semType is None: + outstr += "\n" + else: + outstr += "\n " + f"{fe.semType.name}({fe.semType.ID})" + "\n" + + return outstr + + +def _pretty_frame(frame): + + """ + Helper function for pretty-printing a frame. + + :param frame: The frame to be printed. + :type frame: AttrDict + :return: A nicely formatted string representation of the frame. + :rtype: str + """ + + outstr = "" + outstr += "frame ({0.ID}): {0.name}\n\n".format(frame) + outstr += f"[URL] {frame.URL}\n\n" + outstr += "[definition]\n" + outstr += _pretty_longstring(frame.definition, " ") + "\n" + + outstr += f"[semTypes] {len(frame.semTypes)} semantic types\n" + outstr += ( + " " * (len(frame.semTypes) > 0) + + ", ".join(f"{x.name}({x.ID})" for x in frame.semTypes) + + "\n" * (len(frame.semTypes) > 0) + ) + + outstr += "\n[frameRelations] {} frame relations\n".format( + len(frame.frameRelations) + ) + outstr += " " + "\n ".join(repr(frel) for frel in frame.frameRelations) + "\n" + + outstr += f"\n[lexUnit] {len(frame.lexUnit)} lexical units\n" + lustrs = [] + for luName, lu in sorted(frame.lexUnit.items()): + tmpstr = f"{luName} ({lu.ID})" + lustrs.append(tmpstr) + outstr += "{}\n".format(_pretty_longstring(", ".join(lustrs), prefix=" ")) + + outstr += f"\n[FE] {len(frame.FE)} frame elements\n" + fes = {} + for feName, fe in sorted(frame.FE.items()): + try: + fes[fe.coreType].append(f"{feName} ({fe.ID})") + except KeyError: + fes[fe.coreType] = [] + fes[fe.coreType].append(f"{feName} ({fe.ID})") + for ct in sorted( + fes.keys(), + key=lambda ct2: [ + "Core", + "Core-Unexpressed", + "Peripheral", + "Extra-Thematic", + ].index(ct2), + ): + outstr += "{:>16}: {}\n".format(ct, ", ".join(sorted(fes[ct]))) + + outstr += "\n[FEcoreSets] {} frame element core sets\n".format( + len(frame.FEcoreSets) + ) + outstr += ( + " " + + "\n ".join( + ", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets + ) + + "\n" + ) + + return outstr + + +class FramenetError(Exception): + + """An exception class for framenet-related errors.""" + + +class AttrDict(dict): + + """A class that wraps a dict and allows accessing the keys of the + dict as if they were attributes. Taken from here: + https://stackoverflow.com/a/14620633/8879 + + >>> foo = {'a':1, 'b':2, 'c':3} + >>> bar = AttrDict(foo) + >>> pprint(dict(bar)) + {'a': 1, 'b': 2, 'c': 3} + >>> bar.b + 2 + >>> bar.d = 4 + >>> pprint(dict(bar)) + {'a': 1, 'b': 2, 'c': 3, 'd': 4} + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # self.__dict__ = self + + def __setattr__(self, name, value): + self[name] = value + + def __getattr__(self, name): + if name == "_short_repr": + return self._short_repr + return self[name] + + def __getitem__(self, name): + v = super().__getitem__(name) + if isinstance(v, Future): + return v._data() + return v + + def _short_repr(self): + if "_type" in self: + if self["_type"].endswith("relation"): + return self.__repr__() + try: + return "<{} ID={} name={}>".format( + self["_type"], self["ID"], self["name"] + ) + except KeyError: + try: # no ID--e.g., for _type=lusubcorpus + return "<{} name={}>".format(self["_type"], self["name"]) + except KeyError: # no name--e.g., for _type=lusentence + return "<{} ID={}>".format(self["_type"], self["ID"]) + else: + return self.__repr__() + + def _str(self): + outstr = "" + + if "_type" not in self: + outstr = _pretty_any(self) + elif self["_type"] == "frame": + outstr = _pretty_frame(self) + elif self["_type"] == "fe": + outstr = _pretty_fe(self) + elif self["_type"] == "lu": + outstr = _pretty_lu(self) + elif self["_type"] == "luexemplars": # list of ALL exemplars for LU + outstr = _pretty_exemplars(self, self[0].LU) + elif ( + self["_type"] == "fulltext_annotation" + ): # list of all sentences for full-text doc + outstr = _pretty_fulltext_sentences(self) + elif self["_type"] == "lusentence": + outstr = _pretty_annotation(self) + elif self["_type"] == "fulltext_sentence": + outstr = _pretty_fulltext_sentence(self) + elif self["_type"] in ("luannotationset", "fulltext_annotationset"): + outstr = _pretty_annotation(self, aset_level=True) + elif self["_type"] == "posannotationset": + outstr = _pretty_pos(self) + elif self["_type"] == "semtype": + outstr = _pretty_semtype(self) + elif self["_type"] == "framerelationtype": + outstr = _pretty_frame_relation_type(self) + elif self["_type"] == "framerelation": + outstr = _pretty_frame_relation(self) + elif self["_type"] == "ferelation": + outstr = _pretty_fe_relation(self) + else: + outstr = _pretty_any(self) + + # ensure result is unicode string prior to applying the + # decorator (because non-ASCII characters + # could in principle occur in the data and would trigger an encoding error when + # passed as arguments to str.format()). + # assert isinstance(outstr, unicode) # not in Python 3.2 + return outstr + + def __str__(self): + return self._str() + + def __repr__(self): + return self.__str__() + + +class SpecialList(list): + """ + A list subclass which adds a '_type' attribute for special printing + (similar to an AttrDict, though this is NOT an AttrDict subclass). + """ + + def __init__(self, typ, *args, **kwargs): + super().__init__(*args, **kwargs) + self._type = typ + + def _str(self): + outstr = "" + + assert self._type + if len(self) == 0: + outstr = "[]" + elif self._type == "luexemplars": # list of ALL exemplars for LU + outstr = _pretty_exemplars(self, self[0].LU) + else: + assert False, self._type + return outstr + + def __str__(self): + return self._str() + + def __repr__(self): + return self.__str__() + + +class Future: + """ + Wraps and acts as a proxy for a value to be loaded lazily (on demand). + Adapted from https://gist.github.com/sergey-miryanov/2935416 + """ + + def __init__(self, loader, *args, **kwargs): + """ + :param loader: when called with no arguments, returns the value to be stored + :type loader: callable + """ + super().__init__(*args, **kwargs) + self._loader = loader + self._d = None + + def _data(self): + if callable(self._loader): + self._d = self._loader() + self._loader = None # the data is now cached + return self._d + + def __nonzero__(self): + return bool(self._data()) + + def __len__(self): + return len(self._data()) + + def __setitem__(self, key, value): + return self._data().__setitem__(key, value) + + def __getitem__(self, key): + return self._data().__getitem__(key) + + def __getattr__(self, key): + return self._data().__getattr__(key) + + def __str__(self): + return self._data().__str__() + + def __repr__(self): + return self._data().__repr__() + + +class PrettyDict(AttrDict): + """ + Displays an abbreviated repr of values where possible. + Inherits from AttrDict, so a callable value will + be lazily converted to an actual value. + """ + + def __init__(self, *args, **kwargs): + _BREAK_LINES = kwargs.pop("breakLines", False) + super().__init__(*args, **kwargs) + dict.__setattr__(self, "_BREAK_LINES", _BREAK_LINES) + + def __repr__(self): + parts = [] + for k, v in sorted(self.items()): + kv = repr(k) + ": " + try: + kv += v._short_repr() + except AttributeError: + kv += repr(v) + parts.append(kv) + return "{" + (",\n " if self._BREAK_LINES else ", ").join(parts) + "}" + + +class PrettyList(list): + """ + Displays an abbreviated repr of only the first several elements, not the whole list. + """ + + # from nltk.util + def __init__(self, *args, **kwargs): + self._MAX_REPR_SIZE = kwargs.pop("maxReprSize", 60) + self._BREAK_LINES = kwargs.pop("breakLines", False) + super().__init__(*args, **kwargs) + + def __repr__(self): + """ + Return a string representation for this corpus view that is + similar to a list's representation; but if it would be more + than 60 characters long, it is truncated. + """ + pieces = [] + length = 5 + + for elt in self: + pieces.append( + elt._short_repr() + ) # key difference from inherited version: call to _short_repr() + length += len(pieces[-1]) + 2 + if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2: + return "[%s, ...]" % str(",\n " if self._BREAK_LINES else ", ").join( + pieces[:-1] + ) + return "[%s]" % str(",\n " if self._BREAK_LINES else ", ").join(pieces) + + +class PrettyLazyMap(LazyMap): + """ + Displays an abbreviated repr of only the first several elements, not the whole list. + """ + + # from nltk.util + _MAX_REPR_SIZE = 60 + + def __repr__(self): + """ + Return a string representation for this corpus view that is + similar to a list's representation; but if it would be more + than 60 characters long, it is truncated. + """ + pieces = [] + length = 5 + for elt in self: + pieces.append( + elt._short_repr() + ) # key difference from inherited version: call to _short_repr() + length += len(pieces[-1]) + 2 + if length > self._MAX_REPR_SIZE and len(pieces) > 2: + return "[%s, ...]" % ", ".join(pieces[:-1]) + return "[%s]" % ", ".join(pieces) + + +class PrettyLazyIteratorList(LazyIteratorList): + """ + Displays an abbreviated repr of only the first several elements, not the whole list. + """ + + # from nltk.util + _MAX_REPR_SIZE = 60 + + def __repr__(self): + """ + Return a string representation for this corpus view that is + similar to a list's representation; but if it would be more + than 60 characters long, it is truncated. + """ + pieces = [] + length = 5 + for elt in self: + pieces.append( + elt._short_repr() + ) # key difference from inherited version: call to _short_repr() + length += len(pieces[-1]) + 2 + if length > self._MAX_REPR_SIZE and len(pieces) > 2: + return "[%s, ...]" % ", ".join(pieces[:-1]) + return "[%s]" % ", ".join(pieces) + + +class PrettyLazyConcatenation(LazyConcatenation): + """ + Displays an abbreviated repr of only the first several elements, not the whole list. + """ + + # from nltk.util + _MAX_REPR_SIZE = 60 + + def __repr__(self): + """ + Return a string representation for this corpus view that is + similar to a list's representation; but if it would be more + than 60 characters long, it is truncated. + """ + pieces = [] + length = 5 + for elt in self: + pieces.append( + elt._short_repr() + ) # key difference from inherited version: call to _short_repr() + length += len(pieces[-1]) + 2 + if length > self._MAX_REPR_SIZE and len(pieces) > 2: + return "[%s, ...]" % ", ".join(pieces[:-1]) + return "[%s]" % ", ".join(pieces) + + def __add__(self, other): + """Return a list concatenating self with other.""" + return PrettyLazyIteratorList(itertools.chain(self, other)) + + def __radd__(self, other): + """Return a list concatenating other with self.""" + return PrettyLazyIteratorList(itertools.chain(other, self)) + + +class FramenetCorpusReader(XMLCorpusReader): + """A corpus reader for the Framenet Corpus. + + >>> from nltk.corpus import framenet as fn + >>> fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238) + True + >>> fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame + True + >>> fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality') + True + """ + + _bad_statuses = ["Problem"] + """ + When loading LUs for a frame, those whose status is in this list will be ignored. + Due to caching, if user code modifies this, it should do so before loading any data. + 'Problem' should always be listed for FrameNet 1.5, as these LUs are not included + in the XML index. + """ + + _warnings = False + + def warnings(self, v): + """Enable or disable warnings of data integrity issues as they are encountered. + If v is truthy, warnings will be enabled. + + (This is a function rather than just an attribute/property to ensure that if + enabling warnings is the first action taken, the corpus reader is instantiated first.) + """ + self._warnings = v + + def __init__(self, root, fileids): + XMLCorpusReader.__init__(self, root, fileids) + + # framenet corpus sub dirs + # sub dir containing the xml files for frames + self._frame_dir = "frame" + # sub dir containing the xml files for lexical units + self._lu_dir = "lu" + # sub dir containing the xml files for fulltext annotation files + self._fulltext_dir = "fulltext" + + # location of latest development version of FrameNet + self._fnweb_url = "https://framenet2.icsi.berkeley.edu/fnReports/data" + + # Indexes used for faster look-ups + self._frame_idx = None + self._cached_frames = {} # name -> ID + self._lu_idx = None + self._fulltext_idx = None + self._semtypes = None + self._freltyp_idx = None # frame relation types (Inheritance, Using, etc.) + self._frel_idx = None # frame-to-frame relation instances + self._ferel_idx = None # FE-to-FE relation instances + self._frel_f_idx = None # frame-to-frame relations associated with each frame + + self._readme = "README.txt" + + def help(self, attrname=None): + """Display help information summarizing the main methods.""" + + if attrname is not None: + return help(self.__getattribute__(attrname)) + + # No need to mention frame_by_name() or frame_by_id(), + # as it's easier to just call frame(). + # Also not mentioning lu_basic(). + + msg = """ +Citation: Nathan Schneider and Chuck Wooters (2017), +"The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource". +Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438 + +Use the following methods to access data in FrameNet. +Provide a method name to `help()` for more information. + +FRAMES +====== + +frame() to look up a frame by its exact name or ID +frames() to get frames matching a name pattern +frames_by_lemma() to get frames containing an LU matching a name pattern +frame_ids_and_names() to get a mapping from frame IDs to names + +FRAME ELEMENTS +============== + +fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally constrained + by a frame name pattern + +LEXICAL UNITS +============= + +lu() to look up an LU by its ID +lus() to get lexical units matching a name pattern, optionally constrained by frame +lu_ids_and_names() to get a mapping from LU IDs to names + +RELATIONS +========= + +frame_relation_types() to get the different kinds of frame-to-frame relations + (Inheritance, Subframe, Using, etc.). +frame_relations() to get the relation instances, optionally constrained by + frame(s) or relation type +fe_relations() to get the frame element pairs belonging to a frame-to-frame relation + +SEMANTIC TYPES +============== + +semtypes() to get the different kinds of semantic types that can be applied to + FEs, LUs, and entire frames +semtype() to look up a particular semtype by name, ID, or abbreviation +semtype_inherits() to check whether two semantic types have a subtype-supertype + relationship in the semtype hierarchy +propagate_semtypes() to apply inference rules that distribute semtypes over relations + between FEs + +ANNOTATIONS +=========== + +annotations() to get annotation sets, in which a token in a sentence is annotated + with a lexical unit in a frame, along with its frame elements and their syntactic properties; + can be constrained by LU name pattern and limited to lexicographic exemplars or full-text. + Sentences of full-text annotation can have multiple annotation sets. +sents() to get annotated sentences illustrating one or more lexical units +exemplars() to get sentences of lexicographic annotation, most of which have + just 1 annotation set; can be constrained by LU name pattern, frame, and overt FE(s) +doc() to look up a document of full-text annotation by its ID +docs() to get documents of full-text annotation that match a name pattern +docs_metadata() to get metadata about all full-text documents without loading them +ft_sents() to iterate over sentences of full-text annotation + +UTILITIES +========= + +buildindexes() loads metadata about all frames, LUs, etc. into memory to avoid + delay when one is accessed for the first time. It does not load annotations. +readme() gives the text of the FrameNet README file +warnings(True) to display corpus consistency warnings when loading data + """ + print(msg) + + def _buildframeindex(self): + # The total number of Frames in Framenet is fairly small (~1200) so + # this index should not be very large + if not self._frel_idx: + self._buildrelationindex() # always load frame relations before frames, + # otherwise weird ordering effects might result in incomplete information + self._frame_idx = {} + with XMLCorpusView( + self.abspath("frameIndex.xml"), "frameIndex/frame", self._handle_elt + ) as view: + for f in view: + self._frame_idx[f["ID"]] = f + + def _buildcorpusindex(self): + # The total number of fulltext annotated documents in Framenet + # is fairly small (~90) so this index should not be very large + self._fulltext_idx = {} + with XMLCorpusView( + self.abspath("fulltextIndex.xml"), + "fulltextIndex/corpus", + self._handle_fulltextindex_elt, + ) as view: + for doclist in view: + for doc in doclist: + self._fulltext_idx[doc.ID] = doc + + def _buildluindex(self): + # The number of LUs in Framenet is about 13,000 so this index + # should not be very large + self._lu_idx = {} + with XMLCorpusView( + self.abspath("luIndex.xml"), "luIndex/lu", self._handle_elt + ) as view: + for lu in view: + self._lu_idx[ + lu["ID"] + ] = lu # populate with LU index entries. if any of these + # are looked up they will be replaced by full LU objects. + + def _buildrelationindex(self): + # print('building relation index...', file=sys.stderr) + self._freltyp_idx = {} + self._frel_idx = {} + self._frel_f_idx = defaultdict(set) + self._ferel_idx = {} + + with XMLCorpusView( + self.abspath("frRelation.xml"), + "frameRelations/frameRelationType", + self._handle_framerelationtype_elt, + ) as view: + for freltyp in view: + self._freltyp_idx[freltyp.ID] = freltyp + for frel in freltyp.frameRelations: + supF = frel.superFrame = frel[freltyp.superFrameName] = Future( + (lambda fID: lambda: self.frame_by_id(fID))(frel.supID) + ) + subF = frel.subFrame = frel[freltyp.subFrameName] = Future( + (lambda fID: lambda: self.frame_by_id(fID))(frel.subID) + ) + self._frel_idx[frel.ID] = frel + self._frel_f_idx[frel.supID].add(frel.ID) + self._frel_f_idx[frel.subID].add(frel.ID) + for ferel in frel.feRelations: + ferel.superFrame = supF + ferel.subFrame = subF + ferel.superFE = Future( + (lambda fer: lambda: fer.superFrame.FE[fer.superFEName])( + ferel + ) + ) + ferel.subFE = Future( + (lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel) + ) + self._ferel_idx[ferel.ID] = ferel + # print('...done building relation index', file=sys.stderr) + + def _warn(self, *message, **kwargs): + if self._warnings: + kwargs.setdefault("file", sys.stderr) + print(*message, **kwargs) + + def buildindexes(self): + """ + Build the internal indexes to make look-ups faster. + """ + # Frames + self._buildframeindex() + # LUs + self._buildluindex() + # Fulltext annotation corpora index + self._buildcorpusindex() + # frame and FE relations + self._buildrelationindex() + + def doc(self, fn_docid): + """ + Returns the annotated document whose id number is + ``fn_docid``. This id number can be obtained by calling the + Documents() function. + + The dict that is returned from this function will contain the + following keys: + + - '_type' : 'fulltextannotation' + - 'sentence' : a list of sentences in the document + - Each item in the list is a dict containing the following keys: + - 'ID' : the ID number of the sentence + - '_type' : 'sentence' + - 'text' : the text of the sentence + - 'paragNo' : the paragraph number + - 'sentNo' : the sentence number + - 'docID' : the document ID number + - 'corpID' : the corpus ID number + - 'aPos' : the annotation position + - 'annotationSet' : a list of annotation layers for the sentence + - Each item in the list is a dict containing the following keys: + - 'ID' : the ID number of the annotation set + - '_type' : 'annotationset' + - 'status' : either 'MANUAL' or 'UNANN' + - 'luName' : (only if status is 'MANUAL') + - 'luID' : (only if status is 'MANUAL') + - 'frameID' : (only if status is 'MANUAL') + - 'frameName': (only if status is 'MANUAL') + - 'layer' : a list of labels for the layer + - Each item in the layer is a dict containing the following keys: + - '_type': 'layer' + - 'rank' + - 'name' + - 'label' : a list of labels in the layer + - Each item is a dict containing the following keys: + - 'start' + - 'end' + - 'name' + - 'feID' (optional) + + :param fn_docid: The Framenet id number of the document + :type fn_docid: int + :return: Information about the annotated document + :rtype: dict + """ + try: + xmlfname = self._fulltext_idx[fn_docid].filename + except TypeError: # happens when self._fulltext_idx == None + # build the index + self._buildcorpusindex() + xmlfname = self._fulltext_idx[fn_docid].filename + except KeyError as e: # probably means that fn_docid was not in the index + raise FramenetError(f"Unknown document id: {fn_docid}") from e + + # construct the path name for the xml file containing the document info + locpath = os.path.join(f"{self._root}", self._fulltext_dir, xmlfname) + + # Grab the top-level xml element containing the fulltext annotation + with XMLCorpusView(locpath, "fullTextAnnotation") as view: + elt = view[0] + info = self._handle_fulltextannotation_elt(elt) + # add metadata + for k, v in self._fulltext_idx[fn_docid].items(): + info[k] = v + return info + + def frame_by_id(self, fn_fid, ignorekeys=[]): + """ + Get the details for the specified Frame using the frame's id + number. + + Usage examples: + + >>> from nltk.corpus import framenet as fn + >>> f = fn.frame_by_id(256) + >>> f.ID + 256 + >>> f.name + 'Medical_specialties' + >>> f.definition # doctest: +NORMALIZE_WHITESPACE + "This frame includes words that name medical specialties and is closely related to the + Medical_professionals frame. The FE Type characterizing a sub-are in a Specialty may also be + expressed. 'Ralph practices paediatric oncology.'" + + :param fn_fid: The Framenet id number of the frame + :type fn_fid: int + :param ignorekeys: The keys to ignore. These keys will not be + included in the output. (optional) + :type ignorekeys: list(str) + :return: Information about a frame + :rtype: dict + + Also see the ``frame()`` function for details about what is + contained in the dict that is returned. + """ + + # get the name of the frame with this id number + try: + fentry = self._frame_idx[fn_fid] + if "_type" in fentry: + return fentry # full frame object is cached + name = fentry["name"] + except TypeError: + self._buildframeindex() + name = self._frame_idx[fn_fid]["name"] + except KeyError as e: + raise FramenetError(f"Unknown frame id: {fn_fid}") from e + + return self.frame_by_name(name, ignorekeys, check_cache=False) + + def frame_by_name(self, fn_fname, ignorekeys=[], check_cache=True): + """ + Get the details for the specified Frame using the frame's name. + + Usage examples: + + >>> from nltk.corpus import framenet as fn + >>> f = fn.frame_by_name('Medical_specialties') + >>> f.ID + 256 + >>> f.name + 'Medical_specialties' + >>> f.definition # doctest: +NORMALIZE_WHITESPACE + "This frame includes words that name medical specialties and is closely related to the + Medical_professionals frame. The FE Type characterizing a sub-are in a Specialty may also be + expressed. 'Ralph practices paediatric oncology.'" + + :param fn_fname: The name of the frame + :type fn_fname: str + :param ignorekeys: The keys to ignore. These keys will not be + included in the output. (optional) + :type ignorekeys: list(str) + :return: Information about a frame + :rtype: dict + + Also see the ``frame()`` function for details about what is + contained in the dict that is returned. + """ + + if check_cache and fn_fname in self._cached_frames: + return self._frame_idx[self._cached_frames[fn_fname]] + elif not self._frame_idx: + self._buildframeindex() + + # construct the path name for the xml file containing the Frame info + locpath = os.path.join(f"{self._root}", self._frame_dir, fn_fname + ".xml") + # print(locpath, file=sys.stderr) + # Grab the xml for the frame + try: + with XMLCorpusView(locpath, "frame") as view: + elt = view[0] + except OSError as e: + raise FramenetError(f"Unknown frame: {fn_fname}") from e + + fentry = self._handle_frame_elt(elt, ignorekeys) + assert fentry + + fentry.URL = self._fnweb_url + "/" + self._frame_dir + "/" + fn_fname + ".xml" + + # INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs + for st in fentry.semTypes: + if st.rootType.name == "Lexical_type": + for lu in fentry.lexUnit.values(): + if not any( + x is st for x in lu.semTypes + ): # identity containment check + lu.semTypes.append(st) + + self._frame_idx[fentry.ID] = fentry + self._cached_frames[fentry.name] = fentry.ID + """ + # now set up callables to resolve the LU pointers lazily. + # (could also do this here--caching avoids infinite recursion.) + for luName,luinfo in fentry.lexUnit.items(): + fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID) + """ + return fentry + + def frame(self, fn_fid_or_fname, ignorekeys=[]): + """ + Get the details for the specified Frame using the frame's name + or id number. + + Usage examples: + + >>> from nltk.corpus import framenet as fn + >>> f = fn.frame(256) + >>> f.name + 'Medical_specialties' + >>> f = fn.frame('Medical_specialties') + >>> f.ID + 256 + >>> # ensure non-ASCII character in definition doesn't trigger an encoding error: + >>> fn.frame('Imposing_obligation') # doctest: +ELLIPSIS + frame (1494): Imposing_obligation... + + + The dict that is returned from this function will contain the + following information about the Frame: + + - 'name' : the name of the Frame (e.g. 'Birth', 'Apply_heat', etc.) + - 'definition' : textual definition of the Frame + - 'ID' : the internal ID number of the Frame + - 'semTypes' : a list of semantic types for this frame + - Each item in the list is a dict containing the following keys: + - 'name' : can be used with the semtype() function + - 'ID' : can be used with the semtype() function + + - 'lexUnit' : a dict containing all of the LUs for this frame. + The keys in this dict are the names of the LUs and + the value for each key is itself a dict containing + info about the LU (see the lu() function for more info.) + + - 'FE' : a dict containing the Frame Elements that are part of this frame + The keys in this dict are the names of the FEs (e.g. 'Body_system') + and the values are dicts containing the following keys + + - 'definition' : The definition of the FE + - 'name' : The name of the FE e.g. 'Body_system' + - 'ID' : The id number + - '_type' : 'fe' + - 'abbrev' : Abbreviation e.g. 'bod' + - 'coreType' : one of "Core", "Peripheral", or "Extra-Thematic" + - 'semType' : if not None, a dict with the following two keys: + - 'name' : name of the semantic type. can be used with + the semtype() function + - 'ID' : id number of the semantic type. can be used with + the semtype() function + - 'requiresFE' : if not None, a dict with the following two keys: + - 'name' : the name of another FE in this frame + - 'ID' : the id of the other FE in this frame + - 'excludesFE' : if not None, a dict with the following two keys: + - 'name' : the name of another FE in this frame + - 'ID' : the id of the other FE in this frame + + - 'frameRelation' : a list of objects describing frame relations + - 'FEcoreSets' : a list of Frame Element core sets for this frame + - Each item in the list is a list of FE objects + + :param fn_fid_or_fname: The Framenet name or id number of the frame + :type fn_fid_or_fname: int or str + :param ignorekeys: The keys to ignore. These keys will not be + included in the output. (optional) + :type ignorekeys: list(str) + :return: Information about a frame + :rtype: dict + """ + + # get the frame info by name or id number + if isinstance(fn_fid_or_fname, str): + f = self.frame_by_name(fn_fid_or_fname, ignorekeys) + else: + f = self.frame_by_id(fn_fid_or_fname, ignorekeys) + + return f + + def frames_by_lemma(self, pat): + """ + Returns a list of all frames that contain LUs in which the + ``name`` attribute of the LU matches the given regular expression + ``pat``. Note that LU names are composed of "lemma.POS", where + the "lemma" part can be made up of either a single lexeme + (e.g. 'run') or multiple lexemes (e.g. 'a little'). + + Note: if you are going to be doing a lot of this type of + searching, you'd want to build an index that maps from lemmas to + frames because each time frames_by_lemma() is called, it has to + search through ALL of the frame XML files in the db. + + >>> from nltk.corpus import framenet as fn + >>> from nltk.corpus.reader.framenet import PrettyList + >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) # doctest: +ELLIPSIS + [, ] + + :return: A list of frame objects. + :rtype: list(AttrDict) + """ + return PrettyList( + f + for f in self.frames() + if any(re.search(pat, luName) for luName in f.lexUnit) + ) + + def lu_basic(self, fn_luid): + """ + Returns basic information about the LU whose id is + ``fn_luid``. This is basically just a wrapper around the + ``lu()`` function with "subCorpus" info excluded. + + >>> from nltk.corpus import framenet as fn + >>> lu = PrettyDict(fn.lu_basic(256), breakLines=True) + >>> # ellipses account for differences between FN 1.5 and 1.7 + >>> lu # doctest: +ELLIPSIS + {'ID': 256, + 'POS': 'V', + 'URL': 'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml', + '_type': 'lu', + 'cBy': ..., + 'cDate': '02/08/2001 01:27:50 PST Thu', + 'definition': 'COD: be aware of beforehand; predict.', + 'definitionMarkup': 'COD: be aware of beforehand; predict.', + 'frame': , + 'lemmaID': 15082, + 'lexemes': [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}], + 'name': 'foresee.v', + 'semTypes': [], + 'sentenceCount': {'annotated': ..., 'total': ...}, + 'status': 'FN1_Sent'} + + :param fn_luid: The id number of the desired LU + :type fn_luid: int + :return: Basic information about the lexical unit + :rtype: dict + """ + return self.lu(fn_luid, ignorekeys=["subCorpus", "exemplars"]) + + def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None): + """ + Access a lexical unit by its ID. luName, frameID, and frameName are used + only in the event that the LU does not have a file in the database + (which is the case for LUs with "Problem" status); in this case, + a placeholder LU is created which just contains its name, ID, and frame. + + + Usage examples: + + >>> from nltk.corpus import framenet as fn + >>> fn.lu(256).name + 'foresee.v' + >>> fn.lu(256).definition + 'COD: be aware of beforehand; predict.' + >>> fn.lu(256).frame.name + 'Expectation' + >>> list(map(PrettyDict, fn.lu(256).lexemes)) + [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}] + + >>> fn.lu(227).exemplars[23] # doctest: +NORMALIZE_WHITESPACE + exemplar sentence (352962): + [sentNo] 0 + [aPos] 59699508 + + [LU] (227) guess.v in Coming_to_believe + + [frame] (23) Coming_to_believe + + [annotationSet] 2 annotation sets + + [POS] 18 tags + + [POS_tagset] BNC + + [GF] 3 relations + + [PT] 3 phrases + + [Other] 1 entry + + [text] + [Target] + [FE] + + When he was inside the house , Culley noticed the characteristic + ------------------ + Content + + he would n't have guessed at . + -- ******* -- + Co C1 [Evidence:INI] + (Co=Cognizer, C1=Content) + + + + The dict that is returned from this function will contain most of the + following information about the LU. Note that some LUs do not contain + all of these pieces of information - particularly 'totalAnnotated' and + 'incorporatedFE' may be missing in some LUs: + + - 'name' : the name of the LU (e.g. 'merger.n') + - 'definition' : textual definition of the LU + - 'ID' : the internal ID number of the LU + - '_type' : 'lu' + - 'status' : e.g. 'Created' + - 'frame' : Frame that this LU belongs to + - 'POS' : the part of speech of this LU (e.g. 'N') + - 'totalAnnotated' : total number of examples annotated with this LU + - 'incorporatedFE' : FE that incorporates this LU (e.g. 'Ailment') + - 'sentenceCount' : a dict with the following two keys: + - 'annotated': number of sentences annotated with this LU + - 'total' : total number of sentences with this LU + + - 'lexemes' : a list of dicts describing the lemma of this LU. + Each dict in the list contains these keys: + + - 'POS' : part of speech e.g. 'N' + - 'name' : either single-lexeme e.g. 'merger' or + multi-lexeme e.g. 'a little' + - 'order': the order of the lexeme in the lemma (starting from 1) + - 'headword': a boolean ('true' or 'false') + - 'breakBefore': Can this lexeme be separated from the previous lexeme? + Consider: "take over.v" as in:: + + Germany took over the Netherlands in 2 days. + Germany took the Netherlands over in 2 days. + + In this case, 'breakBefore' would be "true" for the lexeme + "over". Contrast this with "take after.v" as in:: + + Mary takes after her grandmother. + *Mary takes her grandmother after. + + In this case, 'breakBefore' would be "false" for the lexeme "after" + + - 'lemmaID' : Can be used to connect lemmas in different LUs + - 'semTypes' : a list of semantic type objects for this LU + - 'subCorpus' : a list of subcorpora + - Each item in the list is a dict containing the following keys: + - 'name' : + - 'sentence' : a list of sentences in the subcorpus + - each item in the list is a dict with the following keys: + - 'ID': + - 'sentNo': + - 'text': the text of the sentence + - 'aPos': + - 'annotationSet': a list of annotation sets + - each item in the list is a dict with the following keys: + - 'ID': + - 'status': + - 'layer': a list of layers + - each layer is a dict containing the following keys: + - 'name': layer name (e.g. 'BNC') + - 'rank': + - 'label': a list of labels for the layer + - each label is a dict containing the following keys: + - 'start': start pos of label in sentence 'text' (0-based) + - 'end': end pos of label in sentence 'text' (0-based) + - 'name': name of label (e.g. 'NN1') + + Under the hood, this implementation looks up the lexical unit information + in the *frame* definition file. That file does not contain + corpus annotations, so the LU files will be accessed on demand if those are + needed. In principle, valence patterns could be loaded here too, + though these are not currently supported. + + :param fn_luid: The id number of the lexical unit + :type fn_luid: int + :param ignorekeys: The keys to ignore. These keys will not be + included in the output. (optional) + :type ignorekeys: list(str) + :return: All information about the lexical unit + :rtype: dict + """ + # look for this LU in cache + if not self._lu_idx: + self._buildluindex() + OOV = object() + luinfo = self._lu_idx.get(fn_luid, OOV) + if luinfo is OOV: + # LU not in the index. We create a placeholder by falling back to + # luName, frameID, and frameName. However, this will not be listed + # among the LUs for its frame. + self._warn( + "LU ID not found: {} ({}) in {} ({})".format( + luName, fn_luid, frameName, frameID + ) + ) + luinfo = AttrDict( + { + "_type": "lu", + "ID": fn_luid, + "name": luName, + "frameID": frameID, + "status": "Problem", + } + ) + f = self.frame_by_id(luinfo.frameID) + assert f.name == frameName, (f.name, frameName) + luinfo["frame"] = f + self._lu_idx[fn_luid] = luinfo + elif "_type" not in luinfo: + # we only have an index entry for the LU. loading the frame will replace this. + f = self.frame_by_id(luinfo.frameID) + luinfo = self._lu_idx[fn_luid] + if ignorekeys: + return AttrDict({k: v for k, v in luinfo.items() if k not in ignorekeys}) + + return luinfo + + def _lu_file(self, lu, ignorekeys=[]): + """ + Augment the LU information that was loaded from the frame file + with additional information from the LU file. + """ + fn_luid = lu.ID + + fname = f"lu{fn_luid}.xml" + locpath = os.path.join(f"{self._root}", self._lu_dir, fname) + # print(locpath, file=sys.stderr) + if not self._lu_idx: + self._buildluindex() + + try: + with XMLCorpusView(locpath, "lexUnit") as view: + elt = view[0] + except OSError as e: + raise FramenetError(f"Unknown LU id: {fn_luid}") from e + + lu2 = self._handle_lexunit_elt(elt, ignorekeys) + lu.URL = self._fnweb_url + "/" + self._lu_dir + "/" + fname + lu.subCorpus = lu2.subCorpus + lu.exemplars = SpecialList( + "luexemplars", [sent for subc in lu.subCorpus for sent in subc.sentence] + ) + for sent in lu.exemplars: + sent["LU"] = lu + sent["frame"] = lu.frame + for aset in sent.annotationSet: + aset["LU"] = lu + aset["frame"] = lu.frame + + return lu + + def _loadsemtypes(self): + """Create the semantic types index.""" + self._semtypes = AttrDict() + with XMLCorpusView( + self.abspath("semTypes.xml"), + "semTypes/semType", + self._handle_semtype_elt, + ) as view: + for st in view: + n = st["name"] + a = st["abbrev"] + i = st["ID"] + # Both name and abbrev should be able to retrieve the + # ID. The ID will retrieve the semantic type dict itself. + self._semtypes[n] = i + self._semtypes[a] = i + self._semtypes[i] = st + # now that all individual semtype XML is loaded, we can link them together + roots = [] + for st in self.semtypes(): + if st.superType: + st.superType = self.semtype(st.superType.supID) + st.superType.subTypes.append(st) + else: + if st not in roots: + roots.append(st) + st.rootType = st + queue = list(roots) + assert queue + while queue: + st = queue.pop(0) + for child in st.subTypes: + child.rootType = st.rootType + queue.append(child) + # self.propagate_semtypes() # apply inferencing over FE relations + + def propagate_semtypes(self): + """ + Apply inference rules to distribute semtypes over relations between FEs. + For FrameNet 1.5, this results in 1011 semtypes being propagated. + (Not done by default because it requires loading all frame files, + which takes several seconds. If this needed to be fast, it could be rewritten + to traverse the neighboring relations on demand for each FE semtype.) + + >>> from nltk.corpus import framenet as fn + >>> x = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) + >>> fn.propagate_semtypes() + >>> y = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) + >>> y-x > 1000 + True + """ + if not self._semtypes: + self._loadsemtypes() + if not self._ferel_idx: + self._buildrelationindex() + changed = True + i = 0 + nPropagations = 0 + while changed: + # make a pass and see if anything needs to be propagated + i += 1 + changed = False + for ferel in self.fe_relations(): + superST = ferel.superFE.semType + subST = ferel.subFE.semType + try: + if superST and superST is not subST: + # propagate downward + assert subST is None or self.semtype_inherits(subST, superST), ( + superST.name, + ferel, + subST.name, + ) + if subST is None: + ferel.subFE.semType = subST = superST + changed = True + nPropagations += 1 + if ( + ferel.type.name in ["Perspective_on", "Subframe", "Precedes"] + and subST + and subST is not superST + ): + # propagate upward + assert superST is None, (superST.name, ferel, subST.name) + ferel.superFE.semType = superST = subST + changed = True + nPropagations += 1 + except AssertionError as ex: + # bug in the data! ignore + # print(ex, file=sys.stderr) + continue + # print(i, nPropagations, file=sys.stderr) + + def semtype(self, key): + """ + >>> from nltk.corpus import framenet as fn + >>> fn.semtype(233).name + 'Temperature' + >>> fn.semtype(233).abbrev + 'Temp' + >>> fn.semtype('Temperature').ID + 233 + + :param key: The name, abbreviation, or id number of the semantic type + :type key: string or int + :return: Information about a semantic type + :rtype: dict + """ + if isinstance(key, int): + stid = key + else: + try: + stid = self._semtypes[key] + except TypeError: + self._loadsemtypes() + stid = self._semtypes[key] + + try: + st = self._semtypes[stid] + except TypeError: + self._loadsemtypes() + st = self._semtypes[stid] + + return st + + def semtype_inherits(self, st, superST): + if not isinstance(st, dict): + st = self.semtype(st) + if not isinstance(superST, dict): + superST = self.semtype(superST) + par = st.superType + while par: + if par is superST: + return True + par = par.superType + return False + + def frames(self, name=None): + """ + Obtain details for a specific frame. + + >>> from nltk.corpus import framenet as fn + >>> len(fn.frames()) in (1019, 1221) # FN 1.5 and 1.7, resp. + True + >>> x = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True) + >>> x.sort(key=itemgetter('ID')) + >>> x + [, + , + , + ] + + A brief intro to Frames (excerpted from "FrameNet II: Extended + Theory and Practice" by Ruppenhofer et. al., 2010): + + A Frame is a script-like conceptual structure that describes a + particular type of situation, object, or event along with the + participants and props that are needed for that Frame. For + example, the "Apply_heat" frame describes a common situation + involving a Cook, some Food, and a Heating_Instrument, and is + evoked by words such as bake, blanch, boil, broil, brown, + simmer, steam, etc. + + We call the roles of a Frame "frame elements" (FEs) and the + frame-evoking words are called "lexical units" (LUs). + + FrameNet includes relations between Frames. Several types of + relations are defined, of which the most important are: + + - Inheritance: An IS-A relation. The child frame is a subtype + of the parent frame, and each FE in the parent is bound to + a corresponding FE in the child. An example is the + "Revenge" frame which inherits from the + "Rewards_and_punishments" frame. + + - Using: The child frame presupposes the parent frame as + background, e.g the "Speed" frame "uses" (or presupposes) + the "Motion" frame; however, not all parent FEs need to be + bound to child FEs. + + - Subframe: The child frame is a subevent of a complex event + represented by the parent, e.g. the "Criminal_process" frame + has subframes of "Arrest", "Arraignment", "Trial", and + "Sentencing". + + - Perspective_on: The child frame provides a particular + perspective on an un-perspectivized parent frame. A pair of + examples consists of the "Hiring" and "Get_a_job" frames, + which perspectivize the "Employment_start" frame from the + Employer's and the Employee's point of view, respectively. + + :param name: A regular expression pattern used to match against + Frame names. If 'name' is None, then a list of all + Framenet Frames will be returned. + :type name: str + :return: A list of matching Frames (or all Frames). + :rtype: list(AttrDict) + """ + try: + fIDs = list(self._frame_idx.keys()) + except AttributeError: + self._buildframeindex() + fIDs = list(self._frame_idx.keys()) + + if name is not None: + return PrettyList( + self.frame(fID) for fID, finfo in self.frame_ids_and_names(name).items() + ) + else: + return PrettyLazyMap(self.frame, fIDs) + + def frame_ids_and_names(self, name=None): + """ + Uses the frame index, which is much faster than looking up each frame definition + if only the names and IDs are needed. + """ + if not self._frame_idx: + self._buildframeindex() + return { + fID: finfo.name + for fID, finfo in self._frame_idx.items() + if name is None or re.search(name, finfo.name) is not None + } + + def fes(self, name=None, frame=None): + """ + Lists frame element objects. If 'name' is provided, this is treated as + a case-insensitive regular expression to filter by frame name. + (Case-insensitivity is because casing of frame element names is not always + consistent across frames.) Specify 'frame' to filter by a frame name pattern, + ID, or object. + + >>> from nltk.corpus import framenet as fn + >>> fn.fes('Noise_maker') + [] + >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound')]) # doctest: +NORMALIZE_WHITESPACE + [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'), + ('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'), + ('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'), + ('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'), + ('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'), + ('Vocalizations', 'Sound_source')] + >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound',r'(?i)make_noise')]) # doctest: +NORMALIZE_WHITESPACE + [('Cause_to_make_noise', 'Sound_maker'), + ('Make_noise', 'Sound'), + ('Make_noise', 'Sound_source')] + >>> sorted(set(fe.name for fe in fn.fes('^sound'))) + ['Sound', 'Sound_maker', 'Sound_source'] + >>> len(fn.fes('^sound$')) + 2 + + :param name: A regular expression pattern used to match against + frame element names. If 'name' is None, then a list of all + frame elements will be returned. + :type name: str + :return: A list of matching frame elements + :rtype: list(AttrDict) + """ + # what frames are we searching in? + if frame is not None: + if isinstance(frame, int): + frames = [self.frame(frame)] + elif isinstance(frame, str): + frames = self.frames(frame) + else: + frames = [frame] + else: + frames = self.frames() + + return PrettyList( + fe + for f in frames + for fename, fe in f.FE.items() + if name is None or re.search(name, fename, re.I) + ) + + def lus(self, name=None, frame=None): + """ + Obtain details for lexical units. + Optionally restrict by lexical unit name pattern, and/or to a certain frame + or frames whose name matches a pattern. + + >>> from nltk.corpus import framenet as fn + >>> len(fn.lus()) in (11829, 13572) # FN 1.5 and 1.7, resp. + True + >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID')), maxReprSize=0, breakLines=True) + [, + , + ] + >>> PrettyList(sorted(fn.lus(r'interest', r'(?i)stimulus'), key=itemgetter('ID'))) + [, ] + + A brief intro to Lexical Units (excerpted from "FrameNet II: + Extended Theory and Practice" by Ruppenhofer et. al., 2010): + + A lexical unit (LU) is a pairing of a word with a meaning. For + example, the "Apply_heat" Frame describes a common situation + involving a Cook, some Food, and a Heating Instrument, and is + _evoked_ by words such as bake, blanch, boil, broil, brown, + simmer, steam, etc. These frame-evoking words are the LUs in the + Apply_heat frame. Each sense of a polysemous word is a different + LU. + + We have used the word "word" in talking about LUs. The reality + is actually rather complex. When we say that the word "bake" is + polysemous, we mean that the lemma "bake.v" (which has the + word-forms "bake", "bakes", "baked", and "baking") is linked to + three different frames: + + - Apply_heat: "Michelle baked the potatoes for 45 minutes." + + - Cooking_creation: "Michelle baked her mother a cake for her birthday." + + - Absorb_heat: "The potatoes have to bake for more than 30 minutes." + + These constitute three different LUs, with different + definitions. + + Multiword expressions such as "given name" and hyphenated words + like "shut-eye" can also be LUs. Idiomatic phrases such as + "middle of nowhere" and "give the slip (to)" are also defined as + LUs in the appropriate frames ("Isolated_places" and "Evading", + respectively), and their internal structure is not analyzed. + + Framenet provides multiple annotated examples of each sense of a + word (i.e. each LU). Moreover, the set of examples + (approximately 20 per LU) illustrates all of the combinatorial + possibilities of the lexical unit. + + Each LU is linked to a Frame, and hence to the other words which + evoke that Frame. This makes the FrameNet database similar to a + thesaurus, grouping together semantically similar words. + + In the simplest case, frame-evoking words are verbs such as + "fried" in: + + "Matilde fried the catfish in a heavy iron skillet." + + Sometimes event nouns may evoke a Frame. For example, + "reduction" evokes "Cause_change_of_scalar_position" in: + + "...the reduction of debt levels to $665 million from $2.6 billion." + + Adjectives may also evoke a Frame. For example, "asleep" may + evoke the "Sleep" frame as in: + + "They were asleep for hours." + + Many common nouns, such as artifacts like "hat" or "tower", + typically serve as dependents rather than clearly evoking their + own frames. + + :param name: A regular expression pattern used to search the LU + names. Note that LU names take the form of a dotted + string (e.g. "run.v" or "a little.adv") in which a + lemma precedes the "." and a POS follows the + dot. The lemma may be composed of a single lexeme + (e.g. "run") or of multiple lexemes (e.g. "a + little"). If 'name' is not given, then all LUs will + be returned. + + The valid POSes are: + + v - verb + n - noun + a - adjective + adv - adverb + prep - preposition + num - numbers + intj - interjection + art - article + c - conjunction + scon - subordinating conjunction + + :type name: str + :type frame: str or int or frame + :return: A list of selected (or all) lexical units + :rtype: list of LU objects (dicts). See the lu() function for info + about the specifics of LU objects. + + """ + if not self._lu_idx: + self._buildluindex() + + if name is not None: # match LUs, then restrict by frame + result = PrettyList( + self.lu(luID) for luID, luName in self.lu_ids_and_names(name).items() + ) + if frame is not None: + if isinstance(frame, int): + frameIDs = {frame} + elif isinstance(frame, str): + frameIDs = {f.ID for f in self.frames(frame)} + else: + frameIDs = {frame.ID} + result = PrettyList(lu for lu in result if lu.frame.ID in frameIDs) + elif frame is not None: # all LUs in matching frames + if isinstance(frame, int): + frames = [self.frame(frame)] + elif isinstance(frame, str): + frames = self.frames(frame) + else: + frames = [frame] + result = PrettyLazyIteratorList( + iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames)) + ) + else: # all LUs + luIDs = [ + luID + for luID, lu in self._lu_idx.items() + if lu.status not in self._bad_statuses + ] + result = PrettyLazyMap(self.lu, luIDs) + return result + + def lu_ids_and_names(self, name=None): + """ + Uses the LU index, which is much faster than looking up each LU definition + if only the names and IDs are needed. + """ + if not self._lu_idx: + self._buildluindex() + return { + luID: luinfo.name + for luID, luinfo in self._lu_idx.items() + if luinfo.status not in self._bad_statuses + and (name is None or re.search(name, luinfo.name) is not None) + } + + def docs_metadata(self, name=None): + """ + Return an index of the annotated documents in Framenet. + + Details for a specific annotated document can be obtained using this + class's doc() function and pass it the value of the 'ID' field. + + >>> from nltk.corpus import framenet as fn + >>> len(fn.docs()) in (78, 107) # FN 1.5 and 1.7, resp. + True + >>> set([x.corpname for x in fn.docs_metadata()])>=set(['ANC', 'KBEval', \ + 'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank']) + True + + :param name: A regular expression pattern used to search the + file name of each annotated document. The document's + file name contains the name of the corpus that the + document is from, followed by two underscores "__" + followed by the document name. So, for example, the + file name "LUCorpus-v0.3__20000410_nyt-NEW.xml" is + from the corpus named "LUCorpus-v0.3" and the + document name is "20000410_nyt-NEW.xml". + :type name: str + :return: A list of selected (or all) annotated documents + :rtype: list of dicts, where each dict object contains the following + keys: + + - 'name' + - 'ID' + - 'corpid' + - 'corpname' + - 'description' + - 'filename' + """ + try: + ftlist = PrettyList(self._fulltext_idx.values()) + except AttributeError: + self._buildcorpusindex() + ftlist = PrettyList(self._fulltext_idx.values()) + + if name is None: + return ftlist + else: + return PrettyList( + x for x in ftlist if re.search(name, x["filename"]) is not None + ) + + def docs(self, name=None): + """ + Return a list of the annotated full-text documents in FrameNet, + optionally filtered by a regex to be matched against the document name. + """ + return PrettyLazyMap((lambda x: self.doc(x.ID)), self.docs_metadata(name)) + + def sents(self, exemplars=True, full_text=True): + """ + Annotated sentences matching the specified criteria. + """ + if exemplars: + if full_text: + return self.exemplars() + self.ft_sents() + else: + return self.exemplars() + elif full_text: + return self.ft_sents() + + def annotations(self, luNamePattern=None, exemplars=True, full_text=True): + """ + Frame annotation sets matching the specified criteria. + """ + + if exemplars: + epart = PrettyLazyIteratorList( + sent.frameAnnotation for sent in self.exemplars(luNamePattern) + ) + else: + epart = [] + + if full_text: + if luNamePattern is not None: + matchedLUIDs = set(self.lu_ids_and_names(luNamePattern).keys()) + ftpart = PrettyLazyIteratorList( + aset + for sent in self.ft_sents() + for aset in sent.annotationSet[1:] + if luNamePattern is None or aset.get("luID", "CXN_ASET") in matchedLUIDs + ) + else: + ftpart = [] + + if exemplars: + if full_text: + return epart + ftpart + else: + return epart + elif full_text: + return ftpart + + def exemplars(self, luNamePattern=None, frame=None, fe=None, fe2=None): + """ + Lexicographic exemplar sentences, optionally filtered by LU name and/or 1-2 FEs that + are realized overtly. 'frame' may be a name pattern, frame ID, or frame instance. + 'fe' may be a name pattern or FE instance; if specified, 'fe2' may also + be specified to retrieve sentences with both overt FEs (in either order). + """ + if fe is None and fe2 is not None: + raise FramenetError("exemplars(..., fe=None, fe2=) is not allowed") + elif fe is not None and fe2 is not None: + if not isinstance(fe2, str): + if isinstance(fe, str): + # fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame. + fe, fe2 = fe2, fe + elif fe.frame is not fe2.frame: # ensure frames match + raise FramenetError( + "exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)" + ) + if frame is None and fe is not None and not isinstance(fe, str): + frame = fe.frame + + # narrow down to frames matching criteria + + lusByFrame = defaultdict( + list + ) # frame name -> matching LUs, if luNamePattern is specified + if frame is not None or luNamePattern is not None: + if frame is None or isinstance(frame, str): + if luNamePattern is not None: + frames = set() + for lu in self.lus(luNamePattern, frame=frame): + frames.add(lu.frame.ID) + lusByFrame[lu.frame.name].append(lu) + frames = LazyMap(self.frame, list(frames)) + else: + frames = self.frames(frame) + else: + if isinstance(frame, int): + frames = [self.frame(frame)] + else: # frame object + frames = [frame] + + if luNamePattern is not None: + lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)} + + if fe is not None: # narrow to frames that define this FE + if isinstance(fe, str): + frames = PrettyLazyIteratorList( + f + for f in frames + if fe in f.FE + or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys()) + ) + else: + if fe.frame not in frames: + raise FramenetError( + "exemplars() call with inconsistent `frame` and `fe` specification" + ) + frames = [fe.frame] + + if fe2 is not None: # narrow to frames that ALSO define this FE + if isinstance(fe2, str): + frames = PrettyLazyIteratorList( + f + for f in frames + if fe2 in f.FE + or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys()) + ) + # else we already narrowed it to a single frame + else: # frame, luNamePattern are None. fe, fe2 are None or strings + if fe is not None: + frames = {ffe.frame.ID for ffe in self.fes(fe)} + if fe2 is not None: + frames2 = {ffe.frame.ID for ffe in self.fes(fe2)} + frames = frames & frames2 + frames = LazyMap(self.frame, list(frames)) + else: + frames = self.frames() + + # we've narrowed down 'frames' + # now get exemplars for relevant LUs in those frames + + def _matching_exs(): + for f in frames: + fes = fes2 = None # FEs of interest + if fe is not None: + fes = ( + {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)} + if isinstance(fe, str) + else {fe.name} + ) + if fe2 is not None: + fes2 = ( + {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)} + if isinstance(fe2, str) + else {fe2.name} + ) + + for lu in ( + lusByFrame[f.name] + if luNamePattern is not None + else f.lexUnit.values() + ): + for ex in lu.exemplars: + if (fes is None or self._exemplar_of_fes(ex, fes)) and ( + fes2 is None or self._exemplar_of_fes(ex, fes2) + ): + yield ex + + return PrettyLazyIteratorList(_matching_exs()) + + def _exemplar_of_fes(self, ex, fes=None): + """ + Given an exemplar sentence and a set of FE names, return the subset of FE names + that are realized overtly in the sentence on the FE, FE2, or FE3 layer. + + If 'fes' is None, returns all overt FE names. + """ + overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set() + if "FE2" in ex: + overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set() + if "FE3" in ex: + overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set() + return overtNames & fes if fes is not None else overtNames + + def ft_sents(self, docNamePattern=None): + """ + Full-text annotation sentences, optionally filtered by document name. + """ + return PrettyLazyIteratorList( + sent for d in self.docs(docNamePattern) for sent in d.sentence + ) + + def frame_relation_types(self): + """ + Obtain a list of frame relation types. + + >>> from nltk.corpus import framenet as fn + >>> frts = sorted(fn.frame_relation_types(), key=itemgetter('ID')) + >>> isinstance(frts, list) + True + >>> len(frts) in (9, 10) # FN 1.5 and 1.7, resp. + True + >>> PrettyDict(frts[0], breakLines=True) + {'ID': 1, + '_type': 'framerelationtype', + 'frameRelations': [ Child=Change_of_consistency>, Child=Rotting>, ...], + 'name': 'Inheritance', + 'subFrameName': 'Child', + 'superFrameName': 'Parent'} + + :return: A list of all of the frame relation types in framenet + :rtype: list(dict) + """ + if not self._freltyp_idx: + self._buildrelationindex() + return self._freltyp_idx.values() + + def frame_relations(self, frame=None, frame2=None, type=None): + """ + :param frame: (optional) frame object, name, or ID; only relations involving + this frame will be returned + :param frame2: (optional; 'frame' must be a different frame) only show relations + between the two specified frames, in either direction + :param type: (optional) frame relation type (name or object); show only relations + of this type + :type frame: int or str or AttrDict + :return: A list of all of the frame relations in framenet + :rtype: list(dict) + + >>> from nltk.corpus import framenet as fn + >>> frels = fn.frame_relations() + >>> isinstance(frels, list) + True + >>> len(frels) in (1676, 2070) # FN 1.5 and 1.7, resp. + True + >>> PrettyList(fn.frame_relations('Cooking_creation'), maxReprSize=0, breakLines=True) + [ Child=Cooking_creation>, + Child=Cooking_creation>, + ReferringEntry=Cooking_creation>] + >>> PrettyList(fn.frame_relations(274), breakLines=True) + [ Child=Dodging>, + Child=Evading>, ...] + >>> PrettyList(fn.frame_relations(fn.frame('Cooking_creation')), breakLines=True) + [ Child=Cooking_creation>, + Child=Cooking_creation>, ...] + >>> PrettyList(fn.frame_relations('Cooking_creation', type='Inheritance')) + [ Child=Cooking_creation>] + >>> PrettyList(fn.frame_relations('Cooking_creation', 'Apply_heat'), breakLines=True) # doctest: +NORMALIZE_WHITESPACE + [ Child=Cooking_creation>, + ReferringEntry=Cooking_creation>] + """ + relation_type = type + + if not self._frel_idx: + self._buildrelationindex() + + rels = None + + if relation_type is not None: + if not isinstance(relation_type, dict): + type = [rt for rt in self.frame_relation_types() if rt.name == type][0] + assert isinstance(type, dict) + + # lookup by 'frame' + if frame is not None: + if isinstance(frame, dict) and "frameRelations" in frame: + rels = PrettyList(frame.frameRelations) + else: + if not isinstance(frame, int): + if isinstance(frame, dict): + frame = frame.ID + else: + frame = self.frame_by_name(frame).ID + rels = [self._frel_idx[frelID] for frelID in self._frel_f_idx[frame]] + + # filter by 'type' + if type is not None: + rels = [rel for rel in rels if rel.type is type] + elif type is not None: + # lookup by 'type' + rels = type.frameRelations + else: + rels = self._frel_idx.values() + + # filter by 'frame2' + if frame2 is not None: + if frame is None: + raise FramenetError( + "frame_relations(frame=None, frame2=) is not allowed" + ) + if not isinstance(frame2, int): + if isinstance(frame2, dict): + frame2 = frame2.ID + else: + frame2 = self.frame_by_name(frame2).ID + if frame == frame2: + raise FramenetError( + "The two frame arguments to frame_relations() must be different frames" + ) + rels = [ + rel + for rel in rels + if rel.superFrame.ID == frame2 or rel.subFrame.ID == frame2 + ] + + return PrettyList( + sorted( + rels, + key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName), + ) + ) + + def fe_relations(self): + """ + Obtain a list of frame element relations. + + >>> from nltk.corpus import framenet as fn + >>> ferels = fn.fe_relations() + >>> isinstance(ferels, list) + True + >>> len(ferels) in (10020, 12393) # FN 1.5 and 1.7, resp. + True + >>> PrettyDict(ferels[0], breakLines=True) # doctest: +NORMALIZE_WHITESPACE + {'ID': 14642, + '_type': 'ferelation', + 'frameRelation': Child=Lively_place>, + 'subFE': , + 'subFEName': 'Degree', + 'subFrame': , + 'subID': 11370, + 'supID': 2271, + 'superFE': , + 'superFEName': 'Degree', + 'superFrame': , + 'type': } + + :return: A list of all of the frame element relations in framenet + :rtype: list(dict) + """ + if not self._ferel_idx: + self._buildrelationindex() + return PrettyList( + sorted( + self._ferel_idx.values(), + key=lambda ferel: ( + ferel.type.ID, + ferel.frameRelation.superFrameName, + ferel.superFEName, + ferel.frameRelation.subFrameName, + ferel.subFEName, + ), + ) + ) + + def semtypes(self): + """ + Obtain a list of semantic types. + + >>> from nltk.corpus import framenet as fn + >>> stypes = fn.semtypes() + >>> len(stypes) in (73, 109) # FN 1.5 and 1.7, resp. + True + >>> sorted(stypes[0].keys()) + ['ID', '_type', 'abbrev', 'definition', 'definitionMarkup', 'name', 'rootType', 'subTypes', 'superType'] + + :return: A list of all of the semantic types in framenet + :rtype: list(dict) + """ + if not self._semtypes: + self._loadsemtypes() + return PrettyList( + self._semtypes[i] for i in self._semtypes if isinstance(i, int) + ) + + def _load_xml_attributes(self, d, elt): + """ + Extracts a subset of the attributes from the given element and + returns them in a dictionary. + + :param d: A dictionary in which to store the attributes. + :type d: dict + :param elt: An ElementTree Element + :type elt: Element + :return: Returns the input dict ``d`` possibly including attributes from ``elt`` + :rtype: dict + """ + + d = type(d)(d) + + try: + attr_dict = elt.attrib + except AttributeError: + return d + + if attr_dict is None: + return d + + # Ignore these attributes when loading attributes from an xml node + ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest + "xsi", + "schemaLocation", + "xmlns", + "bgColor", + "fgColor", + ] + + for attr in attr_dict: + + if any(attr.endswith(x) for x in ignore_attrs): + continue + + val = attr_dict[attr] + if val.isdigit(): + d[attr] = int(val) + else: + d[attr] = val + + return d + + def _strip_tags(self, data): + """ + Gets rid of all tags and newline characters from the given input + + :return: A cleaned-up version of the input string + :rtype: str + """ + + try: + r""" + # Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.) + m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data) + if m: + print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr) + """ + + data = data.replace("", "") + data = data.replace("", "") + data = re.sub('', "", data) + data = data.replace("", "") + data = data.replace("", "") + data = data.replace("", "") + data = data.replace("", "") + data = data.replace("", "") + data = data.replace("", "") + data = data.replace("", "") + data = data.replace("", "'") + data = data.replace("", "'") + data = data.replace("", "") + data = data.replace("", "") + data = data.replace("", "") + data = data.replace("", "") + + # Get rid of and tags + data = data.replace("", "") + data = data.replace("", "") + + data = data.replace("\n", " ") + except AttributeError: + pass + + return data + + def _handle_elt(self, elt, tagspec=None): + """Extracts and returns the attributes of the given element""" + return self._load_xml_attributes(AttrDict(), elt) + + def _handle_fulltextindex_elt(self, elt, tagspec=None): + """ + Extracts corpus/document info from the fulltextIndex.xml file. + + Note that this function "flattens" the information contained + in each of the "corpus" elements, so that each "document" + element will contain attributes for the corpus and + corpusid. Also, each of the "document" items will contain a + new attribute called "filename" that is the base file name of + the xml file for the document in the "fulltext" subdir of the + Framenet corpus. + """ + ftinfo = self._load_xml_attributes(AttrDict(), elt) + corpname = ftinfo.name + corpid = ftinfo.ID + retlist = [] + for sub in elt: + if sub.tag.endswith("document"): + doc = self._load_xml_attributes(AttrDict(), sub) + if "name" in doc: + docname = doc.name + else: + docname = doc.description + doc.filename = f"{corpname}__{docname}.xml" + doc.URL = ( + self._fnweb_url + "/" + self._fulltext_dir + "/" + doc.filename + ) + doc.corpname = corpname + doc.corpid = corpid + retlist.append(doc) + + return retlist + + def _handle_frame_elt(self, elt, ignorekeys=[]): + """Load the info for a Frame from a frame xml file""" + frinfo = self._load_xml_attributes(AttrDict(), elt) + + frinfo["_type"] = "frame" + frinfo["definition"] = "" + frinfo["definitionMarkup"] = "" + frinfo["FE"] = PrettyDict() + frinfo["FEcoreSets"] = [] + frinfo["lexUnit"] = PrettyDict() + frinfo["semTypes"] = [] + for k in ignorekeys: + if k in frinfo: + del frinfo[k] + + for sub in elt: + if sub.tag.endswith("definition") and "definition" not in ignorekeys: + frinfo["definitionMarkup"] = sub.text + frinfo["definition"] = self._strip_tags(sub.text) + elif sub.tag.endswith("FE") and "FE" not in ignorekeys: + feinfo = self._handle_fe_elt(sub) + frinfo["FE"][feinfo.name] = feinfo + feinfo["frame"] = frinfo # backpointer + elif sub.tag.endswith("FEcoreSet") and "FEcoreSet" not in ignorekeys: + coreset = self._handle_fecoreset_elt(sub) + # assumes all FEs have been loaded before coresets + frinfo["FEcoreSets"].append( + PrettyList(frinfo["FE"][fe.name] for fe in coreset) + ) + elif sub.tag.endswith("lexUnit") and "lexUnit" not in ignorekeys: + luentry = self._handle_framelexunit_elt(sub) + if luentry["status"] in self._bad_statuses: + # problematic LU entry; ignore it + continue + luentry["frame"] = frinfo + luentry["URL"] = ( + self._fnweb_url + + "/" + + self._lu_dir + + "/" + + "lu{}.xml".format(luentry["ID"]) + ) + luentry["subCorpus"] = Future( + (lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry) + ) + luentry["exemplars"] = Future( + (lambda lu: lambda: self._lu_file(lu).exemplars)(luentry) + ) + frinfo["lexUnit"][luentry.name] = luentry + if not self._lu_idx: + self._buildluindex() + self._lu_idx[luentry.ID] = luentry + elif sub.tag.endswith("semType") and "semTypes" not in ignorekeys: + semtypeinfo = self._load_xml_attributes(AttrDict(), sub) + frinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) + + frinfo["frameRelations"] = self.frame_relations(frame=frinfo) + + # resolve 'requires' and 'excludes' links between FEs of this frame + for fe in frinfo.FE.values(): + if fe.requiresFE: + name, ID = fe.requiresFE.name, fe.requiresFE.ID + fe.requiresFE = frinfo.FE[name] + assert fe.requiresFE.ID == ID + if fe.excludesFE: + name, ID = fe.excludesFE.name, fe.excludesFE.ID + fe.excludesFE = frinfo.FE[name] + assert fe.excludesFE.ID == ID + + return frinfo + + def _handle_fecoreset_elt(self, elt): + """Load fe coreset info from xml.""" + info = self._load_xml_attributes(AttrDict(), elt) + tmp = [] + for sub in elt: + tmp.append(self._load_xml_attributes(AttrDict(), sub)) + + return tmp + + def _handle_framerelationtype_elt(self, elt, *args): + """Load frame-relation element and its child fe-relation elements from frRelation.xml.""" + info = self._load_xml_attributes(AttrDict(), elt) + info["_type"] = "framerelationtype" + info["frameRelations"] = PrettyList() + + for sub in elt: + if sub.tag.endswith("frameRelation"): + frel = self._handle_framerelation_elt(sub) + frel["type"] = info # backpointer + for ferel in frel.feRelations: + ferel["type"] = info + info["frameRelations"].append(frel) + + return info + + def _handle_framerelation_elt(self, elt): + """Load frame-relation element and its child fe-relation elements from frRelation.xml.""" + info = self._load_xml_attributes(AttrDict(), elt) + assert info["superFrameName"] != info["subFrameName"], (elt, info) + info["_type"] = "framerelation" + info["feRelations"] = PrettyList() + + for sub in elt: + if sub.tag.endswith("FERelation"): + ferel = self._handle_elt(sub) + ferel["_type"] = "ferelation" + ferel["frameRelation"] = info # backpointer + info["feRelations"].append(ferel) + + return info + + def _handle_fulltextannotation_elt(self, elt): + """Load full annotation info for a document from its xml + file. The main element (fullTextAnnotation) contains a 'header' + element (which we ignore here) and a bunch of 'sentence' + elements.""" + info = AttrDict() + info["_type"] = "fulltext_annotation" + info["sentence"] = [] + + for sub in elt: + if sub.tag.endswith("header"): + continue # not used + elif sub.tag.endswith("sentence"): + s = self._handle_fulltext_sentence_elt(sub) + s.doc = info + info["sentence"].append(s) + + return info + + def _handle_fulltext_sentence_elt(self, elt): + """Load information from the given 'sentence' element. Each + 'sentence' element contains a "text" and "annotationSet" sub + elements.""" + info = self._load_xml_attributes(AttrDict(), elt) + info["_type"] = "fulltext_sentence" + info["annotationSet"] = [] + info["targets"] = [] + target_spans = set() + info["_ascii"] = types.MethodType( + _annotation_ascii, info + ) # attach a method for this instance + info["text"] = "" + + for sub in elt: + if sub.tag.endswith("text"): + info["text"] = self._strip_tags(sub.text) + elif sub.tag.endswith("annotationSet"): + a = self._handle_fulltextannotationset_elt( + sub, is_pos=(len(info["annotationSet"]) == 0) + ) + if "cxnID" in a: # ignoring construction annotations for now + continue + a.sent = info + a.text = info.text + info["annotationSet"].append(a) + if "Target" in a: + for tspan in a.Target: + if tspan in target_spans: + self._warn( + 'Duplicate target span "{}"'.format( + info.text[slice(*tspan)] + ), + tspan, + "in sentence", + info["ID"], + info.text, + ) + # this can happen in cases like "chemical and biological weapons" + # being annotated as "chemical weapons" and "biological weapons" + else: + target_spans.add(tspan) + info["targets"].append((a.Target, a.luName, a.frameName)) + + assert info["annotationSet"][0].status == "UNANN" + info["POS"] = info["annotationSet"][0].POS + info["POS_tagset"] = info["annotationSet"][0].POS_tagset + return info + + def _handle_fulltextannotationset_elt(self, elt, is_pos=False): + """Load information from the given 'annotationSet' element. Each + 'annotationSet' contains several "layer" elements.""" + + info = self._handle_luannotationset_elt(elt, is_pos=is_pos) + if not is_pos: + info["_type"] = "fulltext_annotationset" + if "cxnID" not in info: # ignoring construction annotations for now + info["LU"] = self.lu( + info.luID, + luName=info.luName, + frameID=info.frameID, + frameName=info.frameName, + ) + info["frame"] = info.LU.frame + return info + + def _handle_fulltextlayer_elt(self, elt): + """Load information from the given 'layer' element. Each + 'layer' contains several "label" elements.""" + info = self._load_xml_attributes(AttrDict(), elt) + info["_type"] = "layer" + info["label"] = [] + + for sub in elt: + if sub.tag.endswith("label"): + l = self._load_xml_attributes(AttrDict(), sub) + info["label"].append(l) + + return info + + def _handle_framelexunit_elt(self, elt): + """Load the lexical unit info from an xml element in a frame's xml file.""" + luinfo = AttrDict() + luinfo["_type"] = "lu" + luinfo = self._load_xml_attributes(luinfo, elt) + luinfo["definition"] = "" + luinfo["definitionMarkup"] = "" + luinfo["sentenceCount"] = PrettyDict() + luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes + luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes + + for sub in elt: + if sub.tag.endswith("definition"): + luinfo["definitionMarkup"] = sub.text + luinfo["definition"] = self._strip_tags(sub.text) + elif sub.tag.endswith("sentenceCount"): + luinfo["sentenceCount"] = self._load_xml_attributes(PrettyDict(), sub) + elif sub.tag.endswith("lexeme"): + lexemeinfo = self._load_xml_attributes(PrettyDict(), sub) + if not isinstance(lexemeinfo.name, str): + # some lexeme names are ints by default: e.g., + # thousand.num has lexeme with name="1000" + lexemeinfo.name = str(lexemeinfo.name) + luinfo["lexemes"].append(lexemeinfo) + elif sub.tag.endswith("semType"): + semtypeinfo = self._load_xml_attributes(PrettyDict(), sub) + luinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) + + # sort lexemes by 'order' attribute + # otherwise, e.g., 'write down.v' may have lexemes in wrong order + luinfo["lexemes"].sort(key=lambda x: x.order) + + return luinfo + + def _handle_lexunit_elt(self, elt, ignorekeys): + """ + Load full info for a lexical unit from its xml file. + This should only be called when accessing corpus annotations + (which are not included in frame files). + """ + luinfo = self._load_xml_attributes(AttrDict(), elt) + luinfo["_type"] = "lu" + luinfo["definition"] = "" + luinfo["definitionMarkup"] = "" + luinfo["subCorpus"] = PrettyList() + luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes + luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes + for k in ignorekeys: + if k in luinfo: + del luinfo[k] + + for sub in elt: + if sub.tag.endswith("header"): + continue # not used + elif sub.tag.endswith("valences"): + continue # not used + elif sub.tag.endswith("definition") and "definition" not in ignorekeys: + luinfo["definitionMarkup"] = sub.text + luinfo["definition"] = self._strip_tags(sub.text) + elif sub.tag.endswith("subCorpus") and "subCorpus" not in ignorekeys: + sc = self._handle_lusubcorpus_elt(sub) + if sc is not None: + luinfo["subCorpus"].append(sc) + elif sub.tag.endswith("lexeme") and "lexeme" not in ignorekeys: + luinfo["lexemes"].append(self._load_xml_attributes(PrettyDict(), sub)) + elif sub.tag.endswith("semType") and "semType" not in ignorekeys: + semtypeinfo = self._load_xml_attributes(AttrDict(), sub) + luinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) + + return luinfo + + def _handle_lusubcorpus_elt(self, elt): + """Load a subcorpus of a lexical unit from the given xml.""" + sc = AttrDict() + try: + sc["name"] = elt.get("name") + except AttributeError: + return None + sc["_type"] = "lusubcorpus" + sc["sentence"] = [] + + for sub in elt: + if sub.tag.endswith("sentence"): + s = self._handle_lusentence_elt(sub) + if s is not None: + sc["sentence"].append(s) + + return sc + + def _handle_lusentence_elt(self, elt): + """Load a sentence from a subcorpus of an LU from xml.""" + info = self._load_xml_attributes(AttrDict(), elt) + info["_type"] = "lusentence" + info["annotationSet"] = [] + info["_ascii"] = types.MethodType( + _annotation_ascii, info + ) # attach a method for this instance + for sub in elt: + if sub.tag.endswith("text"): + info["text"] = self._strip_tags(sub.text) + elif sub.tag.endswith("annotationSet"): + annset = self._handle_luannotationset_elt( + sub, is_pos=(len(info["annotationSet"]) == 0) + ) + if annset is not None: + assert annset.status == "UNANN" or "FE" in annset, annset + if annset.status != "UNANN": + info["frameAnnotation"] = annset + # copy layer info up to current level + for k in ( + "Target", + "FE", + "FE2", + "FE3", + "GF", + "PT", + "POS", + "POS_tagset", + "Other", + "Sent", + "Verb", + "Noun", + "Adj", + "Adv", + "Prep", + "Scon", + "Art", + ): + if k in annset: + info[k] = annset[k] + info["annotationSet"].append(annset) + annset["sent"] = info + annset["text"] = info.text + return info + + def _handle_luannotationset_elt(self, elt, is_pos=False): + """Load an annotation set from a sentence in an subcorpus of an LU""" + info = self._load_xml_attributes(AttrDict(), elt) + info["_type"] = "posannotationset" if is_pos else "luannotationset" + info["layer"] = [] + info["_ascii"] = types.MethodType( + _annotation_ascii, info + ) # attach a method for this instance + + if "cxnID" in info: # ignoring construction annotations for now. + return info + + for sub in elt: + if sub.tag.endswith("layer"): + l = self._handle_lulayer_elt(sub) + if l is not None: + overt = [] + ni = {} # null instantiations + + info["layer"].append(l) + for lbl in l.label: + if "start" in lbl: + thespan = (lbl.start, lbl.end + 1, lbl.name) + if l.name not in ( + "Sent", + "Other", + ): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans + assert thespan not in overt, (info.ID, l.name, thespan) + overt.append(thespan) + else: # null instantiation + if lbl.name in ni: + self._warn( + "FE with multiple NI entries:", + lbl.name, + ni[lbl.name], + lbl.itype, + ) + else: + ni[lbl.name] = lbl.itype + overt = sorted(overt) + + if l.name == "Target": + if not overt: + self._warn( + "Skipping empty Target layer in annotation set ID={}".format( + info.ID + ) + ) + continue + assert all(lblname == "Target" for i, j, lblname in overt) + if "Target" in info: + self._warn( + "Annotation set {} has multiple Target layers".format( + info.ID + ) + ) + else: + info["Target"] = [(i, j) for (i, j, _) in overt] + elif l.name == "FE": + if l.rank == 1: + assert "FE" not in info + info["FE"] = (overt, ni) + # assert False,info + else: + # sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v + assert 2 <= l.rank <= 3, l.rank + k = "FE" + str(l.rank) + assert k not in info + info[k] = (overt, ni) + elif l.name in ("GF", "PT"): + assert l.rank == 1 + info[l.name] = overt + elif l.name in ("BNC", "PENN"): + assert l.rank == 1 + info["POS"] = overt + info["POS_tagset"] = l.name + else: + if is_pos: + if l.name not in ("NER", "WSL"): + self._warn( + "Unexpected layer in sentence annotationset:", + l.name, + ) + else: + if l.name not in ( + "Sent", + "Verb", + "Noun", + "Adj", + "Adv", + "Prep", + "Scon", + "Art", + "Other", + ): + self._warn( + "Unexpected layer in frame annotationset:", l.name + ) + info[l.name] = overt + if not is_pos and "cxnID" not in info: + if "Target" not in info: + self._warn(f"Missing target in annotation set ID={info.ID}") + assert "FE" in info + if "FE3" in info: + assert "FE2" in info + + return info + + def _handle_lulayer_elt(self, elt): + """Load a layer from an annotation set""" + layer = self._load_xml_attributes(AttrDict(), elt) + layer["_type"] = "lulayer" + layer["label"] = [] + + for sub in elt: + if sub.tag.endswith("label"): + l = self._load_xml_attributes(AttrDict(), sub) + if l is not None: + layer["label"].append(l) + return layer + + def _handle_fe_elt(self, elt): + feinfo = self._load_xml_attributes(AttrDict(), elt) + feinfo["_type"] = "fe" + feinfo["definition"] = "" + feinfo["definitionMarkup"] = "" + feinfo["semType"] = None + feinfo["requiresFE"] = None + feinfo["excludesFE"] = None + for sub in elt: + if sub.tag.endswith("definition"): + feinfo["definitionMarkup"] = sub.text + feinfo["definition"] = self._strip_tags(sub.text) + elif sub.tag.endswith("semType"): + stinfo = self._load_xml_attributes(AttrDict(), sub) + feinfo["semType"] = self.semtype(stinfo.ID) + elif sub.tag.endswith("requiresFE"): + feinfo["requiresFE"] = self._load_xml_attributes(AttrDict(), sub) + elif sub.tag.endswith("excludesFE"): + feinfo["excludesFE"] = self._load_xml_attributes(AttrDict(), sub) + + return feinfo + + def _handle_semtype_elt(self, elt, tagspec=None): + semt = self._load_xml_attributes(AttrDict(), elt) + semt["_type"] = "semtype" + semt["superType"] = None + semt["subTypes"] = PrettyList() + for sub in elt: + if sub.text is not None: + semt["definitionMarkup"] = sub.text + semt["definition"] = self._strip_tags(sub.text) + else: + supertypeinfo = self._load_xml_attributes(AttrDict(), sub) + semt["superType"] = supertypeinfo + # the supertype may not have been loaded yet + + return semt + + +# +# Demo +# +def demo(): + from nltk.corpus import framenet as fn + + # + # It is not necessary to explicitly build the indexes by calling + # buildindexes(). We do this here just for demo purposes. If the + # indexes are not built explicitly, they will be built as needed. + # + print("Building the indexes...") + fn.buildindexes() + + # + # Get some statistics about the corpus + # + print("Number of Frames:", len(fn.frames())) + print("Number of Lexical Units:", len(fn.lus())) + print("Number of annotated documents:", len(fn.docs())) + print() + + # + # Frames + # + print( + 'getting frames whose name matches the (case insensitive) regex: "(?i)medical"' + ) + medframes = fn.frames(r"(?i)medical") + print(f'Found {len(medframes)} Frames whose name matches "(?i)medical":') + print([(f.name, f.ID) for f in medframes]) + + # + # store the first frame in the list of frames + # + tmp_id = medframes[0].ID + m_frame = fn.frame(tmp_id) # reads all info for the frame + + # + # get the frame relations + # + print( + '\nNumber of frame relations for the "{}" ({}) frame:'.format( + m_frame.name, m_frame.ID + ), + len(m_frame.frameRelations), + ) + for fr in m_frame.frameRelations: + print(" ", fr) + + # + # get the names of the Frame Elements + # + print( + f'\nNumber of Frame Elements in the "{m_frame.name}" frame:', + len(m_frame.FE), + ) + print(" ", [x for x in m_frame.FE]) + + # + # get the names of the "Core" Frame Elements + # + print(f'\nThe "core" Frame Elements in the "{m_frame.name}" frame:') + print(" ", [x.name for x in m_frame.FE.values() if x.coreType == "Core"]) + + # + # get all of the Lexical Units that are incorporated in the + # 'Ailment' FE of the 'Medical_conditions' frame (id=239) + # + print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') + m_frame = fn.frame(239) + ailment_lus = [ + x + for x in m_frame.lexUnit.values() + if "incorporatedFE" in x and x.incorporatedFE == "Ailment" + ] + print(" ", [x.name for x in ailment_lus]) + + # + # get all of the Lexical Units for the frame + # + print( + f'\nNumber of Lexical Units in the "{m_frame.name}" frame:', + len(m_frame.lexUnit), + ) + print(" ", [x.name for x in m_frame.lexUnit.values()][:5], "...") + + # + # get basic info on the second LU in the frame + # + tmp_id = m_frame.lexUnit["ailment.n"].ID # grab the id of the specified LU + luinfo = fn.lu_basic(tmp_id) # get basic info on the LU + print(f"\nInformation on the LU: {luinfo.name}") + pprint(luinfo) + + # + # Get a list of all of the corpora used for fulltext annotation + # + print("\nNames of all of the corpora used for fulltext annotation:") + allcorpora = {x.corpname for x in fn.docs_metadata()} + pprint(list(allcorpora)) + + # + # Get the names of the annotated documents in the first corpus + # + firstcorp = list(allcorpora)[0] + firstcorp_docs = fn.docs(firstcorp) + print(f'\nNames of the annotated documents in the "{firstcorp}" corpus:') + pprint([x.filename for x in firstcorp_docs]) + + # + # Search for frames containing LUs whose name attribute matches a + # regexp pattern. + # + # Note: if you were going to be doing a lot of this type of + # searching, you'd want to build an index that maps from + # lemmas to frames because each time frames_by_lemma() is + # called, it has to search through ALL of the frame XML files + # in the db. + print( + '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":' + ) + pprint(fn.frames_by_lemma(r"^run.v$")) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ieer.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ieer.py new file mode 100644 index 0000000000000000000000000000000000000000..8e969ecc02104ea448c715760715f3fb5f6c27fe --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ieer.py @@ -0,0 +1,116 @@ +# Natural Language Toolkit: IEER Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Corpus reader for the Information Extraction and Entity Recognition Corpus. + +NIST 1999 Information Extraction: Entity Recognition Evaluation +https://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm + +This corpus contains the NEWSWIRE development test data for the +NIST 1999 IE-ER Evaluation. The files were taken from the +subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt`` +and filenames were shortened. + +The corpus contains the following files: APW_19980314, APW_19980424, +APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407. +""" + +import nltk +from nltk.corpus.reader.api import * + +#: A dictionary whose keys are the names of documents in this corpus; +#: and whose values are descriptions of those documents' contents. +titles = { + "APW_19980314": "Associated Press Weekly, 14 March 1998", + "APW_19980424": "Associated Press Weekly, 24 April 1998", + "APW_19980429": "Associated Press Weekly, 29 April 1998", + "NYT_19980315": "New York Times, 15 March 1998", + "NYT_19980403": "New York Times, 3 April 1998", + "NYT_19980407": "New York Times, 7 April 1998", +} + +#: A list of all documents in this corpus. +documents = sorted(titles) + + +class IEERDocument: + def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""): + self.text = text + self.docno = docno + self.doctype = doctype + self.date_time = date_time + self.headline = headline + + def __repr__(self): + if self.headline: + headline = " ".join(self.headline.leaves()) + else: + headline = ( + " ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..." + ) + if self.docno is not None: + return f"" + else: + return "" % headline + + +class IEERCorpusReader(CorpusReader): + """ """ + + def docs(self, fileids=None): + return concat( + [ + StreamBackedCorpusView(fileid, self._read_block, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def parsed_docs(self, fileids=None): + return concat( + [ + StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def _read_parsed_block(self, stream): + # TODO: figure out while empty documents are being returned + return [ + self._parse(doc) + for doc in self._read_block(stream) + if self._parse(doc).docno is not None + ] + + def _parse(self, doc): + val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT") + if isinstance(val, dict): + return IEERDocument(**val) + else: + return IEERDocument(val) + + def _read_block(self, stream): + out = [] + # Skip any preamble. + while True: + line = stream.readline() + if not line: + break + if line.strip() == "": + break + out.append(line) + # Read the document + while True: + line = stream.readline() + if not line: + break + out.append(line) + if line.strip() == "": + break + # Return the document + return ["\n".join(out)] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/indian.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/indian.py new file mode 100644 index 0000000000000000000000000000000000000000..95cf74a9352c9e46b52138162771314feb041c5e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/indian.py @@ -0,0 +1,93 @@ +# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Indian Language POS-Tagged Corpus +Collected by A Kumaran, Microsoft Research, India +Distributed with permission + +Contents: + - Bangla: IIT Kharagpur + - Hindi: Microsoft Research India + - Marathi: IIT Bombay + - Telugu: IIIT Hyderabad +""" + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.tag import map_tag, str2tuple + + +class IndianCorpusReader(CorpusReader): + """ + List of words, one per line. Blank lines are ignored. + """ + + def words(self, fileids=None): + return concat( + [ + IndianCorpusView(fileid, enc, False, False) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_words(self, fileids=None, tagset=None): + if tagset and tagset != self._tagset: + tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) + else: + tag_mapping_function = None + return concat( + [ + IndianCorpusView(fileid, enc, True, False, tag_mapping_function) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def sents(self, fileids=None): + return concat( + [ + IndianCorpusView(fileid, enc, False, True) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_sents(self, fileids=None, tagset=None): + if tagset and tagset != self._tagset: + tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) + else: + tag_mapping_function = None + return concat( + [ + IndianCorpusView(fileid, enc, True, True, tag_mapping_function) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + +class IndianCorpusView(StreamBackedCorpusView): + def __init__( + self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None + ): + self._tagged = tagged + self._group_by_sent = group_by_sent + self._tag_mapping_function = tag_mapping_function + StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) + + def read_block(self, stream): + line = stream.readline() + if line.startswith("<"): + return [] + sent = [str2tuple(word, sep="_") for word in line.split()] + if self._tag_mapping_function: + sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent] + if not self._tagged: + sent = [w for (w, t) in sent] + if self._group_by_sent: + return [sent] + else: + return sent diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ipipan.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ipipan.py new file mode 100644 index 0000000000000000000000000000000000000000..ed1a7ff6523ea4a84bd353cc3d707f75524b4661 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ipipan.py @@ -0,0 +1,356 @@ +# Natural Language Toolkit: IPI PAN Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Konrad Goluchowski +# URL: +# For license information, see LICENSE.TXT + +import functools + +from nltk.corpus.reader.api import CorpusReader +from nltk.corpus.reader.util import StreamBackedCorpusView, concat + + +def _parse_args(fun): + @functools.wraps(fun) + def decorator(self, fileids=None, **kwargs): + kwargs.pop("tags", None) + if not fileids: + fileids = self.fileids() + return fun(self, fileids, **kwargs) + + return decorator + + +class IPIPANCorpusReader(CorpusReader): + """ + Corpus reader designed to work with corpus created by IPI PAN. + See http://korpus.pl/en/ for more details about IPI PAN corpus. + + The corpus includes information about text domain, channel and categories. + You can access possible values using ``domains()``, ``channels()`` and + ``categories()``. You can use also this metadata to filter files, e.g.: + ``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``. + + The reader supports methods: words, sents, paras and their tagged versions. + You can get part of speech instead of full tag by giving "simplify_tags=True" + parameter, e.g.: ``tagged_sents(simplify_tags=True)``. + + Also you can get all tags disambiguated tags specifying parameter + "one_tag=False", e.g.: ``tagged_paras(one_tag=False)``. + + You can get all tags that were assigned by a morphological analyzer specifying + parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``. + + The IPIPAN Corpus contains tags indicating if there is a space between two + tokens. To add special "no space" markers, you should specify parameter + "append_no_space=True", e.g. ``tagged_words(append_no_space=True)``. + As a result in place where there should be no space between two tokens new + pair ('', 'no-space') will be inserted (for tagged data) and just '' for + methods without tags. + + The corpus reader can also try to append spaces between words. To enable this + option, specify parameter "append_space=True", e.g. ``words(append_space=True)``. + As a result either ' ' or (' ', 'space') will be inserted between tokens. + + By default, xml entities like " and & are replaced by corresponding + characters. You can turn off this feature, specifying parameter + "replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``. + """ + + def __init__(self, root, fileids): + CorpusReader.__init__(self, root, fileids, None, None) + + def channels(self, fileids=None): + if not fileids: + fileids = self.fileids() + return self._parse_header(fileids, "channel") + + def domains(self, fileids=None): + if not fileids: + fileids = self.fileids() + return self._parse_header(fileids, "domain") + + def categories(self, fileids=None): + if not fileids: + fileids = self.fileids() + return [ + self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm") + ] + + def fileids(self, channels=None, domains=None, categories=None): + if channels is not None and domains is not None and categories is not None: + raise ValueError( + "You can specify only one of channels, domains " + "and categories parameter at once" + ) + if channels is None and domains is None and categories is None: + return CorpusReader.fileids(self) + if isinstance(channels, str): + channels = [channels] + if isinstance(domains, str): + domains = [domains] + if isinstance(categories, str): + categories = [categories] + if channels: + return self._list_morph_files_by("channel", channels) + elif domains: + return self._list_morph_files_by("domain", domains) + else: + return self._list_morph_files_by( + "keyTerm", categories, map=self._map_category + ) + + @_parse_args + def sents(self, fileids=None, **kwargs): + return concat( + [ + self._view( + fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs + ) + for fileid in self._list_morph_files(fileids) + ] + ) + + @_parse_args + def paras(self, fileids=None, **kwargs): + return concat( + [ + self._view( + fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs + ) + for fileid in self._list_morph_files(fileids) + ] + ) + + @_parse_args + def words(self, fileids=None, **kwargs): + return concat( + [ + self._view(fileid, tags=False, **kwargs) + for fileid in self._list_morph_files(fileids) + ] + ) + + @_parse_args + def tagged_sents(self, fileids=None, **kwargs): + return concat( + [ + self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs) + for fileid in self._list_morph_files(fileids) + ] + ) + + @_parse_args + def tagged_paras(self, fileids=None, **kwargs): + return concat( + [ + self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs) + for fileid in self._list_morph_files(fileids) + ] + ) + + @_parse_args + def tagged_words(self, fileids=None, **kwargs): + return concat( + [self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)] + ) + + def _list_morph_files(self, fileids): + return [f for f in self.abspaths(fileids)] + + def _list_header_files(self, fileids): + return [ + f.replace("morph.xml", "header.xml") + for f in self._list_morph_files(fileids) + ] + + def _parse_header(self, fileids, tag): + values = set() + for f in self._list_header_files(fileids): + values_list = self._get_tag(f, tag) + for v in values_list: + values.add(v) + return list(values) + + def _list_morph_files_by(self, tag, values, map=None): + fileids = self.fileids() + ret_fileids = set() + for f in fileids: + fp = self.abspath(f).replace("morph.xml", "header.xml") + values_list = self._get_tag(fp, tag) + for value in values_list: + if map is not None: + value = map(value) + if value in values: + ret_fileids.add(f) + return list(ret_fileids) + + def _get_tag(self, f, tag): + tags = [] + with open(f) as infile: + header = infile.read() + tag_end = 0 + while True: + tag_pos = header.find("<" + tag, tag_end) + if tag_pos < 0: + return tags + tag_end = header.find("", tag_pos) + tags.append(header[tag_pos + len(tag) + 2 : tag_end]) + + def _map_category(self, cat): + pos = cat.find(">") + if pos == -1: + return cat + else: + return cat[pos + 1 :] + + def _view(self, filename, **kwargs): + tags = kwargs.pop("tags", True) + mode = kwargs.pop("mode", 0) + simplify_tags = kwargs.pop("simplify_tags", False) + one_tag = kwargs.pop("one_tag", True) + disamb_only = kwargs.pop("disamb_only", True) + append_no_space = kwargs.pop("append_no_space", False) + append_space = kwargs.pop("append_space", False) + replace_xmlentities = kwargs.pop("replace_xmlentities", True) + + if len(kwargs) > 0: + raise ValueError("Unexpected arguments: %s" % kwargs.keys()) + if not one_tag and not disamb_only: + raise ValueError( + "You cannot specify both one_tag=False and " "disamb_only=False" + ) + if not tags and (simplify_tags or not one_tag or not disamb_only): + raise ValueError( + "You cannot specify simplify_tags, one_tag or " + "disamb_only with functions other than tagged_*" + ) + + return IPIPANCorpusView( + filename, + tags=tags, + mode=mode, + simplify_tags=simplify_tags, + one_tag=one_tag, + disamb_only=disamb_only, + append_no_space=append_no_space, + append_space=append_space, + replace_xmlentities=replace_xmlentities, + ) + + +class IPIPANCorpusView(StreamBackedCorpusView): + + WORDS_MODE = 0 + SENTS_MODE = 1 + PARAS_MODE = 2 + + def __init__(self, filename, startpos=0, **kwargs): + StreamBackedCorpusView.__init__(self, filename, None, startpos, None) + self.in_sentence = False + self.position = 0 + + self.show_tags = kwargs.pop("tags", True) + self.disamb_only = kwargs.pop("disamb_only", True) + self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE) + self.simplify_tags = kwargs.pop("simplify_tags", False) + self.one_tag = kwargs.pop("one_tag", True) + self.append_no_space = kwargs.pop("append_no_space", False) + self.append_space = kwargs.pop("append_space", False) + self.replace_xmlentities = kwargs.pop("replace_xmlentities", True) + + def read_block(self, stream): + sentence = [] + sentences = [] + space = False + no_space = False + + tags = set() + + lines = self._read_data(stream) + + while True: + + # we may have only part of last line + if len(lines) <= 1: + self._seek(stream) + lines = self._read_data(stream) + + if lines == [""]: + assert not sentences + return [] + + line = lines.pop() + self.position += len(line) + 1 + + if line.startswith('"): + if self.append_space: + no_space = True + if self.append_no_space: + if self.show_tags: + sentence.append(("", "no-space")) + else: + sentence.append("") + elif line.startswith(" +# URL: +# For license information, see LICENSE.TXT + +# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html + +import re + +from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader +from nltk.corpus.reader.util import ( + FileSystemPathPointer, + find_corpus_fileids, + read_blankline_block, +) +from nltk.parse import DependencyGraph + +# default function to convert morphlist to str for tree representation +_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS") + + +class KNBCorpusReader(SyntaxCorpusReader): + """ + This class implements: + - ``__init__``, which specifies the location of the corpus + and a method for detecting the sentence blocks in corpus files. + - ``_read_block``, which reads a block from the input stream. + - ``_word``, which takes a block and returns a list of list of words. + - ``_tag``, which takes a block and returns a list of list of tagged + words. + - ``_parse``, which takes a block and returns a list of parsed + sentences. + + The structure of tagged words: + tagged_word = (word(str), tags(tuple)) + tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) + + Usage example + + >>> from nltk.corpus.util import LazyCorpusLoader + >>> knbc = LazyCorpusLoader( + ... 'knbc/corpus1', + ... KNBCorpusReader, + ... r'.*/KN.*', + ... encoding='euc-jp', + ... ) + + >>> len(knbc.sents()[0]) + 9 + + """ + + def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default): + """ + Initialize KNBCorpusReader + morphs2str is a function to convert morphlist to str for tree representation + for _parse() + """ + SyntaxCorpusReader.__init__(self, root, fileids, encoding) + self.morphs2str = morphs2str + + def _read_block(self, stream): + # blocks are split by blankline (or EOF) - default + return read_blankline_block(stream) + + def _word(self, t): + res = [] + for line in t.splitlines(): + # ignore the Bunsets headers + if not re.match(r"EOS|\*|\#|\+", line): + cells = line.strip().split(" ") + res.append(cells[0]) + + return res + + # ignores tagset argument + def _tag(self, t, tagset=None): + res = [] + for line in t.splitlines(): + # ignore the Bunsets headers + if not re.match(r"EOS|\*|\#|\+", line): + cells = line.strip().split(" ") + # convert cells to morph tuples + res.append((cells[0], " ".join(cells[1:]))) + + return res + + def _parse(self, t): + dg = DependencyGraph() + i = 0 + for line in t.splitlines(): + if line[0] in "*+": + # start of bunsetsu or tag + + cells = line.strip().split(" ", 3) + m = re.match(r"([\-0-9]*)([ADIP])", cells[1]) + + assert m is not None + + node = dg.nodes[i] + node.update({"address": i, "rel": m.group(2), "word": []}) + + dep_parent = int(m.group(1)) + + if dep_parent == -1: + dg.root = node + else: + dg.nodes[dep_parent]["deps"].append(i) + + i += 1 + elif line[0] != "#": + # normal morph + cells = line.strip().split(" ") + # convert cells to morph tuples + morph = cells[0], " ".join(cells[1:]) + dg.nodes[i - 1]["word"].append(morph) + + if self.morphs2str: + for node in dg.nodes.values(): + node["word"] = self.morphs2str(node["word"]) + + return dg.tree() + + +###################################################################### +# Demo +###################################################################### + + +def demo(): + + import nltk + from nltk.corpus.util import LazyCorpusLoader + + root = nltk.data.find("corpora/knbc/corpus1") + fileids = [ + f + for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") + if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) + ] + + def _knbc_fileids_sort(x): + cells = x.split("-") + return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) + + knbc = LazyCorpusLoader( + "knbc/corpus1", + KNBCorpusReader, + sorted(fileids, key=_knbc_fileids_sort), + encoding="euc-jp", + ) + + print(knbc.fileids()[:10]) + print("".join(knbc.words()[:100])) + + print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) + + knbc.morphs2str = lambda morphs: "/".join( + "{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" + ).encode("utf-8") + + print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) + + print( + "\n".join( + " ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent) + for sent in knbc.tagged_sents()[0:2] + ) + ) + + +def test(): + + from nltk.corpus.util import LazyCorpusLoader + + knbc = LazyCorpusLoader( + "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp" + ) + assert isinstance(knbc.words()[0], str) + assert isinstance(knbc.sents()[0][0], str) + assert isinstance(knbc.tagged_words()[0], tuple) + assert isinstance(knbc.tagged_sents()[0][0], tuple) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/lin.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/lin.py new file mode 100644 index 0000000000000000000000000000000000000000..d5f8af00a8f9de16d4383e77fffbfa8ed9bad7fe --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/lin.py @@ -0,0 +1,183 @@ +# Natural Language Toolkit: Lin's Thesaurus +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Dan Blanchard +# URL: +# For license information, see LICENSE.txt + +import re +from collections import defaultdict +from functools import reduce + +from nltk.corpus.reader import CorpusReader + + +class LinThesaurusCorpusReader(CorpusReader): + """Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin.""" + + # Compiled regular expression for extracting the key from the first line of each + # thesaurus entry + _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+') + + @staticmethod + def __defaultdict_factory(): + """Factory for creating defaultdict of defaultdict(dict)s""" + return defaultdict(dict) + + def __init__(self, root, badscore=0.0): + """ + Initialize the thesaurus. + + :param root: root directory containing thesaurus LISP files + :type root: C{string} + :param badscore: the score to give to words which do not appear in each other's sets of synonyms + :type badscore: C{float} + """ + + super().__init__(root, r"sim[A-Z]\.lsp") + self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory) + self._badscore = badscore + for path, encoding, fileid in self.abspaths( + include_encoding=True, include_fileid=True + ): + with open(path) as lin_file: + first = True + for line in lin_file: + line = line.strip() + # Start of entry + if first: + key = LinThesaurusCorpusReader._key_re.sub(r"\1", line) + first = False + # End of entry + elif line == "))": + first = True + # Lines with pairs of ngrams and scores + else: + split_line = line.split("\t") + if len(split_line) == 2: + ngram, score = split_line + self._thesaurus[fileid][key][ngram.strip('"')] = float( + score + ) + + def similarity(self, ngram1, ngram2, fileid=None): + """ + Returns the similarity score for two ngrams. + + :param ngram1: first ngram to compare + :type ngram1: C{string} + :param ngram2: second ngram to compare + :type ngram2: C{string} + :param fileid: thesaurus fileid to search in. If None, search all fileids. + :type fileid: C{string} + :return: If fileid is specified, just the score for the two ngrams; otherwise, + list of tuples of fileids and scores. + """ + # Entries don't contain themselves, so make sure similarity between item and itself is 1.0 + if ngram1 == ngram2: + if fileid: + return 1.0 + else: + return [(fid, 1.0) for fid in self._fileids] + else: + if fileid: + return ( + self._thesaurus[fileid][ngram1][ngram2] + if ngram2 in self._thesaurus[fileid][ngram1] + else self._badscore + ) + else: + return [ + ( + fid, + ( + self._thesaurus[fid][ngram1][ngram2] + if ngram2 in self._thesaurus[fid][ngram1] + else self._badscore + ), + ) + for fid in self._fileids + ] + + def scored_synonyms(self, ngram, fileid=None): + """ + Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram + + :param ngram: ngram to lookup + :type ngram: C{string} + :param fileid: thesaurus fileid to search in. If None, search all fileids. + :type fileid: C{string} + :return: If fileid is specified, list of tuples of scores and synonyms; otherwise, + list of tuples of fileids and lists, where inner lists consist of tuples of + scores and synonyms. + """ + if fileid: + return self._thesaurus[fileid][ngram].items() + else: + return [ + (fileid, self._thesaurus[fileid][ngram].items()) + for fileid in self._fileids + ] + + def synonyms(self, ngram, fileid=None): + """ + Returns a list of synonyms for the current ngram. + + :param ngram: ngram to lookup + :type ngram: C{string} + :param fileid: thesaurus fileid to search in. If None, search all fileids. + :type fileid: C{string} + :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and + lists, where inner lists contain synonyms. + """ + if fileid: + return self._thesaurus[fileid][ngram].keys() + else: + return [ + (fileid, self._thesaurus[fileid][ngram].keys()) + for fileid in self._fileids + ] + + def __contains__(self, ngram): + """ + Determines whether or not the given ngram is in the thesaurus. + + :param ngram: ngram to lookup + :type ngram: C{string} + :return: whether the given ngram is in the thesaurus. + """ + return reduce( + lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), + self._fileids, + False, + ) + + +###################################################################### +# Demo +###################################################################### + + +def demo(): + from nltk.corpus import lin_thesaurus as thes + + word1 = "business" + word2 = "enterprise" + print("Getting synonyms for " + word1) + print(thes.synonyms(word1)) + + print("Getting scored synonyms for " + word1) + print(thes.scored_synonyms(word1)) + + print("Getting synonyms from simN.lsp (noun subsection) for " + word1) + print(thes.synonyms(word1, fileid="simN.lsp")) + + print("Getting synonyms from simN.lsp (noun subsection) for " + word1) + print(thes.synonyms(word1, fileid="simN.lsp")) + + print(f"Similarity score for {word1} and {word2}:") + print(thes.similarity(word1, word2)) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/markdown.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/markdown.py new file mode 100644 index 0000000000000000000000000000000000000000..8df4f924e25426dbe30ef2484f3a0cb4cb1a1740 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/markdown.py @@ -0,0 +1,342 @@ +from collections import namedtuple +from functools import partial, wraps + +from nltk.corpus.reader.api import CategorizedCorpusReader +from nltk.corpus.reader.plaintext import PlaintextCorpusReader +from nltk.corpus.reader.util import concat, read_blankline_block +from nltk.tokenize import blankline_tokenize, sent_tokenize, word_tokenize + + +def comma_separated_string_args(func): + """ + A decorator that allows a function to be called with + a single string of comma-separated values which become + individual function arguments. + """ + + @wraps(func) + def wrapper(*args, **kwargs): + _args = list() + for arg in args: + if isinstance(arg, str): + _args.append({part.strip() for part in arg.split(",")}) + elif isinstance(arg, list): + _args.append(set(arg)) + else: + _args.append(arg) + for name, value in kwargs.items(): + if isinstance(value, str): + kwargs[name] = {part.strip() for part in value.split(",")} + return func(*_args, **kwargs) + + return wrapper + + +def read_parse_blankline_block(stream, parser): + block = read_blankline_block(stream) + if block: + return [parser.render(block[0])] + return block + + +class MarkdownBlock: + def __init__(self, content): + self.content = content + self.truncate_at = 16 + + def __repr__(self): + return f"{self.__class__.__name__}(content={repr(str(self))})" + + def __str__(self): + return ( + f"{self.content[:self.truncate_at]}" + f"{'...' if len(self.content) > self.truncate_at else ''}" + ) + + @property + def raw(self): + return self.content + + @property + def words(self): + return word_tokenize(self.content) + + @property + def sents(self): + return [word_tokenize(sent) for sent in sent_tokenize(self.content)] + + @property + def paras(self): + return [ + [word_tokenize(sent) for sent in sent_tokenize(para)] + for para in blankline_tokenize(self.content) + ] + + +class CodeBlock(MarkdownBlock): + def __init__(self, language, *args): + self.language = language + super().__init__(*args) + + @property + def sents(self): + return [word_tokenize(line) for line in self.content.splitlines()] + + @property + def lines(self): + return self.content.splitlines() + + @property + def paras(self): + return [ + [word_tokenize(line) for line in para.splitlines()] + for para in blankline_tokenize(self.content) + ] + + +class MarkdownSection(MarkdownBlock): + def __init__(self, heading, level, *args): + self.heading = heading + self.level = level + super().__init__(*args) + + +Image = namedtuple("Image", "label, src, title") +Link = namedtuple("Link", "label, href, title") +List = namedtuple("List", "is_ordered, items") + + +class MarkdownCorpusReader(PlaintextCorpusReader): + def __init__(self, *args, parser=None, **kwargs): + from markdown_it import MarkdownIt + from mdit_plain.renderer import RendererPlain + from mdit_py_plugins.front_matter import front_matter_plugin + + self.parser = parser + if self.parser is None: + self.parser = MarkdownIt("commonmark", renderer_cls=RendererPlain) + self.parser.use(front_matter_plugin) + + kwargs.setdefault( + "para_block_reader", partial(read_parse_blankline_block, parser=self.parser) + ) + super().__init__(*args, **kwargs) + + # This override takes care of removing markup. + def _read_word_block(self, stream): + words = list() + for para in self._para_block_reader(stream): + words.extend(self._word_tokenizer.tokenize(para)) + return words + + +class CategorizedMarkdownCorpusReader(CategorizedCorpusReader, MarkdownCorpusReader): + """ + A reader for markdown corpora whose documents are divided into + categories based on their file identifiers. + + Based on nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader: + https://www.nltk.org/_modules/nltk/corpus/reader/api.html#CategorizedCorpusReader + """ + + def __init__(self, *args, cat_field="tags", **kwargs): + """ + Initialize the corpus reader. Categorization arguments + (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to + the ``CategorizedCorpusReader`` constructor. The remaining arguments + are passed to the ``MarkdownCorpusReader`` constructor. + """ + cat_args = ["cat_pattern", "cat_map", "cat_file"] + if not any(arg in kwargs for arg in cat_args): + # Initialize with a blank map now, + # and try to build categories from document metadata later. + kwargs["cat_map"] = dict() + CategorizedCorpusReader.__init__(self, kwargs) + MarkdownCorpusReader.__init__(self, *args, **kwargs) + + # Map file IDs to categories if self._map exists but is still empty: + if self._map is not None and not self._map: + for file_id in self._fileids: + metadata = self.metadata(file_id) + if metadata: + self._map[file_id] = metadata[0].get(cat_field, []) + + ### Begin CategorizedCorpusReader Overrides + @comma_separated_string_args + def categories(self, fileids=None): + return super().categories(fileids) + + @comma_separated_string_args + def fileids(self, categories=None): + if categories is None: + return self._fileids + return super().fileids(categories) + + ### End CategorizedCorpusReader Overrides + + ### Begin MarkdownCorpusReader Overrides + @comma_separated_string_args + def raw(self, fileids=None, categories=None): + return super().raw(self._resolve(fileids, categories)) + + @comma_separated_string_args + def words(self, fileids=None, categories=None): + return super().words(self._resolve(fileids, categories)) + + @comma_separated_string_args + def sents(self, fileids=None, categories=None): + return super().sents(self._resolve(fileids, categories)) + + @comma_separated_string_args + def paras(self, fileids=None, categories=None): + return super().paras(self._resolve(fileids, categories)) + + ### End MarkdownCorpusReader Overrides + + def concatenated_view(self, reader, fileids, categories): + return concat( + [ + self.CorpusView(path, reader, encoding=enc) + for (path, enc) in self.abspaths( + self._resolve(fileids, categories), include_encoding=True + ) + ] + ) + + def metadata_reader(self, stream): + from yaml import safe_load + + return [ + safe_load(t.content) + for t in self.parser.parse(stream.read()) + if t.type == "front_matter" + ] + + @comma_separated_string_args + def metadata(self, fileids=None, categories=None): + return self.concatenated_view(self.metadata_reader, fileids, categories) + + def blockquote_reader(self, stream): + tokens = self.parser.parse(stream.read()) + opening_tokens = filter( + lambda t: t.level == 0 and t.type == "blockquote_open", tokens + ) + closing_tokens = filter( + lambda t: t.level == 0 and t.type == "blockquote_close", tokens + ) + blockquotes = list() + for o, c in zip(opening_tokens, closing_tokens): + opening_index = tokens.index(o) + closing_index = tokens.index(c, opening_index) + blockquotes.append(tokens[opening_index : closing_index + 1]) + return [ + MarkdownBlock( + self.parser.renderer.render(block, self.parser.options, env=None) + ) + for block in blockquotes + ] + + @comma_separated_string_args + def blockquotes(self, fileids=None, categories=None): + return self.concatenated_view(self.blockquote_reader, fileids, categories) + + def code_block_reader(self, stream): + return [ + CodeBlock( + t.info, + t.content, + ) + for t in self.parser.parse(stream.read()) + if t.level == 0 and t.type in ("fence", "code_block") + ] + + @comma_separated_string_args + def code_blocks(self, fileids=None, categories=None): + return self.concatenated_view(self.code_block_reader, fileids, categories) + + def image_reader(self, stream): + return [ + Image( + child_token.content, + child_token.attrGet("src"), + child_token.attrGet("title"), + ) + for inline_token in filter( + lambda t: t.type == "inline", self.parser.parse(stream.read()) + ) + for child_token in inline_token.children + if child_token.type == "image" + ] + + @comma_separated_string_args + def images(self, fileids=None, categories=None): + return self.concatenated_view(self.image_reader, fileids, categories) + + def link_reader(self, stream): + return [ + Link( + inline_token.children[i + 1].content, + child_token.attrGet("href"), + child_token.attrGet("title"), + ) + for inline_token in filter( + lambda t: t.type == "inline", self.parser.parse(stream.read()) + ) + for i, child_token in enumerate(inline_token.children) + if child_token.type == "link_open" + ] + + @comma_separated_string_args + def links(self, fileids=None, categories=None): + return self.concatenated_view(self.link_reader, fileids, categories) + + def list_reader(self, stream): + tokens = self.parser.parse(stream.read()) + opening_types = ("bullet_list_open", "ordered_list_open") + opening_tokens = filter( + lambda t: t.level == 0 and t.type in opening_types, tokens + ) + closing_types = ("bullet_list_close", "ordered_list_close") + closing_tokens = filter( + lambda t: t.level == 0 and t.type in closing_types, tokens + ) + list_blocks = list() + for o, c in zip(opening_tokens, closing_tokens): + opening_index = tokens.index(o) + closing_index = tokens.index(c, opening_index) + list_blocks.append(tokens[opening_index : closing_index + 1]) + return [ + List( + tokens[0].type == "ordered_list_open", + [t.content for t in tokens if t.content], + ) + for tokens in list_blocks + ] + + @comma_separated_string_args + def lists(self, fileids=None, categories=None): + return self.concatenated_view(self.list_reader, fileids, categories) + + def section_reader(self, stream): + section_blocks, block = list(), list() + in_heading = False + for t in self.parser.parse(stream.read()): + if t.level == 0 and t.type == "heading_open": + if block: + section_blocks.append(block) + block = list() + in_heading = True + if in_heading: + block.append(t) + return [ + MarkdownSection( + block[1].content, + block[0].markup.count("#"), + self.parser.renderer.render(block, self.parser.options, env=None), + ) + for block in section_blocks + ] + + @comma_separated_string_args + def sections(self, fileids=None, categories=None): + return self.concatenated_view(self.section_reader, fileids, categories) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/mte.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/mte.py new file mode 100644 index 0000000000000000000000000000000000000000..99190bed452095dc948e324ce5cc0f3c94c46505 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/mte.py @@ -0,0 +1,397 @@ +""" +A reader for corpora whose documents are in MTE format. +""" +import os +import re +from functools import reduce + +from nltk.corpus.reader import TaggedCorpusReader, concat +from nltk.corpus.reader.xmldocs import XMLCorpusView + + +def xpath(root, path, ns): + return root.findall(path, ns) + + +class MTECorpusView(XMLCorpusView): + """ + Class for lazy viewing the MTE Corpus. + """ + + def __init__(self, fileid, tagspec, elt_handler=None): + XMLCorpusView.__init__(self, fileid, tagspec, elt_handler) + + def read_block(self, stream, tagspec=None, elt_handler=None): + return list( + filter( + lambda x: x is not None, + XMLCorpusView.read_block(self, stream, tagspec, elt_handler), + ) + ) + + +class MTEFileReader: + """ + Class for loading the content of the multext-east corpus. It + parses the xml files and does some tag-filtering depending on the + given method parameters. + """ + + ns = { + "tei": "https://www.tei-c.org/ns/1.0", + "xml": "https://www.w3.org/XML/1998/namespace", + } + tag_ns = "{https://www.tei-c.org/ns/1.0}" + xml_ns = "{https://www.w3.org/XML/1998/namespace}" + word_path = "TEI/text/body/div/div/p/s/(w|c)" + sent_path = "TEI/text/body/div/div/p/s" + para_path = "TEI/text/body/div/div/p" + + def __init__(self, file_path): + self.__file_path = file_path + + @classmethod + def _word_elt(cls, elt, context): + return elt.text + + @classmethod + def _sent_elt(cls, elt, context): + return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)] + + @classmethod + def _para_elt(cls, elt, context): + return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)] + + @classmethod + def _tagged_word_elt(cls, elt, context): + if "ana" not in elt.attrib: + return (elt.text, "") + + if cls.__tags == "" and cls.__tagset == "msd": + return (elt.text, elt.attrib["ana"]) + elif cls.__tags == "" and cls.__tagset == "universal": + return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"])) + else: + tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$") + if tags.match(elt.attrib["ana"]): + if cls.__tagset == "msd": + return (elt.text, elt.attrib["ana"]) + else: + return ( + elt.text, + MTETagConverter.msd_to_universal(elt.attrib["ana"]), + ) + else: + return None + + @classmethod + def _tagged_sent_elt(cls, elt, context): + return list( + filter( + lambda x: x is not None, + [cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)], + ) + ) + + @classmethod + def _tagged_para_elt(cls, elt, context): + return list( + filter( + lambda x: x is not None, + [cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)], + ) + ) + + @classmethod + def _lemma_word_elt(cls, elt, context): + if "lemma" not in elt.attrib: + return (elt.text, "") + else: + return (elt.text, elt.attrib["lemma"]) + + @classmethod + def _lemma_sent_elt(cls, elt, context): + return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)] + + @classmethod + def _lemma_para_elt(cls, elt, context): + return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)] + + def words(self): + return MTECorpusView( + self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt + ) + + def sents(self): + return MTECorpusView( + self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt + ) + + def paras(self): + return MTECorpusView( + self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt + ) + + def lemma_words(self): + return MTECorpusView( + self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt + ) + + def tagged_words(self, tagset, tags): + MTEFileReader.__tagset = tagset + MTEFileReader.__tags = tags + return MTECorpusView( + self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt + ) + + def lemma_sents(self): + return MTECorpusView( + self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt + ) + + def tagged_sents(self, tagset, tags): + MTEFileReader.__tagset = tagset + MTEFileReader.__tags = tags + return MTECorpusView( + self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt + ) + + def lemma_paras(self): + return MTECorpusView( + self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt + ) + + def tagged_paras(self, tagset, tags): + MTEFileReader.__tagset = tagset + MTEFileReader.__tags = tags + return MTECorpusView( + self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt + ) + + +class MTETagConverter: + """ + Class for converting msd tags to universal tags, more conversion + options are currently not implemented. + """ + + mapping_msd_universal = { + "A": "ADJ", + "S": "ADP", + "R": "ADV", + "C": "CONJ", + "D": "DET", + "N": "NOUN", + "M": "NUM", + "Q": "PRT", + "P": "PRON", + "V": "VERB", + ".": ".", + "-": "X", + } + + @staticmethod + def msd_to_universal(tag): + """ + This function converts the annotation from the Multex-East to the universal tagset + as described in Chapter 5 of the NLTK-Book + + Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so + """ + indicator = tag[0] if not tag[0] == "#" else tag[1] + + if not indicator in MTETagConverter.mapping_msd_universal: + indicator = "-" + + return MTETagConverter.mapping_msd_universal[indicator] + + +class MTECorpusReader(TaggedCorpusReader): + """ + Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East. + MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging + scheme. These tags can be converted to the Universal tagset + """ + + def __init__(self, root=None, fileids=None, encoding="utf8"): + """ + Construct a new MTECorpusreader for a set of documents + located at the given root directory. Example usage: + + >>> root = '/...path to corpus.../' + >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP + + :param root: The root directory for this corpus. (default points to location in multext config file) + :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml) + :param encoding: The encoding of the given files (default is utf8) + """ + TaggedCorpusReader.__init__(self, root, fileids, encoding) + self._readme = "00README.txt" + + def __fileids(self, fileids): + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + # filter wrong userinput + fileids = filter(lambda x: x in self._fileids, fileids) + # filter multext-east sourcefiles that are not compatible to the teip5 specification + fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids) + if not fileids: + print("No valid multext-east file specified") + return fileids + + def words(self, fileids=None): + """ + :param fileids: A list specifying the fileids that should be used. + :return: the given file(s) as a list of words and punctuation symbols. + :rtype: list(str) + """ + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).words() + for f in self.__fileids(fileids) + ] + ) + + def sents(self, fileids=None): + """ + :param fileids: A list specifying the fileids that should be used. + :return: the given file(s) as a list of sentences or utterances, + each encoded as a list of word strings + :rtype: list(list(str)) + """ + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).sents() + for f in self.__fileids(fileids) + ] + ) + + def paras(self, fileids=None): + """ + :param fileids: A list specifying the fileids that should be used. + :return: the given file(s) as a list of paragraphs, each encoded as a list + of sentences, which are in turn encoded as lists of word string + :rtype: list(list(list(str))) + """ + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).paras() + for f in self.__fileids(fileids) + ] + ) + + def lemma_words(self, fileids=None): + """ + :param fileids: A list specifying the fileids that should be used. + :return: the given file(s) as a list of words, the corresponding lemmas + and punctuation symbols, encoded as tuples (word, lemma) + :rtype: list(tuple(str,str)) + """ + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).lemma_words() + for f in self.__fileids(fileids) + ] + ) + + def tagged_words(self, fileids=None, tagset="msd", tags=""): + """ + :param fileids: A list specifying the fileids that should be used. + :param tagset: The tagset that should be used in the returned object, + either "universal" or "msd", "msd" is the default + :param tags: An MSD Tag that is used to filter all parts of the used corpus + that are not more precise or at least equal to the given tag + :return: the given file(s) as a list of tagged words and punctuation symbols + encoded as tuples (word, tag) + :rtype: list(tuple(str, str)) + """ + if tagset == "universal" or tagset == "msd": + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).tagged_words( + tagset, tags + ) + for f in self.__fileids(fileids) + ] + ) + else: + print("Unknown tagset specified.") + + def lemma_sents(self, fileids=None): + """ + :param fileids: A list specifying the fileids that should be used. + :return: the given file(s) as a list of sentences or utterances, each + encoded as a list of tuples of the word and the corresponding + lemma (word, lemma) + :rtype: list(list(tuple(str, str))) + """ + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).lemma_sents() + for f in self.__fileids(fileids) + ] + ) + + def tagged_sents(self, fileids=None, tagset="msd", tags=""): + """ + :param fileids: A list specifying the fileids that should be used. + :param tagset: The tagset that should be used in the returned object, + either "universal" or "msd", "msd" is the default + :param tags: An MSD Tag that is used to filter all parts of the used corpus + that are not more precise or at least equal to the given tag + :return: the given file(s) as a list of sentences or utterances, each + each encoded as a list of (word,tag) tuples + :rtype: list(list(tuple(str, str))) + """ + if tagset == "universal" or tagset == "msd": + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).tagged_sents( + tagset, tags + ) + for f in self.__fileids(fileids) + ] + ) + else: + print("Unknown tagset specified.") + + def lemma_paras(self, fileids=None): + """ + :param fileids: A list specifying the fileids that should be used. + :return: the given file(s) as a list of paragraphs, each encoded as a + list of sentences, which are in turn encoded as a list of + tuples of the word and the corresponding lemma (word, lemma) + :rtype: list(List(List(tuple(str, str)))) + """ + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).lemma_paras() + for f in self.__fileids(fileids) + ] + ) + + def tagged_paras(self, fileids=None, tagset="msd", tags=""): + """ + :param fileids: A list specifying the fileids that should be used. + :param tagset: The tagset that should be used in the returned object, + either "universal" or "msd", "msd" is the default + :param tags: An MSD Tag that is used to filter all parts of the used corpus + that are not more precise or at least equal to the given tag + :return: the given file(s) as a list of paragraphs, each encoded as a + list of sentences, which are in turn encoded as a list + of (word,tag) tuples + :rtype: list(list(list(tuple(str, str)))) + """ + if tagset == "universal" or tagset == "msd": + return concat( + [ + MTEFileReader(os.path.join(self._root, f)).tagged_paras( + tagset, tags + ) + for f in self.__fileids(fileids) + ] + ) + else: + print("Unknown tagset specified.") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nkjp.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nkjp.py new file mode 100644 index 0000000000000000000000000000000000000000..69a2b4768418a2c2f005266d25bf7b921df14bc4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nkjp.py @@ -0,0 +1,487 @@ +# Natural Language Toolkit: NKJP Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Gabriela Kaczka +# URL: +# For license information, see LICENSE.TXT + +import functools +import os +import re +import tempfile + +from nltk.corpus.reader.util import concat +from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView + + +def _parse_args(fun): + """ + Wraps function arguments: + if fileids not specified then function set NKJPCorpusReader paths. + """ + + @functools.wraps(fun) + def decorator(self, fileids=None, **kwargs): + if not fileids: + fileids = self._paths + return fun(self, fileids, **kwargs) + + return decorator + + +class NKJPCorpusReader(XMLCorpusReader): + WORDS_MODE = 0 + SENTS_MODE = 1 + HEADER_MODE = 2 + RAW_MODE = 3 + + def __init__(self, root, fileids=".*"): + """ + Corpus reader designed to work with National Corpus of Polish. + See http://nkjp.pl/ for more details about NKJP. + use example: + import nltk + import nkjp + from nkjp import NKJPCorpusReader + x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus + x.header() + x.raw() + x.words() + x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html + x.sents() + x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s) + x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy']) + x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp']) + """ + if isinstance(fileids, str): + XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml") + else: + XMLCorpusReader.__init__( + self, root, [fileid + "/header.xml" for fileid in fileids] + ) + self._paths = self.get_paths() + + def get_paths(self): + return [ + os.path.join(str(self._root), f.split("header.xml")[0]) + for f in self._fileids + ] + + def fileids(self): + """ + Returns a list of file identifiers for the fileids that make up + this corpus. + """ + return [f.split("header.xml")[0] for f in self._fileids] + + def _view(self, filename, tags=None, **kwargs): + """ + Returns a view specialised for use with particular corpus file. + """ + mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE) + if mode is NKJPCorpusReader.WORDS_MODE: + return NKJPCorpus_Morph_View(filename, tags=tags) + elif mode is NKJPCorpusReader.SENTS_MODE: + return NKJPCorpus_Segmentation_View(filename, tags=tags) + elif mode is NKJPCorpusReader.HEADER_MODE: + return NKJPCorpus_Header_View(filename, tags=tags) + elif mode is NKJPCorpusReader.RAW_MODE: + return NKJPCorpus_Text_View( + filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE + ) + + else: + raise NameError("No such mode!") + + def add_root(self, fileid): + """ + Add root if necessary to specified fileid. + """ + if self.root in fileid: + return fileid + return self.root + fileid + + @_parse_args + def header(self, fileids=None, **kwargs): + """ + Returns header(s) of specified fileids. + """ + return concat( + [ + self._view( + self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs + ).handle_query() + for fileid in fileids + ] + ) + + @_parse_args + def sents(self, fileids=None, **kwargs): + """ + Returns sentences in specified fileids. + """ + return concat( + [ + self._view( + self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs + ).handle_query() + for fileid in fileids + ] + ) + + @_parse_args + def words(self, fileids=None, **kwargs): + """ + Returns words in specified fileids. + """ + + return concat( + [ + self._view( + self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs + ).handle_query() + for fileid in fileids + ] + ) + + @_parse_args + def tagged_words(self, fileids=None, **kwargs): + """ + Call with specified tags as a list, e.g. tags=['subst', 'comp']. + Returns tagged words in specified fileids. + """ + tags = kwargs.pop("tags", []) + return concat( + [ + self._view( + self.add_root(fileid), + mode=NKJPCorpusReader.WORDS_MODE, + tags=tags, + **kwargs + ).handle_query() + for fileid in fileids + ] + ) + + @_parse_args + def raw(self, fileids=None, **kwargs): + """ + Returns words in specified fileids. + """ + return concat( + [ + self._view( + self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs + ).handle_query() + for fileid in fileids + ] + ) + + +class NKJPCorpus_Header_View(XMLCorpusView): + def __init__(self, filename, **kwargs): + """ + HEADER_MODE + A stream backed corpus view specialized for use with + header.xml files in NKJP corpus. + """ + self.tagspec = ".*/sourceDesc$" + XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec) + + def handle_query(self): + self._open() + header = [] + while True: + segm = XMLCorpusView.read_block(self, self._stream) + if len(segm) == 0: + break + header.extend(segm) + self.close() + return header + + def handle_elt(self, elt, context): + titles = elt.findall("bibl/title") + title = [] + if titles: + title = "\n".join(title.text.strip() for title in titles) + + authors = elt.findall("bibl/author") + author = [] + if authors: + author = "\n".join(author.text.strip() for author in authors) + + dates = elt.findall("bibl/date") + date = [] + if dates: + date = "\n".join(date.text.strip() for date in dates) + + publishers = elt.findall("bibl/publisher") + publisher = [] + if publishers: + publisher = "\n".join(publisher.text.strip() for publisher in publishers) + + idnos = elt.findall("bibl/idno") + idno = [] + if idnos: + idno = "\n".join(idno.text.strip() for idno in idnos) + + notes = elt.findall("bibl/note") + note = [] + if notes: + note = "\n".join(note.text.strip() for note in notes) + + return { + "title": title, + "author": author, + "date": date, + "publisher": publisher, + "idno": idno, + "note": note, + } + + +class XML_Tool: + """ + Helper class creating xml file to one without references to nkjp: namespace. + That's needed because the XMLCorpusView assumes that one can find short substrings + of XML that are valid XML, which is not true if a namespace is declared at top level + """ + + def __init__(self, root, filename): + self.read_file = os.path.join(root, filename) + self.write_file = tempfile.NamedTemporaryFile(delete=False) + + def build_preprocessed_file(self): + try: + fr = open(self.read_file) + fw = self.write_file + line = " " + while len(line): + line = fr.readline() + x = re.split(r"nkjp:[^ ]* ", line) # in all files + ret = " ".join(x) + x = re.split("", ret) # in ann_segmentation.xml + ret = " ".join(x) + x = re.split("", ret) # in ann_segmentation.xml + ret = " ".join(x) + x = re.split("", ret) # in ann_segmentation.xml + ret = " ".join(x) + x = re.split("", ret) # in ann_segmentation.xml + ret = " ".join(x) + fw.write(ret) + fr.close() + fw.close() + return self.write_file.name + except Exception as e: + self.remove_preprocessed_file() + raise Exception from e + + def remove_preprocessed_file(self): + os.remove(self.write_file.name) + + +class NKJPCorpus_Segmentation_View(XMLCorpusView): + """ + A stream backed corpus view specialized for use with + ann_segmentation.xml files in NKJP corpus. + """ + + def __init__(self, filename, **kwargs): + self.tagspec = ".*p/.*s" + # intersperse NKJPCorpus_Text_View + self.text_view = NKJPCorpus_Text_View( + filename, mode=NKJPCorpus_Text_View.SENTS_MODE + ) + self.text_view.handle_query() + # xml preprocessing + self.xml_tool = XML_Tool(filename, "ann_segmentation.xml") + # base class init + XMLCorpusView.__init__( + self, self.xml_tool.build_preprocessed_file(), self.tagspec + ) + + def get_segm_id(self, example_word): + return example_word.split("(")[1].split(",")[0] + + def get_sent_beg(self, beg_word): + # returns index of beginning letter in sentence + return int(beg_word.split(",")[1]) + + def get_sent_end(self, end_word): + # returns index of end letter in sentence + splitted = end_word.split(")")[0].split(",") + return int(splitted[1]) + int(splitted[2]) + + def get_sentences(self, sent_segm): + # returns one sentence + id = self.get_segm_id(sent_segm[0]) + segm = self.text_view.segm_dict[id] # text segment + beg = self.get_sent_beg(sent_segm[0]) + end = self.get_sent_end(sent_segm[len(sent_segm) - 1]) + return segm[beg:end] + + def remove_choice(self, segm): + ret = [] + prev_txt_end = -1 + prev_txt_nr = -1 + for word in segm: + txt_nr = self.get_segm_id(word) + # get increasing sequence of ids: in case of choice get first possibility + if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr: + ret.append(word) + prev_txt_end = self.get_sent_end(word) + prev_txt_nr = txt_nr + + return ret + + def handle_query(self): + try: + self._open() + sentences = [] + while True: + sent_segm = XMLCorpusView.read_block(self, self._stream) + if len(sent_segm) == 0: + break + for segm in sent_segm: + segm = self.remove_choice(segm) + sentences.append(self.get_sentences(segm)) + self.close() + self.xml_tool.remove_preprocessed_file() + return sentences + except Exception as e: + self.xml_tool.remove_preprocessed_file() + raise Exception from e + + def handle_elt(self, elt, context): + ret = [] + for seg in elt: + ret.append(seg.get("corresp")) + return ret + + +class NKJPCorpus_Text_View(XMLCorpusView): + """ + A stream backed corpus view specialized for use with + text.xml files in NKJP corpus. + """ + + SENTS_MODE = 0 + RAW_MODE = 1 + + def __init__(self, filename, **kwargs): + self.mode = kwargs.pop("mode", 0) + self.tagspec = ".*/div/ab" + self.segm_dict = dict() + # xml preprocessing + self.xml_tool = XML_Tool(filename, "text.xml") + # base class init + XMLCorpusView.__init__( + self, self.xml_tool.build_preprocessed_file(), self.tagspec + ) + + def handle_query(self): + try: + self._open() + x = self.read_block(self._stream) + self.close() + self.xml_tool.remove_preprocessed_file() + return x + except Exception as e: + self.xml_tool.remove_preprocessed_file() + raise Exception from e + + def read_block(self, stream, tagspec=None, elt_handler=None): + """ + Returns text as a list of sentences. + """ + txt = [] + while True: + segm = XMLCorpusView.read_block(self, stream) + if len(segm) == 0: + break + for part in segm: + txt.append(part) + + return [" ".join([segm for segm in txt])] + + def get_segm_id(self, elt): + for attr in elt.attrib: + if attr.endswith("id"): + return elt.get(attr) + + def handle_elt(self, elt, context): + # fill dictionary to use later in sents mode + if self.mode is NKJPCorpus_Text_View.SENTS_MODE: + self.segm_dict[self.get_segm_id(elt)] = elt.text + return elt.text + + +class NKJPCorpus_Morph_View(XMLCorpusView): + """ + A stream backed corpus view specialized for use with + ann_morphosyntax.xml files in NKJP corpus. + """ + + def __init__(self, filename, **kwargs): + self.tags = kwargs.pop("tags", None) + self.tagspec = ".*/seg/fs" + self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml") + XMLCorpusView.__init__( + self, self.xml_tool.build_preprocessed_file(), self.tagspec + ) + + def handle_query(self): + try: + self._open() + words = [] + while True: + segm = XMLCorpusView.read_block(self, self._stream) + if len(segm) == 0: + break + for part in segm: + if part is not None: + words.append(part) + self.close() + self.xml_tool.remove_preprocessed_file() + return words + except Exception as e: + self.xml_tool.remove_preprocessed_file() + raise Exception from e + + def handle_elt(self, elt, context): + word = "" + flag = False + is_not_interp = True + # if tags not specified, then always return word + if self.tags is None: + flag = True + + for child in elt: + + # get word + if "name" in child.keys() and child.attrib["name"] == "orth": + for symbol in child: + if symbol.tag == "string": + word = symbol.text + elif "name" in child.keys() and child.attrib["name"] == "interps": + for symbol in child: + if "type" in symbol.keys() and symbol.attrib["type"] == "lex": + for symbol2 in symbol: + if ( + "name" in symbol2.keys() + and symbol2.attrib["name"] == "ctag" + ): + for symbol3 in symbol2: + if ( + "value" in symbol3.keys() + and self.tags is not None + and symbol3.attrib["value"] in self.tags + ): + flag = True + elif ( + "value" in symbol3.keys() + and symbol3.attrib["value"] == "interp" + ): + is_not_interp = False + if flag and is_not_interp: + return word diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nombank.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nombank.py new file mode 100644 index 0000000000000000000000000000000000000000..7135219a5847fdbc6cc26f588d47437934a33140 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nombank.py @@ -0,0 +1,466 @@ +# Natural Language Toolkit: NomBank Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Paul Bedaride +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +from functools import total_ordering +from xml.etree import ElementTree + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.internals import raise_unorderable_types +from nltk.tree import Tree + + +class NombankCorpusReader(CorpusReader): + """ + Corpus reader for the nombank corpus, which augments the Penn + Treebank with information about the predicate argument structure + of every noun instance. The corpus consists of two parts: the + predicate-argument annotations themselves, and a set of "frameset + files" which define the argument labels used by the annotations, + on a per-noun basis. Each "frameset file" contains one or more + predicates, such as ``'turn'`` or ``'turn_on'``, each of which is + divided into coarse-grained word senses called "rolesets". For + each "roleset", the frameset file provides descriptions of the + argument roles, along with examples. + """ + + def __init__( + self, + root, + nomfile, + framefiles="", + nounsfile=None, + parse_fileid_xform=None, + parse_corpus=None, + encoding="utf8", + ): + """ + :param root: The root directory for this corpus. + :param nomfile: The name of the file containing the predicate- + argument annotations (relative to ``root``). + :param framefiles: A list or regexp specifying the frameset + fileids for this corpus. + :param parse_fileid_xform: A transform that should be applied + to the fileids in this corpus. This should be a function + of one argument (a fileid) that returns a string (the new + fileid). + :param parse_corpus: The corpus containing the parse trees + corresponding to this corpus. These parse trees are + necessary to resolve the tree pointers used by nombank. + """ + + # If framefiles is specified as a regexp, expand it. + if isinstance(framefiles, str): + self._fileids = find_corpus_fileids(root, framefiles) + self._fileids = list(framefiles) + # Initialize the corpus reader. + CorpusReader.__init__(self, root, framefiles, encoding) + + # Record our nom file & nouns file. + self._nomfile = nomfile + self._nounsfile = nounsfile + self._parse_fileid_xform = parse_fileid_xform + self._parse_corpus = parse_corpus + + def instances(self, baseform=None): + """ + :return: a corpus view that acts as a list of + ``NombankInstance`` objects, one for each noun in the corpus. + """ + kwargs = {} + if baseform is not None: + kwargs["instance_filter"] = lambda inst: inst.baseform == baseform + return StreamBackedCorpusView( + self.abspath(self._nomfile), + lambda stream: self._read_instance_block(stream, **kwargs), + encoding=self.encoding(self._nomfile), + ) + + def lines(self): + """ + :return: a corpus view that acts as a list of strings, one for + each line in the predicate-argument annotation file. + """ + return StreamBackedCorpusView( + self.abspath(self._nomfile), + read_line_block, + encoding=self.encoding(self._nomfile), + ) + + def roleset(self, roleset_id): + """ + :return: the xml description for the given roleset. + """ + baseform = roleset_id.split(".")[0] + baseform = baseform.replace("perc-sign", "%") + baseform = baseform.replace("oneslashonezero", "1/10").replace( + "1/10", "1-slash-10" + ) + framefile = "frames/%s.xml" % baseform + if framefile not in self.fileids(): + raise ValueError("Frameset file for %s not found" % roleset_id) + + # n.b.: The encoding for XML fileids is specified by the file + # itself; so we ignore self._encoding here. + with self.abspath(framefile).open() as fp: + etree = ElementTree.parse(fp).getroot() + for roleset in etree.findall("predicate/roleset"): + if roleset.attrib["id"] == roleset_id: + return roleset + raise ValueError(f"Roleset {roleset_id} not found in {framefile}") + + def rolesets(self, baseform=None): + """ + :return: list of xml descriptions for rolesets. + """ + if baseform is not None: + framefile = "frames/%s.xml" % baseform + if framefile not in self.fileids(): + raise ValueError("Frameset file for %s not found" % baseform) + framefiles = [framefile] + else: + framefiles = self.fileids() + + rsets = [] + for framefile in framefiles: + # n.b.: The encoding for XML fileids is specified by the file + # itself; so we ignore self._encoding here. + with self.abspath(framefile).open() as fp: + etree = ElementTree.parse(fp).getroot() + rsets.append(etree.findall("predicate/roleset")) + return LazyConcatenation(rsets) + + def nouns(self): + """ + :return: a corpus view that acts as a list of all noun lemmas + in this corpus (from the nombank.1.0.words file). + """ + return StreamBackedCorpusView( + self.abspath(self._nounsfile), + read_line_block, + encoding=self.encoding(self._nounsfile), + ) + + def _read_instance_block(self, stream, instance_filter=lambda inst: True): + block = [] + + # Read 100 at a time. + for i in range(100): + line = stream.readline().strip() + if line: + inst = NombankInstance.parse( + line, self._parse_fileid_xform, self._parse_corpus + ) + if instance_filter(inst): + block.append(inst) + + return block + + +###################################################################### +# { Nombank Instance & related datatypes +###################################################################### + + +class NombankInstance: + def __init__( + self, + fileid, + sentnum, + wordnum, + baseform, + sensenumber, + predicate, + predid, + arguments, + parse_corpus=None, + ): + + self.fileid = fileid + """The name of the file containing the parse tree for this + instance's sentence.""" + + self.sentnum = sentnum + """The sentence number of this sentence within ``fileid``. + Indexing starts from zero.""" + + self.wordnum = wordnum + """The word number of this instance's predicate within its + containing sentence. Word numbers are indexed starting from + zero, and include traces and other empty parse elements.""" + + self.baseform = baseform + """The baseform of the predicate.""" + + self.sensenumber = sensenumber + """The sense number of the predicate.""" + + self.predicate = predicate + """A ``NombankTreePointer`` indicating the position of this + instance's predicate within its containing sentence.""" + + self.predid = predid + """Identifier of the predicate.""" + + self.arguments = tuple(arguments) + """A list of tuples (argloc, argid), specifying the location + and identifier for each of the predicate's argument in the + containing sentence. Argument identifiers are strings such as + ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain + the predicate.""" + + self.parse_corpus = parse_corpus + """A corpus reader for the parse trees corresponding to the + instances in this nombank corpus.""" + + @property + def roleset(self): + """The name of the roleset used by this instance's predicate. + Use ``nombank.roleset() `` to + look up information about the roleset.""" + r = self.baseform.replace("%", "perc-sign") + r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero") + return f"{r}.{self.sensenumber}" + + def __repr__(self): + return "".format( + self.fileid, + self.sentnum, + self.wordnum, + ) + + def __str__(self): + s = "{} {} {} {} {}".format( + self.fileid, + self.sentnum, + self.wordnum, + self.baseform, + self.sensenumber, + ) + items = self.arguments + ((self.predicate, "rel"),) + for (argloc, argid) in sorted(items): + s += f" {argloc}-{argid}" + return s + + def _get_tree(self): + if self.parse_corpus is None: + return None + if self.fileid not in self.parse_corpus.fileids(): + return None + return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] + + tree = property( + _get_tree, + doc=""" + The parse tree corresponding to this instance, or None if + the corresponding tree is not available.""", + ) + + @staticmethod + def parse(s, parse_fileid_xform=None, parse_corpus=None): + pieces = s.split() + if len(pieces) < 6: + raise ValueError("Badly formatted nombank line: %r" % s) + + # Divide the line into its basic pieces. + (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5] + + args = pieces[5:] + rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p] + if len(rel) != 1: + raise ValueError("Badly formatted nombank line: %r" % s) + + # Apply the fileid selector, if any. + if parse_fileid_xform is not None: + fileid = parse_fileid_xform(fileid) + + # Convert sentence & word numbers to ints. + sentnum = int(sentnum) + wordnum = int(wordnum) + + # Parse the predicate location. + + predloc, predid = rel[0].split("-", 1) + predicate = NombankTreePointer.parse(predloc) + + # Parse the arguments. + arguments = [] + for arg in args: + argloc, argid = arg.split("-", 1) + arguments.append((NombankTreePointer.parse(argloc), argid)) + + # Put it all together. + return NombankInstance( + fileid, + sentnum, + wordnum, + baseform, + sensenumber, + predicate, + predid, + arguments, + parse_corpus, + ) + + +class NombankPointer: + """ + A pointer used by nombank to identify one or more constituents in + a parse tree. ``NombankPointer`` is an abstract base class with + three concrete subclasses: + + - ``NombankTreePointer`` is used to point to single constituents. + - ``NombankSplitTreePointer`` is used to point to 'split' + constituents, which consist of a sequence of two or more + ``NombankTreePointer`` pointers. + - ``NombankChainTreePointer`` is used to point to entire trace + chains in a tree. It consists of a sequence of pieces, which + can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers. + """ + + def __init__(self): + if self.__class__ == NombankPointer: + raise NotImplementedError() + + +class NombankChainTreePointer(NombankPointer): + def __init__(self, pieces): + self.pieces = pieces + """A list of the pieces that make up this chain. Elements may + be either ``NombankSplitTreePointer`` or + ``NombankTreePointer`` pointers.""" + + def __str__(self): + return "*".join("%s" % p for p in self.pieces) + + def __repr__(self): + return "" % self + + def select(self, tree): + if tree is None: + raise ValueError("Parse tree not available") + return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) + + +class NombankSplitTreePointer(NombankPointer): + def __init__(self, pieces): + self.pieces = pieces + """A list of the pieces that make up this chain. Elements are + all ``NombankTreePointer`` pointers.""" + + def __str__(self): + return ",".join("%s" % p for p in self.pieces) + + def __repr__(self): + return "" % self + + def select(self, tree): + if tree is None: + raise ValueError("Parse tree not available") + return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) + + +@total_ordering +class NombankTreePointer(NombankPointer): + """ + wordnum:height*wordnum:height*... + wordnum:height, + + """ + + def __init__(self, wordnum, height): + self.wordnum = wordnum + self.height = height + + @staticmethod + def parse(s): + # Deal with chains (xx*yy*zz) + pieces = s.split("*") + if len(pieces) > 1: + return NombankChainTreePointer( + [NombankTreePointer.parse(elt) for elt in pieces] + ) + + # Deal with split args (xx,yy,zz) + pieces = s.split(",") + if len(pieces) > 1: + return NombankSplitTreePointer( + [NombankTreePointer.parse(elt) for elt in pieces] + ) + + # Deal with normal pointers. + pieces = s.split(":") + if len(pieces) != 2: + raise ValueError("bad nombank pointer %r" % s) + return NombankTreePointer(int(pieces[0]), int(pieces[1])) + + def __str__(self): + return f"{self.wordnum}:{self.height}" + + def __repr__(self): + return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height) + + def __eq__(self, other): + while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): + other = other.pieces[0] + + if not isinstance(other, NombankTreePointer): + return self is other + + return self.wordnum == other.wordnum and self.height == other.height + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): + other = other.pieces[0] + + if not isinstance(other, NombankTreePointer): + return id(self) < id(other) + + return (self.wordnum, -self.height) < (other.wordnum, -other.height) + + def select(self, tree): + if tree is None: + raise ValueError("Parse tree not available") + return tree[self.treepos(tree)] + + def treepos(self, tree): + """ + Convert this pointer to a standard 'tree position' pointer, + given that it points to the given tree. + """ + if tree is None: + raise ValueError("Parse tree not available") + stack = [tree] + treepos = [] + + wordnum = 0 + while True: + # tree node: + if isinstance(stack[-1], Tree): + # Select the next child. + if len(treepos) < len(stack): + treepos.append(0) + else: + treepos[-1] += 1 + # Update the stack. + if treepos[-1] < len(stack[-1]): + stack.append(stack[-1][treepos[-1]]) + else: + # End of node's child list: pop up a level. + stack.pop() + treepos.pop() + # word node: + else: + if wordnum == self.wordnum: + return tuple(treepos[: len(treepos) - self.height - 1]) + else: + wordnum += 1 + stack.pop() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nps_chat.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nps_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..1b01d593dd7a3acdef9c34523d53918c73e057ee --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/nps_chat.py @@ -0,0 +1,90 @@ +# Natural Language Toolkit: NPS Chat Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +import re +import textwrap + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.corpus.reader.xmldocs import * +from nltk.internals import ElementWrapper +from nltk.tag import map_tag +from nltk.util import LazyConcatenation + + +class NPSChatCorpusReader(XMLCorpusReader): + def __init__(self, root, fileids, wrap_etree=False, tagset=None): + XMLCorpusReader.__init__(self, root, fileids, wrap_etree) + self._tagset = tagset + + def xml_posts(self, fileids=None): + if self._wrap_etree: + return concat( + [ + XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt) + for fileid in self.abspaths(fileids) + ] + ) + else: + return concat( + [ + XMLCorpusView(fileid, "Session/Posts/Post") + for fileid in self.abspaths(fileids) + ] + ) + + def posts(self, fileids=None): + return concat( + [ + XMLCorpusView( + fileid, "Session/Posts/Post/terminals", self._elt_to_words + ) + for fileid in self.abspaths(fileids) + ] + ) + + def tagged_posts(self, fileids=None, tagset=None): + def reader(elt, handler): + return self._elt_to_tagged_words(elt, handler, tagset) + + return concat( + [ + XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader) + for fileid in self.abspaths(fileids) + ] + ) + + def words(self, fileids=None): + return LazyConcatenation(self.posts(fileids)) + + def tagged_words(self, fileids=None, tagset=None): + return LazyConcatenation(self.tagged_posts(fileids, tagset)) + + def _wrap_elt(self, elt, handler): + return ElementWrapper(elt) + + def _elt_to_words(self, elt, handler): + return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")] + + def _elt_to_tagged_words(self, elt, handler, tagset=None): + tagged_post = [ + (self._simplify_username(t.attrib["word"]), t.attrib["pos"]) + for t in elt.findall("t") + ] + if tagset and tagset != self._tagset: + tagged_post = [ + (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post + ] + return tagged_post + + @staticmethod + def _simplify_username(word): + if "User" in word: + word = "U" + word.split("User", 1)[1] + elif isinstance(word, bytes): + word = word.decode("ascii") + return word diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/opinion_lexicon.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/opinion_lexicon.py new file mode 100644 index 0000000000000000000000000000000000000000..f5059c42f0340fd852533eb0382f73357dd05fc4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/opinion_lexicon.py @@ -0,0 +1,125 @@ +# Natural Language Toolkit: Opinion Lexicon Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Pierpaolo Pantone <24alsecondo@gmail.com> +# URL: +# For license information, see LICENSE.TXT + +""" +CorpusReader for the Opinion Lexicon. + +Opinion Lexicon information +=========================== + +Authors: Minqing Hu and Bing Liu, 2004. + Department of Computer Science + University of Illinois at Chicago + +Contact: Bing Liu, liub@cs.uic.edu + https://www.cs.uic.edu/~liub + +Distributed with permission. + +Related papers: + +- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". + Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery + & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA. + +- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and + Comparing Opinions on the Web". Proceedings of the 14th International World + Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan. +""" + +from nltk.corpus.reader import WordListCorpusReader +from nltk.corpus.reader.api import * + + +class IgnoreReadmeCorpusView(StreamBackedCorpusView): + """ + This CorpusView is used to skip the initial readme block of the corpus. + """ + + def __init__(self, *args, **kwargs): + StreamBackedCorpusView.__init__(self, *args, **kwargs) + # open self._stream + self._open() + # skip the readme block + read_blankline_block(self._stream) + # Set the initial position to the current stream position + self._filepos = [self._stream.tell()] + + +class OpinionLexiconCorpusReader(WordListCorpusReader): + """ + Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored. + + >>> from nltk.corpus import opinion_lexicon + >>> opinion_lexicon.words() + ['2-faced', '2-faces', 'abnormal', 'abolish', ...] + + The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative + words: + + >>> opinion_lexicon.negative() + ['2-faced', '2-faces', 'abnormal', 'abolish', ...] + + Note that words from `words()` method are sorted by file id, not alphabetically: + + >>> opinion_lexicon.words()[0:10] # doctest: +NORMALIZE_WHITESPACE + ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', + 'abominate', 'abomination', 'abort', 'aborted'] + >>> sorted(opinion_lexicon.words())[0:10] # doctest: +NORMALIZE_WHITESPACE + ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', + 'abominate', 'abomination', 'abort'] + """ + + CorpusView = IgnoreReadmeCorpusView + + def words(self, fileids=None): + """ + Return all words in the opinion lexicon. Note that these words are not + sorted in alphabetical order. + + :param fileids: a list or regexp specifying the ids of the files whose + words have to be returned. + :return: the given file(s) as a list of words and punctuation symbols. + :rtype: list(str) + """ + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + return concat( + [ + self.CorpusView(path, self._read_word_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def positive(self): + """ + Return all positive words in alphabetical order. + + :return: a list of positive words. + :rtype: list(str) + """ + return self.words("positive-words.txt") + + def negative(self): + """ + Return all negative words in alphabetical order. + + :return: a list of negative words. + :rtype: list(str) + """ + return self.words("negative-words.txt") + + def _read_word_block(self, stream): + words = [] + for i in range(20): # Read 20 lines at a time. + line = stream.readline() + if not line: + continue + words.append(line.strip()) + return words diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/util.py new file mode 100644 index 0000000000000000000000000000000000000000..81ba4278f9706773302ec0d631880e50ffa899cc --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/util.py @@ -0,0 +1,154 @@ +# Natural Language Toolkit: Corpus Reader Utility Functions +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +###################################################################### +# { Lazy Corpus Loader +###################################################################### + +import gc +import re + +import nltk + +TRY_ZIPFILE_FIRST = False + + +class LazyCorpusLoader: + """ + To see the API documentation for this lazily loaded corpus, first + run corpus.ensure_loaded(), and then run help(this_corpus). + + LazyCorpusLoader is a proxy object which is used to stand in for a + corpus object before the corpus is loaded. This allows NLTK to + create an object for each corpus, but defer the costs associated + with loading those corpora until the first time that they're + actually accessed. + + The first time this object is accessed in any way, it will load + the corresponding corpus, and transform itself into that corpus + (by modifying its own ``__class__`` and ``__dict__`` attributes). + + If the corpus can not be found, then accessing this object will + raise an exception, displaying installation instructions for the + NLTK data package. Once they've properly installed the data + package (or modified ``nltk.data.path`` to point to its location), + they can then use the corpus object without restarting python. + + :param name: The name of the corpus + :type name: str + :param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader + :type reader: nltk.corpus.reader.api.CorpusReader + :param nltk_data_subdir: The subdirectory where the corpus is stored. + :type nltk_data_subdir: str + :param `*args`: Any other non-keywords arguments that `reader_cls` might need. + :param `**kwargs`: Any other keywords arguments that `reader_cls` might need. + """ + + def __init__(self, name, reader_cls, *args, **kwargs): + from nltk.corpus.reader.api import CorpusReader + + assert issubclass(reader_cls, CorpusReader) + self.__name = self.__name__ = name + self.__reader_cls = reader_cls + # If nltk_data_subdir is set explicitly + if "nltk_data_subdir" in kwargs: + # Use the specified subdirectory path + self.subdir = kwargs["nltk_data_subdir"] + # Pops the `nltk_data_subdir` argument, we don't need it anymore. + kwargs.pop("nltk_data_subdir", None) + else: # Otherwise use 'nltk_data/corpora' + self.subdir = "corpora" + self.__args = args + self.__kwargs = kwargs + + def __load(self): + # Find the corpus root directory. + zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name) + if TRY_ZIPFILE_FIRST: + try: + root = nltk.data.find(f"{self.subdir}/{zip_name}") + except LookupError as e: + try: + root = nltk.data.find(f"{self.subdir}/{self.__name}") + except LookupError: + raise e + else: + try: + root = nltk.data.find(f"{self.subdir}/{self.__name}") + except LookupError as e: + try: + root = nltk.data.find(f"{self.subdir}/{zip_name}") + except LookupError: + raise e + + # Load the corpus. + corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) + + # This is where the magic happens! Transform ourselves into + # the corpus by modifying our own __dict__ and __class__ to + # match that of the corpus. + + args, kwargs = self.__args, self.__kwargs + name, reader_cls = self.__name, self.__reader_cls + + self.__dict__ = corpus.__dict__ + self.__class__ = corpus.__class__ + + # _unload support: assign __dict__ and __class__ back, then do GC. + # after reassigning __dict__ there shouldn't be any references to + # corpus data so the memory should be deallocated after gc.collect() + def _unload(self): + lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs) + self.__dict__ = lazy_reader.__dict__ + self.__class__ = lazy_reader.__class__ + gc.collect() + + self._unload = _make_bound_method(_unload, self) + + def __getattr__(self, attr): + + # Fix for inspect.isclass under Python 2.6 + # (see https://bugs.python.org/issue1225107). + # Without this fix tests may take extra 1.5GB RAM + # because all corpora gets loaded during test collection. + if attr == "__bases__": + raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") + + self.__load() + # This looks circular, but its not, since __load() changes our + # __class__ to something new: + return getattr(self, attr) + + def __repr__(self): + return "<{} in {!r} (not loaded yet)>".format( + self.__reader_cls.__name__, + ".../corpora/" + self.__name, + ) + + def _unload(self): + # If an exception occurs during corpus loading then + # '_unload' method may be unattached, so __getattr__ can be called; + # we shouldn't trigger corpus loading again in this case. + pass + + +def _make_bound_method(func, self): + """ + Magic for creating bound methods (used for _unload). + """ + + class Foo: + def meth(self): + pass + + f = Foo() + bound_method = type(f.meth) + + try: + return bound_method(func, self, self.__class__) + except TypeError: # python3 + return bound_method(func, self) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/draw/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fe6be42cf7e6622008f31cf676fef853c2ad5de8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/__init__.py @@ -0,0 +1,27 @@ +# Natural Language Toolkit: graphical representations package +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +# Import Tkinter-based modules if Tkinter is installed +try: + import tkinter +except ImportError: + import warnings + + warnings.warn("nltk.draw package not loaded (please install Tkinter library).") +else: + from nltk.draw.cfg import ProductionList, CFGEditor, CFGDemo + from nltk.draw.tree import ( + TreeSegmentWidget, + tree_to_treesegment, + TreeWidget, + TreeView, + draw_trees, + ) + from nltk.draw.table import Table + +from nltk.draw.dispersion import dispersion_plot diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/draw/cfg.py b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/cfg.py new file mode 100644 index 0000000000000000000000000000000000000000..078f3e2b4b09e641a00cb34d04d27c075b4c2144 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/cfg.py @@ -0,0 +1,859 @@ +# Natural Language Toolkit: CFG visualization +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Visualization tools for CFGs. +""" + +# Idea for a nice demo: +# - 3 panes: grammar, treelet, working area +# - grammar is a list of productions +# - when you select a production, the treelet that it licenses appears +# in the treelet area +# - the working area has the text on the bottom, and S at top. When +# you select a production, it shows (ghosted) the locations where +# that production's treelet could be attached to either the text +# or the tree rooted at S. +# - the user can drag the treelet onto one of those (or click on them?) +# - the user can delete pieces of the tree from the working area +# (right click?) +# - connecting top to bottom? drag one NP onto another? +# +# +-------------------------------------------------------------+ +# | S -> NP VP | S | +# |[NP -> Det N ]| / \ | +# | ... | NP VP | +# | N -> 'dog' | | +# | N -> 'cat' | | +# | ... | | +# +--------------+ | +# | NP | Det N | +# | / \ | | | | +# | Det N | the cat saw the dog | +# | | | +# +--------------+----------------------------------------------+ +# +# Operations: +# - connect a new treelet -- drag or click shadow +# - delete a treelet -- right click +# - if only connected to top, delete everything below +# - if only connected to bottom, delete everything above +# - connect top & bottom -- drag a leaf to a root or a root to a leaf +# - disconnect top & bottom -- right click +# - if connected to top & bottom, then disconnect + +import re +from tkinter import ( + Button, + Canvas, + Entry, + Frame, + IntVar, + Label, + Scrollbar, + Text, + Tk, + Toplevel, +) + +from nltk.draw.tree import TreeSegmentWidget, tree_to_treesegment +from nltk.draw.util import ( + CanvasFrame, + ColorizedList, + ShowText, + SymbolWidget, + TextWidget, +) +from nltk.grammar import CFG, Nonterminal, _read_cfg_production, nonterminals +from nltk.tree import Tree + +###################################################################### +# Production List +###################################################################### + + +class ProductionList(ColorizedList): + ARROW = SymbolWidget.SYMBOLS["rightarrow"] + + def _init_colortags(self, textwidget, options): + textwidget.tag_config("terminal", foreground="#006000") + textwidget.tag_config("arrow", font="symbol", underline="0") + textwidget.tag_config( + "nonterminal", foreground="blue", font=("helvetica", -12, "bold") + ) + + def _item_repr(self, item): + contents = [] + contents.append(("%s\t" % item.lhs(), "nonterminal")) + contents.append((self.ARROW, "arrow")) + for elt in item.rhs(): + if isinstance(elt, Nonterminal): + contents.append((" %s" % elt.symbol(), "nonterminal")) + else: + contents.append((" %r" % elt, "terminal")) + return contents + + +###################################################################### +# CFG Editor +###################################################################### + +_CFGEditor_HELP = """ + +The CFG Editor can be used to create or modify context free grammars. +A context free grammar consists of a start symbol and a list of +productions. The start symbol is specified by the text entry field in +the upper right hand corner of the editor; and the list of productions +are specified in the main text editing box. + +Every non-blank line specifies a single production. Each production +has the form "LHS -> RHS," where LHS is a single nonterminal, and RHS +is a list of nonterminals and terminals. + +Nonterminals must be a single word, such as S or NP or NP_subj. +Currently, nonterminals must consists of alphanumeric characters and +underscores (_). Nonterminals are colored blue. If you place the +mouse over any nonterminal, then all occurrences of that nonterminal +will be highlighted. + +Terminals must be surrounded by single quotes (') or double +quotes(\"). For example, "dog" and "New York" are terminals. +Currently, the string within the quotes must consist of alphanumeric +characters, underscores, and spaces. + +To enter a new production, go to a blank line, and type a nonterminal, +followed by an arrow (->), followed by a sequence of terminals and +nonterminals. Note that "->" (dash + greater-than) is automatically +converted to an arrow symbol. When you move your cursor to a +different line, your production will automatically be colorized. If +there are any errors, they will be highlighted in red. + +Note that the order of the productions is significant for some +algorithms. To re-order the productions, use cut and paste to move +them. + +Use the buttons at the bottom of the window when you are done editing +the CFG: + - Ok: apply the new CFG, and exit the editor. + - Apply: apply the new CFG, and do not exit the editor. + - Reset: revert to the original CFG, and do not exit the editor. + - Cancel: revert to the original CFG, and exit the editor. + +""" + + +class CFGEditor: + """ + A dialog window for creating and editing context free grammars. + ``CFGEditor`` imposes the following restrictions: + + - All nonterminals must be strings consisting of word + characters. + - All terminals must be strings consisting of word characters + and space characters. + """ + + # Regular expressions used by _analyze_line. Precompile them, so + # we can process the text faster. + ARROW = SymbolWidget.SYMBOLS["rightarrow"] + _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))") + _ARROW_RE = re.compile(r"\s*(->|(" + ARROW + r"))\s*") + _PRODUCTION_RE = re.compile( + r"(^\s*\w+\s*)" + + "(->|(" # LHS + + ARROW + + r"))\s*" + + r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$" # arrow + ) # RHS + _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")") + _BOLD = ("helvetica", -12, "bold") + + def __init__(self, parent, cfg=None, set_cfg_callback=None): + self._parent = parent + if cfg is not None: + self._cfg = cfg + else: + self._cfg = CFG(Nonterminal("S"), []) + self._set_cfg_callback = set_cfg_callback + + self._highlight_matching_nonterminals = 1 + + # Create the top-level window. + self._top = Toplevel(parent) + self._init_bindings() + + self._init_startframe() + self._startframe.pack(side="top", fill="x", expand=0) + self._init_prodframe() + self._prodframe.pack(side="top", fill="both", expand=1) + self._init_buttons() + self._buttonframe.pack(side="bottom", fill="x", expand=0) + + self._textwidget.focus() + + def _init_startframe(self): + frame = self._startframe = Frame(self._top) + self._start = Entry(frame) + self._start.pack(side="right") + Label(frame, text="Start Symbol:").pack(side="right") + Label(frame, text="Productions:").pack(side="left") + self._start.insert(0, self._cfg.start().symbol()) + + def _init_buttons(self): + frame = self._buttonframe = Frame(self._top) + Button(frame, text="Ok", command=self._ok, underline=0, takefocus=0).pack( + side="left" + ) + Button(frame, text="Apply", command=self._apply, underline=0, takefocus=0).pack( + side="left" + ) + Button(frame, text="Reset", command=self._reset, underline=0, takefocus=0).pack( + side="left" + ) + Button( + frame, text="Cancel", command=self._cancel, underline=0, takefocus=0 + ).pack(side="left") + Button(frame, text="Help", command=self._help, underline=0, takefocus=0).pack( + side="right" + ) + + def _init_bindings(self): + self._top.title("CFG Editor") + self._top.bind("", self._cancel) + self._top.bind("", self._cancel) + self._top.bind("", self._cancel) + # self._top.bind('', self._cancel) + self._top.bind("", self._cancel) + self._top.bind("", self._cancel) + # self._top.bind('', self._cancel) + self._top.bind("", self._cancel) + + self._top.bind("", self._ok) + self._top.bind("", self._ok) + self._top.bind("", self._apply) + self._top.bind("", self._apply) + self._top.bind("", self._reset) + self._top.bind("", self._reset) + self._top.bind("", self._help) + self._top.bind("", self._help) + self._top.bind("", self._help) + + def _init_prodframe(self): + self._prodframe = Frame(self._top) + + # Create the basic Text widget & scrollbar. + self._textwidget = Text( + self._prodframe, background="#e0e0e0", exportselection=1 + ) + self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient="vertical") + self._textwidget.config(yscrollcommand=self._textscroll.set) + self._textscroll.config(command=self._textwidget.yview) + self._textscroll.pack(side="right", fill="y") + self._textwidget.pack(expand=1, fill="both", side="left") + + # Initialize the colorization tags. Each nonterminal gets its + # own tag, so they aren't listed here. + self._textwidget.tag_config("terminal", foreground="#006000") + self._textwidget.tag_config("arrow", font="symbol") + self._textwidget.tag_config("error", background="red") + + # Keep track of what line they're on. We use that to remember + # to re-analyze a line whenever they leave it. + self._linenum = 0 + + # Expand "->" to an arrow. + self._top.bind(">", self._replace_arrows) + + # Re-colorize lines when appropriate. + self._top.bind("<>", self._analyze) + self._top.bind("", self._check_analyze) + self._top.bind("", self._check_analyze) + + # Tab cycles focus. (why doesn't this work??) + def cycle(e, textwidget=self._textwidget): + textwidget.tk_focusNext().focus() + + self._textwidget.bind("", cycle) + + prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()] + for i in range(len(prod_tuples) - 1, 0, -1): + if prod_tuples[i][0] == prod_tuples[i - 1][0]: + if () in prod_tuples[i][1]: + continue + if () in prod_tuples[i - 1][1]: + continue + print(prod_tuples[i - 1][1]) + print(prod_tuples[i][1]) + prod_tuples[i - 1][1].extend(prod_tuples[i][1]) + del prod_tuples[i] + + for lhs, rhss in prod_tuples: + print(lhs, rhss) + s = "%s ->" % lhs + for rhs in rhss: + for elt in rhs: + if isinstance(elt, Nonterminal): + s += " %s" % elt + else: + s += " %r" % elt + s += " |" + s = s[:-2] + "\n" + self._textwidget.insert("end", s) + + self._analyze() + + # # Add the producitons to the text widget, and colorize them. + # prod_by_lhs = {} + # for prod in self._cfg.productions(): + # if len(prod.rhs()) > 0: + # prod_by_lhs.setdefault(prod.lhs(),[]).append(prod) + # for (lhs, prods) in prod_by_lhs.items(): + # self._textwidget.insert('end', '%s ->' % lhs) + # self._textwidget.insert('end', self._rhs(prods[0])) + # for prod in prods[1:]: + # print '\t|'+self._rhs(prod), + # self._textwidget.insert('end', '\t|'+self._rhs(prod)) + # print + # self._textwidget.insert('end', '\n') + # for prod in self._cfg.productions(): + # if len(prod.rhs()) == 0: + # self._textwidget.insert('end', '%s' % prod) + # self._analyze() + + # def _rhs(self, prod): + # s = '' + # for elt in prod.rhs(): + # if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol() + # else: s += ' %r' % elt + # return s + + def _clear_tags(self, linenum): + """ + Remove all tags (except ``arrow`` and ``sel``) from the given + line of the text widget used for editing the productions. + """ + start = "%d.0" % linenum + end = "%d.end" % linenum + for tag in self._textwidget.tag_names(): + if tag not in ("arrow", "sel"): + self._textwidget.tag_remove(tag, start, end) + + def _check_analyze(self, *e): + """ + Check if we've moved to a new line. If we have, then remove + all colorization from the line we moved to, and re-colorize + the line that we moved from. + """ + linenum = int(self._textwidget.index("insert").split(".")[0]) + if linenum != self._linenum: + self._clear_tags(linenum) + self._analyze_line(self._linenum) + self._linenum = linenum + + def _replace_arrows(self, *e): + """ + Replace any ``'->'`` text strings with arrows (char \\256, in + symbol font). This searches the whole buffer, but is fast + enough to be done anytime they press '>'. + """ + arrow = "1.0" + while True: + arrow = self._textwidget.search("->", arrow, "end+1char") + if arrow == "": + break + self._textwidget.delete(arrow, arrow + "+2char") + self._textwidget.insert(arrow, self.ARROW, "arrow") + self._textwidget.insert(arrow, "\t") + + arrow = "1.0" + while True: + arrow = self._textwidget.search(self.ARROW, arrow + "+1char", "end+1char") + if arrow == "": + break + self._textwidget.tag_add("arrow", arrow, arrow + "+1char") + + def _analyze_token(self, match, linenum): + """ + Given a line number and a regexp match for a token on that + line, colorize the token. Note that the regexp match gives us + the token's text, start index (on the line), and end index (on + the line). + """ + # What type of token is it? + if match.group()[0] in "'\"": + tag = "terminal" + elif match.group() in ("->", self.ARROW): + tag = "arrow" + else: + # If it's a nonterminal, then set up new bindings, so we + # can highlight all instances of that nonterminal when we + # put the mouse over it. + tag = "nonterminal_" + match.group() + if tag not in self._textwidget.tag_names(): + self._init_nonterminal_tag(tag) + + start = "%d.%d" % (linenum, match.start()) + end = "%d.%d" % (linenum, match.end()) + self._textwidget.tag_add(tag, start, end) + + def _init_nonterminal_tag(self, tag, foreground="blue"): + self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD) + if not self._highlight_matching_nonterminals: + return + + def enter(e, textwidget=self._textwidget, tag=tag): + textwidget.tag_config(tag, background="#80ff80") + + def leave(e, textwidget=self._textwidget, tag=tag): + textwidget.tag_config(tag, background="") + + self._textwidget.tag_bind(tag, "", enter) + self._textwidget.tag_bind(tag, "", leave) + + def _analyze_line(self, linenum): + """ + Colorize a given line. + """ + # Get rid of any tags that were previously on the line. + self._clear_tags(linenum) + + # Get the line line's text string. + line = self._textwidget.get(repr(linenum) + ".0", repr(linenum) + ".end") + + # If it's a valid production, then colorize each token. + if CFGEditor._PRODUCTION_RE.match(line): + # It's valid; Use _TOKEN_RE to tokenize the production, + # and call analyze_token on each token. + def analyze_token(match, self=self, linenum=linenum): + self._analyze_token(match, linenum) + return "" + + CFGEditor._TOKEN_RE.sub(analyze_token, line) + elif line.strip() != "": + # It's invalid; show the user where the error is. + self._mark_error(linenum, line) + + def _mark_error(self, linenum, line): + """ + Mark the location of an error in a line. + """ + arrowmatch = CFGEditor._ARROW_RE.search(line) + if not arrowmatch: + # If there's no arrow at all, highlight the whole line. + start = "%d.0" % linenum + end = "%d.end" % linenum + elif not CFGEditor._LHS_RE.match(line): + # Otherwise, if the LHS is bad, highlight it. + start = "%d.0" % linenum + end = "%d.%d" % (linenum, arrowmatch.start()) + else: + # Otherwise, highlight the RHS. + start = "%d.%d" % (linenum, arrowmatch.end()) + end = "%d.end" % linenum + + # If we're highlighting 0 chars, highlight the whole line. + if self._textwidget.compare(start, "==", end): + start = "%d.0" % linenum + end = "%d.end" % linenum + self._textwidget.tag_add("error", start, end) + + def _analyze(self, *e): + """ + Replace ``->`` with arrows, and colorize the entire buffer. + """ + self._replace_arrows() + numlines = int(self._textwidget.index("end").split(".")[0]) + for linenum in range(1, numlines + 1): # line numbers start at 1. + self._analyze_line(linenum) + + def _parse_productions(self): + """ + Parse the current contents of the textwidget buffer, to create + a list of productions. + """ + productions = [] + + # Get the text, normalize it, and split it into lines. + text = self._textwidget.get("1.0", "end") + text = re.sub(self.ARROW, "->", text) + text = re.sub("\t", " ", text) + lines = text.split("\n") + + # Convert each line to a CFG production + for line in lines: + line = line.strip() + if line == "": + continue + productions += _read_cfg_production(line) + # if line.strip() == '': continue + # if not CFGEditor._PRODUCTION_RE.match(line): + # raise ValueError('Bad production string %r' % line) + # + # (lhs_str, rhs_str) = line.split('->') + # lhs = Nonterminal(lhs_str.strip()) + # rhs = [] + # def parse_token(match, rhs=rhs): + # token = match.group() + # if token[0] in "'\"": rhs.append(token[1:-1]) + # else: rhs.append(Nonterminal(token)) + # return '' + # CFGEditor._TOKEN_RE.sub(parse_token, rhs_str) + # + # productions.append(Production(lhs, *rhs)) + + return productions + + def _destroy(self, *e): + if self._top is None: + return + self._top.destroy() + self._top = None + + def _ok(self, *e): + self._apply() + self._destroy() + + def _apply(self, *e): + productions = self._parse_productions() + start = Nonterminal(self._start.get()) + cfg = CFG(start, productions) + if self._set_cfg_callback is not None: + self._set_cfg_callback(cfg) + + def _reset(self, *e): + self._textwidget.delete("1.0", "end") + for production in self._cfg.productions(): + self._textwidget.insert("end", "%s\n" % production) + self._analyze() + if self._set_cfg_callback is not None: + self._set_cfg_callback(self._cfg) + + def _cancel(self, *e): + try: + self._reset() + except: + pass + self._destroy() + + def _help(self, *e): + # The default font's not very legible; try using 'fixed' instead. + try: + ShowText( + self._parent, + "Help: Chart Parser Demo", + (_CFGEditor_HELP).strip(), + width=75, + font="fixed", + ) + except: + ShowText( + self._parent, + "Help: Chart Parser Demo", + (_CFGEditor_HELP).strip(), + width=75, + ) + + +###################################################################### +# New Demo (built tree based on cfg) +###################################################################### + + +class CFGDemo: + def __init__(self, grammar, text): + self._grammar = grammar + self._text = text + + # Set up the main window. + self._top = Tk() + self._top.title("Context Free Grammar Demo") + + # Base font size + self._size = IntVar(self._top) + self._size.set(12) # = medium + + # Set up the key bindings + self._init_bindings(self._top) + + # Create the basic frames + frame1 = Frame(self._top) + frame1.pack(side="left", fill="y", expand=0) + self._init_menubar(self._top) + self._init_buttons(self._top) + self._init_grammar(frame1) + self._init_treelet(frame1) + self._init_workspace(self._top) + + # ////////////////////////////////////////////////// + # Initialization + # ////////////////////////////////////////////////// + + def _init_bindings(self, top): + top.bind("", self.destroy) + + def _init_menubar(self, parent): + pass + + def _init_buttons(self, parent): + pass + + def _init_grammar(self, parent): + self._prodlist = ProductionList(parent, self._grammar, width=20) + self._prodlist.pack(side="top", fill="both", expand=1) + self._prodlist.focus() + self._prodlist.add_callback("select", self._selectprod_cb) + self._prodlist.add_callback("move", self._selectprod_cb) + + def _init_treelet(self, parent): + self._treelet_canvas = Canvas(parent, background="white") + self._treelet_canvas.pack(side="bottom", fill="x") + self._treelet = None + + def _init_workspace(self, parent): + self._workspace = CanvasFrame(parent, background="white") + self._workspace.pack(side="right", fill="both", expand=1) + self._tree = None + self.reset_workspace() + + # ////////////////////////////////////////////////// + # Workspace + # ////////////////////////////////////////////////// + + def reset_workspace(self): + c = self._workspace.canvas() + fontsize = int(self._size.get()) + node_font = ("helvetica", -(fontsize + 4), "bold") + leaf_font = ("helvetica", -(fontsize + 2)) + + # Remove the old tree + if self._tree is not None: + self._workspace.remove_widget(self._tree) + + # The root of the tree. + start = self._grammar.start().symbol() + rootnode = TextWidget(c, start, font=node_font, draggable=1) + + # The leaves of the tree. + leaves = [] + for word in self._text: + leaves.append(TextWidget(c, word, font=leaf_font, draggable=1)) + + # Put it all together into one tree + self._tree = TreeSegmentWidget(c, rootnode, leaves, color="white") + + # Add it to the workspace. + self._workspace.add_widget(self._tree) + + # Move the leaves to the bottom of the workspace. + for leaf in leaves: + leaf.move(0, 100) + + # self._nodes = {start:1} + # self._leaves = dict([(l,1) for l in leaves]) + + def workspace_markprod(self, production): + pass + + def _markproduction(self, prod, tree=None): + if tree is None: + tree = self._tree + for i in range(len(tree.subtrees()) - len(prod.rhs())): + if tree["color", i] == "white": + self._markproduction # FIXME: Is this necessary at all? + + for j, node in enumerate(prod.rhs()): + widget = tree.subtrees()[i + j] + if ( + isinstance(node, Nonterminal) + and isinstance(widget, TreeSegmentWidget) + and node.symbol == widget.label().text() + ): + pass # matching nonterminal + elif ( + isinstance(node, str) + and isinstance(widget, TextWidget) + and node == widget.text() + ): + pass # matching nonterminal + else: + break + else: + # Everything matched! + print("MATCH AT", i) + + # ////////////////////////////////////////////////// + # Grammar + # ////////////////////////////////////////////////// + + def _selectprod_cb(self, production): + canvas = self._treelet_canvas + + self._prodlist.highlight(production) + if self._treelet is not None: + self._treelet.destroy() + + # Convert the production to a tree. + rhs = production.rhs() + for (i, elt) in enumerate(rhs): + if isinstance(elt, Nonterminal): + elt = Tree(elt) + tree = Tree(production.lhs().symbol(), *rhs) + + # Draw the tree in the treelet area. + fontsize = int(self._size.get()) + node_font = ("helvetica", -(fontsize + 4), "bold") + leaf_font = ("helvetica", -(fontsize + 2)) + self._treelet = tree_to_treesegment( + canvas, tree, node_font=node_font, leaf_font=leaf_font + ) + self._treelet["draggable"] = 1 + + # Center the treelet. + (x1, y1, x2, y2) = self._treelet.bbox() + w, h = int(canvas["width"]), int(canvas["height"]) + self._treelet.move((w - x1 - x2) / 2, (h - y1 - y2) / 2) + + # Mark the places where we can add it to the workspace. + self._markproduction(production) + + def destroy(self, *args): + self._top.destroy() + + def mainloop(self, *args, **kwargs): + self._top.mainloop(*args, **kwargs) + + +def demo2(): + from nltk import CFG, Nonterminal, Production + + nonterminals = "S VP NP PP P N Name V Det" + (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) + productions = ( + # Syntactic Productions + Production(S, [NP, VP]), + Production(NP, [Det, N]), + Production(NP, [NP, PP]), + Production(VP, [VP, PP]), + Production(VP, [V, NP, PP]), + Production(VP, [V, NP]), + Production(PP, [P, NP]), + Production(PP, []), + Production(PP, ["up", "over", NP]), + # Lexical Productions + Production(NP, ["I"]), + Production(Det, ["the"]), + Production(Det, ["a"]), + Production(N, ["man"]), + Production(V, ["saw"]), + Production(P, ["in"]), + Production(P, ["with"]), + Production(N, ["park"]), + Production(N, ["dog"]), + Production(N, ["statue"]), + Production(Det, ["my"]), + ) + grammar = CFG(S, productions) + + text = "I saw a man in the park".split() + d = CFGDemo(grammar, text) + d.mainloop() + + +###################################################################### +# Old Demo +###################################################################### + + +def demo(): + from nltk import CFG, Nonterminal + + nonterminals = "S VP NP PP P N Name V Det" + (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) + + grammar = CFG.fromstring( + """ + S -> NP VP + PP -> P NP + NP -> Det N + NP -> NP PP + VP -> V NP + VP -> VP PP + Det -> 'a' + Det -> 'the' + Det -> 'my' + NP -> 'I' + N -> 'dog' + N -> 'man' + N -> 'park' + N -> 'statue' + V -> 'saw' + P -> 'in' + P -> 'up' + P -> 'over' + P -> 'with' + """ + ) + + def cb(grammar): + print(grammar) + + top = Tk() + editor = CFGEditor(top, grammar, cb) + Label(top, text="\nTesting CFG Editor\n").pack() + Button(top, text="Quit", command=top.destroy).pack() + top.mainloop() + + +def demo3(): + from nltk import Production + + (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals( + "S, VP, NP, PP, P, N, Name, V, Det" + ) + + productions = ( + # Syntactic Productions + Production(S, [NP, VP]), + Production(NP, [Det, N]), + Production(NP, [NP, PP]), + Production(VP, [VP, PP]), + Production(VP, [V, NP, PP]), + Production(VP, [V, NP]), + Production(PP, [P, NP]), + Production(PP, []), + Production(PP, ["up", "over", NP]), + # Lexical Productions + Production(NP, ["I"]), + Production(Det, ["the"]), + Production(Det, ["a"]), + Production(N, ["man"]), + Production(V, ["saw"]), + Production(P, ["in"]), + Production(P, ["with"]), + Production(N, ["park"]), + Production(N, ["dog"]), + Production(N, ["statue"]), + Production(Det, ["my"]), + ) + + t = Tk() + + def destroy(e, t=t): + t.destroy() + + t.bind("q", destroy) + p = ProductionList(t, productions) + p.pack(expand=1, fill="both") + p.add_callback("select", p.markonly) + p.add_callback("move", p.markonly) + p.focus() + p.mark(productions[2]) + p.mark(productions[8]) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/draw/dispersion.py b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/dispersion.py new file mode 100644 index 0000000000000000000000000000000000000000..b64e2b824e0cb8ec0bbae8bbcabc7f4bb7d70791 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/dispersion.py @@ -0,0 +1,63 @@ +# Natural Language Toolkit: Dispersion Plots +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +A utility for displaying lexical dispersion. +""" + + +def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"): + """ + Generate a lexical dispersion plot. + + :param text: The source text + :type text: list(str) or iter(str) + :param words: The target words + :type words: list of str + :param ignore_case: flag to set if case should be ignored when searching text + :type ignore_case: bool + :return: a matplotlib Axes object that may still be modified before plotting + :rtype: Axes + """ + + try: + import matplotlib.pyplot as plt + except ImportError as e: + raise ImportError( + "The plot function requires matplotlib to be installed. " + "See https://matplotlib.org/" + ) from e + + word2y = { + word.casefold() if ignore_case else word: y + for y, word in enumerate(reversed(words)) + } + xs, ys = [], [] + for x, token in enumerate(text): + token = token.casefold() if ignore_case else token + y = word2y.get(token) + if y is not None: + xs.append(x) + ys.append(y) + + _, ax = plt.subplots() + ax.plot(xs, ys, "|") + ax.set_yticks(list(range(len(words))), words, color="C0") + ax.set_ylim(-1, len(words)) + ax.set_title(title) + ax.set_xlabel("Word Offset") + return ax + + +if __name__ == "__main__": + import matplotlib.pyplot as plt + + from nltk.corpus import gutenberg + + words = ["Elinor", "Marianne", "Edward", "Willoughby"] + dispersion_plot(gutenberg.words("austen-sense.txt"), words) + plt.show() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/grammar.py b/.eggs/nltk-3.8-py3.10.egg/nltk/grammar.py new file mode 100644 index 0000000000000000000000000000000000000000..16d70307c4aa6ad9ac8257cb1d5d8f18f884b2f4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/grammar.py @@ -0,0 +1,1708 @@ +# Natural Language Toolkit: Context Free Grammars +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# Jason Narad +# Peter Ljunglöf +# Tom Aarsen <> +# URL: +# For license information, see LICENSE.TXT +# + +""" +Basic data classes for representing context free grammars. A +"grammar" specifies which trees can represent the structure of a +given text. Each of these trees is called a "parse tree" for the +text (or simply a "parse"). In a "context free" grammar, the set of +parse trees for any piece of a text can depend only on that piece, and +not on the rest of the text (i.e., the piece's context). Context free +grammars are often used to find possible syntactic structures for +sentences. In this context, the leaves of a parse tree are word +tokens; and the node values are phrasal categories, such as ``NP`` +and ``VP``. + +The ``CFG`` class is used to encode context free grammars. Each +``CFG`` consists of a start symbol and a set of productions. +The "start symbol" specifies the root node value for parse trees. For example, +the start symbol for syntactic parsing is usually ``S``. Start +symbols are encoded using the ``Nonterminal`` class, which is discussed +below. + +A Grammar's "productions" specify what parent-child relationships a parse +tree can contain. Each production specifies that a particular +node can be the parent of a particular set of children. For example, +the production `` -> `` specifies that an ``S`` node can +be the parent of an ``NP`` node and a ``VP`` node. + +Grammar productions are implemented by the ``Production`` class. +Each ``Production`` consists of a left hand side and a right hand +side. The "left hand side" is a ``Nonterminal`` that specifies the +node type for a potential parent; and the "right hand side" is a list +that specifies allowable children for that parent. This lists +consists of ``Nonterminals`` and text types: each ``Nonterminal`` +indicates that the corresponding child may be a ``TreeToken`` with the +specified node type; and each text type indicates that the +corresponding child may be a ``Token`` with the with that type. + +The ``Nonterminal`` class is used to distinguish node values from leaf +values. This prevents the grammar from accidentally using a leaf +value (such as the English word "A") as the node of a subtree. Within +a ``CFG``, all node values are wrapped in the ``Nonterminal`` +class. Note, however, that the trees that are specified by the grammar do +*not* include these ``Nonterminal`` wrappers. + +Grammars can also be given a more procedural interpretation. According to +this interpretation, a Grammar specifies any tree structure *tree* that +can be produced by the following procedure: + +| Set tree to the start symbol +| Repeat until tree contains no more nonterminal leaves: +| Choose a production prod with whose left hand side +| lhs is a nonterminal leaf of tree. +| Replace the nonterminal leaf with a subtree, whose node +| value is the value wrapped by the nonterminal lhs, and +| whose children are the right hand side of prod. + +The operation of replacing the left hand side (*lhs*) of a production +with the right hand side (*rhs*) in a tree (*tree*) is known as +"expanding" *lhs* to *rhs* in *tree*. +""" +import re +from functools import total_ordering + +from nltk.featstruct import SLASH, TYPE, FeatDict, FeatStruct, FeatStructReader +from nltk.internals import raise_unorderable_types +from nltk.probability import ImmutableProbabilisticMixIn +from nltk.util import invert_graph, transitive_closure + +################################################################# +# Nonterminal +################################################################# + + +@total_ordering +class Nonterminal: + """ + A non-terminal symbol for a context free grammar. ``Nonterminal`` + is a wrapper class for node values; it is used by ``Production`` + objects to distinguish node values from leaf values. + The node value that is wrapped by a ``Nonterminal`` is known as its + "symbol". Symbols are typically strings representing phrasal + categories (such as ``"NP"`` or ``"VP"``). However, more complex + symbol types are sometimes used (e.g., for lexicalized grammars). + Since symbols are node values, they must be immutable and + hashable. Two ``Nonterminals`` are considered equal if their + symbols are equal. + + :see: ``CFG``, ``Production`` + :type _symbol: any + :ivar _symbol: The node value corresponding to this + ``Nonterminal``. This value must be immutable and hashable. + """ + + def __init__(self, symbol): + """ + Construct a new non-terminal from the given symbol. + + :type symbol: any + :param symbol: The node value corresponding to this + ``Nonterminal``. This value must be immutable and + hashable. + """ + self._symbol = symbol + + def symbol(self): + """ + Return the node value corresponding to this ``Nonterminal``. + + :rtype: (any) + """ + return self._symbol + + def __eq__(self, other): + """ + Return True if this non-terminal is equal to ``other``. In + particular, return True if ``other`` is a ``Nonterminal`` + and this non-terminal's symbol is equal to ``other`` 's symbol. + + :rtype: bool + """ + return type(self) == type(other) and self._symbol == other._symbol + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, Nonterminal): + raise_unorderable_types("<", self, other) + return self._symbol < other._symbol + + def __hash__(self): + return hash(self._symbol) + + def __repr__(self): + """ + Return a string representation for this ``Nonterminal``. + + :rtype: str + """ + if isinstance(self._symbol, str): + return "%s" % self._symbol + else: + return "%s" % repr(self._symbol) + + def __str__(self): + """ + Return a string representation for this ``Nonterminal``. + + :rtype: str + """ + if isinstance(self._symbol, str): + return "%s" % self._symbol + else: + return "%s" % repr(self._symbol) + + def __div__(self, rhs): + """ + Return a new nonterminal whose symbol is ``A/B``, where ``A`` is + the symbol for this nonterminal, and ``B`` is the symbol for rhs. + + :param rhs: The nonterminal used to form the right hand side + of the new nonterminal. + :type rhs: Nonterminal + :rtype: Nonterminal + """ + return Nonterminal(f"{self._symbol}/{rhs._symbol}") + + def __truediv__(self, rhs): + """ + Return a new nonterminal whose symbol is ``A/B``, where ``A`` is + the symbol for this nonterminal, and ``B`` is the symbol for rhs. + This function allows use of the slash ``/`` operator with + the future import of division. + + :param rhs: The nonterminal used to form the right hand side + of the new nonterminal. + :type rhs: Nonterminal + :rtype: Nonterminal + """ + return self.__div__(rhs) + + +def nonterminals(symbols): + """ + Given a string containing a list of symbol names, return a list of + ``Nonterminals`` constructed from those symbols. + + :param symbols: The symbol name string. This string can be + delimited by either spaces or commas. + :type symbols: str + :return: A list of ``Nonterminals`` constructed from the symbol + names given in ``symbols``. The ``Nonterminals`` are sorted + in the same order as the symbols names. + :rtype: list(Nonterminal) + """ + if "," in symbols: + symbol_list = symbols.split(",") + else: + symbol_list = symbols.split() + return [Nonterminal(s.strip()) for s in symbol_list] + + +class FeatStructNonterminal(FeatDict, Nonterminal): + """A feature structure that's also a nonterminal. It acts as its + own symbol, and automatically freezes itself when hashed.""" + + def __hash__(self): + self.freeze() + return FeatStruct.__hash__(self) + + def symbol(self): + return self + + +def is_nonterminal(item): + """ + :return: True if the item is a ``Nonterminal``. + :rtype: bool + """ + return isinstance(item, Nonterminal) + + +################################################################# +# Terminals +################################################################# + + +def is_terminal(item): + """ + Return True if the item is a terminal, which currently is + if it is hashable and not a ``Nonterminal``. + + :rtype: bool + """ + return hasattr(item, "__hash__") and not isinstance(item, Nonterminal) + + +################################################################# +# Productions +################################################################# + + +@total_ordering +class Production: + """ + A grammar production. Each production maps a single symbol + on the "left-hand side" to a sequence of symbols on the + "right-hand side". (In the case of context-free productions, + the left-hand side must be a ``Nonterminal``, and the right-hand + side is a sequence of terminals and ``Nonterminals``.) + "terminals" can be any immutable hashable object that is + not a ``Nonterminal``. Typically, terminals are strings + representing words, such as ``"dog"`` or ``"under"``. + + :see: ``CFG`` + :see: ``DependencyGrammar`` + :see: ``Nonterminal`` + :type _lhs: Nonterminal + :ivar _lhs: The left-hand side of the production. + :type _rhs: tuple(Nonterminal, terminal) + :ivar _rhs: The right-hand side of the production. + """ + + def __init__(self, lhs, rhs): + """ + Construct a new ``Production``. + + :param lhs: The left-hand side of the new ``Production``. + :type lhs: Nonterminal + :param rhs: The right-hand side of the new ``Production``. + :type rhs: sequence(Nonterminal and terminal) + """ + if isinstance(rhs, str): + raise TypeError( + "production right hand side should be a list, " "not a string" + ) + self._lhs = lhs + self._rhs = tuple(rhs) + + def lhs(self): + """ + Return the left-hand side of this ``Production``. + + :rtype: Nonterminal + """ + return self._lhs + + def rhs(self): + """ + Return the right-hand side of this ``Production``. + + :rtype: sequence(Nonterminal and terminal) + """ + return self._rhs + + def __len__(self): + """ + Return the length of the right-hand side. + + :rtype: int + """ + return len(self._rhs) + + def is_nonlexical(self): + """ + Return True if the right-hand side only contains ``Nonterminals`` + + :rtype: bool + """ + return all(is_nonterminal(n) for n in self._rhs) + + def is_lexical(self): + """ + Return True if the right-hand contain at least one terminal token. + + :rtype: bool + """ + return not self.is_nonlexical() + + def __str__(self): + """ + Return a verbose string representation of the ``Production``. + + :rtype: str + """ + result = "%s -> " % repr(self._lhs) + result += " ".join(repr(el) for el in self._rhs) + return result + + def __repr__(self): + """ + Return a concise string representation of the ``Production``. + + :rtype: str + """ + return "%s" % self + + def __eq__(self, other): + """ + Return True if this ``Production`` is equal to ``other``. + + :rtype: bool + """ + return ( + type(self) == type(other) + and self._lhs == other._lhs + and self._rhs == other._rhs + ) + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, Production): + raise_unorderable_types("<", self, other) + return (self._lhs, self._rhs) < (other._lhs, other._rhs) + + def __hash__(self): + """ + Return a hash value for the ``Production``. + + :rtype: int + """ + return hash((self._lhs, self._rhs)) + + +class DependencyProduction(Production): + """ + A dependency grammar production. Each production maps a single + head word to an unordered list of one or more modifier words. + """ + + def __str__(self): + """ + Return a verbose string representation of the ``DependencyProduction``. + + :rtype: str + """ + result = f"'{self._lhs}' ->" + for elt in self._rhs: + result += f" '{elt}'" + return result + + +class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn): + """ + A probabilistic context free grammar production. + A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that + has an associated probability, which represents how likely it is that + this production will be used. In particular, the probability of a + ``ProbabilisticProduction`` records the likelihood that its right-hand side is + the correct instantiation for any given occurrence of its left-hand side. + + :see: ``Production`` + """ + + def __init__(self, lhs, rhs, **prob): + """ + Construct a new ``ProbabilisticProduction``. + + :param lhs: The left-hand side of the new ``ProbabilisticProduction``. + :type lhs: Nonterminal + :param rhs: The right-hand side of the new ``ProbabilisticProduction``. + :type rhs: sequence(Nonterminal and terminal) + :param prob: Probability parameters of the new ``ProbabilisticProduction``. + """ + ImmutableProbabilisticMixIn.__init__(self, **prob) + Production.__init__(self, lhs, rhs) + + def __str__(self): + return super().__str__() + ( + " [1.0]" if (self.prob() == 1.0) else " [%g]" % self.prob() + ) + + def __eq__(self, other): + return ( + type(self) == type(other) + and self._lhs == other._lhs + and self._rhs == other._rhs + and self.prob() == other.prob() + ) + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash((self._lhs, self._rhs, self.prob())) + + +################################################################# +# Grammars +################################################################# + + +class CFG: + """ + A context-free grammar. A grammar consists of a start state and + a set of productions. The set of terminals and nonterminals is + implicitly specified by the productions. + + If you need efficient key-based access to productions, you + can use a subclass to implement it. + """ + + def __init__(self, start, productions, calculate_leftcorners=True): + """ + Create a new context-free grammar, from the given start state + and set of ``Production`` instances. + + :param start: The start symbol + :type start: Nonterminal + :param productions: The list of productions that defines the grammar + :type productions: list(Production) + :param calculate_leftcorners: False if we don't want to calculate the + leftcorner relation. In that case, some optimized chart parsers won't work. + :type calculate_leftcorners: bool + """ + if not is_nonterminal(start): + raise TypeError( + "start should be a Nonterminal object," + " not a %s" % type(start).__name__ + ) + + self._start = start + self._productions = productions + self._categories = {prod.lhs() for prod in productions} + self._calculate_indexes() + self._calculate_grammar_forms() + if calculate_leftcorners: + self._calculate_leftcorners() + + def _calculate_indexes(self): + self._lhs_index = {} + self._rhs_index = {} + self._empty_index = {} + self._lexical_index = {} + for prod in self._productions: + # Left hand side. + lhs = prod._lhs + if lhs not in self._lhs_index: + self._lhs_index[lhs] = [] + self._lhs_index[lhs].append(prod) + if prod._rhs: + # First item in right hand side. + rhs0 = prod._rhs[0] + if rhs0 not in self._rhs_index: + self._rhs_index[rhs0] = [] + self._rhs_index[rhs0].append(prod) + else: + # The right hand side is empty. + self._empty_index[prod.lhs()] = prod + # Lexical tokens in the right hand side. + for token in prod._rhs: + if is_terminal(token): + self._lexical_index.setdefault(token, set()).add(prod) + + def _calculate_leftcorners(self): + # Calculate leftcorner relations, for use in optimized parsing. + self._immediate_leftcorner_categories = {cat: {cat} for cat in self._categories} + self._immediate_leftcorner_words = {cat: set() for cat in self._categories} + for prod in self.productions(): + if len(prod) > 0: + cat, left = prod.lhs(), prod.rhs()[0] + if is_nonterminal(left): + self._immediate_leftcorner_categories[cat].add(left) + else: + self._immediate_leftcorner_words[cat].add(left) + + lc = transitive_closure(self._immediate_leftcorner_categories, reflexive=True) + self._leftcorners = lc + self._leftcorner_parents = invert_graph(lc) + + nr_leftcorner_categories = sum( + map(len, self._immediate_leftcorner_categories.values()) + ) + nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values())) + if nr_leftcorner_words > nr_leftcorner_categories > 10000: + # If the grammar is big, the leftcorner-word dictionary will be too large. + # In that case it is better to calculate the relation on demand. + self._leftcorner_words = None + return + + self._leftcorner_words = {} + for cat in self._leftcorners: + lefts = self._leftcorners[cat] + lc = self._leftcorner_words[cat] = set() + for left in lefts: + lc.update(self._immediate_leftcorner_words.get(left, set())) + + @classmethod + def fromstring(cls, input, encoding=None): + """ + Return the grammar instance corresponding to the input string(s). + + :param input: a grammar, either in the form of a string or as a list of strings. + """ + start, productions = read_grammar( + input, standard_nonterm_parser, encoding=encoding + ) + return cls(start, productions) + + def start(self): + """ + Return the start symbol of the grammar + + :rtype: Nonterminal + """ + return self._start + + # tricky to balance readability and efficiency here! + # can't use set operations as they don't preserve ordering + def productions(self, lhs=None, rhs=None, empty=False): + """ + Return the grammar productions, filtered by the left-hand side + or the first item in the right-hand side. + + :param lhs: Only return productions with the given left-hand side. + :param rhs: Only return productions with the given first item + in the right-hand side. + :param empty: Only return productions with an empty right-hand side. + :return: A list of productions matching the given constraints. + :rtype: list(Production) + """ + if rhs and empty: + raise ValueError( + "You cannot select empty and non-empty " "productions at the same time." + ) + + # no constraints so return everything + if not lhs and not rhs: + if not empty: + return self._productions + else: + return self._empty_index.values() + + # only lhs specified so look up its index + elif lhs and not rhs: + if not empty: + return self._lhs_index.get(lhs, []) + elif lhs in self._empty_index: + return [self._empty_index[lhs]] + else: + return [] + + # only rhs specified so look up its index + elif rhs and not lhs: + return self._rhs_index.get(rhs, []) + + # intersect + else: + return [ + prod + for prod in self._lhs_index.get(lhs, []) + if prod in self._rhs_index.get(rhs, []) + ] + + def leftcorners(self, cat): + """ + Return the set of all nonterminals that the given nonterminal + can start with, including itself. + + This is the reflexive, transitive closure of the immediate + leftcorner relation: (A > B) iff (A -> B beta) + + :param cat: the parent of the leftcorners + :type cat: Nonterminal + :return: the set of all leftcorners + :rtype: set(Nonterminal) + """ + return self._leftcorners.get(cat, {cat}) + + def is_leftcorner(self, cat, left): + """ + True if left is a leftcorner of cat, where left can be a + terminal or a nonterminal. + + :param cat: the parent of the leftcorner + :type cat: Nonterminal + :param left: the suggested leftcorner + :type left: Terminal or Nonterminal + :rtype: bool + """ + if is_nonterminal(left): + return left in self.leftcorners(cat) + elif self._leftcorner_words: + return left in self._leftcorner_words.get(cat, set()) + else: + return any( + left in self._immediate_leftcorner_words.get(parent, set()) + for parent in self.leftcorners(cat) + ) + + def leftcorner_parents(self, cat): + """ + Return the set of all nonterminals for which the given category + is a left corner. This is the inverse of the leftcorner relation. + + :param cat: the suggested leftcorner + :type cat: Nonterminal + :return: the set of all parents to the leftcorner + :rtype: set(Nonterminal) + """ + return self._leftcorner_parents.get(cat, {cat}) + + def check_coverage(self, tokens): + """ + Check whether the grammar rules cover the given list of tokens. + If not, then raise an exception. + + :type tokens: list(str) + """ + missing = [tok for tok in tokens if not self._lexical_index.get(tok)] + if missing: + missing = ", ".join(f"{w!r}" for w in missing) + raise ValueError( + "Grammar does not cover some of the " "input words: %r." % missing + ) + + def _calculate_grammar_forms(self): + """ + Pre-calculate of which form(s) the grammar is. + """ + prods = self._productions + self._is_lexical = all(p.is_lexical() for p in prods) + self._is_nonlexical = all(p.is_nonlexical() for p in prods if len(p) != 1) + self._min_len = min(len(p) for p in prods) + self._max_len = max(len(p) for p in prods) + self._all_unary_are_lexical = all(p.is_lexical() for p in prods if len(p) == 1) + + def is_lexical(self): + """ + Return True if all productions are lexicalised. + """ + return self._is_lexical + + def is_nonlexical(self): + """ + Return True if all lexical rules are "preterminals", that is, + unary rules which can be separated in a preprocessing step. + + This means that all productions are of the forms + A -> B1 ... Bn (n>=0), or A -> "s". + + Note: is_lexical() and is_nonlexical() are not opposites. + There are grammars which are neither, and grammars which are both. + """ + return self._is_nonlexical + + def min_len(self): + """ + Return the right-hand side length of the shortest grammar production. + """ + return self._min_len + + def max_len(self): + """ + Return the right-hand side length of the longest grammar production. + """ + return self._max_len + + def is_nonempty(self): + """ + Return True if there are no empty productions. + """ + return self._min_len > 0 + + def is_binarised(self): + """ + Return True if all productions are at most binary. + Note that there can still be empty and unary productions. + """ + return self._max_len <= 2 + + def is_flexible_chomsky_normal_form(self): + """ + Return True if all productions are of the forms + A -> B C, A -> B, or A -> "s". + """ + return self.is_nonempty() and self.is_nonlexical() and self.is_binarised() + + def is_chomsky_normal_form(self): + """ + Return True if the grammar is of Chomsky Normal Form, i.e. all productions + are of the form A -> B C, or A -> "s". + """ + return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical + + def chomsky_normal_form(self, new_token_padding="@$@", flexible=False): + """ + Returns a new Grammar that is in chomsky normal + + :param: new_token_padding + Customise new rule formation during binarisation + """ + if self.is_chomsky_normal_form(): + return self + if self.productions(empty=True): + raise ValueError( + "Grammar has Empty rules. " "Cannot deal with them at the moment" + ) + + # check for mixed rules + for rule in self.productions(): + if rule.is_lexical() and len(rule.rhs()) > 1: + raise ValueError( + f"Cannot handled mixed rule {rule.lhs()} => {rule.rhs()}" + ) + + step1 = CFG.eliminate_start(self) + step2 = CFG.binarize(step1, new_token_padding) + if flexible: + return step2 + step3 = CFG.remove_unitary_rules(step2) + step4 = CFG(step3.start(), list(set(step3.productions()))) + return step4 + + @classmethod + def remove_unitary_rules(cls, grammar): + """ + Remove nonlexical unitary rules and convert them to + lexical + """ + result = [] + unitary = [] + for rule in grammar.productions(): + if len(rule) == 1 and rule.is_nonlexical(): + unitary.append(rule) + else: + result.append(rule) + + while unitary: + rule = unitary.pop(0) + for item in grammar.productions(lhs=rule.rhs()[0]): + new_rule = Production(rule.lhs(), item.rhs()) + if len(new_rule) != 1 or new_rule.is_lexical(): + result.append(new_rule) + else: + unitary.append(new_rule) + + n_grammar = CFG(grammar.start(), result) + return n_grammar + + @classmethod + def binarize(cls, grammar, padding="@$@"): + """ + Convert all non-binary rules into binary by introducing + new tokens. + Example:: + + Original: + A => B C D + After Conversion: + A => B A@$@B + A@$@B => C D + """ + result = [] + + for rule in grammar.productions(): + if len(rule.rhs()) > 2: + # this rule needs to be broken down + left_side = rule.lhs() + for k in range(0, len(rule.rhs()) - 2): + tsym = rule.rhs()[k] + new_sym = Nonterminal(left_side.symbol() + padding + tsym.symbol()) + new_production = Production(left_side, (tsym, new_sym)) + left_side = new_sym + result.append(new_production) + last_prd = Production(left_side, rule.rhs()[-2:]) + result.append(last_prd) + else: + result.append(rule) + + n_grammar = CFG(grammar.start(), result) + return n_grammar + + @classmethod + def eliminate_start(cls, grammar): + """ + Eliminate start rule in case it appears on RHS + Example: S -> S0 S1 and S0 -> S1 S + Then another rule S0_Sigma -> S is added + """ + start = grammar.start() + result = [] + need_to_add = None + for rule in grammar.productions(): + if start in rule.rhs(): + need_to_add = True + result.append(rule) + if need_to_add: + start = Nonterminal("S0_SIGMA") + result.append(Production(start, [grammar.start()])) + n_grammar = CFG(start, result) + return n_grammar + return grammar + + def __repr__(self): + return "" % len(self._productions) + + def __str__(self): + result = "Grammar with %d productions" % len(self._productions) + result += " (start state = %r)" % self._start + for production in self._productions: + result += "\n %s" % production + return result + + +class FeatureGrammar(CFG): + """ + A feature-based grammar. This is equivalent to a + ``CFG`` whose nonterminals are all + ``FeatStructNonterminal``. + + A grammar consists of a start state and a set of + productions. The set of terminals and nonterminals + is implicitly specified by the productions. + """ + + def __init__(self, start, productions): + """ + Create a new feature-based grammar, from the given start + state and set of ``Productions``. + + :param start: The start symbol + :type start: FeatStructNonterminal + :param productions: The list of productions that defines the grammar + :type productions: list(Production) + """ + CFG.__init__(self, start, productions) + + # The difference with CFG is that the productions are + # indexed on the TYPE feature of the nonterminals. + # This is calculated by the method _get_type_if_possible(). + + def _calculate_indexes(self): + self._lhs_index = {} + self._rhs_index = {} + self._empty_index = {} + self._empty_productions = [] + self._lexical_index = {} + for prod in self._productions: + # Left hand side. + lhs = self._get_type_if_possible(prod._lhs) + if lhs not in self._lhs_index: + self._lhs_index[lhs] = [] + self._lhs_index[lhs].append(prod) + if prod._rhs: + # First item in right hand side. + rhs0 = self._get_type_if_possible(prod._rhs[0]) + if rhs0 not in self._rhs_index: + self._rhs_index[rhs0] = [] + self._rhs_index[rhs0].append(prod) + else: + # The right hand side is empty. + if lhs not in self._empty_index: + self._empty_index[lhs] = [] + self._empty_index[lhs].append(prod) + self._empty_productions.append(prod) + # Lexical tokens in the right hand side. + for token in prod._rhs: + if is_terminal(token): + self._lexical_index.setdefault(token, set()).add(prod) + + @classmethod + def fromstring( + cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None + ): + """ + Return a feature structure based grammar. + + :param input: a grammar, either in the form of a string or else + as a list of strings. + :param features: a tuple of features (default: SLASH, TYPE) + :param logic_parser: a parser for lambda-expressions, + by default, ``LogicParser()`` + :param fstruct_reader: a feature structure parser + (only if features and logic_parser is None) + """ + if features is None: + features = (SLASH, TYPE) + + if fstruct_reader is None: + fstruct_reader = FeatStructReader( + features, FeatStructNonterminal, logic_parser=logic_parser + ) + elif logic_parser is not None: + raise Exception( + "'logic_parser' and 'fstruct_reader' must " "not both be set" + ) + + start, productions = read_grammar( + input, fstruct_reader.read_partial, encoding=encoding + ) + return cls(start, productions) + + def productions(self, lhs=None, rhs=None, empty=False): + """ + Return the grammar productions, filtered by the left-hand side + or the first item in the right-hand side. + + :param lhs: Only return productions with the given left-hand side. + :param rhs: Only return productions with the given first item + in the right-hand side. + :param empty: Only return productions with an empty right-hand side. + :rtype: list(Production) + """ + if rhs and empty: + raise ValueError( + "You cannot select empty and non-empty " "productions at the same time." + ) + + # no constraints so return everything + if not lhs and not rhs: + if empty: + return self._empty_productions + else: + return self._productions + + # only lhs specified so look up its index + elif lhs and not rhs: + if empty: + return self._empty_index.get(self._get_type_if_possible(lhs), []) + else: + return self._lhs_index.get(self._get_type_if_possible(lhs), []) + + # only rhs specified so look up its index + elif rhs and not lhs: + return self._rhs_index.get(self._get_type_if_possible(rhs), []) + + # intersect + else: + return [ + prod + for prod in self._lhs_index.get(self._get_type_if_possible(lhs), []) + if prod in self._rhs_index.get(self._get_type_if_possible(rhs), []) + ] + + def leftcorners(self, cat): + """ + Return the set of all words that the given category can start with. + Also called the "first set" in compiler construction. + """ + raise NotImplementedError("Not implemented yet") + + def leftcorner_parents(self, cat): + """ + Return the set of all categories for which the given category + is a left corner. + """ + raise NotImplementedError("Not implemented yet") + + def _get_type_if_possible(self, item): + """ + Helper function which returns the ``TYPE`` feature of the ``item``, + if it exists, otherwise it returns the ``item`` itself + """ + if isinstance(item, dict) and TYPE in item: + return FeatureValueType(item[TYPE]) + else: + return item + + +@total_ordering +class FeatureValueType: + """ + A helper class for ``FeatureGrammars``, designed to be different + from ordinary strings. This is to stop the ``FeatStruct`` + ``FOO[]`` from being compare equal to the terminal "FOO". + """ + + def __init__(self, value): + self._value = value + + def __repr__(self): + return "<%s>" % self._value + + def __eq__(self, other): + return type(self) == type(other) and self._value == other._value + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, FeatureValueType): + raise_unorderable_types("<", self, other) + return self._value < other._value + + def __hash__(self): + return hash(self._value) + + +class DependencyGrammar: + """ + A dependency grammar. A DependencyGrammar consists of a set of + productions. Each production specifies a head/modifier relationship + between a pair of words. + """ + + def __init__(self, productions): + """ + Create a new dependency grammar, from the set of ``Productions``. + + :param productions: The list of productions that defines the grammar + :type productions: list(Production) + """ + self._productions = productions + + @classmethod + def fromstring(cls, input): + productions = [] + for linenum, line in enumerate(input.split("\n")): + line = line.strip() + if line.startswith("#") or line == "": + continue + try: + productions += _read_dependency_production(line) + except ValueError as e: + raise ValueError(f"Unable to parse line {linenum}: {line}") from e + if len(productions) == 0: + raise ValueError("No productions found!") + return cls(productions) + + def contains(self, head, mod): + """ + :param head: A head word. + :type head: str + :param mod: A mod word, to test as a modifier of 'head'. + :type mod: str + + :return: true if this ``DependencyGrammar`` contains a + ``DependencyProduction`` mapping 'head' to 'mod'. + :rtype: bool + """ + for production in self._productions: + for possibleMod in production._rhs: + if production._lhs == head and possibleMod == mod: + return True + return False + + def __contains__(self, head_mod): + """ + Return True if this ``DependencyGrammar`` contains a + ``DependencyProduction`` mapping 'head' to 'mod'. + + :param head_mod: A tuple of a head word and a mod word, + to test as a modifier of 'head'. + :type head: Tuple[str, str] + :rtype: bool + """ + try: + head, mod = head_mod + except ValueError as e: + raise ValueError( + "Must use a tuple of strings, e.g. `('price', 'of') in grammar`" + ) from e + return self.contains(head, mod) + + # # should be rewritten, the set comp won't work in all comparisons + # def contains_exactly(self, head, modlist): + # for production in self._productions: + # if(len(production._rhs) == len(modlist)): + # if(production._lhs == head): + # set1 = Set(production._rhs) + # set2 = Set(modlist) + # if(set1 == set2): + # return True + # return False + + def __str__(self): + """ + Return a verbose string representation of the ``DependencyGrammar`` + + :rtype: str + """ + str = "Dependency grammar with %d productions" % len(self._productions) + for production in self._productions: + str += "\n %s" % production + return str + + def __repr__(self): + """ + Return a concise string representation of the ``DependencyGrammar`` + """ + return "Dependency grammar with %d productions" % len(self._productions) + + +class ProbabilisticDependencyGrammar: + """ """ + + def __init__(self, productions, events, tags): + self._productions = productions + self._events = events + self._tags = tags + + def contains(self, head, mod): + """ + Return True if this ``DependencyGrammar`` contains a + ``DependencyProduction`` mapping 'head' to 'mod'. + + :param head: A head word. + :type head: str + :param mod: A mod word, to test as a modifier of 'head'. + :type mod: str + :rtype: bool + """ + for production in self._productions: + for possibleMod in production._rhs: + if production._lhs == head and possibleMod == mod: + return True + return False + + def __str__(self): + """ + Return a verbose string representation of the ``ProbabilisticDependencyGrammar`` + + :rtype: str + """ + str = "Statistical dependency grammar with %d productions" % len( + self._productions + ) + for production in self._productions: + str += "\n %s" % production + str += "\nEvents:" + for event in self._events: + str += "\n %d:%s" % (self._events[event], event) + str += "\nTags:" + for tag_word in self._tags: + str += f"\n {tag_word}:\t({self._tags[tag_word]})" + return str + + def __repr__(self): + """ + Return a concise string representation of the ``ProbabilisticDependencyGrammar`` + """ + return "Statistical Dependency grammar with %d productions" % len( + self._productions + ) + + +class PCFG(CFG): + """ + A probabilistic context-free grammar. A PCFG consists of a + start state and a set of productions with probabilities. The set of + terminals and nonterminals is implicitly specified by the productions. + + PCFG productions use the ``ProbabilisticProduction`` class. + ``PCFGs`` impose the constraint that the set of productions with + any given left-hand-side must have probabilities that sum to 1 + (allowing for a small margin of error). + + If you need efficient key-based access to productions, you can use + a subclass to implement it. + + :type EPSILON: float + :cvar EPSILON: The acceptable margin of error for checking that + productions with a given left-hand side have probabilities + that sum to 1. + """ + + EPSILON = 0.01 + + def __init__(self, start, productions, calculate_leftcorners=True): + """ + Create a new context-free grammar, from the given start state + and set of ``ProbabilisticProductions``. + + :param start: The start symbol + :type start: Nonterminal + :param productions: The list of productions that defines the grammar + :type productions: list(Production) + :raise ValueError: if the set of productions with any left-hand-side + do not have probabilities that sum to a value within + EPSILON of 1. + :param calculate_leftcorners: False if we don't want to calculate the + leftcorner relation. In that case, some optimized chart parsers won't work. + :type calculate_leftcorners: bool + """ + CFG.__init__(self, start, productions, calculate_leftcorners) + + # Make sure that the probabilities sum to one. + probs = {} + for production in productions: + probs[production.lhs()] = probs.get(production.lhs(), 0) + production.prob() + for (lhs, p) in probs.items(): + if not ((1 - PCFG.EPSILON) < p < (1 + PCFG.EPSILON)): + raise ValueError("Productions for %r do not sum to 1" % lhs) + + @classmethod + def fromstring(cls, input, encoding=None): + """ + Return a probabilistic context-free grammar corresponding to the + input string(s). + + :param input: a grammar, either in the form of a string or else + as a list of strings. + """ + start, productions = read_grammar( + input, standard_nonterm_parser, probabilistic=True, encoding=encoding + ) + return cls(start, productions) + + +################################################################# +# Inducing Grammars +################################################################# + +# Contributed by Nathan Bodenstab + + +def induce_pcfg(start, productions): + r""" + Induce a PCFG grammar from a list of productions. + + The probability of a production A -> B C in a PCFG is: + + | count(A -> B C) + | P(B, C | A) = --------------- where \* is any right hand side + | count(A -> \*) + + :param start: The start symbol + :type start: Nonterminal + :param productions: The list of productions that defines the grammar + :type productions: list(Production) + """ + # Production count: the number of times a given production occurs + pcount = {} + + # LHS-count: counts the number of times a given lhs occurs + lcount = {} + + for prod in productions: + lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 + pcount[prod] = pcount.get(prod, 0) + 1 + + prods = [ + ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()]) + for p in pcount + ] + return PCFG(start, prods) + + +################################################################# +# Helper functions for reading productions +################################################################# + + +def _read_cfg_production(input): + """ + Return a list of context-free ``Productions``. + """ + return _read_production(input, standard_nonterm_parser) + + +def _read_pcfg_production(input): + """ + Return a list of PCFG ``ProbabilisticProductions``. + """ + return _read_production(input, standard_nonterm_parser, probabilistic=True) + + +def _read_fcfg_production(input, fstruct_reader): + """ + Return a list of feature-based ``Productions``. + """ + return _read_production(input, fstruct_reader) + + +# Parsing generic grammars + +_ARROW_RE = re.compile(r"\s* -> \s*", re.VERBOSE) +_PROBABILITY_RE = re.compile(r"( \[ [\d\.]+ \] ) \s*", re.VERBOSE) +_TERMINAL_RE = re.compile(r'( "[^"]*" | \'[^\']*\' ) \s*', re.VERBOSE) +_DISJUNCTION_RE = re.compile(r"\| \s*", re.VERBOSE) + + +def _read_production(line, nonterm_parser, probabilistic=False): + """ + Parse a grammar rule, given as a string, and return + a list of productions. + """ + pos = 0 + + # Parse the left-hand side. + lhs, pos = nonterm_parser(line, pos) + + # Skip over the arrow. + m = _ARROW_RE.match(line, pos) + if not m: + raise ValueError("Expected an arrow") + pos = m.end() + + # Parse the right hand side. + probabilities = [0.0] + rhsides = [[]] + while pos < len(line): + # Probability. + m = _PROBABILITY_RE.match(line, pos) + if probabilistic and m: + pos = m.end() + probabilities[-1] = float(m.group(1)[1:-1]) + if probabilities[-1] > 1.0: + raise ValueError( + "Production probability %f, " + "should not be greater than 1.0" % (probabilities[-1],) + ) + + # String -- add terminal. + elif line[pos] in "'\"": + m = _TERMINAL_RE.match(line, pos) + if not m: + raise ValueError("Unterminated string") + rhsides[-1].append(m.group(1)[1:-1]) + pos = m.end() + + # Vertical bar -- start new rhside. + elif line[pos] == "|": + m = _DISJUNCTION_RE.match(line, pos) + probabilities.append(0.0) + rhsides.append([]) + pos = m.end() + + # Anything else -- nonterminal. + else: + nonterm, pos = nonterm_parser(line, pos) + rhsides[-1].append(nonterm) + + if probabilistic: + return [ + ProbabilisticProduction(lhs, rhs, prob=probability) + for (rhs, probability) in zip(rhsides, probabilities) + ] + else: + return [Production(lhs, rhs) for rhs in rhsides] + + +################################################################# +# Reading Phrase Structure Grammars +################################################################# + + +def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None): + """ + Return a pair consisting of a starting category and a list of + ``Productions``. + + :param input: a grammar, either in the form of a string or else + as a list of strings. + :param nonterm_parser: a function for parsing nonterminals. + It should take a ``(string, position)`` as argument and + return a ``(nonterminal, position)`` as result. + :param probabilistic: are the grammar rules probabilistic? + :type probabilistic: bool + :param encoding: the encoding of the grammar, if it is a binary string + :type encoding: str + """ + if encoding is not None: + input = input.decode(encoding) + if isinstance(input, str): + lines = input.split("\n") + else: + lines = input + + start = None + productions = [] + continue_line = "" + for linenum, line in enumerate(lines): + line = continue_line + line.strip() + if line.startswith("#") or line == "": + continue + if line.endswith("\\"): + continue_line = line[:-1].rstrip() + " " + continue + continue_line = "" + try: + if line[0] == "%": + directive, args = line[1:].split(None, 1) + if directive == "start": + start, pos = nonterm_parser(args, 0) + if pos != len(args): + raise ValueError("Bad argument to start directive") + else: + raise ValueError("Bad directive") + else: + # expand out the disjunctions on the RHS + productions += _read_production(line, nonterm_parser, probabilistic) + except ValueError as e: + raise ValueError(f"Unable to parse line {linenum + 1}: {line}\n{e}") from e + + if not productions: + raise ValueError("No productions found!") + if not start: + start = productions[0].lhs() + return (start, productions) + + +_STANDARD_NONTERM_RE = re.compile(r"( [\w/][\w/^<>-]* ) \s*", re.VERBOSE) + + +def standard_nonterm_parser(string, pos): + m = _STANDARD_NONTERM_RE.match(string, pos) + if not m: + raise ValueError("Expected a nonterminal, found: " + string[pos:]) + return (Nonterminal(m.group(1)), m.end()) + + +################################################################# +# Reading Dependency Grammars +################################################################# + +_READ_DG_RE = re.compile( + r"""^\s* # leading whitespace + ('[^']+')\s* # single-quoted lhs + (?:[-=]+>)\s* # arrow + (?:( # rhs: + "[^"]+" # doubled-quoted terminal + | '[^']+' # single-quoted terminal + | \| # disjunction + ) + \s*) # trailing space + *$""", # zero or more copies + re.VERBOSE, +) +_SPLIT_DG_RE = re.compile(r"""('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)""") + + +def _read_dependency_production(s): + if not _READ_DG_RE.match(s): + raise ValueError("Bad production string") + pieces = _SPLIT_DG_RE.split(s) + pieces = [p for i, p in enumerate(pieces) if i % 2 == 1] + lhside = pieces[0].strip("'\"") + rhsides = [[]] + for piece in pieces[2:]: + if piece == "|": + rhsides.append([]) + else: + rhsides[-1].append(piece.strip("'\"")) + return [DependencyProduction(lhside, rhside) for rhside in rhsides] + + +################################################################# +# Demonstration +################################################################# + + +def cfg_demo(): + """ + A demonstration showing how ``CFGs`` can be created and used. + """ + + from nltk import CFG, Production, nonterminals + + # Create some nonterminals + S, NP, VP, PP = nonterminals("S, NP, VP, PP") + N, V, P, Det = nonterminals("N, V, P, Det") + VP_slash_NP = VP / NP + + print("Some nonterminals:", [S, NP, VP, PP, N, V, P, Det, VP / NP]) + print(" S.symbol() =>", repr(S.symbol())) + print() + + print(Production(S, [NP])) + + # Create some Grammar Productions + grammar = CFG.fromstring( + """ + S -> NP VP + PP -> P NP + NP -> Det N | NP PP + VP -> V NP | VP PP + Det -> 'a' | 'the' + N -> 'dog' | 'cat' + V -> 'chased' | 'sat' + P -> 'on' | 'in' + """ + ) + + print("A Grammar:", repr(grammar)) + print(" grammar.start() =>", repr(grammar.start())) + print(" grammar.productions() =>", end=" ") + # Use string.replace(...) is to line-wrap the output. + print(repr(grammar.productions()).replace(",", ",\n" + " " * 25)) + print() + + +def pcfg_demo(): + """ + A demonstration showing how a ``PCFG`` can be created and used. + """ + + from nltk import induce_pcfg, treetransforms + from nltk.corpus import treebank + from nltk.parse import pchart + + toy_pcfg1 = PCFG.fromstring( + """ + S -> NP VP [1.0] + NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] + Det -> 'the' [0.8] | 'my' [0.2] + N -> 'man' [0.5] | 'telescope' [0.5] + VP -> VP PP [0.1] | V NP [0.7] | V [0.2] + V -> 'ate' [0.35] | 'saw' [0.65] + PP -> P NP [1.0] + P -> 'with' [0.61] | 'under' [0.39] + """ + ) + + toy_pcfg2 = PCFG.fromstring( + """ + S -> NP VP [1.0] + VP -> V NP [.59] + VP -> V [.40] + VP -> VP PP [.01] + NP -> Det N [.41] + NP -> Name [.28] + NP -> NP PP [.31] + PP -> P NP [1.0] + V -> 'saw' [.21] + V -> 'ate' [.51] + V -> 'ran' [.28] + N -> 'boy' [.11] + N -> 'cookie' [.12] + N -> 'table' [.13] + N -> 'telescope' [.14] + N -> 'hill' [.5] + Name -> 'Jack' [.52] + Name -> 'Bob' [.48] + P -> 'with' [.61] + P -> 'under' [.39] + Det -> 'the' [.41] + Det -> 'a' [.31] + Det -> 'my' [.28] + """ + ) + + pcfg_prods = toy_pcfg1.productions() + + pcfg_prod = pcfg_prods[2] + print("A PCFG production:", repr(pcfg_prod)) + print(" pcfg_prod.lhs() =>", repr(pcfg_prod.lhs())) + print(" pcfg_prod.rhs() =>", repr(pcfg_prod.rhs())) + print(" pcfg_prod.prob() =>", repr(pcfg_prod.prob())) + print() + + grammar = toy_pcfg2 + print("A PCFG grammar:", repr(grammar)) + print(" grammar.start() =>", repr(grammar.start())) + print(" grammar.productions() =>", end=" ") + # Use .replace(...) is to line-wrap the output. + print(repr(grammar.productions()).replace(",", ",\n" + " " * 26)) + print() + + # extract productions from three trees and induce the PCFG + print("Induce PCFG grammar from treebank data:") + + productions = [] + item = treebank._fileids[0] + for tree in treebank.parsed_sents(item)[:3]: + # perform optional tree transformations, e.g.: + tree.collapse_unary(collapsePOS=False) + tree.chomsky_normal_form(horzMarkov=2) + + productions += tree.productions() + + S = Nonterminal("S") + grammar = induce_pcfg(S, productions) + print(grammar) + print() + + print("Parse sentence using induced grammar:") + + parser = pchart.InsideChartParser(grammar) + parser.trace(3) + + # doesn't work as tokens are different: + # sent = treebank.tokenized('wsj_0001.mrg')[0] + + sent = treebank.parsed_sents(item)[0].leaves() + print(sent) + for parse in parser.parse(sent): + print(parse) + + +def fcfg_demo(): + import nltk.data + + g = nltk.data.load("grammars/book_grammars/feat0.fcfg") + print(g) + print() + + +def dg_demo(): + """ + A demonstration showing the creation and inspection of a + ``DependencyGrammar``. + """ + grammar = DependencyGrammar.fromstring( + """ + 'scratch' -> 'cats' | 'walls' + 'walls' -> 'the' + 'cats' -> 'the' + """ + ) + print(grammar) + + +def sdg_demo(): + """ + A demonstration of how to read a string representation of + a CoNLL format dependency tree. + """ + from nltk.parse import DependencyGraph + + dg = DependencyGraph( + """ + 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ + 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ + 3 met met Prep Prep voor 8 mod _ _ + 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ + 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ + 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ + 7 gaan ga V V hulp|inf 6 vc _ _ + 8 winkelen winkel V V intrans|inf 11 cnj _ _ + 9 , , Punc Punc komma 8 punct _ _ + 10 zwemmen zwem V V intrans|inf 11 cnj _ _ + 11 of of Conj Conj neven 7 vc _ _ + 12 terrassen terras N N soort|mv|neut 11 cnj _ _ + 13 . . Punc Punc punt 12 punct _ _ + """ + ) + tree = dg.tree() + print(tree.pprint()) + + +def demo(): + cfg_demo() + pcfg_demo() + fcfg_demo() + dg_demo() + sdg_demo() + + +if __name__ == "__main__": + demo() + +__all__ = [ + "Nonterminal", + "nonterminals", + "CFG", + "Production", + "PCFG", + "ProbabilisticProduction", + "DependencyGrammar", + "DependencyProduction", + "ProbabilisticDependencyGrammar", + "induce_pcfg", + "read_grammar", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/inference/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6fa225e5cd1b807a13ba23506f6d595f81c72ba2 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/__init__.py @@ -0,0 +1,24 @@ +# Natural Language Toolkit: Inference +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Dan Garrette +# Ewan Klein +# +# URL: +# For license information, see LICENSE.TXT + +""" +Classes and interfaces for theorem proving and model building. +""" + +from nltk.inference.api import ParallelProverBuilder, ParallelProverBuilderCommand +from nltk.inference.discourse import ( + CfgReadingCommand, + DiscourseTester, + DrtGlueReadingCommand, + ReadingCommand, +) +from nltk.inference.mace import Mace, MaceCommand +from nltk.inference.prover9 import Prover9, Prover9Command +from nltk.inference.resolution import ResolutionProver, ResolutionProverCommand +from nltk.inference.tableau import TableauProver, TableauProverCommand diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/inference/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/api.py new file mode 100644 index 0000000000000000000000000000000000000000..12f1c099941280c1a72f40f957330dc5497a1b27 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/api.py @@ -0,0 +1,614 @@ +# Natural Language Toolkit: Classifier Interface +# +# Author: Ewan Klein +# Dan Garrette +# +# URL: +# For license information, see LICENSE.TXT + +""" +Interfaces and base classes for theorem provers and model builders. + +``Prover`` is a standard interface for a theorem prover which tries to prove a goal from a +list of assumptions. + +``ModelBuilder`` is a standard interface for a model builder. Given just a set of assumptions. +the model builder tries to build a model for the assumptions. Given a set of assumptions and a +goal *G*, the model builder tries to find a counter-model, in the sense of a model that will satisfy +the assumptions plus the negation of *G*. +""" + +import threading +import time +from abc import ABCMeta, abstractmethod + + +class Prover(metaclass=ABCMeta): + """ + Interface for trying to prove a goal from assumptions. Both the goal and + the assumptions are constrained to be formulas of ``logic.Expression``. + """ + + def prove(self, goal=None, assumptions=None, verbose=False): + """ + :return: Whether the proof was successful or not. + :rtype: bool + """ + return self._prove(goal, assumptions, verbose)[0] + + @abstractmethod + def _prove(self, goal=None, assumptions=None, verbose=False): + """ + :return: Whether the proof was successful or not, along with the proof + :rtype: tuple: (bool, str) + """ + + +class ModelBuilder(metaclass=ABCMeta): + """ + Interface for trying to build a model of set of formulas. + Open formulas are assumed to be universally quantified. + Both the goal and the assumptions are constrained to be formulas + of ``logic.Expression``. + """ + + def build_model(self, goal=None, assumptions=None, verbose=False): + """ + Perform the actual model building. + :return: Whether a model was generated + :rtype: bool + """ + return self._build_model(goal, assumptions, verbose)[0] + + @abstractmethod + def _build_model(self, goal=None, assumptions=None, verbose=False): + """ + Perform the actual model building. + :return: Whether a model was generated, and the model itself + :rtype: tuple(bool, sem.Valuation) + """ + + +class TheoremToolCommand(metaclass=ABCMeta): + """ + This class holds a goal and a list of assumptions to be used in proving + or model building. + """ + + @abstractmethod + def add_assumptions(self, new_assumptions): + """ + Add new assumptions to the assumption list. + + :param new_assumptions: new assumptions + :type new_assumptions: list(sem.Expression) + """ + + @abstractmethod + def retract_assumptions(self, retracted, debug=False): + """ + Retract assumptions from the assumption list. + + :param debug: If True, give warning when ``retracted`` is not present on + assumptions list. + :type debug: bool + :param retracted: assumptions to be retracted + :type retracted: list(sem.Expression) + """ + + @abstractmethod + def assumptions(self): + """ + List the current assumptions. + + :return: list of ``Expression`` + """ + + @abstractmethod + def goal(self): + """ + Return the goal + + :return: ``Expression`` + """ + + @abstractmethod + def print_assumptions(self): + """ + Print the list of the current assumptions. + """ + + +class ProverCommand(TheoremToolCommand): + """ + This class holds a ``Prover``, a goal, and a list of assumptions. When + prove() is called, the ``Prover`` is executed with the goal and assumptions. + """ + + @abstractmethod + def prove(self, verbose=False): + """ + Perform the actual proof. + """ + + @abstractmethod + def proof(self, simplify=True): + """ + Return the proof string + :param simplify: bool simplify the proof? + :return: str + """ + + @abstractmethod + def get_prover(self): + """ + Return the prover object + :return: ``Prover`` + """ + + +class ModelBuilderCommand(TheoremToolCommand): + """ + This class holds a ``ModelBuilder``, a goal, and a list of assumptions. + When build_model() is called, the ``ModelBuilder`` is executed with the goal + and assumptions. + """ + + @abstractmethod + def build_model(self, verbose=False): + """ + Perform the actual model building. + :return: A model if one is generated; None otherwise. + :rtype: sem.Valuation + """ + + @abstractmethod + def model(self, format=None): + """ + Return a string representation of the model + + :param simplify: bool simplify the proof? + :return: str + """ + + @abstractmethod + def get_model_builder(self): + """ + Return the model builder object + :return: ``ModelBuilder`` + """ + + +class BaseTheoremToolCommand(TheoremToolCommand): + """ + This class holds a goal and a list of assumptions to be used in proving + or model building. + """ + + def __init__(self, goal=None, assumptions=None): + """ + :param goal: Input expression to prove + :type goal: sem.Expression + :param assumptions: Input expressions to use as assumptions in + the proof. + :type assumptions: list(sem.Expression) + """ + self._goal = goal + + if not assumptions: + self._assumptions = [] + else: + self._assumptions = list(assumptions) + + self._result = None + """A holder for the result, to prevent unnecessary re-proving""" + + def add_assumptions(self, new_assumptions): + """ + Add new assumptions to the assumption list. + + :param new_assumptions: new assumptions + :type new_assumptions: list(sem.Expression) + """ + self._assumptions.extend(new_assumptions) + self._result = None + + def retract_assumptions(self, retracted, debug=False): + """ + Retract assumptions from the assumption list. + + :param debug: If True, give warning when ``retracted`` is not present on + assumptions list. + :type debug: bool + :param retracted: assumptions to be retracted + :type retracted: list(sem.Expression) + """ + retracted = set(retracted) + result_list = list(filter(lambda a: a not in retracted, self._assumptions)) + if debug and result_list == self._assumptions: + print(Warning("Assumptions list has not been changed:")) + self.print_assumptions() + + self._assumptions = result_list + + self._result = None + + def assumptions(self): + """ + List the current assumptions. + + :return: list of ``Expression`` + """ + return self._assumptions + + def goal(self): + """ + Return the goal + + :return: ``Expression`` + """ + return self._goal + + def print_assumptions(self): + """ + Print the list of the current assumptions. + """ + for a in self.assumptions(): + print(a) + + +class BaseProverCommand(BaseTheoremToolCommand, ProverCommand): + """ + This class holds a ``Prover``, a goal, and a list of assumptions. When + prove() is called, the ``Prover`` is executed with the goal and assumptions. + """ + + def __init__(self, prover, goal=None, assumptions=None): + """ + :param prover: The theorem tool to execute with the assumptions + :type prover: Prover + :see: ``BaseTheoremToolCommand`` + """ + self._prover = prover + """The theorem tool to execute with the assumptions""" + + BaseTheoremToolCommand.__init__(self, goal, assumptions) + + self._proof = None + + def prove(self, verbose=False): + """ + Perform the actual proof. Store the result to prevent unnecessary + re-proving. + """ + if self._result is None: + self._result, self._proof = self._prover._prove( + self.goal(), self.assumptions(), verbose + ) + return self._result + + def proof(self, simplify=True): + """ + Return the proof string + :param simplify: bool simplify the proof? + :return: str + """ + if self._result is None: + raise LookupError("You have to call prove() first to get a proof!") + else: + return self.decorate_proof(self._proof, simplify) + + def decorate_proof(self, proof_string, simplify=True): + """ + Modify and return the proof string + :param proof_string: str the proof to decorate + :param simplify: bool simplify the proof? + :return: str + """ + return proof_string + + def get_prover(self): + return self._prover + + +class BaseModelBuilderCommand(BaseTheoremToolCommand, ModelBuilderCommand): + """ + This class holds a ``ModelBuilder``, a goal, and a list of assumptions. When + build_model() is called, the ``ModelBuilder`` is executed with the goal and + assumptions. + """ + + def __init__(self, modelbuilder, goal=None, assumptions=None): + """ + :param modelbuilder: The theorem tool to execute with the assumptions + :type modelbuilder: ModelBuilder + :see: ``BaseTheoremToolCommand`` + """ + self._modelbuilder = modelbuilder + """The theorem tool to execute with the assumptions""" + + BaseTheoremToolCommand.__init__(self, goal, assumptions) + + self._model = None + + def build_model(self, verbose=False): + """ + Attempt to build a model. Store the result to prevent unnecessary + re-building. + """ + if self._result is None: + self._result, self._model = self._modelbuilder._build_model( + self.goal(), self.assumptions(), verbose + ) + return self._result + + def model(self, format=None): + """ + Return a string representation of the model + + :param simplify: bool simplify the proof? + :return: str + """ + if self._result is None: + raise LookupError("You have to call build_model() first to " "get a model!") + else: + return self._decorate_model(self._model, format) + + def _decorate_model(self, valuation_str, format=None): + """ + :param valuation_str: str with the model builder's output + :param format: str indicating the format for displaying + :return: str + """ + return valuation_str + + def get_model_builder(self): + return self._modelbuilder + + +class TheoremToolCommandDecorator(TheoremToolCommand): + """ + A base decorator for the ``ProverCommandDecorator`` and + ``ModelBuilderCommandDecorator`` classes from which decorators can extend. + """ + + def __init__(self, command): + """ + :param command: ``TheoremToolCommand`` to decorate + """ + self._command = command + + # The decorator has its own versions of 'result' different from the + # underlying command + self._result = None + + def assumptions(self): + return self._command.assumptions() + + def goal(self): + return self._command.goal() + + def add_assumptions(self, new_assumptions): + self._command.add_assumptions(new_assumptions) + self._result = None + + def retract_assumptions(self, retracted, debug=False): + self._command.retract_assumptions(retracted, debug) + self._result = None + + def print_assumptions(self): + self._command.print_assumptions() + + +class ProverCommandDecorator(TheoremToolCommandDecorator, ProverCommand): + """ + A base decorator for the ``ProverCommand`` class from which other + prover command decorators can extend. + """ + + def __init__(self, proverCommand): + """ + :param proverCommand: ``ProverCommand`` to decorate + """ + TheoremToolCommandDecorator.__init__(self, proverCommand) + + # The decorator has its own versions of 'result' and 'proof' + # because they may be different from the underlying command + self._proof = None + + def prove(self, verbose=False): + if self._result is None: + prover = self.get_prover() + self._result, self._proof = prover._prove( + self.goal(), self.assumptions(), verbose + ) + return self._result + + def proof(self, simplify=True): + """ + Return the proof string + :param simplify: bool simplify the proof? + :return: str + """ + if self._result is None: + raise LookupError("You have to call prove() first to get a proof!") + else: + return self.decorate_proof(self._proof, simplify) + + def decorate_proof(self, proof_string, simplify=True): + """ + Modify and return the proof string + :param proof_string: str the proof to decorate + :param simplify: bool simplify the proof? + :return: str + """ + return self._command.decorate_proof(proof_string, simplify) + + def get_prover(self): + return self._command.get_prover() + + +class ModelBuilderCommandDecorator(TheoremToolCommandDecorator, ModelBuilderCommand): + """ + A base decorator for the ``ModelBuilderCommand`` class from which other + prover command decorators can extend. + """ + + def __init__(self, modelBuilderCommand): + """ + :param modelBuilderCommand: ``ModelBuilderCommand`` to decorate + """ + TheoremToolCommandDecorator.__init__(self, modelBuilderCommand) + + # The decorator has its own versions of 'result' and 'valuation' + # because they may be different from the underlying command + self._model = None + + def build_model(self, verbose=False): + """ + Attempt to build a model. Store the result to prevent unnecessary + re-building. + """ + if self._result is None: + modelbuilder = self.get_model_builder() + self._result, self._model = modelbuilder._build_model( + self.goal(), self.assumptions(), verbose + ) + return self._result + + def model(self, format=None): + """ + Return a string representation of the model + + :param simplify: bool simplify the proof? + :return: str + """ + if self._result is None: + raise LookupError("You have to call build_model() first to " "get a model!") + else: + return self._decorate_model(self._model, format) + + def _decorate_model(self, valuation_str, format=None): + """ + Modify and return the proof string + :param valuation_str: str with the model builder's output + :param format: str indicating the format for displaying + :return: str + """ + return self._command._decorate_model(valuation_str, format) + + def get_model_builder(self): + return self._command.get_prover() + + +class ParallelProverBuilder(Prover, ModelBuilder): + """ + This class stores both a prover and a model builder and when either + prove() or build_model() is called, then both theorem tools are run in + parallel. Whichever finishes first, the prover or the model builder, is the + result that will be used. + """ + + def __init__(self, prover, modelbuilder): + self._prover = prover + self._modelbuilder = modelbuilder + + def _prove(self, goal=None, assumptions=None, verbose=False): + return self._run(goal, assumptions, verbose), "" + + def _build_model(self, goal=None, assumptions=None, verbose=False): + return not self._run(goal, assumptions, verbose), "" + + def _run(self, goal, assumptions, verbose): + # Set up two thread, Prover and ModelBuilder to run in parallel + tp_thread = TheoremToolThread( + lambda: self._prover.prove(goal, assumptions, verbose), verbose, "TP" + ) + mb_thread = TheoremToolThread( + lambda: self._modelbuilder.build_model(goal, assumptions, verbose), + verbose, + "MB", + ) + + tp_thread.start() + mb_thread.start() + + while tp_thread.is_alive() and mb_thread.is_alive(): + # wait until either the prover or the model builder is done + pass + + if tp_thread.result is not None: + return tp_thread.result + elif mb_thread.result is not None: + return not mb_thread.result + else: + return None + + +class ParallelProverBuilderCommand(BaseProverCommand, BaseModelBuilderCommand): + """ + This command stores both a prover and a model builder and when either + prove() or build_model() is called, then both theorem tools are run in + parallel. Whichever finishes first, the prover or the model builder, is the + result that will be used. + + Because the theorem prover result is the opposite of the model builder + result, we will treat self._result as meaning "proof found/no model found". + """ + + def __init__(self, prover, modelbuilder, goal=None, assumptions=None): + BaseProverCommand.__init__(self, prover, goal, assumptions) + BaseModelBuilderCommand.__init__(self, modelbuilder, goal, assumptions) + + def prove(self, verbose=False): + return self._run(verbose) + + def build_model(self, verbose=False): + return not self._run(verbose) + + def _run(self, verbose): + # Set up two thread, Prover and ModelBuilder to run in parallel + tp_thread = TheoremToolThread( + lambda: BaseProverCommand.prove(self, verbose), verbose, "TP" + ) + mb_thread = TheoremToolThread( + lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, "MB" + ) + + tp_thread.start() + mb_thread.start() + + while tp_thread.is_alive() and mb_thread.is_alive(): + # wait until either the prover or the model builder is done + pass + + if tp_thread.result is not None: + self._result = tp_thread.result + elif mb_thread.result is not None: + self._result = not mb_thread.result + return self._result + + +class TheoremToolThread(threading.Thread): + def __init__(self, command, verbose, name=None): + threading.Thread.__init__(self) + self._command = command + self._result = None + self._verbose = verbose + self._name = name + + def run(self): + try: + self._result = self._command() + if self._verbose: + print( + "Thread %s finished with result %s at %s" + % (self._name, self._result, time.localtime(time.time())) + ) + except Exception as e: + print(e) + print("Thread %s completed abnormally" % (self._name)) + + @property + def result(self): + return self._result diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/inference/nonmonotonic.py b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/nonmonotonic.py new file mode 100644 index 0000000000000000000000000000000000000000..1c25fac71d365a02a655092df18383aa55b3129b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/nonmonotonic.py @@ -0,0 +1,561 @@ +# Natural Language Toolkit: Nonmonotonic Reasoning +# +# Author: Daniel H. Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +""" +A module to perform nonmonotonic reasoning. The ideas and demonstrations in +this module are based on "Logical Foundations of Artificial Intelligence" by +Michael R. Genesereth and Nils J. Nilsson. +""" + +from collections import defaultdict +from functools import reduce + +from nltk.inference.api import Prover, ProverCommandDecorator +from nltk.inference.prover9 import Prover9, Prover9Command +from nltk.sem.logic import ( + AbstractVariableExpression, + AllExpression, + AndExpression, + ApplicationExpression, + BooleanExpression, + EqualityExpression, + ExistsExpression, + Expression, + ImpExpression, + NegatedExpression, + Variable, + VariableExpression, + operator, + unique_variable, +) + + +class ProverParseError(Exception): + pass + + +def get_domain(goal, assumptions): + if goal is None: + all_expressions = assumptions + else: + all_expressions = assumptions + [-goal] + return reduce(operator.or_, (a.constants() for a in all_expressions), set()) + + +class ClosedDomainProver(ProverCommandDecorator): + """ + This is a prover decorator that adds domain closure assumptions before + proving. + """ + + def assumptions(self): + assumptions = [a for a in self._command.assumptions()] + goal = self._command.goal() + domain = get_domain(goal, assumptions) + return [self.replace_quants(ex, domain) for ex in assumptions] + + def goal(self): + goal = self._command.goal() + domain = get_domain(goal, self._command.assumptions()) + return self.replace_quants(goal, domain) + + def replace_quants(self, ex, domain): + """ + Apply the closed domain assumption to the expression + + - Domain = union([e.free()|e.constants() for e in all_expressions]) + - translate "exists x.P" to "(z=d1 | z=d2 | ... ) & P.replace(x,z)" OR + "P.replace(x, d1) | P.replace(x, d2) | ..." + - translate "all x.P" to "P.replace(x, d1) & P.replace(x, d2) & ..." + + :param ex: ``Expression`` + :param domain: set of {Variable}s + :return: ``Expression`` + """ + if isinstance(ex, AllExpression): + conjuncts = [ + ex.term.replace(ex.variable, VariableExpression(d)) for d in domain + ] + conjuncts = [self.replace_quants(c, domain) for c in conjuncts] + return reduce(lambda x, y: x & y, conjuncts) + elif isinstance(ex, BooleanExpression): + return ex.__class__( + self.replace_quants(ex.first, domain), + self.replace_quants(ex.second, domain), + ) + elif isinstance(ex, NegatedExpression): + return -self.replace_quants(ex.term, domain) + elif isinstance(ex, ExistsExpression): + disjuncts = [ + ex.term.replace(ex.variable, VariableExpression(d)) for d in domain + ] + disjuncts = [self.replace_quants(d, domain) for d in disjuncts] + return reduce(lambda x, y: x | y, disjuncts) + else: + return ex + + +class UniqueNamesProver(ProverCommandDecorator): + """ + This is a prover decorator that adds unique names assumptions before + proving. + """ + + def assumptions(self): + """ + - Domain = union([e.free()|e.constants() for e in all_expressions]) + - if "d1 = d2" cannot be proven from the premises, then add "d1 != d2" + """ + assumptions = self._command.assumptions() + + domain = list(get_domain(self._command.goal(), assumptions)) + + # build a dictionary of obvious equalities + eq_sets = SetHolder() + for a in assumptions: + if isinstance(a, EqualityExpression): + av = a.first.variable + bv = a.second.variable + # put 'a' and 'b' in the same set + eq_sets[av].add(bv) + + new_assumptions = [] + for i, a in enumerate(domain): + for b in domain[i + 1 :]: + # if a and b are not already in the same equality set + if b not in eq_sets[a]: + newEqEx = EqualityExpression( + VariableExpression(a), VariableExpression(b) + ) + if Prover9().prove(newEqEx, assumptions): + # we can prove that the names are the same entity. + # remember that they are equal so we don't re-check. + eq_sets[a].add(b) + else: + # we can't prove it, so assume unique names + new_assumptions.append(-newEqEx) + + return assumptions + new_assumptions + + +class SetHolder(list): + """ + A list of sets of Variables. + """ + + def __getitem__(self, item): + """ + :param item: ``Variable`` + :return: the set containing 'item' + """ + assert isinstance(item, Variable) + for s in self: + if item in s: + return s + # item is not found in any existing set. so create a new set + new = {item} + self.append(new) + return new + + +class ClosedWorldProver(ProverCommandDecorator): + """ + This is a prover decorator that completes predicates before proving. + + If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion of "P". + If the assumptions contain "all x.(ostrich(x) -> bird(x))", then "all x.(bird(x) -> ostrich(x))" is the completion of "bird". + If the assumptions don't contain anything that are "P", then "all x.-P(x)" is the completion of "P". + + walk(Socrates) + Socrates != Bill + + all x.(walk(x) -> (x=Socrates)) + ---------------- + -walk(Bill) + + see(Socrates, John) + see(John, Mary) + Socrates != John + John != Mary + + all x.all y.(see(x,y) -> ((x=Socrates & y=John) | (x=John & y=Mary))) + ---------------- + -see(Socrates, Mary) + + all x.(ostrich(x) -> bird(x)) + bird(Tweety) + -ostrich(Sam) + Sam != Tweety + + all x.(bird(x) -> (ostrich(x) | x=Tweety)) + + all x.-ostrich(x) + ------------------- + -bird(Sam) + """ + + def assumptions(self): + assumptions = self._command.assumptions() + + predicates = self._make_predicate_dict(assumptions) + + new_assumptions = [] + for p in predicates: + predHolder = predicates[p] + new_sig = self._make_unique_signature(predHolder) + new_sig_exs = [VariableExpression(v) for v in new_sig] + + disjuncts = [] + + # Turn the signatures into disjuncts + for sig in predHolder.signatures: + equality_exs = [] + for v1, v2 in zip(new_sig_exs, sig): + equality_exs.append(EqualityExpression(v1, v2)) + disjuncts.append(reduce(lambda x, y: x & y, equality_exs)) + + # Turn the properties into disjuncts + for prop in predHolder.properties: + # replace variables from the signature with new sig variables + bindings = {} + for v1, v2 in zip(new_sig_exs, prop[0]): + bindings[v2] = v1 + disjuncts.append(prop[1].substitute_bindings(bindings)) + + # make the assumption + if disjuncts: + # disjuncts exist, so make an implication + antecedent = self._make_antecedent(p, new_sig) + consequent = reduce(lambda x, y: x | y, disjuncts) + accum = ImpExpression(antecedent, consequent) + else: + # nothing has property 'p' + accum = NegatedExpression(self._make_antecedent(p, new_sig)) + + # quantify the implication + for new_sig_var in new_sig[::-1]: + accum = AllExpression(new_sig_var, accum) + new_assumptions.append(accum) + + return assumptions + new_assumptions + + def _make_unique_signature(self, predHolder): + """ + This method figures out how many arguments the predicate takes and + returns a tuple containing that number of unique variables. + """ + return tuple(unique_variable() for i in range(predHolder.signature_len)) + + def _make_antecedent(self, predicate, signature): + """ + Return an application expression with 'predicate' as the predicate + and 'signature' as the list of arguments. + """ + antecedent = predicate + for v in signature: + antecedent = antecedent(VariableExpression(v)) + return antecedent + + def _make_predicate_dict(self, assumptions): + """ + Create a dictionary of predicates from the assumptions. + + :param assumptions: a list of ``Expression``s + :return: dict mapping ``AbstractVariableExpression`` to ``PredHolder`` + """ + predicates = defaultdict(PredHolder) + for a in assumptions: + self._map_predicates(a, predicates) + return predicates + + def _map_predicates(self, expression, predDict): + if isinstance(expression, ApplicationExpression): + func, args = expression.uncurry() + if isinstance(func, AbstractVariableExpression): + predDict[func].append_sig(tuple(args)) + elif isinstance(expression, AndExpression): + self._map_predicates(expression.first, predDict) + self._map_predicates(expression.second, predDict) + elif isinstance(expression, AllExpression): + # collect all the universally quantified variables + sig = [expression.variable] + term = expression.term + while isinstance(term, AllExpression): + sig.append(term.variable) + term = term.term + if isinstance(term, ImpExpression): + if isinstance(term.first, ApplicationExpression) and isinstance( + term.second, ApplicationExpression + ): + func1, args1 = term.first.uncurry() + func2, args2 = term.second.uncurry() + if ( + isinstance(func1, AbstractVariableExpression) + and isinstance(func2, AbstractVariableExpression) + and sig == [v.variable for v in args1] + and sig == [v.variable for v in args2] + ): + predDict[func2].append_prop((tuple(sig), term.first)) + predDict[func1].validate_sig_len(sig) + + +class PredHolder: + """ + This class will be used by a dictionary that will store information + about predicates to be used by the ``ClosedWorldProver``. + + The 'signatures' property is a list of tuples defining signatures for + which the predicate is true. For instance, 'see(john, mary)' would be + result in the signature '(john,mary)' for 'see'. + + The second element of the pair is a list of pairs such that the first + element of the pair is a tuple of variables and the second element is an + expression of those variables that makes the predicate true. For instance, + 'all x.all y.(see(x,y) -> know(x,y))' would result in "((x,y),('see(x,y)'))" + for 'know'. + """ + + def __init__(self): + self.signatures = [] + self.properties = [] + self.signature_len = None + + def append_sig(self, new_sig): + self.validate_sig_len(new_sig) + self.signatures.append(new_sig) + + def append_prop(self, new_prop): + self.validate_sig_len(new_prop[0]) + self.properties.append(new_prop) + + def validate_sig_len(self, new_sig): + if self.signature_len is None: + self.signature_len = len(new_sig) + elif self.signature_len != len(new_sig): + raise Exception("Signature lengths do not match") + + def __str__(self): + return f"({self.signatures},{self.properties},{self.signature_len})" + + def __repr__(self): + return "%s" % self + + +def closed_domain_demo(): + lexpr = Expression.fromstring + + p1 = lexpr(r"exists x.walk(x)") + p2 = lexpr(r"man(Socrates)") + c = lexpr(r"walk(Socrates)") + prover = Prover9Command(c, [p1, p2]) + print(prover.prove()) + cdp = ClosedDomainProver(prover) + print("assumptions:") + for a in cdp.assumptions(): + print(" ", a) + print("goal:", cdp.goal()) + print(cdp.prove()) + + p1 = lexpr(r"exists x.walk(x)") + p2 = lexpr(r"man(Socrates)") + p3 = lexpr(r"-walk(Bill)") + c = lexpr(r"walk(Socrates)") + prover = Prover9Command(c, [p1, p2, p3]) + print(prover.prove()) + cdp = ClosedDomainProver(prover) + print("assumptions:") + for a in cdp.assumptions(): + print(" ", a) + print("goal:", cdp.goal()) + print(cdp.prove()) + + p1 = lexpr(r"exists x.walk(x)") + p2 = lexpr(r"man(Socrates)") + p3 = lexpr(r"-walk(Bill)") + c = lexpr(r"walk(Socrates)") + prover = Prover9Command(c, [p1, p2, p3]) + print(prover.prove()) + cdp = ClosedDomainProver(prover) + print("assumptions:") + for a in cdp.assumptions(): + print(" ", a) + print("goal:", cdp.goal()) + print(cdp.prove()) + + p1 = lexpr(r"walk(Socrates)") + p2 = lexpr(r"walk(Bill)") + c = lexpr(r"all x.walk(x)") + prover = Prover9Command(c, [p1, p2]) + print(prover.prove()) + cdp = ClosedDomainProver(prover) + print("assumptions:") + for a in cdp.assumptions(): + print(" ", a) + print("goal:", cdp.goal()) + print(cdp.prove()) + + p1 = lexpr(r"girl(mary)") + p2 = lexpr(r"dog(rover)") + p3 = lexpr(r"all x.(girl(x) -> -dog(x))") + p4 = lexpr(r"all x.(dog(x) -> -girl(x))") + p5 = lexpr(r"chase(mary, rover)") + c = lexpr(r"exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))") + prover = Prover9Command(c, [p1, p2, p3, p4, p5]) + print(prover.prove()) + cdp = ClosedDomainProver(prover) + print("assumptions:") + for a in cdp.assumptions(): + print(" ", a) + print("goal:", cdp.goal()) + print(cdp.prove()) + + +def unique_names_demo(): + lexpr = Expression.fromstring + + p1 = lexpr(r"man(Socrates)") + p2 = lexpr(r"man(Bill)") + c = lexpr(r"exists x.exists y.(x != y)") + prover = Prover9Command(c, [p1, p2]) + print(prover.prove()) + unp = UniqueNamesProver(prover) + print("assumptions:") + for a in unp.assumptions(): + print(" ", a) + print("goal:", unp.goal()) + print(unp.prove()) + + p1 = lexpr(r"all x.(walk(x) -> (x = Socrates))") + p2 = lexpr(r"Bill = William") + p3 = lexpr(r"Bill = Billy") + c = lexpr(r"-walk(William)") + prover = Prover9Command(c, [p1, p2, p3]) + print(prover.prove()) + unp = UniqueNamesProver(prover) + print("assumptions:") + for a in unp.assumptions(): + print(" ", a) + print("goal:", unp.goal()) + print(unp.prove()) + + +def closed_world_demo(): + lexpr = Expression.fromstring + + p1 = lexpr(r"walk(Socrates)") + p2 = lexpr(r"(Socrates != Bill)") + c = lexpr(r"-walk(Bill)") + prover = Prover9Command(c, [p1, p2]) + print(prover.prove()) + cwp = ClosedWorldProver(prover) + print("assumptions:") + for a in cwp.assumptions(): + print(" ", a) + print("goal:", cwp.goal()) + print(cwp.prove()) + + p1 = lexpr(r"see(Socrates, John)") + p2 = lexpr(r"see(John, Mary)") + p3 = lexpr(r"(Socrates != John)") + p4 = lexpr(r"(John != Mary)") + c = lexpr(r"-see(Socrates, Mary)") + prover = Prover9Command(c, [p1, p2, p3, p4]) + print(prover.prove()) + cwp = ClosedWorldProver(prover) + print("assumptions:") + for a in cwp.assumptions(): + print(" ", a) + print("goal:", cwp.goal()) + print(cwp.prove()) + + p1 = lexpr(r"all x.(ostrich(x) -> bird(x))") + p2 = lexpr(r"bird(Tweety)") + p3 = lexpr(r"-ostrich(Sam)") + p4 = lexpr(r"Sam != Tweety") + c = lexpr(r"-bird(Sam)") + prover = Prover9Command(c, [p1, p2, p3, p4]) + print(prover.prove()) + cwp = ClosedWorldProver(prover) + print("assumptions:") + for a in cwp.assumptions(): + print(" ", a) + print("goal:", cwp.goal()) + print(cwp.prove()) + + +def combination_prover_demo(): + lexpr = Expression.fromstring + + p1 = lexpr(r"see(Socrates, John)") + p2 = lexpr(r"see(John, Mary)") + c = lexpr(r"-see(Socrates, Mary)") + prover = Prover9Command(c, [p1, p2]) + print(prover.prove()) + command = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover))) + for a in command.assumptions(): + print(a) + print(command.prove()) + + +def default_reasoning_demo(): + lexpr = Expression.fromstring + + premises = [] + + # define taxonomy + premises.append(lexpr(r"all x.(elephant(x) -> animal(x))")) + premises.append(lexpr(r"all x.(bird(x) -> animal(x))")) + premises.append(lexpr(r"all x.(dove(x) -> bird(x))")) + premises.append(lexpr(r"all x.(ostrich(x) -> bird(x))")) + premises.append(lexpr(r"all x.(flying_ostrich(x) -> ostrich(x))")) + + # default properties + premises.append( + lexpr(r"all x.((animal(x) & -Ab1(x)) -> -fly(x))") + ) # normal animals don't fly + premises.append( + lexpr(r"all x.((bird(x) & -Ab2(x)) -> fly(x))") + ) # normal birds fly + premises.append( + lexpr(r"all x.((ostrich(x) & -Ab3(x)) -> -fly(x))") + ) # normal ostriches don't fly + + # specify abnormal entities + premises.append(lexpr(r"all x.(bird(x) -> Ab1(x))")) # flight + premises.append(lexpr(r"all x.(ostrich(x) -> Ab2(x))")) # non-flying bird + premises.append(lexpr(r"all x.(flying_ostrich(x) -> Ab3(x))")) # flying ostrich + + # define entities + premises.append(lexpr(r"elephant(E)")) + premises.append(lexpr(r"dove(D)")) + premises.append(lexpr(r"ostrich(O)")) + + # print the assumptions + prover = Prover9Command(None, premises) + command = UniqueNamesProver(ClosedWorldProver(prover)) + for a in command.assumptions(): + print(a) + + print_proof("-fly(E)", premises) + print_proof("fly(D)", premises) + print_proof("-fly(O)", premises) + + +def print_proof(goal, premises): + lexpr = Expression.fromstring + prover = Prover9Command(lexpr(goal), premises) + command = UniqueNamesProver(ClosedWorldProver(prover)) + print(goal, prover.prove(), command.prove()) + + +def demo(): + closed_domain_demo() + unique_names_demo() + closed_world_demo() + combination_prover_demo() + default_reasoning_demo() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/internals.py b/.eggs/nltk-3.8-py3.10.egg/nltk/internals.py new file mode 100644 index 0000000000000000000000000000000000000000..b37ae4be3444b666b909debf468182aca53927fe --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/internals.py @@ -0,0 +1,1123 @@ +# Natural Language Toolkit: Internal utility functions +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# Nitin Madnani +# URL: +# For license information, see LICENSE.TXT + +import fnmatch +import locale +import os +import re +import stat +import subprocess +import sys +import textwrap +import types +import warnings +from xml.etree import ElementTree + +########################################################################## +# Java Via Command-Line +########################################################################## + +_java_bin = None +_java_options = [] +# [xx] add classpath option to config_java? +def config_java(bin=None, options=None, verbose=False): + """ + Configure nltk's java interface, by letting nltk know where it can + find the Java binary, and what extra options (if any) should be + passed to Java when it is run. + + :param bin: The full path to the Java binary. If not specified, + then nltk will search the system for a Java binary; and if + one is not found, it will raise a ``LookupError`` exception. + :type bin: str + :param options: A list of options that should be passed to the + Java binary when it is called. A common value is + ``'-Xmx512m'``, which tells Java binary to increase + the maximum heap size to 512 megabytes. If no options are + specified, then do not modify the options list. + :type options: list(str) + """ + global _java_bin, _java_options + _java_bin = find_binary( + "java", + bin, + env_vars=["JAVAHOME", "JAVA_HOME"], + verbose=verbose, + binary_names=["java.exe"], + ) + + if options is not None: + if isinstance(options, str): + options = options.split() + _java_options = list(options) + + +def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True): + """ + Execute the given java command, by opening a subprocess that calls + Java. If java has not yet been configured, it will be configured + by calling ``config_java()`` with no arguments. + + :param cmd: The java command that should be called, formatted as + a list of strings. Typically, the first string will be the name + of the java class; and the remaining strings will be arguments + for that java class. + :type cmd: list(str) + + :param classpath: A ``':'`` separated list of directories, JAR + archives, and ZIP archives to search for class files. + :type classpath: str + + :param stdin: Specify the executed program's + standard input file handles, respectively. Valid values are ``subprocess.PIPE``, + an existing file descriptor (a positive integer), an existing + file object, 'pipe', 'stdout', 'devnull' and None. ``subprocess.PIPE`` indicates that a + new pipe to the child should be created. With None, no + redirection will occur; the child's file handles will be + inherited from the parent. Additionally, stderr can be + ``subprocess.STDOUT``, which indicates that the stderr data + from the applications should be captured into the same file + handle as for stdout. + + :param stdout: Specify the executed program's standard output file + handle. See ``stdin`` for valid values. + + :param stderr: Specify the executed program's standard error file + handle. See ``stdin`` for valid values. + + + :param blocking: If ``false``, then return immediately after + spawning the subprocess. In this case, the return value is + the ``Popen`` object, and not a ``(stdout, stderr)`` tuple. + + :return: If ``blocking=True``, then return a tuple ``(stdout, + stderr)``, containing the stdout and stderr outputs generated + by the java command if the ``stdout`` and ``stderr`` parameters + were set to ``subprocess.PIPE``; or None otherwise. If + ``blocking=False``, then return a ``subprocess.Popen`` object. + + :raise OSError: If the java command returns a nonzero return code. + """ + + subprocess_output_dict = { + "pipe": subprocess.PIPE, + "stdout": subprocess.STDOUT, + "devnull": subprocess.DEVNULL, + } + + stdin = subprocess_output_dict.get(stdin, stdin) + stdout = subprocess_output_dict.get(stdout, stdout) + stderr = subprocess_output_dict.get(stderr, stderr) + + if isinstance(cmd, str): + raise TypeError("cmd should be a list of strings") + + # Make sure we know where a java binary is. + if _java_bin is None: + config_java() + + # Set up the classpath. + if isinstance(classpath, str): + classpaths = [classpath] + else: + classpaths = list(classpath) + classpath = os.path.pathsep.join(classpaths) + + # Construct the full command string. + cmd = list(cmd) + cmd = ["-cp", classpath] + cmd + cmd = [_java_bin] + _java_options + cmd + + # Call java via a subprocess + p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) + if not blocking: + return p + (stdout, stderr) = p.communicate() + + # Check the return code. + if p.returncode != 0: + print(_decode_stdoutdata(stderr)) + raise OSError("Java command failed : " + str(cmd)) + + return (stdout, stderr) + + +###################################################################### +# Parsing +###################################################################### + + +class ReadError(ValueError): + """ + Exception raised by read_* functions when they fail. + :param position: The index in the input string where an error occurred. + :param expected: What was expected when an error occurred. + """ + + def __init__(self, expected, position): + ValueError.__init__(self, expected, position) + self.expected = expected + self.position = position + + def __str__(self): + return f"Expected {self.expected} at {self.position}" + + +_STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')") + + +def read_str(s, start_position): + """ + If a Python string literal begins at the specified position in the + given string, then return a tuple ``(val, end_position)`` + containing the value of the string literal and the position where + it ends. Otherwise, raise a ``ReadError``. + + :param s: A string that will be checked to see if within which a + Python string literal exists. + :type s: str + + :param start_position: The specified beginning position of the string ``s`` + to begin regex matching. + :type start_position: int + + :return: A tuple containing the matched string literal evaluated as a + string and the end position of the string literal. + :rtype: tuple(str, int) + + :raise ReadError: If the ``_STRING_START_RE`` regex doesn't return a + match in ``s`` at ``start_position``, i.e., open quote. If the + ``_STRING_END_RE`` regex doesn't return a match in ``s`` at the + end of the first match, i.e., close quote. + :raise ValueError: If an invalid string (i.e., contains an invalid + escape sequence) is passed into the ``eval``. + + :Example: + + >>> from nltk.internals import read_str + >>> read_str('"Hello", World!', 0) + ('Hello', 7) + + """ + # Read the open quote, and any modifiers. + m = _STRING_START_RE.match(s, start_position) + if not m: + raise ReadError("open quote", start_position) + quotemark = m.group(1) + + # Find the close quote. + _STRING_END_RE = re.compile(r"\\|%s" % quotemark) + position = m.end() + while True: + match = _STRING_END_RE.search(s, position) + if not match: + raise ReadError("close quote", position) + if match.group(0) == "\\": + position = match.end() + 1 + else: + break + + # Process it, using eval. Strings with invalid escape sequences + # might raise ValueError. + try: + return eval(s[start_position : match.end()]), match.end() + except ValueError as e: + raise ReadError("valid escape sequence", start_position) from e + + +_READ_INT_RE = re.compile(r"-?\d+") + + +def read_int(s, start_position): + """ + If an integer begins at the specified position in the given + string, then return a tuple ``(val, end_position)`` containing the + value of the integer and the position where it ends. Otherwise, + raise a ``ReadError``. + + :param s: A string that will be checked to see if within which a + Python integer exists. + :type s: str + + :param start_position: The specified beginning position of the string ``s`` + to begin regex matching. + :type start_position: int + + :return: A tuple containing the matched integer casted to an int, + and the end position of the int in ``s``. + :rtype: tuple(int, int) + + :raise ReadError: If the ``_READ_INT_RE`` regex doesn't return a + match in ``s`` at ``start_position``. + + :Example: + + >>> from nltk.internals import read_int + >>> read_int('42 is the answer', 0) + (42, 2) + + """ + m = _READ_INT_RE.match(s, start_position) + if not m: + raise ReadError("integer", start_position) + return int(m.group()), m.end() + + +_READ_NUMBER_VALUE = re.compile(r"-?(\d*)([.]?\d*)?") + + +def read_number(s, start_position): + """ + If an integer or float begins at the specified position in the + given string, then return a tuple ``(val, end_position)`` + containing the value of the number and the position where it ends. + Otherwise, raise a ``ReadError``. + + :param s: A string that will be checked to see if within which a + Python number exists. + :type s: str + + :param start_position: The specified beginning position of the string ``s`` + to begin regex matching. + :type start_position: int + + :return: A tuple containing the matched number casted to a ``float``, + and the end position of the number in ``s``. + :rtype: tuple(float, int) + + :raise ReadError: If the ``_READ_NUMBER_VALUE`` regex doesn't return a + match in ``s`` at ``start_position``. + + :Example: + + >>> from nltk.internals import read_number + >>> read_number('Pi is 3.14159', 6) + (3.14159, 13) + + """ + m = _READ_NUMBER_VALUE.match(s, start_position) + if not m or not (m.group(1) or m.group(2)): + raise ReadError("number", start_position) + if m.group(2): + return float(m.group()), m.end() + else: + return int(m.group()), m.end() + + +###################################################################### +# Check if a method has been overridden +###################################################################### + + +def overridden(method): + """ + :return: True if ``method`` overrides some method with the same + name in a base class. This is typically used when defining + abstract base classes or interfaces, to allow subclasses to define + either of two related methods: + + >>> class EaterI: + ... '''Subclass must define eat() or batch_eat().''' + ... def eat(self, food): + ... if overridden(self.batch_eat): + ... return self.batch_eat([food])[0] + ... else: + ... raise NotImplementedError() + ... def batch_eat(self, foods): + ... return [self.eat(food) for food in foods] + + :type method: instance method + """ + if isinstance(method, types.MethodType) and method.__self__.__class__ is not None: + name = method.__name__ + funcs = [ + cls.__dict__[name] + for cls in _mro(method.__self__.__class__) + if name in cls.__dict__ + ] + return len(funcs) > 1 + else: + raise TypeError("Expected an instance method.") + + +def _mro(cls): + """ + Return the method resolution order for ``cls`` -- i.e., a list + containing ``cls`` and all its base classes, in the order in which + they would be checked by ``getattr``. For new-style classes, this + is just cls.__mro__. For classic classes, this can be obtained by + a depth-first left-to-right traversal of ``__bases__``. + """ + if isinstance(cls, type): + return cls.__mro__ + else: + mro = [cls] + for base in cls.__bases__: + mro.extend(_mro(base)) + return mro + + +###################################################################### +# Deprecation decorator & base class +###################################################################### +# [xx] dedent msg first if it comes from a docstring. + + +def _add_epytext_field(obj, field, message): + """Add an epytext @field to a given object's docstring.""" + indent = "" + # If we already have a docstring, then add a blank line to separate + # it from the new field, and check its indentation. + if obj.__doc__: + obj.__doc__ = obj.__doc__.rstrip() + "\n\n" + indents = re.findall(r"(?<=\n)[ ]+(?!\s)", obj.__doc__.expandtabs()) + if indents: + indent = min(indents) + # If we don't have a docstring, add an empty one. + else: + obj.__doc__ = "" + + obj.__doc__ += textwrap.fill( + f"@{field}: {message}", + initial_indent=indent, + subsequent_indent=indent + " ", + ) + + +def deprecated(message): + """ + A decorator used to mark functions as deprecated. This will cause + a warning to be printed the when the function is used. Usage: + + >>> from nltk.internals import deprecated + >>> @deprecated('Use foo() instead') + ... def bar(x): + ... print(x/10) + + """ + + def decorator(func): + msg = f"Function {func.__name__}() has been deprecated. {message}" + msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ") + + def newFunc(*args, **kwargs): + warnings.warn(msg, category=DeprecationWarning, stacklevel=2) + return func(*args, **kwargs) + + # Copy the old function's name, docstring, & dict + newFunc.__dict__.update(func.__dict__) + newFunc.__name__ = func.__name__ + newFunc.__doc__ = func.__doc__ + newFunc.__deprecated__ = True + # Add a @deprecated field to the docstring. + _add_epytext_field(newFunc, "deprecated", message) + return newFunc + + return decorator + + +class Deprecated: + """ + A base class used to mark deprecated classes. A typical usage is to + alert users that the name of a class has changed: + + >>> from nltk.internals import Deprecated + >>> class NewClassName: + ... pass # All logic goes here. + ... + >>> class OldClassName(Deprecated, NewClassName): + ... "Use NewClassName instead." + + The docstring of the deprecated class will be used in the + deprecation warning message. + """ + + def __new__(cls, *args, **kwargs): + # Figure out which class is the deprecated one. + dep_cls = None + for base in _mro(cls): + if Deprecated in base.__bases__: + dep_cls = base + break + assert dep_cls, "Unable to determine which base is deprecated." + + # Construct an appropriate warning. + doc = dep_cls.__doc__ or "".strip() + # If there's a @deprecated field, strip off the field marker. + doc = re.sub(r"\A\s*@deprecated:", r"", doc) + # Strip off any indentation. + doc = re.sub(r"(?m)^\s*", "", doc) + # Construct a 'name' string. + name = "Class %s" % dep_cls.__name__ + if cls != dep_cls: + name += " (base class for %s)" % cls.__name__ + # Put it all together. + msg = f"{name} has been deprecated. {doc}" + # Wrap it. + msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ") + warnings.warn(msg, category=DeprecationWarning, stacklevel=2) + # Do the actual work of __new__. + return object.__new__(cls) + + +########################################################################## +# COUNTER, FOR UNIQUE NAMING +########################################################################## + + +class Counter: + """ + A counter that auto-increments each time its value is read. + """ + + def __init__(self, initial_value=0): + self._value = initial_value + + def get(self): + self._value += 1 + return self._value + + +########################################################################## +# Search for files/binaries +########################################################################## + + +def find_file_iter( + filename, + env_vars=(), + searchpath=(), + file_names=None, + url=None, + verbose=False, + finding_dir=False, +): + """ + Search for a file to be used by nltk. + + :param filename: The name or path of the file. + :param env_vars: A list of environment variable names to check. + :param file_names: A list of alternative file names to check. + :param searchpath: List of directories to search. + :param url: URL presented to user for download help. + :param verbose: Whether or not to print path when a file is found. + """ + file_names = [filename] + (file_names or []) + assert isinstance(filename, str) + assert not isinstance(file_names, str) + assert not isinstance(searchpath, str) + if isinstance(env_vars, str): + env_vars = env_vars.split() + yielded = False + + # File exists, no magic + for alternative in file_names: + path_to_file = os.path.join(filename, alternative) + if os.path.isfile(path_to_file): + if verbose: + print(f"[Found {filename}: {path_to_file}]") + yielded = True + yield path_to_file + # Check the bare alternatives + if os.path.isfile(alternative): + if verbose: + print(f"[Found {filename}: {alternative}]") + yielded = True + yield alternative + # Check if the alternative is inside a 'file' directory + path_to_file = os.path.join(filename, "file", alternative) + if os.path.isfile(path_to_file): + if verbose: + print(f"[Found {filename}: {path_to_file}]") + yielded = True + yield path_to_file + + # Check environment variables + for env_var in env_vars: + if env_var in os.environ: + if finding_dir: # This is to file a directory instead of file + yielded = True + yield os.environ[env_var] + + for env_dir in os.environ[env_var].split(os.pathsep): + # Check if the environment variable contains a direct path to the bin + if os.path.isfile(env_dir): + if verbose: + print(f"[Found {filename}: {env_dir}]") + yielded = True + yield env_dir + # Check if the possible bin names exist inside the environment variable directories + for alternative in file_names: + path_to_file = os.path.join(env_dir, alternative) + if os.path.isfile(path_to_file): + if verbose: + print(f"[Found {filename}: {path_to_file}]") + yielded = True + yield path_to_file + # Check if the alternative is inside a 'file' directory + # path_to_file = os.path.join(env_dir, 'file', alternative) + + # Check if the alternative is inside a 'bin' directory + path_to_file = os.path.join(env_dir, "bin", alternative) + + if os.path.isfile(path_to_file): + if verbose: + print(f"[Found {filename}: {path_to_file}]") + yielded = True + yield path_to_file + + # Check the path list. + for directory in searchpath: + for alternative in file_names: + path_to_file = os.path.join(directory, alternative) + if os.path.isfile(path_to_file): + yielded = True + yield path_to_file + + # If we're on a POSIX system, then try using the 'which' command + # to find the file. + if os.name == "posix": + for alternative in file_names: + try: + p = subprocess.Popen( + ["which", alternative], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = p.communicate() + path = _decode_stdoutdata(stdout).strip() + if path.endswith(alternative) and os.path.exists(path): + if verbose: + print(f"[Found {filename}: {path}]") + yielded = True + yield path + except (KeyboardInterrupt, SystemExit, OSError): + raise + finally: + pass + + if not yielded: + msg = ( + "NLTK was unable to find the %s file!" + "\nUse software specific " + "configuration parameters" % filename + ) + if env_vars: + msg += " or set the %s environment variable" % env_vars[0] + msg += "." + if searchpath: + msg += "\n\n Searched in:" + msg += "".join("\n - %s" % d for d in searchpath) + if url: + msg += f"\n\n For more information on {filename}, see:\n <{url}>" + div = "=" * 75 + raise LookupError(f"\n\n{div}\n{msg}\n{div}") + + +def find_file( + filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False +): + return next( + find_file_iter(filename, env_vars, searchpath, file_names, url, verbose) + ) + + +def find_dir( + filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False +): + return next( + find_file_iter( + filename, env_vars, searchpath, file_names, url, verbose, finding_dir=True + ) + ) + + +def find_binary_iter( + name, + path_to_bin=None, + env_vars=(), + searchpath=(), + binary_names=None, + url=None, + verbose=False, +): + """ + Search for a file to be used by nltk. + + :param name: The name or path of the file. + :param path_to_bin: The user-supplied binary location (deprecated) + :param env_vars: A list of environment variable names to check. + :param file_names: A list of alternative file names to check. + :param searchpath: List of directories to search. + :param url: URL presented to user for download help. + :param verbose: Whether or not to print path when a file is found. + """ + yield from find_file_iter( + path_to_bin or name, env_vars, searchpath, binary_names, url, verbose + ) + + +def find_binary( + name, + path_to_bin=None, + env_vars=(), + searchpath=(), + binary_names=None, + url=None, + verbose=False, +): + return next( + find_binary_iter( + name, path_to_bin, env_vars, searchpath, binary_names, url, verbose + ) + ) + + +def find_jar_iter( + name_pattern, + path_to_jar=None, + env_vars=(), + searchpath=(), + url=None, + verbose=False, + is_regex=False, +): + """ + Search for a jar that is used by nltk. + + :param name_pattern: The name of the jar file + :param path_to_jar: The user-supplied jar location, or None. + :param env_vars: A list of environment variable names to check + in addition to the CLASSPATH variable which is + checked by default. + :param searchpath: List of directories to search. + :param is_regex: Whether name is a regular expression. + """ + + assert isinstance(name_pattern, str) + assert not isinstance(searchpath, str) + if isinstance(env_vars, str): + env_vars = env_vars.split() + yielded = False + + # Make sure we check the CLASSPATH first + env_vars = ["CLASSPATH"] + list(env_vars) + + # If an explicit location was given, then check it, and yield it if + # it's present; otherwise, complain. + if path_to_jar is not None: + if os.path.isfile(path_to_jar): + yielded = True + yield path_to_jar + else: + raise LookupError( + f"Could not find {name_pattern} jar file at {path_to_jar}" + ) + + # Check environment variables + for env_var in env_vars: + if env_var in os.environ: + if env_var == "CLASSPATH": + classpath = os.environ["CLASSPATH"] + for cp in classpath.split(os.path.pathsep): + cp = os.path.expanduser(cp) + if os.path.isfile(cp): + filename = os.path.basename(cp) + if ( + is_regex + and re.match(name_pattern, filename) + or (not is_regex and filename == name_pattern) + ): + if verbose: + print(f"[Found {name_pattern}: {cp}]") + yielded = True + yield cp + # The case where user put directory containing the jar file in the classpath + if os.path.isdir(cp): + if not is_regex: + if os.path.isfile(os.path.join(cp, name_pattern)): + if verbose: + print(f"[Found {name_pattern}: {cp}]") + yielded = True + yield os.path.join(cp, name_pattern) + else: + # Look for file using regular expression + for file_name in os.listdir(cp): + if re.match(name_pattern, file_name): + if verbose: + print( + "[Found %s: %s]" + % ( + name_pattern, + os.path.join(cp, file_name), + ) + ) + yielded = True + yield os.path.join(cp, file_name) + + else: + jar_env = os.path.expanduser(os.environ[env_var]) + jar_iter = ( + ( + os.path.join(jar_env, path_to_jar) + for path_to_jar in os.listdir(jar_env) + ) + if os.path.isdir(jar_env) + else (jar_env,) + ) + for path_to_jar in jar_iter: + if os.path.isfile(path_to_jar): + filename = os.path.basename(path_to_jar) + if ( + is_regex + and re.match(name_pattern, filename) + or (not is_regex and filename == name_pattern) + ): + if verbose: + print(f"[Found {name_pattern}: {path_to_jar}]") + yielded = True + yield path_to_jar + + # Check the path list. + for directory in searchpath: + if is_regex: + for filename in os.listdir(directory): + path_to_jar = os.path.join(directory, filename) + if os.path.isfile(path_to_jar): + if re.match(name_pattern, filename): + if verbose: + print(f"[Found {filename}: {path_to_jar}]") + yielded = True + yield path_to_jar + else: + path_to_jar = os.path.join(directory, name_pattern) + if os.path.isfile(path_to_jar): + if verbose: + print(f"[Found {name_pattern}: {path_to_jar}]") + yielded = True + yield path_to_jar + + if not yielded: + # If nothing was found, raise an error + msg = "NLTK was unable to find %s!" % name_pattern + if env_vars: + msg += " Set the %s environment variable" % env_vars[0] + msg = textwrap.fill(msg + ".", initial_indent=" ", subsequent_indent=" ") + if searchpath: + msg += "\n\n Searched in:" + msg += "".join("\n - %s" % d for d in searchpath) + if url: + msg += "\n\n For more information, on {}, see:\n <{}>".format( + name_pattern, + url, + ) + div = "=" * 75 + raise LookupError(f"\n\n{div}\n{msg}\n{div}") + + +def find_jar( + name_pattern, + path_to_jar=None, + env_vars=(), + searchpath=(), + url=None, + verbose=False, + is_regex=False, +): + return next( + find_jar_iter( + name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex + ) + ) + + +def find_jars_within_path(path_to_jars): + return [ + os.path.join(root, filename) + for root, dirnames, filenames in os.walk(path_to_jars) + for filename in fnmatch.filter(filenames, "*.jar") + ] + + +def _decode_stdoutdata(stdoutdata): + """Convert data read from stdout/stderr to unicode""" + if not isinstance(stdoutdata, bytes): + return stdoutdata + + encoding = getattr(sys.__stdout__, "encoding", locale.getpreferredencoding()) + if encoding is None: + return stdoutdata.decode() + return stdoutdata.decode(encoding) + + +########################################################################## +# Import Stdlib Module +########################################################################## + + +def import_from_stdlib(module): + """ + When python is run from within the nltk/ directory tree, the + current directory is included at the beginning of the search path. + Unfortunately, that means that modules within nltk can sometimes + shadow standard library modules. As an example, the stdlib + 'inspect' module will attempt to import the stdlib 'tokenize' + module, but will instead end up importing NLTK's 'tokenize' module + instead (causing the import to fail). + """ + old_path = sys.path + sys.path = [d for d in sys.path if d not in ("", ".")] + m = __import__(module) + sys.path = old_path + return m + + +########################################################################## +# Wrapper for ElementTree Elements +########################################################################## + + +class ElementWrapper: + """ + A wrapper around ElementTree Element objects whose main purpose is + to provide nicer __repr__ and __str__ methods. In addition, any + of the wrapped Element's methods that return other Element objects + are overridden to wrap those values before returning them. + + This makes Elements more convenient to work with in + interactive sessions and doctests, at the expense of some + efficiency. + """ + + # Prevent double-wrapping: + def __new__(cls, etree): + """ + Create and return a wrapper around a given Element object. + If ``etree`` is an ``ElementWrapper``, then ``etree`` is + returned as-is. + """ + if isinstance(etree, ElementWrapper): + return etree + else: + return object.__new__(ElementWrapper) + + def __init__(self, etree): + r""" + Initialize a new Element wrapper for ``etree``. + + If ``etree`` is a string, then it will be converted to an + Element object using ``ElementTree.fromstring()`` first: + + >>> ElementWrapper("") + \n"> + + """ + if isinstance(etree, str): + etree = ElementTree.fromstring(etree) + self.__dict__["_etree"] = etree + + def unwrap(self): + """ + Return the Element object wrapped by this wrapper. + """ + return self._etree + + ##//////////////////////////////////////////////////////////// + # { String Representation + ##//////////////////////////////////////////////////////////// + + def __repr__(self): + s = ElementTree.tostring(self._etree, encoding="utf8").decode("utf8") + if len(s) > 60: + e = s.rfind("<") + if (len(s) - e) > 30: + e = -20 + s = f"{s[:30]}...{s[e:]}" + return "" % s + + def __str__(self): + """ + :return: the result of applying ``ElementTree.tostring()`` to + the wrapped Element object. + """ + return ( + ElementTree.tostring(self._etree, encoding="utf8").decode("utf8").rstrip() + ) + + ##//////////////////////////////////////////////////////////// + # { Element interface Delegation (pass-through) + ##//////////////////////////////////////////////////////////// + + def __getattr__(self, attrib): + return getattr(self._etree, attrib) + + def __setattr__(self, attr, value): + return setattr(self._etree, attr, value) + + def __delattr__(self, attr): + return delattr(self._etree, attr) + + def __setitem__(self, index, element): + self._etree[index] = element + + def __delitem__(self, index): + del self._etree[index] + + def __setslice__(self, start, stop, elements): + self._etree[start:stop] = elements + + def __delslice__(self, start, stop): + del self._etree[start:stop] + + def __len__(self): + return len(self._etree) + + ##//////////////////////////////////////////////////////////// + # { Element interface Delegation (wrap result) + ##//////////////////////////////////////////////////////////// + + def __getitem__(self, index): + return ElementWrapper(self._etree[index]) + + def __getslice__(self, start, stop): + return [ElementWrapper(elt) for elt in self._etree[start:stop]] + + def getchildren(self): + return [ElementWrapper(elt) for elt in self._etree] + + def getiterator(self, tag=None): + return (ElementWrapper(elt) for elt in self._etree.getiterator(tag)) + + def makeelement(self, tag, attrib): + return ElementWrapper(self._etree.makeelement(tag, attrib)) + + def find(self, path): + elt = self._etree.find(path) + if elt is None: + return elt + else: + return ElementWrapper(elt) + + def findall(self, path): + return [ElementWrapper(elt) for elt in self._etree.findall(path)] + + +###################################################################### +# Helper for Handling Slicing +###################################################################### + + +def slice_bounds(sequence, slice_obj, allow_step=False): + """ + Given a slice, return the corresponding (start, stop) bounds, + taking into account None indices and negative indices. The + following guarantees are made for the returned start and stop values: + + - 0 <= start <= len(sequence) + - 0 <= stop <= len(sequence) + - start <= stop + + :raise ValueError: If ``slice_obj.step`` is not None. + :param allow_step: If true, then the slice object may have a + non-None step. If it does, then return a tuple + (start, stop, step). + """ + start, stop = (slice_obj.start, slice_obj.stop) + + # If allow_step is true, then include the step in our return + # value tuple. + if allow_step: + step = slice_obj.step + if step is None: + step = 1 + # Use a recursive call without allow_step to find the slice + # bounds. If step is negative, then the roles of start and + # stop (in terms of default values, etc), are swapped. + if step < 0: + start, stop = slice_bounds(sequence, slice(stop, start)) + else: + start, stop = slice_bounds(sequence, slice(start, stop)) + return start, stop, step + + # Otherwise, make sure that no non-default step value is used. + elif slice_obj.step not in (None, 1): + raise ValueError( + "slices with steps are not supported by %s" % sequence.__class__.__name__ + ) + + # Supply default offsets. + if start is None: + start = 0 + if stop is None: + stop = len(sequence) + + # Handle negative indices. + if start < 0: + start = max(0, len(sequence) + start) + if stop < 0: + stop = max(0, len(sequence) + stop) + + # Make sure stop doesn't go past the end of the list. Note that + # we avoid calculating len(sequence) if possible, because for lazy + # sequences, calculating the length of a sequence can be expensive. + if stop > 0: + try: + sequence[stop - 1] + except IndexError: + stop = len(sequence) + + # Make sure start isn't past stop. + start = min(start, stop) + + # That's all folks! + return start, stop + + +###################################################################### +# Permission Checking +###################################################################### + + +def is_writable(path): + # Ensure that it exists. + if not os.path.exists(path): + return False + + # If we're on a posix system, check its permissions. + if hasattr(os, "getuid"): + statdata = os.stat(path) + perm = stat.S_IMODE(statdata.st_mode) + # is it world-writable? + if perm & 0o002: + return True + # do we own it? + elif statdata.st_uid == os.getuid() and (perm & 0o200): + return True + # are we in a group that can write to it? + elif (statdata.st_gid in [os.getgid()] + os.getgroups()) and (perm & 0o020): + return True + # otherwise, we can't write to it. + else: + return False + + # Otherwise, we'll assume it's writable. + # [xx] should we do other checks on other platforms? + return True + + +###################################################################### +# NLTK Error reporting +###################################################################### + + +def raise_unorderable_types(ordering, a, b): + raise TypeError( + "unorderable types: %s() %s %s()" + % (type(a).__name__, ordering, type(b).__name__) + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/langnames.py b/.eggs/nltk-3.8-py3.10.egg/nltk/langnames.py new file mode 100644 index 0000000000000000000000000000000000000000..6d35d0a16eb041a5785278bf4a7d3ca4ecff2b66 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/langnames.py @@ -0,0 +1,730 @@ +# Natural Language Toolkit: Language Codes +# +# Copyright (C) 2022 NLTK Project +# Author: Eric Kafe +# URL: +# For license information, see LICENSE.TXT +# +# iso639-3 language codes (C) https://iso639-3.sil.org/ + +""" +Translate between language names and language codes. + +The iso639-3 language codes were downloaded from the registration authority at +https://iso639-3.sil.org/ + +The iso639-3 codeset is evolving, so retired language codes are kept in the +"iso639retired" dictionary, which is used as fallback by the wrapper functions +"langname" and "langcode", in order to support the lookup of retired codes. + +The "langcode" function returns the current iso639-3 code if there is one, +and falls back to the retired code otherwise. As specified by BCP-47, +it returns the shortest (2-letter) code by default, but 3-letter codes +are also available: + + >>> import nltk.langnames as lgn + >>> lgn.langname('fri') #'fri' is a retired code + 'Western Frisian' + + The current code is different from the retired one: + >>> lgn.langcode('Western Frisian') + 'fy' + + >>> lgn.langcode('Western Frisian', typ = 3) + 'fry' + +""" + +import re +from warnings import warn + +from nltk.corpus import bcp47 + +codepattern = re.compile("[a-z][a-z][a-z]?") + + +def langname(tag, typ="full"): + """ + Convert a composite BCP-47 tag to a language name + + >>> from nltk.langnames import langname + >>> langname('ca-Latn-ES-valencia') + 'Catalan: Latin: Spain: Valencian' + + >>> langname('ca-Latn-ES-valencia', typ="short") + 'Catalan' + """ + tags = tag.split("-") + code = tags[0].lower() + if codepattern.fullmatch(code): + if code in iso639retired: # retired codes + return iso639retired[code] + elif code in iso639short: # 3-letter codes + code2 = iso639short[code] # convert to 2-letter code + warn(f"Shortening {code!r} to {code2!r}", stacklevel=2) + tag = "-".join([code2] + tags[1:]) + name = bcp47.name(tag) # parse according to BCP-47 + if typ == "full": + return name # include all subtags + elif name: + return name.split(":")[0] # only the language subtag + else: + warn(f"Could not find code in {code!r}", stacklevel=2) + + +def langcode(name, typ=2): + """ + Convert language name to iso639-3 language code. Returns the short 2-letter + code by default, if one is available, and the 3-letter code otherwise: + + >>> from nltk.langnames import langcode + >>> langcode('Modern Greek (1453-)') + 'el' + + Specify 'typ=3' to get the 3-letter code: + + >>> langcode('Modern Greek (1453-)', typ=3) + 'ell' + """ + if name in bcp47.langcode: + code = bcp47.langcode[name] + if typ == 3 and code in iso639long: + code = iso639long[code] # convert to 3-letter code + return code + elif name in iso639code_retired: + return iso639code_retired[name] + else: + warn(f"Could not find language in {name!r}", stacklevel=2) + + +# ======================================================================= +# Translate betwwen Wikidata Q-codes and BCP-47 codes or names +# ....................................................................... + + +def tag2q(tag): + """ + Convert BCP-47 tag to Wikidata Q-code + + >>> tag2q('nds-u-sd-demv') + 'Q4289225' + """ + return bcp47.wiki_q[tag] + + +def q2tag(qcode): + """ + Convert Wikidata Q-code to BCP-47 tag + + >>> q2tag('Q4289225') + 'nds-u-sd-demv' + """ + return wiki_bcp47[qcode] + + +def q2name(qcode, typ="full"): + """ + Convert Wikidata Q-code to BCP-47 (full or short) language name + + >>> q2name('Q4289225') + 'Low German: Mecklenburg-Vorpommern' + + >>> q2name('Q4289225', "short") + 'Low German' + """ + return langname(q2tag(qcode), typ) + + +def lang2q(name): + """ + Convert simple language name to Wikidata Q-code + + >>> lang2q('Low German') + 'Q25433' + """ + return tag2q(langcode(name)) + + +# ====================================================================== +# Data dictionaries +# ...................................................................... + + +def inverse_dict(dic): + """Return inverse mapping, but only if it is bijective""" + if len(dic.keys()) == len(set(dic.values())): + return {val: key for (key, val) in dic.items()} + else: + warn("This dictionary has no bijective inverse mapping.") + + +bcp47.load_wiki_q() # Wikidata conversion table needs to be loaded explicitly +wiki_bcp47 = inverse_dict(bcp47.wiki_q) + +iso639short = { + "aar": "aa", + "abk": "ab", + "afr": "af", + "aka": "ak", + "amh": "am", + "ara": "ar", + "arg": "an", + "asm": "as", + "ava": "av", + "ave": "ae", + "aym": "ay", + "aze": "az", + "bak": "ba", + "bam": "bm", + "bel": "be", + "ben": "bn", + "bis": "bi", + "bod": "bo", + "bos": "bs", + "bre": "br", + "bul": "bg", + "cat": "ca", + "ces": "cs", + "cha": "ch", + "che": "ce", + "chu": "cu", + "chv": "cv", + "cor": "kw", + "cos": "co", + "cre": "cr", + "cym": "cy", + "dan": "da", + "deu": "de", + "div": "dv", + "dzo": "dz", + "ell": "el", + "eng": "en", + "epo": "eo", + "est": "et", + "eus": "eu", + "ewe": "ee", + "fao": "fo", + "fas": "fa", + "fij": "fj", + "fin": "fi", + "fra": "fr", + "fry": "fy", + "ful": "ff", + "gla": "gd", + "gle": "ga", + "glg": "gl", + "glv": "gv", + "grn": "gn", + "guj": "gu", + "hat": "ht", + "hau": "ha", + "hbs": "sh", + "heb": "he", + "her": "hz", + "hin": "hi", + "hmo": "ho", + "hrv": "hr", + "hun": "hu", + "hye": "hy", + "ibo": "ig", + "ido": "io", + "iii": "ii", + "iku": "iu", + "ile": "ie", + "ina": "ia", + "ind": "id", + "ipk": "ik", + "isl": "is", + "ita": "it", + "jav": "jv", + "jpn": "ja", + "kal": "kl", + "kan": "kn", + "kas": "ks", + "kat": "ka", + "kau": "kr", + "kaz": "kk", + "khm": "km", + "kik": "ki", + "kin": "rw", + "kir": "ky", + "kom": "kv", + "kon": "kg", + "kor": "ko", + "kua": "kj", + "kur": "ku", + "lao": "lo", + "lat": "la", + "lav": "lv", + "lim": "li", + "lin": "ln", + "lit": "lt", + "ltz": "lb", + "lub": "lu", + "lug": "lg", + "mah": "mh", + "mal": "ml", + "mar": "mr", + "mkd": "mk", + "mlg": "mg", + "mlt": "mt", + "mon": "mn", + "mri": "mi", + "msa": "ms", + "mya": "my", + "nau": "na", + "nav": "nv", + "nbl": "nr", + "nde": "nd", + "ndo": "ng", + "nep": "ne", + "nld": "nl", + "nno": "nn", + "nob": "nb", + "nor": "no", + "nya": "ny", + "oci": "oc", + "oji": "oj", + "ori": "or", + "orm": "om", + "oss": "os", + "pan": "pa", + "pli": "pi", + "pol": "pl", + "por": "pt", + "pus": "ps", + "que": "qu", + "roh": "rm", + "ron": "ro", + "run": "rn", + "rus": "ru", + "sag": "sg", + "san": "sa", + "sin": "si", + "slk": "sk", + "slv": "sl", + "sme": "se", + "smo": "sm", + "sna": "sn", + "snd": "sd", + "som": "so", + "sot": "st", + "spa": "es", + "sqi": "sq", + "srd": "sc", + "srp": "sr", + "ssw": "ss", + "sun": "su", + "swa": "sw", + "swe": "sv", + "tah": "ty", + "tam": "ta", + "tat": "tt", + "tel": "te", + "tgk": "tg", + "tgl": "tl", + "tha": "th", + "tir": "ti", + "ton": "to", + "tsn": "tn", + "tso": "ts", + "tuk": "tk", + "tur": "tr", + "twi": "tw", + "uig": "ug", + "ukr": "uk", + "urd": "ur", + "uzb": "uz", + "ven": "ve", + "vie": "vi", + "vol": "vo", + "wln": "wa", + "wol": "wo", + "xho": "xh", + "yid": "yi", + "yor": "yo", + "zha": "za", + "zho": "zh", + "zul": "zu", +} + + +iso639retired = { + "fri": "Western Frisian", + "auv": "Auvergnat", + "gsc": "Gascon", + "lms": "Limousin", + "lnc": "Languedocien", + "prv": "Provençal", + "amd": "Amapá Creole", + "bgh": "Bogan", + "bnh": "Banawá", + "bvs": "Belgian Sign Language", + "ccy": "Southern Zhuang", + "cit": "Chittagonian", + "flm": "Falam Chin", + "jap": "Jaruára", + "kob": "Kohoroxitari", + "mob": "Moinba", + "mzf": "Aiku", + "nhj": "Tlalitzlipa Nahuatl", + "nhs": "Southeastern Puebla Nahuatl", + "occ": "Occidental", + "tmx": "Tomyang", + "tot": "Patla-Chicontla Totonac", + "xmi": "Miarrã", + "yib": "Yinglish", + "ztc": "Lachirioag Zapotec", + "atf": "Atuence", + "bqe": "Navarro-Labourdin Basque", + "bsz": "Souletin Basque", + "aex": "Amerax", + "ahe": "Ahe", + "aiz": "Aari", + "akn": "Amikoana", + "arf": "Arafundi", + "azr": "Adzera", + "bcx": "Pamona", + "bii": "Bisu", + "bke": "Bengkulu", + "blu": "Hmong Njua", + "boc": "Bakung Kenyah", + "bsd": "Sarawak Bisaya", + "bwv": "Bahau River Kenyah", + "bxt": "Buxinhua", + "byu": "Buyang", + "ccx": "Northern Zhuang", + "cru": "Carútana", + "dat": "Darang Deng", + "dyk": "Land Dayak", + "eni": "Enim", + "fiz": "Izere", + "gen": "Geman Deng", + "ggh": "Garreh-Ajuran", + "itu": "Itutang", + "kds": "Lahu Shi", + "knh": "Kayan River Kenyah", + "krg": "North Korowai", + "krq": "Krui", + "kxg": "Katingan", + "lmt": "Lematang", + "lnt": "Lintang", + "lod": "Berawan", + "mbg": "Northern Nambikuára", + "mdo": "Southwest Gbaya", + "mhv": "Arakanese", + "miv": "Mimi", + "mqd": "Madang", + "nky": "Khiamniungan Naga", + "nxj": "Nyadu", + "ogn": "Ogan", + "ork": "Orokaiva", + "paj": "Ipeka-Tapuia", + "pec": "Southern Pesisir", + "pen": "Penesak", + "plm": "Palembang", + "poj": "Lower Pokomo", + "pun": "Pubian", + "rae": "Ranau", + "rjb": "Rajbanshi", + "rws": "Rawas", + "sdd": "Semendo", + "sdi": "Sindang Kelingi", + "skl": "Selako", + "slb": "Kahumamahon Saluan", + "srj": "Serawai", + "suf": "Tarpia", + "suh": "Suba", + "suu": "Sungkai", + "szk": "Sizaki", + "tle": "Southern Marakwet", + "tnj": "Tanjong", + "ttx": "Tutong 1", + "ubm": "Upper Baram Kenyah", + "vky": "Kayu Agung", + "vmo": "Muko-Muko", + "wre": "Ware", + "xah": "Kahayan", + "xkm": "Mahakam Kenyah", + "xuf": "Kunfal", + "yio": "Dayao Yi", + "ymj": "Muji Yi", + "ypl": "Pula Yi", + "ypw": "Puwa Yi", + "ywm": "Wumeng Yi", + "yym": "Yuanjiang-Mojiang Yi", + "mly": "Malay (individual language)", + "muw": "Mundari", + "xst": "Silt'e", + "ope": "Old Persian", + "scc": "Serbian", + "scr": "Croatian", + "xsk": "Sakan", + "mol": "Moldavian", + "aay": "Aariya", + "acc": "Cubulco Achí", + "cbm": "Yepocapa Southwestern Cakchiquel", + "chs": "Chumash", + "ckc": "Northern Cakchiquel", + "ckd": "South Central Cakchiquel", + "cke": "Eastern Cakchiquel", + "ckf": "Southern Cakchiquel", + "cki": "Santa María De Jesús Cakchiquel", + "ckj": "Santo Domingo Xenacoj Cakchiquel", + "ckk": "Acatenango Southwestern Cakchiquel", + "ckw": "Western Cakchiquel", + "cnm": "Ixtatán Chuj", + "cti": "Tila Chol", + "cun": "Cunén Quiché", + "eml": "Emiliano-Romagnolo", + "eur": "Europanto", + "gmo": "Gamo-Gofa-Dawro", + "hsf": "Southeastern Huastec", + "hva": "San Luís Potosí Huastec", + "ixi": "Nebaj Ixil", + "ixj": "Chajul Ixil", + "jai": "Western Jacalteco", + "mms": "Southern Mam", + "mpf": "Tajumulco Mam", + "mtz": "Tacanec", + "mvc": "Central Mam", + "mvj": "Todos Santos Cuchumatán Mam", + "poa": "Eastern Pokomam", + "pob": "Western Pokomchí", + "pou": "Southern Pokomam", + "ppv": "Papavô", + "quj": "Joyabaj Quiché", + "qut": "West Central Quiché", + "quu": "Eastern Quiché", + "qxi": "San Andrés Quiché", + "sic": "Malinguat", + "stc": "Santa Cruz", + "tlz": "Toala'", + "tzb": "Bachajón Tzeltal", + "tzc": "Chamula Tzotzil", + "tze": "Chenalhó Tzotzil", + "tzs": "San Andrés Larrainzar Tzotzil", + "tzt": "Western Tzutujil", + "tzu": "Huixtán Tzotzil", + "tzz": "Zinacantán Tzotzil", + "vlr": "Vatrata", + "yus": "Chan Santa Cruz Maya", + "nfg": "Nyeng", + "nfk": "Shakara", + "agp": "Paranan", + "bhk": "Albay Bicolano", + "bkb": "Finallig", + "btb": "Beti (Cameroon)", + "cjr": "Chorotega", + "cmk": "Chimakum", + "drh": "Darkhat", + "drw": "Darwazi", + "gav": "Gabutamon", + "mof": "Mohegan-Montauk-Narragansett", + "mst": "Cataelano Mandaya", + "myt": "Sangab Mandaya", + "rmr": "Caló", + "sgl": "Sanglechi-Ishkashimi", + "sul": "Surigaonon", + "sum": "Sumo-Mayangna", + "tnf": "Tangshewi", + "wgw": "Wagawaga", + "ayx": "Ayi (China)", + "bjq": "Southern Betsimisaraka Malagasy", + "dha": "Dhanwar (India)", + "dkl": "Kolum So Dogon", + "mja": "Mahei", + "nbf": "Naxi", + "noo": "Nootka", + "tie": "Tingal", + "tkk": "Takpa", + "baz": "Tunen", + "bjd": "Bandjigali", + "ccq": "Chaungtha", + "cka": "Khumi Awa Chin", + "dap": "Nisi (India)", + "dwl": "Walo Kumbe Dogon", + "elp": "Elpaputih", + "gbc": "Garawa", + "gio": "Gelao", + "hrr": "Horuru", + "ibi": "Ibilo", + "jar": "Jarawa (Nigeria)", + "kdv": "Kado", + "kgh": "Upper Tanudan Kalinga", + "kpp": "Paku Karen", + "kzh": "Kenuzi-Dongola", + "lcq": "Luhu", + "mgx": "Omati", + "nln": "Durango Nahuatl", + "pbz": "Palu", + "pgy": "Pongyong", + "sca": "Sansu", + "tlw": "South Wemale", + "unp": "Worora", + "wiw": "Wirangu", + "ybd": "Yangbye", + "yen": "Yendang", + "yma": "Yamphe", + "daf": "Dan", + "djl": "Djiwarli", + "ggr": "Aghu Tharnggalu", + "ilw": "Talur", + "izi": "Izi-Ezaa-Ikwo-Mgbo", + "meg": "Mea", + "mld": "Malakhel", + "mnt": "Maykulan", + "mwd": "Mudbura", + "myq": "Forest Maninka", + "nbx": "Ngura", + "nlr": "Ngarla", + "pcr": "Panang", + "ppr": "Piru", + "tgg": "Tangga", + "wit": "Wintu", + "xia": "Xiandao", + "yiy": "Yir Yoront", + "yos": "Yos", + "emo": "Emok", + "ggm": "Gugu Mini", + "leg": "Lengua", + "lmm": "Lamam", + "mhh": "Maskoy Pidgin", + "puz": "Purum Naga", + "sap": "Sanapaná", + "yuu": "Yugh", + "aam": "Aramanik", + "adp": "Adap", + "aue": "ǂKxʼauǁʼein", + "bmy": "Bemba (Democratic Republic of Congo)", + "bxx": "Borna (Democratic Republic of Congo)", + "byy": "Buya", + "dzd": "Daza", + "gfx": "Mangetti Dune ǃXung", + "gti": "Gbati-ri", + "ime": "Imeraguen", + "kbf": "Kakauhua", + "koj": "Sara Dunjo", + "kwq": "Kwak", + "kxe": "Kakihum", + "lii": "Lingkhim", + "mwj": "Maligo", + "nnx": "Ngong", + "oun": "ǃOǃung", + "pmu": "Mirpur Panjabi", + "sgo": "Songa", + "thx": "The", + "tsf": "Southwestern Tamang", + "uok": "Uokha", + "xsj": "Subi", + "yds": "Yiddish Sign Language", + "ymt": "Mator-Taygi-Karagas", + "ynh": "Yangho", + "bgm": "Baga Mboteni", + "btl": "Bhatola", + "cbe": "Chipiajes", + "cbh": "Cagua", + "coy": "Coyaima", + "cqu": "Chilean Quechua", + "cum": "Cumeral", + "duj": "Dhuwal", + "ggn": "Eastern Gurung", + "ggo": "Southern Gondi", + "guv": "Gey", + "iap": "Iapama", + "ill": "Iranun", + "kgc": "Kasseng", + "kox": "Coxima", + "ktr": "Kota Marudu Tinagas", + "kvs": "Kunggara", + "kzj": "Coastal Kadazan", + "kzt": "Tambunan Dusun", + "nad": "Nijadali", + "nts": "Natagaimas", + "ome": "Omejes", + "pmc": "Palumata", + "pod": "Ponares", + "ppa": "Pao", + "pry": "Pray 3", + "rna": "Runa", + "svr": "Savara", + "tdu": "Tempasuk Dusun", + "thc": "Tai Hang Tong", + "tid": "Tidong", + "tmp": "Tai Mène", + "tne": "Tinoc Kallahan", + "toe": "Tomedes", + "xba": "Kamba (Brazil)", + "xbx": "Kabixí", + "xip": "Xipináwa", + "xkh": "Karahawyana", + "yri": "Yarí", + "jeg": "Jeng", + "kgd": "Kataang", + "krm": "Krim", + "prb": "Lua'", + "puk": "Pu Ko", + "rie": "Rien", + "rsi": "Rennellese Sign Language", + "skk": "Sok", + "snh": "Shinabo", + "lsg": "Lyons Sign Language", + "mwx": "Mediak", + "mwy": "Mosiro", + "ncp": "Ndaktup", + "ais": "Nataoran Amis", + "asd": "Asas", + "dit": "Dirari", + "dud": "Hun-Saare", + "lba": "Lui", + "llo": "Khlor", + "myd": "Maramba", + "myi": "Mina (India)", + "nns": "Ningye", + "aoh": "Arma", + "ayy": "Tayabas Ayta", + "bbz": "Babalia Creole Arabic", + "bpb": "Barbacoas", + "cca": "Cauca", + "cdg": "Chamari", + "dgu": "Degaru", + "drr": "Dororo", + "ekc": "Eastern Karnic", + "gli": "Guliguli", + "kjf": "Khalaj", + "kxl": "Nepali Kurux", + "kxu": "Kui (India)", + "lmz": "Lumbee", + "nxu": "Narau", + "plp": "Palpa", + "sdm": "Semandang", + "tbb": "Tapeba", + "xrq": "Karranga", + "xtz": "Tasmanian", + "zir": "Ziriya", + "thw": "Thudam", + "bic": "Bikaru", + "bij": "Vaghat-Ya-Bijim-Legeri", + "blg": "Balau", + "gji": "Geji", + "mvm": "Muya", + "ngo": "Ngoni", + "pat": "Papitalai", + "vki": "Ija-Zuba", + "wra": "Warapu", + "ajt": "Judeo-Tunisian Arabic", + "cug": "Chungmboko", + "lak": "Laka (Nigeria)", + "lno": "Lango (South Sudan)", + "pii": "Pini", + "smd": "Sama", + "snb": "Sebuyau", + "uun": "Kulon-Pazeh", + "wrd": "Warduji", + "wya": "Wyandot", +} + + +iso639long = inverse_dict(iso639short) + +iso639code_retired = inverse_dict(iso639retired) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lazyimport.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lazyimport.py new file mode 100644 index 0000000000000000000000000000000000000000..ee0c8e4451fff3a19c3608d0d08e3422a77fd8f0 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lazyimport.py @@ -0,0 +1,142 @@ +# This module is from mx/DateTime/LazyModule.py and is +# distributed under the terms of the eGenix.com Public License Agreement +# https://www.egenix.com/products/eGenix.com-Public-License-1.1.0.pdf + +""" Helper to enable simple lazy module import. + + 'Lazy' means the actual import is deferred until an attribute is + requested from the module's namespace. This has the advantage of + allowing all imports to be done at the top of a script (in a + prominent and visible place) without having a great impact + on startup time. + + Copyright (c) 1999-2005, Marc-Andre Lemburg; mailto:mal@lemburg.com + See the documentation for further information on copyrights, + or contact the author. All Rights Reserved. +""" + +### Constants + +_debug = 0 + +### + + +class LazyModule: + + """Lazy module class. + + Lazy modules are imported into the given namespaces whenever a + non-special attribute (there are some attributes like __doc__ + that class instances handle without calling __getattr__) is + requested. The module is then registered under the given name + in locals usually replacing the import wrapper instance. The + import itself is done using globals as global namespace. + + Example of creating a lazy load module: + + ISO = LazyModule('ISO',locals(),globals()) + + Later, requesting an attribute from ISO will load the module + automatically into the locals() namespace, overriding the + LazyModule instance: + + t = ISO.Week(1998,1,1) + + """ + + # Flag which indicates whether the LazyModule is initialized or not + __lazymodule_init = 0 + + # Name of the module to load + __lazymodule_name = "" + + # Flag which indicates whether the module was loaded or not + __lazymodule_loaded = 0 + + # Locals dictionary where to register the module + __lazymodule_locals = None + + # Globals dictionary to use for the module import + __lazymodule_globals = None + + def __init__(self, name, locals, globals=None): + + """Create a LazyModule instance wrapping module name. + + The module will later on be registered in locals under the + given module name. + + globals is optional and defaults to locals. + + """ + self.__lazymodule_locals = locals + if globals is None: + globals = locals + self.__lazymodule_globals = globals + mainname = globals.get("__name__", "") + if mainname: + self.__name__ = mainname + "." + name + self.__lazymodule_name = name + else: + self.__name__ = self.__lazymodule_name = name + self.__lazymodule_init = 1 + + def __lazymodule_import(self): + + """Import the module now.""" + # Load and register module + local_name = self.__lazymodule_name # e.g. "toolbox" + full_name = self.__name__ # e.g. "nltk.toolbox" + if self.__lazymodule_loaded: + return self.__lazymodule_locals[local_name] + if _debug: + print("LazyModule: Loading module %r" % full_name) + self.__lazymodule_locals[local_name] = module = __import__( + full_name, self.__lazymodule_locals, self.__lazymodule_globals, "*" + ) + + # Fill namespace with all symbols from original module to + # provide faster access. + self.__dict__.update(module.__dict__) + + # Set import flag + self.__dict__["__lazymodule_loaded"] = 1 + + if _debug: + print("LazyModule: Module %r loaded" % full_name) + return module + + def __getattr__(self, name): + + """Import the module on demand and get the attribute.""" + if self.__lazymodule_loaded: + raise AttributeError(name) + if _debug: + print( + "LazyModule: " + "Module load triggered by attribute %r read access" % name + ) + module = self.__lazymodule_import() + return getattr(module, name) + + def __setattr__(self, name, value): + + """Import the module on demand and set the attribute.""" + if not self.__lazymodule_init: + self.__dict__[name] = value + return + if self.__lazymodule_loaded: + self.__lazymodule_locals[self.__lazymodule_name] = value + self.__dict__[name] = value + return + if _debug: + print( + "LazyModule: " + "Module load triggered by attribute %r write access" % name + ) + module = self.__lazymodule_import() + setattr(module, name, value) + + def __repr__(self): + return "" % self.__name__ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lm/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/api.py new file mode 100644 index 0000000000000000000000000000000000000000..d37669accdb522d314d6c9045217cdfb1026f6a3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/api.py @@ -0,0 +1,235 @@ +# Natural Language Toolkit: Language Models +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT +"""Language Model Interface.""" + +import random +import warnings +from abc import ABCMeta, abstractmethod +from bisect import bisect +from itertools import accumulate + +from nltk.lm.counter import NgramCounter +from nltk.lm.util import log_base2 +from nltk.lm.vocabulary import Vocabulary + + +class Smoothing(metaclass=ABCMeta): + """Ngram Smoothing Interface + + Implements Chen & Goodman 1995's idea that all smoothing algorithms have + certain features in common. This should ideally allow smoothing algorithms to + work both with Backoff and Interpolation. + """ + + def __init__(self, vocabulary, counter): + """ + :param vocabulary: The Ngram vocabulary object. + :type vocabulary: nltk.lm.vocab.Vocabulary + :param counter: The counts of the vocabulary items. + :type counter: nltk.lm.counter.NgramCounter + """ + self.vocab = vocabulary + self.counts = counter + + @abstractmethod + def unigram_score(self, word): + raise NotImplementedError() + + @abstractmethod + def alpha_gamma(self, word, context): + raise NotImplementedError() + + +def _mean(items): + """Return average (aka mean) for sequence of items.""" + return sum(items) / len(items) + + +def _random_generator(seed_or_generator): + if isinstance(seed_or_generator, random.Random): + return seed_or_generator + return random.Random(seed_or_generator) + + +def _weighted_choice(population, weights, random_generator=None): + """Like random.choice, but with weights. + + Heavily inspired by python 3.6 `random.choices`. + """ + if not population: + raise ValueError("Can't choose from empty population") + if len(population) != len(weights): + raise ValueError("The number of weights does not match the population") + cum_weights = list(accumulate(weights)) + total = cum_weights[-1] + threshold = random_generator.random() + return population[bisect(cum_weights, total * threshold)] + + +class LanguageModel(metaclass=ABCMeta): + """ABC for Language Models. + + Cannot be directly instantiated itself. + + """ + + def __init__(self, order, vocabulary=None, counter=None): + """Creates new LanguageModel. + + :param vocabulary: If provided, this vocabulary will be used instead + of creating a new one when training. + :type vocabulary: `nltk.lm.Vocabulary` or None + :param counter: If provided, use this object to count ngrams. + :type counter: `nltk.lm.NgramCounter` or None + :param ngrams_fn: If given, defines how sentences in training text are turned to ngram + sequences. + :type ngrams_fn: function or None + :param pad_fn: If given, defines how sentences in training text are padded. + :type pad_fn: function or None + """ + self.order = order + if vocabulary and not isinstance(vocabulary, Vocabulary): + warnings.warn( + f"The `vocabulary` argument passed to {self.__class__.__name__!r} " + "must be an instance of `nltk.lm.Vocabulary`.", + stacklevel=3, + ) + self.vocab = Vocabulary() if vocabulary is None else vocabulary + self.counts = NgramCounter() if counter is None else counter + + def fit(self, text, vocabulary_text=None): + """Trains the model on a text. + + :param text: Training text as a sequence of sentences. + + """ + if not self.vocab: + if vocabulary_text is None: + raise ValueError( + "Cannot fit without a vocabulary or text to create it from." + ) + self.vocab.update(vocabulary_text) + self.counts.update(self.vocab.lookup(sent) for sent in text) + + def score(self, word, context=None): + """Masks out of vocab (OOV) words and computes their model score. + + For model-specific logic of calculating scores, see the `unmasked_score` + method. + """ + return self.unmasked_score( + self.vocab.lookup(word), self.vocab.lookup(context) if context else None + ) + + @abstractmethod + def unmasked_score(self, word, context=None): + """Score a word given some optional context. + + Concrete models are expected to provide an implementation. + Note that this method does not mask its arguments with the OOV label. + Use the `score` method for that. + + :param str word: Word for which we want the score + :param tuple(str) context: Context the word is in. + If `None`, compute unigram score. + :param context: tuple(str) or None + :rtype: float + """ + raise NotImplementedError() + + def logscore(self, word, context=None): + """Evaluate the log score of this word in this context. + + The arguments are the same as for `score` and `unmasked_score`. + + """ + return log_base2(self.score(word, context)) + + def context_counts(self, context): + """Helper method for retrieving counts for a given context. + + Assumes context has been checked and oov words in it masked. + :type context: tuple(str) or None + + """ + return ( + self.counts[len(context) + 1][context] if context else self.counts.unigrams + ) + + def entropy(self, text_ngrams): + """Calculate cross-entropy of model for given evaluation text. + + :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples. + :rtype: float + + """ + return -1 * _mean( + [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams] + ) + + def perplexity(self, text_ngrams): + """Calculates the perplexity of the given text. + + This is simply 2 ** cross-entropy for the text, so the arguments are the same. + + """ + return pow(2.0, self.entropy(text_ngrams)) + + def generate(self, num_words=1, text_seed=None, random_seed=None): + """Generate words from the model. + + :param int num_words: How many words to generate. By default 1. + :param text_seed: Generation can be conditioned on preceding context. + :param random_seed: A random seed or an instance of `random.Random`. If provided, + makes the random sampling part of generation reproducible. + :return: One (str) word or a list of words generated from model. + + Examples: + + >>> from nltk.lm import MLE + >>> lm = MLE(2) + >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c']) + >>> lm.fit([[("a",), ("b",), ("c",)]]) + >>> lm.generate(random_seed=3) + 'a' + >>> lm.generate(text_seed=['a']) + 'b' + + """ + text_seed = [] if text_seed is None else list(text_seed) + random_generator = _random_generator(random_seed) + # This is the base recursion case. + if num_words == 1: + context = ( + text_seed[-self.order + 1 :] + if len(text_seed) >= self.order + else text_seed + ) + samples = self.context_counts(self.vocab.lookup(context)) + while context and not samples: + context = context[1:] if len(context) > 1 else [] + samples = self.context_counts(self.vocab.lookup(context)) + # Sorting samples achieves two things: + # - reproducible randomness when sampling + # - turns Mapping into Sequence which `_weighted_choice` expects + samples = sorted(samples) + return _weighted_choice( + samples, + tuple(self.score(w, context) for w in samples), + random_generator, + ) + # We build up text one word at a time using the preceding context. + generated = [] + for _ in range(num_words): + generated.append( + self.generate( + num_words=1, + text_seed=text_seed + generated, + random_seed=random_generator, + ) + ) + return generated diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lm/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/util.py new file mode 100644 index 0000000000000000000000000000000000000000..4cfd8af1a4d8af073bdb524834f269fa8b6834c5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/util.py @@ -0,0 +1,19 @@ +# Natural Language Toolkit +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT +"""Language Model Utilities""" + +from math import log + +NEG_INF = float("-inf") +POS_INF = float("inf") + + +def log_base2(score): + """Convenience function for computing logarithms with base 2.""" + if score == 0.0: + return NEG_INF + return log(score, 2) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lm/vocabulary.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/vocabulary.py new file mode 100644 index 0000000000000000000000000000000000000000..c083392084104a2f8b87dc21d8b49d4609f73fd7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/vocabulary.py @@ -0,0 +1,218 @@ +# Natural Language Toolkit +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT +"""Language Model Vocabulary""" + +import sys +from collections import Counter +from collections.abc import Iterable +from functools import singledispatch +from itertools import chain + + +@singledispatch +def _dispatched_lookup(words, vocab): + raise TypeError(f"Unsupported type for looking up in vocabulary: {type(words)}") + + +@_dispatched_lookup.register(Iterable) +def _(words, vocab): + """Look up a sequence of words in the vocabulary. + + Returns an iterator over looked up words. + + """ + return tuple(_dispatched_lookup(w, vocab) for w in words) + + +@_dispatched_lookup.register(str) +def _string_lookup(word, vocab): + """Looks up one word in the vocabulary.""" + return word if word in vocab else vocab.unk_label + + +class Vocabulary: + """Stores language model vocabulary. + + Satisfies two common language modeling requirements for a vocabulary: + + - When checking membership and calculating its size, filters items + by comparing their counts to a cutoff value. + - Adds a special "unknown" token which unseen words are mapped to. + + >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd'] + >>> from nltk.lm import Vocabulary + >>> vocab = Vocabulary(words, unk_cutoff=2) + + Tokens with counts greater than or equal to the cutoff value will + be considered part of the vocabulary. + + >>> vocab['c'] + 3 + >>> 'c' in vocab + True + >>> vocab['d'] + 2 + >>> 'd' in vocab + True + + Tokens with frequency counts less than the cutoff value will be considered not + part of the vocabulary even though their entries in the count dictionary are + preserved. + + >>> vocab['b'] + 1 + >>> 'b' in vocab + False + >>> vocab['aliens'] + 0 + >>> 'aliens' in vocab + False + + Keeping the count entries for seen words allows us to change the cutoff value + without having to recalculate the counts. + + >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1) + >>> "b" in vocab2 + True + + The cutoff value influences not only membership checking but also the result of + getting the size of the vocabulary using the built-in `len`. + Note that while the number of keys in the vocabulary's counter stays the same, + the items in the vocabulary differ depending on the cutoff. + We use `sorted` to demonstrate because it keeps the order consistent. + + >>> sorted(vocab2.counts) + ['-', 'a', 'b', 'c', 'd', 'r'] + >>> sorted(vocab2) + ['-', '', 'a', 'b', 'c', 'd', 'r'] + >>> sorted(vocab.counts) + ['-', 'a', 'b', 'c', 'd', 'r'] + >>> sorted(vocab) + ['', 'a', 'c', 'd'] + + In addition to items it gets populated with, the vocabulary stores a special + token that stands in for so-called "unknown" items. By default it's "". + + >>> "" in vocab + True + + We can look up words in a vocabulary using its `lookup` method. + "Unseen" words (with counts less than cutoff) are looked up as the unknown label. + If given one word (a string) as an input, this method will return a string. + + >>> vocab.lookup("a") + 'a' + >>> vocab.lookup("aliens") + '' + + If given a sequence, it will return an tuple of the looked up words. + + >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c']) + ('', 'a', '', 'd', '', 'c') + + It's possible to update the counts after the vocabulary has been created. + In general, the interface is the same as that of `collections.Counter`. + + >>> vocab['b'] + 1 + >>> vocab.update(["b", "b", "c"]) + >>> vocab['b'] + 3 + """ + + def __init__(self, counts=None, unk_cutoff=1, unk_label=""): + """Create a new Vocabulary. + + :param counts: Optional iterable or `collections.Counter` instance to + pre-seed the Vocabulary. In case it is iterable, counts + are calculated. + :param int unk_cutoff: Words that occur less frequently than this value + are not considered part of the vocabulary. + :param unk_label: Label for marking words not part of vocabulary. + + """ + self.unk_label = unk_label + if unk_cutoff < 1: + raise ValueError(f"Cutoff value cannot be less than 1. Got: {unk_cutoff}") + self._cutoff = unk_cutoff + + self.counts = Counter() + self.update(counts if counts is not None else "") + + @property + def cutoff(self): + """Cutoff value. + + Items with count below this value are not considered part of vocabulary. + + """ + return self._cutoff + + def update(self, *counter_args, **counter_kwargs): + """Update vocabulary counts. + + Wraps `collections.Counter.update` method. + + """ + self.counts.update(*counter_args, **counter_kwargs) + self._len = sum(1 for _ in self) + + def lookup(self, words): + """Look up one or more words in the vocabulary. + + If passed one word as a string will return that word or `self.unk_label`. + Otherwise will assume it was passed a sequence of words, will try to look + each of them up and return an iterator over the looked up words. + + :param words: Word(s) to look up. + :type words: Iterable(str) or str + :rtype: generator(str) or str + :raises: TypeError for types other than strings or iterables + + >>> from nltk.lm import Vocabulary + >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2) + >>> vocab.lookup("a") + 'a' + >>> vocab.lookup("aliens") + '' + >>> vocab.lookup(["a", "b", "c", ["x", "b"]]) + ('a', 'b', '', ('', 'b')) + + """ + return _dispatched_lookup(words, self) + + def __getitem__(self, item): + return self._cutoff if item == self.unk_label else self.counts[item] + + def __contains__(self, item): + """Only consider items with counts GE to cutoff as being in the + vocabulary.""" + return self[item] >= self.cutoff + + def __iter__(self): + """Building on membership check define how to iterate over + vocabulary.""" + return chain( + (item for item in self.counts if item in self), + [self.unk_label] if self.counts else [], + ) + + def __len__(self): + """Computing size of vocabulary reflects the cutoff.""" + return self._len + + def __eq__(self, other): + return ( + self.unk_label == other.unk_label + and self.cutoff == other.cutoff + and self.counts == other.counts + ) + + def __str__(self): + return "<{} with cutoff={} unk_label='{}' and {} items>".format( + self.__class__.__name__, self.cutoff, self.unk_label, len(self) + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/segmentation.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/segmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..50c7932e27769c2793714c2625df69413e331120 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/segmentation.py @@ -0,0 +1,222 @@ +# Natural Language Toolkit: Text Segmentation Metrics +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# David Doukhan +# URL: +# For license information, see LICENSE.TXT + + +""" +Text Segmentation Metrics + +1. Windowdiff + +Pevzner, L., and Hearst, M., A Critique and Improvement of + an Evaluation Metric for Text Segmentation, + Computational Linguistics 28, 19-36 + + +2. Generalized Hamming Distance + +Bookstein A., Kulyukin V.A., Raita T. +Generalized Hamming Distance +Information Retrieval 5, 2002, pp 353-375 + +Baseline implementation in C++ +http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html + +Study describing benefits of Generalized Hamming Distance Versus +WindowDiff for evaluating text segmentation tasks +Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ? +TALN 2009 + + +3. Pk text segmentation metric + +Beeferman D., Berger A., Lafferty J. (1999) +Statistical Models for Text Segmentation +Machine Learning, 34, 177-210 +""" + +try: + import numpy as np +except ImportError: + pass + + +def windowdiff(seg1, seg2, k, boundary="1", weighted=False): + """ + Compute the windowdiff score for a pair of segmentations. A + segmentation is any sequence over a vocabulary of two items + (e.g. "0", "1"), where the specified boundary value is used to + mark the edge of a segmentation. + + >>> s1 = "000100000010" + >>> s2 = "000010000100" + >>> s3 = "100000010000" + >>> '%.2f' % windowdiff(s1, s1, 3) + '0.00' + >>> '%.2f' % windowdiff(s1, s2, 3) + '0.30' + >>> '%.2f' % windowdiff(s2, s3, 3) + '0.80' + + :param seg1: a segmentation + :type seg1: str or list + :param seg2: a segmentation + :type seg2: str or list + :param k: window width + :type k: int + :param boundary: boundary value + :type boundary: str or int or bool + :param weighted: use the weighted variant of windowdiff + :type weighted: boolean + :rtype: float + """ + + if len(seg1) != len(seg2): + raise ValueError("Segmentations have unequal length") + if k > len(seg1): + raise ValueError( + "Window width k should be smaller or equal than segmentation lengths" + ) + wd = 0 + for i in range(len(seg1) - k + 1): + ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary)) + if weighted: + wd += ndiff + else: + wd += min(1, ndiff) + return wd / (len(seg1) - k + 1.0) + + +# Generalized Hamming Distance + + +def _init_mat(nrows, ncols, ins_cost, del_cost): + mat = np.empty((nrows, ncols)) + mat[0, :] = ins_cost * np.arange(ncols) + mat[:, 0] = del_cost * np.arange(nrows) + return mat + + +def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff): + for i, rowi in enumerate(rowv): + for j, colj in enumerate(colv): + shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j] + if rowi == colj: + # boundaries are at the same location, no transformation required + tcost = mat[i, j] + elif rowi > colj: + # boundary match through a deletion + tcost = del_cost + mat[i, j + 1] + else: + # boundary match through an insertion + tcost = ins_cost + mat[i + 1, j] + mat[i + 1, j + 1] = min(tcost, shift_cost) + + +def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"): + """ + Compute the Generalized Hamming Distance for a reference and a hypothetical + segmentation, corresponding to the cost related to the transformation + of the hypothetical segmentation into the reference segmentation + through boundary insertion, deletion and shift operations. + + A segmentation is any sequence over a vocabulary of two items + (e.g. "0", "1"), where the specified boundary value is used to + mark the edge of a segmentation. + + Recommended parameter values are a shift_cost_coeff of 2. + Associated with a ins_cost, and del_cost equal to the mean segment + length in the reference segmentation. + + >>> # Same examples as Kulyukin C++ implementation + >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5) + 0.5 + >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5) + 2.0 + >>> ghd('011', '110', 1.0, 1.0, 0.5) + 1.0 + >>> ghd('1', '0', 1.0, 1.0, 0.5) + 1.0 + >>> ghd('111', '000', 1.0, 1.0, 0.5) + 3.0 + >>> ghd('000', '111', 1.0, 2.0, 0.5) + 6.0 + + :param ref: the reference segmentation + :type ref: str or list + :param hyp: the hypothetical segmentation + :type hyp: str or list + :param ins_cost: insertion cost + :type ins_cost: float + :param del_cost: deletion cost + :type del_cost: float + :param shift_cost_coeff: constant used to compute the cost of a shift. + ``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j`` + are the positions indicating the shift + :type shift_cost_coeff: float + :param boundary: boundary value + :type boundary: str or int or bool + :rtype: float + """ + + ref_idx = [i for (i, val) in enumerate(ref) if val == boundary] + hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary] + + nref_bound = len(ref_idx) + nhyp_bound = len(hyp_idx) + + if nref_bound == 0 and nhyp_bound == 0: + return 0.0 + elif nref_bound > 0 and nhyp_bound == 0: + return nref_bound * ins_cost + elif nref_bound == 0 and nhyp_bound > 0: + return nhyp_bound * del_cost + + mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost) + _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff) + return mat[-1, -1] + + +# Beeferman's Pk text segmentation evaluation metric + + +def pk(ref, hyp, k=None, boundary="1"): + """ + Compute the Pk metric for a pair of segmentations A segmentation + is any sequence over a vocabulary of two items (e.g. "0", "1"), + where the specified boundary value is used to mark the edge of a + segmentation. + + >>> '%.2f' % pk('0100'*100, '1'*400, 2) + '0.50' + >>> '%.2f' % pk('0100'*100, '0'*400, 2) + '0.50' + >>> '%.2f' % pk('0100'*100, '0100'*100, 2) + '0.00' + + :param ref: the reference segmentation + :type ref: str or list + :param hyp: the segmentation to evaluate + :type hyp: str or list + :param k: window size, if None, set to half of the average reference segment length + :type boundary: str or int or bool + :param boundary: boundary value + :type boundary: str or int or bool + :rtype: float + """ + + if k is None: + k = int(round(len(ref) / (ref.count(boundary) * 2.0))) + + err = 0 + for i in range(len(ref) - k + 1): + r = ref[i : i + k].count(boundary) > 0 + h = hyp[i : i + k].count(boundary) > 0 + if r != h: + err += 1 + return err / (len(ref) - k + 1.0) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/spearman.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/spearman.py new file mode 100644 index 0000000000000000000000000000000000000000..3e4faeb9ea065513ff5ff903acfe6057c2cf4b15 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/spearman.py @@ -0,0 +1,68 @@ +# Natural Language Toolkit: Spearman Rank Correlation +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Joel Nothman +# URL: +# For license information, see LICENSE.TXT + +""" +Tools for comparing ranked lists. +""" + + +def _rank_dists(ranks1, ranks2): + """Finds the difference between the values in ranks1 and ranks2 for keys + present in both dicts. If the arguments are not dicts, they are converted + from (key, rank) sequences. + """ + ranks1 = dict(ranks1) + ranks2 = dict(ranks2) + for k in ranks1: + try: + yield k, ranks1[k] - ranks2[k] + except KeyError: + pass + + +def spearman_correlation(ranks1, ranks2): + """Returns the Spearman correlation coefficient for two rankings, which + should be dicts or sequences of (key, rank). The coefficient ranges from + -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only + calculated for keys in both rankings (for meaningful results, remove keys + present in only one list before ranking).""" + n = 0 + res = 0 + for k, d in _rank_dists(ranks1, ranks2): + res += d * d + n += 1 + try: + return 1 - (6 * res / (n * (n * n - 1))) + except ZeroDivisionError: + # Result is undefined if only one item is ranked + return 0.0 + + +def ranks_from_sequence(seq): + """Given a sequence, yields each element with an increasing rank, suitable + for use as an argument to ``spearman_correlation``. + """ + return ((k, i) for i, k in enumerate(seq)) + + +def ranks_from_scores(scores, rank_gap=1e-15): + """Given a sequence of (key, score) tuples, yields each key with an + increasing rank, tying with previous key's rank if the difference between + their scores is less than rank_gap. Suitable for use as an argument to + ``spearman_correlation``. + """ + prev_score = None + rank = 0 + for i, (key, score) in enumerate(scores): + try: + if abs(score - prev_score) > rank_gap: + rank = i + except TypeError: + pass + + yield key, rank + prev_score = score diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/chart.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/chart.py new file mode 100644 index 0000000000000000000000000000000000000000..3534c933cf478d85049662a326d3dd75c8bfe4c7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/chart.py @@ -0,0 +1,1848 @@ +# Natural Language Toolkit: A Chart Parser +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Jean Mark Gawron +# Peter Ljunglöf +# URL: +# For license information, see LICENSE.TXT + +""" +Data classes and parser implementations for "chart parsers", which +use dynamic programming to efficiently parse a text. A chart +parser derives parse trees for a text by iteratively adding "edges" +to a "chart." Each edge represents a hypothesis about the tree +structure for a subsequence of the text. The chart is a +"blackboard" for composing and combining these hypotheses. + +When a chart parser begins parsing a text, it creates a new (empty) +chart, spanning the text. It then incrementally adds new edges to the +chart. A set of "chart rules" specifies the conditions under which +new edges should be added to the chart. Once the chart reaches a +stage where none of the chart rules adds any new edges, parsing is +complete. + +Charts are encoded with the ``Chart`` class, and edges are encoded with +the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module +defines three chart parsers: + + - ``ChartParser`` is a simple and flexible chart parser. Given a + set of chart rules, it will apply those rules to the chart until + no more edges are added. + + - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can + be used to step through the parsing process. +""" + +import itertools +import re +import warnings +from functools import total_ordering + +from nltk.grammar import PCFG, is_nonterminal, is_terminal +from nltk.internals import raise_unorderable_types +from nltk.parse.api import ParserI +from nltk.tree import Tree +from nltk.util import OrderedDict + +######################################################################## +## Edges +######################################################################## + + +@total_ordering +class EdgeI: + """ + A hypothesis about the structure of part of a sentence. + Each edge records the fact that a structure is (partially) + consistent with the sentence. An edge contains: + + - A span, indicating what part of the sentence is + consistent with the hypothesized structure. + - A left-hand side, specifying what kind of structure is + hypothesized. + - A right-hand side, specifying the contents of the + hypothesized structure. + - A dot position, indicating how much of the hypothesized + structure is consistent with the sentence. + + Every edge is either complete or incomplete: + + - An edge is complete if its structure is fully consistent + with the sentence. + - An edge is incomplete if its structure is partially + consistent with the sentence. For every incomplete edge, the + span specifies a possible prefix for the edge's structure. + + There are two kinds of edge: + + - A ``TreeEdge`` records which trees have been found to + be (partially) consistent with the text. + - A ``LeafEdge`` records the tokens occurring in the text. + + The ``EdgeI`` interface provides a common interface to both types + of edge, allowing chart parsers to treat them in a uniform manner. + """ + + def __init__(self): + if self.__class__ == EdgeI: + raise TypeError("Edge is an abstract interface") + + # //////////////////////////////////////////////////////////// + # Span + # //////////////////////////////////////////////////////////// + + def span(self): + """ + Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the + portion of the sentence that is consistent with this + edge's structure. + + :rtype: tuple(int, int) + """ + raise NotImplementedError() + + def start(self): + """ + Return the start index of this edge's span. + + :rtype: int + """ + raise NotImplementedError() + + def end(self): + """ + Return the end index of this edge's span. + + :rtype: int + """ + raise NotImplementedError() + + def length(self): + """ + Return the length of this edge's span. + + :rtype: int + """ + raise NotImplementedError() + + # //////////////////////////////////////////////////////////// + # Left Hand Side + # //////////////////////////////////////////////////////////// + + def lhs(self): + """ + Return this edge's left-hand side, which specifies what kind + of structure is hypothesized by this edge. + + :see: ``TreeEdge`` and ``LeafEdge`` for a description of + the left-hand side values for each edge type. + """ + raise NotImplementedError() + + # //////////////////////////////////////////////////////////// + # Right Hand Side + # //////////////////////////////////////////////////////////// + + def rhs(self): + """ + Return this edge's right-hand side, which specifies + the content of the structure hypothesized by this edge. + + :see: ``TreeEdge`` and ``LeafEdge`` for a description of + the right-hand side values for each edge type. + """ + raise NotImplementedError() + + def dot(self): + """ + Return this edge's dot position, which indicates how much of + the hypothesized structure is consistent with the + sentence. In particular, ``self.rhs[:dot]`` is consistent + with ``tokens[self.start():self.end()]``. + + :rtype: int + """ + raise NotImplementedError() + + def nextsym(self): + """ + Return the element of this edge's right-hand side that + immediately follows its dot. + + :rtype: Nonterminal or terminal or None + """ + raise NotImplementedError() + + def is_complete(self): + """ + Return True if this edge's structure is fully consistent + with the text. + + :rtype: bool + """ + raise NotImplementedError() + + def is_incomplete(self): + """ + Return True if this edge's structure is partially consistent + with the text. + + :rtype: bool + """ + raise NotImplementedError() + + # //////////////////////////////////////////////////////////// + # Comparisons & hashing + # //////////////////////////////////////////////////////////// + + def __eq__(self, other): + return ( + self.__class__ is other.__class__ + and self._comparison_key == other._comparison_key + ) + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, EdgeI): + raise_unorderable_types("<", self, other) + if self.__class__ is other.__class__: + return self._comparison_key < other._comparison_key + else: + return self.__class__.__name__ < other.__class__.__name__ + + def __hash__(self): + try: + return self._hash + except AttributeError: + self._hash = hash(self._comparison_key) + return self._hash + + +class TreeEdge(EdgeI): + """ + An edge that records the fact that a tree is (partially) + consistent with the sentence. A tree edge consists of: + + - A span, indicating what part of the sentence is + consistent with the hypothesized tree. + - A left-hand side, specifying the hypothesized tree's node + value. + - A right-hand side, specifying the hypothesized tree's + children. Each element of the right-hand side is either a + terminal, specifying a token with that terminal as its leaf + value; or a nonterminal, specifying a subtree with that + nonterminal's symbol as its node value. + - A dot position, indicating which children are consistent + with part of the sentence. In particular, if ``dot`` is the + dot position, ``rhs`` is the right-hand size, ``(start,end)`` + is the span, and ``sentence`` is the list of tokens in the + sentence, then ``tokens[start:end]`` can be spanned by the + children specified by ``rhs[:dot]``. + + For more information about edges, see the ``EdgeI`` interface. + """ + + def __init__(self, span, lhs, rhs, dot=0): + """ + Construct a new ``TreeEdge``. + + :type span: tuple(int, int) + :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the + portion of the sentence that is consistent with the new + edge's structure. + :type lhs: Nonterminal + :param lhs: The new edge's left-hand side, specifying the + hypothesized tree's node value. + :type rhs: list(Nonterminal and str) + :param rhs: The new edge's right-hand side, specifying the + hypothesized tree's children. + :type dot: int + :param dot: The position of the new edge's dot. This position + specifies what prefix of the production's right hand side + is consistent with the text. In particular, if + ``sentence`` is the list of tokens in the sentence, then + ``okens[span[0]:span[1]]`` can be spanned by the + children specified by ``rhs[:dot]``. + """ + self._span = span + self._lhs = lhs + rhs = tuple(rhs) + self._rhs = rhs + self._dot = dot + self._comparison_key = (span, lhs, rhs, dot) + + @staticmethod + def from_production(production, index): + """ + Return a new ``TreeEdge`` formed from the given production. + The new edge's left-hand side and right-hand side will + be taken from ``production``; its span will be + ``(index,index)``; and its dot position will be ``0``. + + :rtype: TreeEdge + """ + return TreeEdge( + span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0 + ) + + def move_dot_forward(self, new_end): + """ + Return a new ``TreeEdge`` formed from this edge. + The new edge's dot position is increased by ``1``, + and its end index will be replaced by ``new_end``. + + :param new_end: The new end index. + :type new_end: int + :rtype: TreeEdge + """ + return TreeEdge( + span=(self._span[0], new_end), + lhs=self._lhs, + rhs=self._rhs, + dot=self._dot + 1, + ) + + # Accessors + def lhs(self): + return self._lhs + + def span(self): + return self._span + + def start(self): + return self._span[0] + + def end(self): + return self._span[1] + + def length(self): + return self._span[1] - self._span[0] + + def rhs(self): + return self._rhs + + def dot(self): + return self._dot + + def is_complete(self): + return self._dot == len(self._rhs) + + def is_incomplete(self): + return self._dot != len(self._rhs) + + def nextsym(self): + if self._dot >= len(self._rhs): + return None + else: + return self._rhs[self._dot] + + # String representation + def __str__(self): + str = f"[{self._span[0]}:{self._span[1]}] " + str += "%-2r ->" % (self._lhs,) + + for i in range(len(self._rhs)): + if i == self._dot: + str += " *" + str += " %s" % repr(self._rhs[i]) + if len(self._rhs) == self._dot: + str += " *" + return str + + def __repr__(self): + return "[Edge: %s]" % self + + +class LeafEdge(EdgeI): + """ + An edge that records the fact that a leaf value is consistent with + a word in the sentence. A leaf edge consists of: + + - An index, indicating the position of the word. + - A leaf, specifying the word's content. + + A leaf edge's left-hand side is its leaf value, and its right hand + side is ``()``. Its span is ``[index, index+1]``, and its dot + position is ``0``. + """ + + def __init__(self, leaf, index): + """ + Construct a new ``LeafEdge``. + + :param leaf: The new edge's leaf value, specifying the word + that is recorded by this edge. + :param index: The new edge's index, specifying the position of + the word that is recorded by this edge. + """ + self._leaf = leaf + self._index = index + self._comparison_key = (leaf, index) + + # Accessors + def lhs(self): + return self._leaf + + def span(self): + return (self._index, self._index + 1) + + def start(self): + return self._index + + def end(self): + return self._index + 1 + + def length(self): + return 1 + + def rhs(self): + return () + + def dot(self): + return 0 + + def is_complete(self): + return True + + def is_incomplete(self): + return False + + def nextsym(self): + return None + + # String representations + def __str__(self): + return f"[{self._index}:{self._index + 1}] {repr(self._leaf)}" + + def __repr__(self): + return "[Edge: %s]" % (self) + + +######################################################################## +## Chart +######################################################################## + + +class Chart: + """ + A blackboard for hypotheses about the syntactic constituents of a + sentence. A chart contains a set of edges, and each edge encodes + a single hypothesis about the structure of some portion of the + sentence. + + The ``select`` method can be used to select a specific collection + of edges. For example ``chart.select(is_complete=True, start=0)`` + yields all complete edges whose start indices are 0. To ensure + the efficiency of these selection operations, ``Chart`` dynamically + creates and maintains an index for each set of attributes that + have been selected on. + + In order to reconstruct the trees that are represented by an edge, + the chart associates each edge with a set of child pointer lists. + A child pointer list is a list of the edges that license an + edge's right-hand side. + + :ivar _tokens: The sentence that the chart covers. + :ivar _num_leaves: The number of tokens. + :ivar _edges: A list of the edges in the chart + :ivar _edge_to_cpls: A dictionary mapping each edge to a set + of child pointer lists that are associated with that edge. + :ivar _indexes: A dictionary mapping tuples of edge attributes + to indices, where each index maps the corresponding edge + attribute values to lists of edges. + """ + + def __init__(self, tokens): + """ + Construct a new chart. The chart is initialized with the + leaf edges corresponding to the terminal leaves. + + :type tokens: list + :param tokens: The sentence that this chart will be used to parse. + """ + # Record the sentence token and the sentence length. + self._tokens = tuple(tokens) + self._num_leaves = len(self._tokens) + + # Initialise the chart. + self.initialize() + + def initialize(self): + """ + Clear the chart. + """ + # A list of edges contained in this chart. + self._edges = [] + + # The set of child pointer lists associated with each edge. + self._edge_to_cpls = {} + + # Indexes mapping attribute values to lists of edges + # (used by select()). + self._indexes = {} + + # //////////////////////////////////////////////////////////// + # Sentence Access + # //////////////////////////////////////////////////////////// + + def num_leaves(self): + """ + Return the number of words in this chart's sentence. + + :rtype: int + """ + return self._num_leaves + + def leaf(self, index): + """ + Return the leaf value of the word at the given index. + + :rtype: str + """ + return self._tokens[index] + + def leaves(self): + """ + Return a list of the leaf values of each word in the + chart's sentence. + + :rtype: list(str) + """ + return self._tokens + + # //////////////////////////////////////////////////////////// + # Edge access + # //////////////////////////////////////////////////////////// + + def edges(self): + """ + Return a list of all edges in this chart. New edges + that are added to the chart after the call to edges() + will *not* be contained in this list. + + :rtype: list(EdgeI) + :see: ``iteredges``, ``select`` + """ + return self._edges[:] + + def iteredges(self): + """ + Return an iterator over the edges in this chart. It is + not guaranteed that new edges which are added to the + chart before the iterator is exhausted will also be generated. + + :rtype: iter(EdgeI) + :see: ``edges``, ``select`` + """ + return iter(self._edges) + + # Iterating over the chart yields its edges. + __iter__ = iteredges + + def num_edges(self): + """ + Return the number of edges contained in this chart. + + :rtype: int + """ + return len(self._edge_to_cpls) + + def select(self, **restrictions): + """ + Return an iterator over the edges in this chart. Any + new edges that are added to the chart before the iterator + is exahusted will also be generated. ``restrictions`` + can be used to restrict the set of edges that will be + generated. + + :param span: Only generate edges ``e`` where ``e.span()==span`` + :param start: Only generate edges ``e`` where ``e.start()==start`` + :param end: Only generate edges ``e`` where ``e.end()==end`` + :param length: Only generate edges ``e`` where ``e.length()==length`` + :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs`` + :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs`` + :param nextsym: Only generate edges ``e`` where + ``e.nextsym()==nextsym`` + :param dot: Only generate edges ``e`` where ``e.dot()==dot`` + :param is_complete: Only generate edges ``e`` where + ``e.is_complete()==is_complete`` + :param is_incomplete: Only generate edges ``e`` where + ``e.is_incomplete()==is_incomplete`` + :rtype: iter(EdgeI) + """ + # If there are no restrictions, then return all edges. + if restrictions == {}: + return iter(self._edges) + + # Find the index corresponding to the given restrictions. + restr_keys = sorted(restrictions.keys()) + restr_keys = tuple(restr_keys) + + # If it doesn't exist, then create it. + if restr_keys not in self._indexes: + self._add_index(restr_keys) + + vals = tuple(restrictions[key] for key in restr_keys) + return iter(self._indexes[restr_keys].get(vals, [])) + + def _add_index(self, restr_keys): + """ + A helper function for ``select``, which creates a new index for + a given set of attributes (aka restriction keys). + """ + # Make sure it's a valid index. + for key in restr_keys: + if not hasattr(EdgeI, key): + raise ValueError("Bad restriction: %s" % key) + + # Create the index. + index = self._indexes[restr_keys] = {} + + # Add all existing edges to the index. + for edge in self._edges: + vals = tuple(getattr(edge, key)() for key in restr_keys) + index.setdefault(vals, []).append(edge) + + def _register_with_indexes(self, edge): + """ + A helper function for ``insert``, which registers the new + edge with all existing indexes. + """ + for (restr_keys, index) in self._indexes.items(): + vals = tuple(getattr(edge, key)() for key in restr_keys) + index.setdefault(vals, []).append(edge) + + # //////////////////////////////////////////////////////////// + # Edge Insertion + # //////////////////////////////////////////////////////////// + + def insert_with_backpointer(self, new_edge, previous_edge, child_edge): + """ + Add a new edge to the chart, using a pointer to the previous edge. + """ + cpls = self.child_pointer_lists(previous_edge) + new_cpls = [cpl + (child_edge,) for cpl in cpls] + return self.insert(new_edge, *new_cpls) + + def insert(self, edge, *child_pointer_lists): + """ + Add a new edge to the chart, and return True if this operation + modified the chart. In particular, return true iff the chart + did not already contain ``edge``, or if it did not already associate + ``child_pointer_lists`` with ``edge``. + + :type edge: EdgeI + :param edge: The new edge + :type child_pointer_lists: sequence of tuple(EdgeI) + :param child_pointer_lists: A sequence of lists of the edges that + were used to form this edge. This list is used to reconstruct + the trees (or partial trees) that are associated with ``edge``. + :rtype: bool + """ + # Is it a new edge? + if edge not in self._edge_to_cpls: + # Add it to the list of edges. + self._append_edge(edge) + # Register with indexes. + self._register_with_indexes(edge) + + # Get the set of child pointer lists for this edge. + cpls = self._edge_to_cpls.setdefault(edge, OrderedDict()) + chart_was_modified = False + for child_pointer_list in child_pointer_lists: + child_pointer_list = tuple(child_pointer_list) + if child_pointer_list not in cpls: + # It's a new CPL; register it, and return true. + cpls[child_pointer_list] = True + chart_was_modified = True + return chart_was_modified + + def _append_edge(self, edge): + self._edges.append(edge) + + # //////////////////////////////////////////////////////////// + # Tree extraction & child pointer lists + # //////////////////////////////////////////////////////////// + + def parses(self, root, tree_class=Tree): + """ + Return an iterator of the complete tree structures that span + the entire chart, and whose root node is ``root``. + """ + for edge in self.select(start=0, end=self._num_leaves, lhs=root): + yield from self.trees(edge, tree_class=tree_class, complete=True) + + def trees(self, edge, tree_class=Tree, complete=False): + """ + Return an iterator of the tree structures that are associated + with ``edge``. + + If ``edge`` is incomplete, then the unexpanded children will be + encoded as childless subtrees, whose node value is the + corresponding terminal or nonterminal. + + :rtype: list(Tree) + :note: If two trees share a common subtree, then the same + Tree may be used to encode that subtree in + both trees. If you need to eliminate this subtree + sharing, then create a deep copy of each tree. + """ + return iter(self._trees(edge, complete, memo={}, tree_class=tree_class)) + + def _trees(self, edge, complete, memo, tree_class): + """ + A helper function for ``trees``. + + :param memo: A dictionary used to record the trees that we've + generated for each edge, so that when we see an edge more + than once, we can reuse the same trees. + """ + # If we've seen this edge before, then reuse our old answer. + if edge in memo: + return memo[edge] + + # when we're reading trees off the chart, don't use incomplete edges + if complete and edge.is_incomplete(): + return [] + + # Leaf edges. + if isinstance(edge, LeafEdge): + leaf = self._tokens[edge.start()] + memo[edge] = [leaf] + return [leaf] + + # Until we're done computing the trees for edge, set + # memo[edge] to be empty. This has the effect of filtering + # out any cyclic trees (i.e., trees that contain themselves as + # descendants), because if we reach this edge via a cycle, + # then it will appear that the edge doesn't generate any trees. + memo[edge] = [] + trees = [] + lhs = edge.lhs().symbol() + + # Each child pointer list can be used to form trees. + for cpl in self.child_pointer_lists(edge): + # Get the set of child choices for each child pointer. + # child_choices[i] is the set of choices for the tree's + # ith child. + child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] + + # For each combination of children, add a tree. + for children in itertools.product(*child_choices): + trees.append(tree_class(lhs, children)) + + # If the edge is incomplete, then extend it with "partial trees": + if edge.is_incomplete(): + unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]] + for tree in trees: + tree.extend(unexpanded) + + # Update the memoization dictionary. + memo[edge] = trees + + # Return the list of trees. + return trees + + def child_pointer_lists(self, edge): + """ + Return the set of child pointer lists for the given edge. + Each child pointer list is a list of edges that have + been used to form this edge. + + :rtype: list(list(EdgeI)) + """ + # Make a copy, in case they modify it. + return self._edge_to_cpls.get(edge, {}).keys() + + # //////////////////////////////////////////////////////////// + # Display + # //////////////////////////////////////////////////////////// + def pretty_format_edge(self, edge, width=None): + """ + Return a pretty-printed string representation of a given edge + in this chart. + + :rtype: str + :param width: The number of characters allotted to each + index in the sentence. + """ + if width is None: + width = 50 // (self.num_leaves() + 1) + (start, end) = (edge.start(), edge.end()) + + str = "|" + ("." + " " * (width - 1)) * start + + # Zero-width edges are "#" if complete, ">" if incomplete + if start == end: + if edge.is_complete(): + str += "#" + else: + str += ">" + + # Spanning complete edges are "[===]"; Other edges are + # "[---]" if complete, "[--->" if incomplete + elif edge.is_complete() and edge.span() == (0, self._num_leaves): + str += "[" + ("=" * width) * (end - start - 1) + "=" * (width - 1) + "]" + elif edge.is_complete(): + str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + "]" + else: + str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + ">" + + str += (" " * (width - 1) + ".") * (self._num_leaves - end) + return str + "| %s" % edge + + def pretty_format_leaves(self, width=None): + """ + Return a pretty-printed string representation of this + chart's leaves. This string can be used as a header + for calls to ``pretty_format_edge``. + """ + if width is None: + width = 50 // (self.num_leaves() + 1) + + if self._tokens is not None and width > 1: + header = "|." + for tok in self._tokens: + header += tok[: width - 1].center(width - 1) + "." + header += "|" + else: + header = "" + + return header + + def pretty_format(self, width=None): + """ + Return a pretty-printed string representation of this chart. + + :param width: The number of characters allotted to each + index in the sentence. + :rtype: str + """ + if width is None: + width = 50 // (self.num_leaves() + 1) + # sort edges: primary key=length, secondary key=start index. + # (and filter out the token edges) + edges = sorted((e.length(), e.start(), e) for e in self) + edges = [e for (_, _, e) in edges] + + return ( + self.pretty_format_leaves(width) + + "\n" + + "\n".join(self.pretty_format_edge(edge, width) for edge in edges) + ) + + # //////////////////////////////////////////////////////////// + # Display: Dot (AT&T Graphviz) + # //////////////////////////////////////////////////////////// + + def dot_digraph(self): + # Header + s = "digraph nltk_chart {\n" + # s += ' size="5,5";\n' + s += " rankdir=LR;\n" + s += " node [height=0.1,width=0.1];\n" + s += ' node [style=filled, color="lightgray"];\n' + + # Set up the nodes + for y in range(self.num_edges(), -1, -1): + if y == 0: + s += ' node [style=filled, color="black"];\n' + for x in range(self.num_leaves() + 1): + if y == 0 or ( + x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end() + ): + s += ' %04d.%04d [label=""];\n' % (x, y) + + # Add a spacer + s += " x [style=invis]; x->0000.0000 [style=invis];\n" + + # Declare ranks. + for x in range(self.num_leaves() + 1): + s += " {rank=same;" + for y in range(self.num_edges() + 1): + if y == 0 or ( + x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end() + ): + s += " %04d.%04d" % (x, y) + s += "}\n" + + # Add the leaves + s += " edge [style=invis, weight=100];\n" + s += " node [shape=plaintext]\n" + s += " 0000.0000" + for x in range(self.num_leaves()): + s += "->%s->%04d.0000" % (self.leaf(x), x + 1) + s += ";\n\n" + + # Add the edges + s += " edge [style=solid, weight=1];\n" + for y, edge in enumerate(self): + for x in range(edge.start()): + s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % ( + x, + y + 1, + x + 1, + y + 1, + ) + s += ' %04d.%04d -> %04d.%04d [label="%s"];\n' % ( + edge.start(), + y + 1, + edge.end(), + y + 1, + edge, + ) + for x in range(edge.end(), self.num_leaves()): + s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % ( + x, + y + 1, + x + 1, + y + 1, + ) + s += "}\n" + return s + + +######################################################################## +## Chart Rules +######################################################################## + + +class ChartRuleI: + """ + A rule that specifies what new edges are licensed by any given set + of existing edges. Each chart rule expects a fixed number of + edges, as indicated by the class variable ``NUM_EDGES``. In + particular: + + - A chart rule with ``NUM_EDGES=0`` specifies what new edges are + licensed, regardless of existing edges. + - A chart rule with ``NUM_EDGES=1`` specifies what new edges are + licensed by a single existing edge. + - A chart rule with ``NUM_EDGES=2`` specifies what new edges are + licensed by a pair of existing edges. + + :type NUM_EDGES: int + :cvar NUM_EDGES: The number of existing edges that this rule uses + to license new edges. Typically, this number ranges from zero + to two. + """ + + def apply(self, chart, grammar, *edges): + """ + Return a generator that will add edges licensed by this rule + and the given edges to the chart, one at a time. Each + time the generator is resumed, it will either add a new + edge and yield that edge; or return. + + :type edges: list(EdgeI) + :param edges: A set of existing edges. The number of edges + that should be passed to ``apply()`` is specified by the + ``NUM_EDGES`` class variable. + :rtype: iter(EdgeI) + """ + raise NotImplementedError() + + def apply_everywhere(self, chart, grammar): + """ + Return a generator that will add all edges licensed by + this rule, given the edges that are currently in the + chart, one at a time. Each time the generator is resumed, + it will either add a new edge and yield that edge; or return. + + :rtype: iter(EdgeI) + """ + raise NotImplementedError() + + +class AbstractChartRule(ChartRuleI): + """ + An abstract base class for chart rules. ``AbstractChartRule`` + provides: + + - A default implementation for ``apply``. + - A default implementation for ``apply_everywhere``, + (Currently, this implementation assumes that ``NUM_EDGES <= 3``.) + - A default implementation for ``__str__``, which returns a + name based on the rule's class name. + """ + + # Subclasses must define apply. + def apply(self, chart, grammar, *edges): + raise NotImplementedError() + + # Default: loop through the given number of edges, and call + # self.apply() for each set of edges. + def apply_everywhere(self, chart, grammar): + if self.NUM_EDGES == 0: + yield from self.apply(chart, grammar) + + elif self.NUM_EDGES == 1: + for e1 in chart: + yield from self.apply(chart, grammar, e1) + + elif self.NUM_EDGES == 2: + for e1 in chart: + for e2 in chart: + yield from self.apply(chart, grammar, e1, e2) + + elif self.NUM_EDGES == 3: + for e1 in chart: + for e2 in chart: + for e3 in chart: + yield from self.apply(chart, grammar, e1, e2, e3) + + else: + raise AssertionError("NUM_EDGES>3 is not currently supported") + + # Default: return a name based on the class name. + def __str__(self): + # Add spaces between InitialCapsWords. + return re.sub("([a-z])([A-Z])", r"\1 \2", self.__class__.__name__) + + +# //////////////////////////////////////////////////////////// +# Fundamental Rule +# //////////////////////////////////////////////////////////// + + +class FundamentalRule(AbstractChartRule): + r""" + A rule that joins two adjacent edges to form a single combined + edge. In particular, this rule specifies that any pair of edges + + - ``[A -> alpha \* B beta][i:j]`` + - ``[B -> gamma \*][j:k]`` + + licenses the edge: + + - ``[A -> alpha B * beta][i:j]`` + """ + + NUM_EDGES = 2 + + def apply(self, chart, grammar, left_edge, right_edge): + # Make sure the rule is applicable. + if not ( + left_edge.is_incomplete() + and right_edge.is_complete() + and left_edge.end() == right_edge.start() + and left_edge.nextsym() == right_edge.lhs() + ): + return + + # Construct the new edge. + new_edge = left_edge.move_dot_forward(right_edge.end()) + + # Insert it into the chart. + if chart.insert_with_backpointer(new_edge, left_edge, right_edge): + yield new_edge + + +class SingleEdgeFundamentalRule(FundamentalRule): + r""" + A rule that joins a given edge with adjacent edges in the chart, + to form combined edges. In particular, this rule specifies that + either of the edges: + + - ``[A -> alpha \* B beta][i:j]`` + - ``[B -> gamma \*][j:k]`` + + licenses the edge: + + - ``[A -> alpha B * beta][i:j]`` + + if the other edge is already in the chart. + + :note: This is basically ``FundamentalRule``, with one edge left + unspecified. + """ + + NUM_EDGES = 1 + + def apply(self, chart, grammar, edge): + if edge.is_incomplete(): + yield from self._apply_incomplete(chart, grammar, edge) + else: + yield from self._apply_complete(chart, grammar, edge) + + def _apply_complete(self, chart, grammar, right_edge): + for left_edge in chart.select( + end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() + ): + new_edge = left_edge.move_dot_forward(right_edge.end()) + if chart.insert_with_backpointer(new_edge, left_edge, right_edge): + yield new_edge + + def _apply_incomplete(self, chart, grammar, left_edge): + for right_edge in chart.select( + start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() + ): + new_edge = left_edge.move_dot_forward(right_edge.end()) + if chart.insert_with_backpointer(new_edge, left_edge, right_edge): + yield new_edge + + +# //////////////////////////////////////////////////////////// +# Inserting Terminal Leafs +# //////////////////////////////////////////////////////////// + + +class LeafInitRule(AbstractChartRule): + NUM_EDGES = 0 + + def apply(self, chart, grammar): + for index in range(chart.num_leaves()): + new_edge = LeafEdge(chart.leaf(index), index) + if chart.insert(new_edge, ()): + yield new_edge + + +# //////////////////////////////////////////////////////////// +# Top-Down Prediction +# //////////////////////////////////////////////////////////// + + +class TopDownInitRule(AbstractChartRule): + r""" + A rule licensing edges corresponding to the grammar productions for + the grammar's start symbol. In particular, this rule specifies that + ``[S -> \* alpha][0:i]`` is licensed for each grammar production + ``S -> alpha``, where ``S`` is the grammar's start symbol. + """ + + NUM_EDGES = 0 + + def apply(self, chart, grammar): + for prod in grammar.productions(lhs=grammar.start()): + new_edge = TreeEdge.from_production(prod, 0) + if chart.insert(new_edge, ()): + yield new_edge + + +class TopDownPredictRule(AbstractChartRule): + r""" + A rule licensing edges corresponding to the grammar productions + for the nonterminal following an incomplete edge's dot. In + particular, this rule specifies that + ``[A -> alpha \* B beta][i:j]`` licenses the edge + ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``. + + :note: This rule corresponds to the Predictor Rule in Earley parsing. + """ + + NUM_EDGES = 1 + + def apply(self, chart, grammar, edge): + if edge.is_complete(): + return + for prod in grammar.productions(lhs=edge.nextsym()): + new_edge = TreeEdge.from_production(prod, edge.end()) + if chart.insert(new_edge, ()): + yield new_edge + + +class CachedTopDownPredictRule(TopDownPredictRule): + r""" + A cached version of ``TopDownPredictRule``. After the first time + this rule is applied to an edge with a given ``end`` and ``next``, + it will not generate any more edges for edges with that ``end`` and + ``next``. + + If ``chart`` or ``grammar`` are changed, then the cache is flushed. + """ + + def __init__(self): + TopDownPredictRule.__init__(self) + self._done = {} + + def apply(self, chart, grammar, edge): + if edge.is_complete(): + return + nextsym, index = edge.nextsym(), edge.end() + if not is_nonterminal(nextsym): + return + + # If we've already applied this rule to an edge with the same + # next & end, and the chart & grammar have not changed, then + # just return (no new edges to add). + done = self._done.get((nextsym, index), (None, None)) + if done[0] is chart and done[1] is grammar: + return + + # Add all the edges indicated by the top down expand rule. + for prod in grammar.productions(lhs=nextsym): + # If the left corner in the predicted production is + # leaf, it must match with the input. + if prod.rhs(): + first = prod.rhs()[0] + if is_terminal(first): + if index >= chart.num_leaves() or first != chart.leaf(index): + continue + + new_edge = TreeEdge.from_production(prod, index) + if chart.insert(new_edge, ()): + yield new_edge + + # Record the fact that we've applied this rule. + self._done[nextsym, index] = (chart, grammar) + + +# //////////////////////////////////////////////////////////// +# Bottom-Up Prediction +# //////////////////////////////////////////////////////////// + + +class BottomUpPredictRule(AbstractChartRule): + r""" + A rule licensing any edge corresponding to a production whose + right-hand side begins with a complete edge's left-hand side. In + particular, this rule specifies that ``[A -> alpha \*]`` licenses + the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``. + """ + + NUM_EDGES = 1 + + def apply(self, chart, grammar, edge): + if edge.is_incomplete(): + return + for prod in grammar.productions(rhs=edge.lhs()): + new_edge = TreeEdge.from_production(prod, edge.start()) + if chart.insert(new_edge, ()): + yield new_edge + + +class BottomUpPredictCombineRule(BottomUpPredictRule): + r""" + A rule licensing any edge corresponding to a production whose + right-hand side begins with a complete edge's left-hand side. In + particular, this rule specifies that ``[A -> alpha \*]`` + licenses the edge ``[B -> A \* beta]`` for each grammar + production ``B -> A beta``. + + :note: This is like ``BottomUpPredictRule``, but it also applies + the ``FundamentalRule`` to the resulting edge. + """ + + NUM_EDGES = 1 + + def apply(self, chart, grammar, edge): + if edge.is_incomplete(): + return + for prod in grammar.productions(rhs=edge.lhs()): + new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) + if chart.insert(new_edge, (edge,)): + yield new_edge + + +class EmptyPredictRule(AbstractChartRule): + """ + A rule that inserts all empty productions as passive edges, + in every position in the chart. + """ + + NUM_EDGES = 0 + + def apply(self, chart, grammar): + for prod in grammar.productions(empty=True): + for index in range(chart.num_leaves() + 1): + new_edge = TreeEdge.from_production(prod, index) + if chart.insert(new_edge, ()): + yield new_edge + + +######################################################################## +## Filtered Bottom Up +######################################################################## + + +class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): + def _apply_complete(self, chart, grammar, right_edge): + end = right_edge.end() + nexttoken = end < chart.num_leaves() and chart.leaf(end) + for left_edge in chart.select( + end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() + ): + if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): + new_edge = left_edge.move_dot_forward(right_edge.end()) + if chart.insert_with_backpointer(new_edge, left_edge, right_edge): + yield new_edge + + def _apply_incomplete(self, chart, grammar, left_edge): + for right_edge in chart.select( + start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() + ): + end = right_edge.end() + nexttoken = end < chart.num_leaves() and chart.leaf(end) + if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): + new_edge = left_edge.move_dot_forward(right_edge.end()) + if chart.insert_with_backpointer(new_edge, left_edge, right_edge): + yield new_edge + + +class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule): + def apply(self, chart, grammar, edge): + if edge.is_incomplete(): + return + + end = edge.end() + nexttoken = end < chart.num_leaves() and chart.leaf(end) + for prod in grammar.productions(rhs=edge.lhs()): + if _bottomup_filter(grammar, nexttoken, prod.rhs()): + new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) + if chart.insert(new_edge, (edge,)): + yield new_edge + + +def _bottomup_filter(grammar, nexttoken, rhs, dot=0): + if len(rhs) <= dot + 1: + return True + _next = rhs[dot + 1] + if is_terminal(_next): + return nexttoken == _next + else: + return grammar.is_leftcorner(_next, nexttoken) + + +######################################################################## +## Generic Chart Parser +######################################################################## + +TD_STRATEGY = [ + LeafInitRule(), + TopDownInitRule(), + CachedTopDownPredictRule(), + SingleEdgeFundamentalRule(), +] +BU_STRATEGY = [ + LeafInitRule(), + EmptyPredictRule(), + BottomUpPredictRule(), + SingleEdgeFundamentalRule(), +] +BU_LC_STRATEGY = [ + LeafInitRule(), + EmptyPredictRule(), + BottomUpPredictCombineRule(), + SingleEdgeFundamentalRule(), +] + +LC_STRATEGY = [ + LeafInitRule(), + FilteredBottomUpPredictCombineRule(), + FilteredSingleEdgeFundamentalRule(), +] + + +class ChartParser(ParserI): + """ + A generic chart parser. A "strategy", or list of + ``ChartRuleI`` instances, is used to decide what edges to add to + the chart. In particular, ``ChartParser`` uses the following + algorithm to parse texts: + + | Until no new edges are added: + | For each *rule* in *strategy*: + | Apply *rule* to any applicable edges in the chart. + | Return any complete parses in the chart + """ + + def __init__( + self, + grammar, + strategy=BU_LC_STRATEGY, + trace=0, + trace_chart_width=50, + use_agenda=True, + chart_class=Chart, + ): + """ + Create a new chart parser, that uses ``grammar`` to parse + texts. + + :type grammar: CFG + :param grammar: The grammar used to parse texts. + :type strategy: list(ChartRuleI) + :param strategy: A list of rules that should be used to decide + what edges to add to the chart (top-down strategy by default). + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + and higher numbers will produce more verbose tracing + output. + :type trace_chart_width: int + :param trace_chart_width: The default total width reserved for + the chart in trace output. The remainder of each line will + be used to display edges. + :type use_agenda: bool + :param use_agenda: Use an optimized agenda-based algorithm, + if possible. + :param chart_class: The class that should be used to create + the parse charts. + """ + self._grammar = grammar + self._strategy = strategy + self._trace = trace + self._trace_chart_width = trace_chart_width + # If the strategy only consists of axioms (NUM_EDGES==0) and + # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm: + self._use_agenda = use_agenda + self._chart_class = chart_class + + self._axioms = [] + self._inference_rules = [] + for rule in strategy: + if rule.NUM_EDGES == 0: + self._axioms.append(rule) + elif rule.NUM_EDGES == 1: + self._inference_rules.append(rule) + else: + self._use_agenda = False + + def grammar(self): + return self._grammar + + def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width): + if not trace: + return + print_rule_header = trace > 1 + for edge in new_edges: + if print_rule_header: + print("%s:" % rule) + print_rule_header = False + print(chart.pretty_format_edge(edge, edge_width)) + + def chart_parse(self, tokens, trace=None): + """ + Return the final parse ``Chart`` from which all possible + parse trees can be extracted. + + :param tokens: The sentence to be parsed + :type tokens: list(str) + :rtype: Chart + """ + if trace is None: + trace = self._trace + trace_new_edges = self._trace_new_edges + + tokens = list(tokens) + self._grammar.check_coverage(tokens) + chart = self._chart_class(tokens) + grammar = self._grammar + + # Width, for printing trace edges. + trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) + if trace: + print(chart.pretty_format_leaves(trace_edge_width)) + + if self._use_agenda: + # Use an agenda-based algorithm. + for axiom in self._axioms: + new_edges = list(axiom.apply(chart, grammar)) + trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) + + inference_rules = self._inference_rules + agenda = chart.edges() + # We reverse the initial agenda, since it is a stack + # but chart.edges() functions as a queue. + agenda.reverse() + while agenda: + edge = agenda.pop() + for rule in inference_rules: + new_edges = list(rule.apply(chart, grammar, edge)) + if trace: + trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) + agenda += new_edges + + else: + # Do not use an agenda-based algorithm. + edges_added = True + while edges_added: + edges_added = False + for rule in self._strategy: + new_edges = list(rule.apply_everywhere(chart, grammar)) + edges_added = len(new_edges) + trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) + + # Return the final chart. + return chart + + def parse(self, tokens, tree_class=Tree): + chart = self.chart_parse(tokens) + return iter(chart.parses(self._grammar.start(), tree_class=tree_class)) + + +class TopDownChartParser(ChartParser): + """ + A ``ChartParser`` using a top-down parsing strategy. + See ``ChartParser`` for more information. + """ + + def __init__(self, grammar, **parser_args): + ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args) + + +class BottomUpChartParser(ChartParser): + """ + A ``ChartParser`` using a bottom-up parsing strategy. + See ``ChartParser`` for more information. + """ + + def __init__(self, grammar, **parser_args): + if isinstance(grammar, PCFG): + warnings.warn( + "BottomUpChartParser only works for CFG, " + "use BottomUpProbabilisticChartParser instead", + category=DeprecationWarning, + ) + ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args) + + +class BottomUpLeftCornerChartParser(ChartParser): + """ + A ``ChartParser`` using a bottom-up left-corner parsing strategy. + This strategy is often more efficient than standard bottom-up. + See ``ChartParser`` for more information. + """ + + def __init__(self, grammar, **parser_args): + ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args) + + +class LeftCornerChartParser(ChartParser): + def __init__(self, grammar, **parser_args): + if not grammar.is_nonempty(): + raise ValueError( + "LeftCornerParser only works for grammars " "without empty productions." + ) + ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args) + + +######################################################################## +## Stepping Chart Parser +######################################################################## + + +class SteppingChartParser(ChartParser): + """ + A ``ChartParser`` that allows you to step through the parsing + process, adding a single edge at a time. It also allows you to + change the parser's strategy or grammar midway through parsing a + text. + + The ``initialize`` method is used to start parsing a text. ``step`` + adds a single edge to the chart. ``set_strategy`` changes the + strategy used by the chart parser. ``parses`` returns the set of + parses that has been found by the chart parser. + + :ivar _restart: Records whether the parser's strategy, grammar, + or chart has been changed. If so, then ``step`` must restart + the parsing algorithm. + """ + + def __init__(self, grammar, strategy=[], trace=0): + self._chart = None + self._current_chartrule = None + self._restart = False + ChartParser.__init__(self, grammar, strategy, trace) + + # //////////////////////////////////////////////////////////// + # Initialization + # //////////////////////////////////////////////////////////// + + def initialize(self, tokens): + "Begin parsing the given tokens." + self._chart = Chart(list(tokens)) + self._restart = True + + # //////////////////////////////////////////////////////////// + # Stepping + # //////////////////////////////////////////////////////////// + + def step(self): + """ + Return a generator that adds edges to the chart, one at a + time. Each time the generator is resumed, it adds a single + edge and yields that edge. If no more edges can be added, + then it yields None. + + If the parser's strategy, grammar, or chart is changed, then + the generator will continue adding edges using the new + strategy, grammar, or chart. + + Note that this generator never terminates, since the grammar + or strategy might be changed to values that would add new + edges. Instead, it yields None when no more edges can be + added with the current strategy and grammar. + """ + if self._chart is None: + raise ValueError("Parser must be initialized first") + while True: + self._restart = False + w = 50 // (self._chart.num_leaves() + 1) + + for e in self._parse(): + if self._trace > 1: + print(self._current_chartrule) + if self._trace > 0: + print(self._chart.pretty_format_edge(e, w)) + yield e + if self._restart: + break + else: + yield None # No more edges. + + def _parse(self): + """ + A generator that implements the actual parsing algorithm. + ``step`` iterates through this generator, and restarts it + whenever the parser's strategy, grammar, or chart is modified. + """ + chart = self._chart + grammar = self._grammar + edges_added = 1 + while edges_added > 0: + edges_added = 0 + for rule in self._strategy: + self._current_chartrule = rule + for e in rule.apply_everywhere(chart, grammar): + edges_added += 1 + yield e + + # //////////////////////////////////////////////////////////// + # Accessors + # //////////////////////////////////////////////////////////// + + def strategy(self): + "Return the strategy used by this parser." + return self._strategy + + def grammar(self): + "Return the grammar used by this parser." + return self._grammar + + def chart(self): + "Return the chart that is used by this parser." + return self._chart + + def current_chartrule(self): + "Return the chart rule used to generate the most recent edge." + return self._current_chartrule + + def parses(self, tree_class=Tree): + "Return the parse trees currently contained in the chart." + return self._chart.parses(self._grammar.start(), tree_class) + + # //////////////////////////////////////////////////////////// + # Parser modification + # //////////////////////////////////////////////////////////// + + def set_strategy(self, strategy): + """ + Change the strategy that the parser uses to decide which edges + to add to the chart. + + :type strategy: list(ChartRuleI) + :param strategy: A list of rules that should be used to decide + what edges to add to the chart. + """ + if strategy == self._strategy: + return + self._strategy = strategy[:] # Make a copy. + self._restart = True + + def set_grammar(self, grammar): + "Change the grammar used by the parser." + if grammar is self._grammar: + return + self._grammar = grammar + self._restart = True + + def set_chart(self, chart): + "Load a given chart into the chart parser." + if chart is self._chart: + return + self._chart = chart + self._restart = True + + # //////////////////////////////////////////////////////////// + # Standard parser methods + # //////////////////////////////////////////////////////////// + + def parse(self, tokens, tree_class=Tree): + tokens = list(tokens) + self._grammar.check_coverage(tokens) + + # Initialize ourselves. + self.initialize(tokens) + + # Step until no more edges are generated. + for e in self.step(): + if e is None: + break + + # Return an iterator of complete parses. + return self.parses(tree_class=tree_class) + + +######################################################################## +## Demo Code +######################################################################## + + +def demo_grammar(): + from nltk.grammar import CFG + + return CFG.fromstring( + """ +S -> NP VP +PP -> "with" NP +NP -> NP PP +VP -> VP PP +VP -> Verb NP +VP -> Verb +NP -> Det Noun +NP -> "John" +NP -> "I" +Det -> "the" +Det -> "my" +Det -> "a" +Noun -> "dog" +Noun -> "cookie" +Verb -> "ate" +Verb -> "saw" +Prep -> "with" +Prep -> "under" +""" + ) + + +def demo( + choice=None, + print_times=True, + print_grammar=False, + print_trees=True, + trace=2, + sent="I saw John with a dog with my cookie", + numparses=5, +): + """ + A demonstration of the chart parsers. + """ + import sys + import time + + from nltk import CFG, Production, nonterminals + + # The grammar for ChartParser and SteppingChartParser: + grammar = demo_grammar() + if print_grammar: + print("* Grammar") + print(grammar) + + # Tokenize the sample sentence. + print("* Sentence:") + print(sent) + tokens = sent.split() + print(tokens) + print() + + # Ask the user which parser to test, + # if the parser wasn't provided as an argument + if choice is None: + print(" 1: Top-down chart parser") + print(" 2: Bottom-up chart parser") + print(" 3: Bottom-up left-corner chart parser") + print(" 4: Left-corner chart parser with bottom-up filter") + print(" 5: Stepping chart parser (alternating top-down & bottom-up)") + print(" 6: All parsers") + print("\nWhich parser (1-6)? ", end=" ") + choice = sys.stdin.readline().strip() + print() + + choice = str(choice) + if choice not in "123456": + print("Bad parser number") + return + + # Keep track of how long each parser takes. + times = {} + + strategies = { + "1": ("Top-down", TD_STRATEGY), + "2": ("Bottom-up", BU_STRATEGY), + "3": ("Bottom-up left-corner", BU_LC_STRATEGY), + "4": ("Filtered left-corner", LC_STRATEGY), + } + choices = [] + if choice in strategies: + choices = [choice] + if choice == "6": + choices = "1234" + + # Run the requested chart parser(s), except the stepping parser. + for strategy in choices: + print("* Strategy: " + strategies[strategy][0]) + print() + cp = ChartParser(grammar, strategies[strategy][1], trace=trace) + t = time.time() + chart = cp.chart_parse(tokens) + parses = list(chart.parses(grammar.start())) + + times[strategies[strategy][0]] = time.time() - t + print("Nr edges in chart:", len(chart.edges())) + if numparses: + assert len(parses) == numparses, "Not all parses found" + if print_trees: + for tree in parses: + print(tree) + else: + print("Nr trees:", len(parses)) + print() + + # Run the stepping parser, if requested. + if choice in "56": + print("* Strategy: Stepping (top-down vs bottom-up)") + print() + t = time.time() + cp = SteppingChartParser(grammar, trace=trace) + cp.initialize(tokens) + for i in range(5): + print("*** SWITCH TO TOP DOWN") + cp.set_strategy(TD_STRATEGY) + for j, e in enumerate(cp.step()): + if j > 20 or e is None: + break + print("*** SWITCH TO BOTTOM UP") + cp.set_strategy(BU_STRATEGY) + for j, e in enumerate(cp.step()): + if j > 20 or e is None: + break + times["Stepping"] = time.time() - t + print("Nr edges in chart:", len(cp.chart().edges())) + if numparses: + assert len(list(cp.parses())) == numparses, "Not all parses found" + if print_trees: + for tree in cp.parses(): + print(tree) + else: + print("Nr trees:", len(list(cp.parses()))) + print() + + # Print the times of all parsers: + if not (print_times and times): + return + print("* Parsing times") + print() + maxlen = max(len(key) for key in times) + format = "%" + repr(maxlen) + "s parser: %6.3fsec" + times_items = times.items() + for (parser, t) in sorted(times_items, key=lambda a: a[1]): + print(format % (parser, t)) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/dependencygraph.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/dependencygraph.py new file mode 100644 index 0000000000000000000000000000000000000000..c2a7c030bee2d4f24974ac0a12decef8d20e3044 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/dependencygraph.py @@ -0,0 +1,799 @@ +# Natural Language Toolkit: Dependency Grammars +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Jason Narad +# Steven Bird (modifications) +# +# URL: +# For license information, see LICENSE.TXT +# + +""" +Tools for reading and writing dependency trees. +The input is assumed to be in Malt-TAB format +(https://stp.lingfil.uu.se/~nivre/research/MaltXML.html). +""" + +import subprocess +import warnings +from collections import defaultdict +from itertools import chain +from pprint import pformat + +from nltk.internals import find_binary +from nltk.tree import Tree + +################################################################# +# DependencyGraph Class +################################################################# + + +class DependencyGraph: + """ + A container for the nodes and labelled edges of a dependency structure. + """ + + def __init__( + self, + tree_str=None, + cell_extractor=None, + zero_based=False, + cell_separator=None, + top_relation_label="ROOT", + ): + """Dependency graph. + + We place a dummy `TOP` node with the index 0, since the root node is + often assigned 0 as its head. This also means that the indexing of the + nodes corresponds directly to the Malt-TAB format, which starts at 1. + + If zero-based is True, then Malt-TAB-like input with node numbers + starting at 0 and the root node assigned -1 (as produced by, e.g., + zpar). + + :param str cell_separator: the cell separator. If not provided, cells + are split by whitespace. + + :param str top_relation_label: the label by which the top relation is + identified, for examlple, `ROOT`, `null` or `TOP`. + """ + self.nodes = defaultdict( + lambda: { + "address": None, + "word": None, + "lemma": None, + "ctag": None, + "tag": None, + "feats": None, + "head": None, + "deps": defaultdict(list), + "rel": None, + } + ) + + self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0}) + + self.root = None + + if tree_str: + self._parse( + tree_str, + cell_extractor=cell_extractor, + zero_based=zero_based, + cell_separator=cell_separator, + top_relation_label=top_relation_label, + ) + + def remove_by_address(self, address): + """ + Removes the node with the given address. References + to this node in others will still exist. + """ + del self.nodes[address] + + def redirect_arcs(self, originals, redirect): + """ + Redirects arcs to any of the nodes in the originals list + to the redirect node address. + """ + for node in self.nodes.values(): + new_deps = [] + for dep in node["deps"]: + if dep in originals: + new_deps.append(redirect) + else: + new_deps.append(dep) + node["deps"] = new_deps + + def add_arc(self, head_address, mod_address): + """ + Adds an arc from the node specified by head_address to the + node specified by the mod address. + """ + relation = self.nodes[mod_address]["rel"] + self.nodes[head_address]["deps"].setdefault(relation, []) + self.nodes[head_address]["deps"][relation].append(mod_address) + # self.nodes[head_address]['deps'].append(mod_address) + + def connect_graph(self): + """ + Fully connects all non-root nodes. All nodes are set to be dependents + of the root node. + """ + for node1 in self.nodes.values(): + for node2 in self.nodes.values(): + if node1["address"] != node2["address"] and node2["rel"] != "TOP": + relation = node2["rel"] + node1["deps"].setdefault(relation, []) + node1["deps"][relation].append(node2["address"]) + # node1['deps'].append(node2['address']) + + def get_by_address(self, node_address): + """Return the node with the given address.""" + return self.nodes[node_address] + + def contains_address(self, node_address): + """ + Returns true if the graph contains a node with the given node + address, false otherwise. + """ + return node_address in self.nodes + + def to_dot(self): + """Return a dot representation suitable for using with Graphviz. + + >>> dg = DependencyGraph( + ... 'John N 2\\n' + ... 'loves V 0\\n' + ... 'Mary N 2' + ... ) + >>> print(dg.to_dot()) + digraph G{ + edge [dir=forward] + node [shape=plaintext] + + 0 [label="0 (None)"] + 0 -> 2 [label="ROOT"] + 1 [label="1 (John)"] + 2 [label="2 (loves)"] + 2 -> 1 [label=""] + 2 -> 3 [label=""] + 3 [label="3 (Mary)"] + } + + """ + # Start the digraph specification + s = "digraph G{\n" + s += "edge [dir=forward]\n" + s += "node [shape=plaintext]\n" + + # Draw the remaining nodes + for node in sorted(self.nodes.values(), key=lambda v: v["address"]): + s += '\n{} [label="{} ({})"]'.format( + node["address"], + node["address"], + node["word"], + ) + for rel, deps in node["deps"].items(): + for dep in deps: + if rel is not None: + s += '\n{} -> {} [label="{}"]'.format(node["address"], dep, rel) + else: + s += "\n{} -> {} ".format(node["address"], dep) + s += "\n}" + + return s + + def _repr_svg_(self): + """Show SVG representation of the transducer (IPython magic). + >>> from nltk.test.setup_fixt import check_binary + >>> check_binary('dot') + >>> dg = DependencyGraph( + ... 'John N 2\\n' + ... 'loves V 0\\n' + ... 'Mary N 2' + ... ) + >>> dg._repr_svg_().split('\\n')[0] + '' + + """ + dot_string = self.to_dot() + return dot2img(dot_string) + + def __str__(self): + return pformat(self.nodes) + + def __repr__(self): + return f"" + + @staticmethod + def load( + filename, zero_based=False, cell_separator=None, top_relation_label="ROOT" + ): + """ + :param filename: a name of a file in Malt-TAB format + :param zero_based: nodes in the input file are numbered starting from 0 + rather than 1 (as produced by, e.g., zpar) + :param str cell_separator: the cell separator. If not provided, cells + are split by whitespace. + :param str top_relation_label: the label by which the top relation is + identified, for examlple, `ROOT`, `null` or `TOP`. + + :return: a list of DependencyGraphs + + """ + with open(filename) as infile: + return [ + DependencyGraph( + tree_str, + zero_based=zero_based, + cell_separator=cell_separator, + top_relation_label=top_relation_label, + ) + for tree_str in infile.read().split("\n\n") + ] + + def left_children(self, node_index): + """ + Returns the number of left children under the node specified + by the given address. + """ + children = chain.from_iterable(self.nodes[node_index]["deps"].values()) + index = self.nodes[node_index]["address"] + return sum(1 for c in children if c < index) + + def right_children(self, node_index): + """ + Returns the number of right children under the node specified + by the given address. + """ + children = chain.from_iterable(self.nodes[node_index]["deps"].values()) + index = self.nodes[node_index]["address"] + return sum(1 for c in children if c > index) + + def add_node(self, node): + if not self.contains_address(node["address"]): + self.nodes[node["address"]].update(node) + + def _parse( + self, + input_, + cell_extractor=None, + zero_based=False, + cell_separator=None, + top_relation_label="ROOT", + ): + """Parse a sentence. + + :param extractor: a function that given a tuple of cells returns a + 7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, + rel``. + + :param str cell_separator: the cell separator. If not provided, cells + are split by whitespace. + + :param str top_relation_label: the label by which the top relation is + identified, for examlple, `ROOT`, `null` or `TOP`. + + """ + + def extract_3_cells(cells, index): + word, tag, head = cells + return index, word, word, tag, tag, "", head, "" + + def extract_4_cells(cells, index): + word, tag, head, rel = cells + return index, word, word, tag, tag, "", head, rel + + def extract_7_cells(cells, index): + line_index, word, lemma, tag, _, head, rel = cells + try: + index = int(line_index) + except ValueError: + # index can't be parsed as an integer, use default + pass + return index, word, lemma, tag, tag, "", head, rel + + def extract_10_cells(cells, index): + line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells + try: + index = int(line_index) + except ValueError: + # index can't be parsed as an integer, use default + pass + return index, word, lemma, ctag, tag, feats, head, rel + + extractors = { + 3: extract_3_cells, + 4: extract_4_cells, + 7: extract_7_cells, + 10: extract_10_cells, + } + + if isinstance(input_, str): + input_ = (line for line in input_.split("\n")) + + lines = (l.rstrip() for l in input_) + lines = (l for l in lines if l) + + cell_number = None + for index, line in enumerate(lines, start=1): + cells = line.split(cell_separator) + if cell_number is None: + cell_number = len(cells) + else: + assert cell_number == len(cells) + + if cell_extractor is None: + try: + cell_extractor = extractors[cell_number] + except KeyError as e: + raise ValueError( + "Number of tab-delimited fields ({}) not supported by " + "CoNLL(10) or Malt-Tab(4) format".format(cell_number) + ) from e + + try: + index, word, lemma, ctag, tag, feats, head, rel = cell_extractor( + cells, index + ) + except (TypeError, ValueError): + # cell_extractor doesn't take 2 arguments or doesn't return 8 + # values; assume the cell_extractor is an older external + # extractor and doesn't accept or return an index. + word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells) + + if head == "_": + continue + + head = int(head) + if zero_based: + head += 1 + + self.nodes[index].update( + { + "address": index, + "word": word, + "lemma": lemma, + "ctag": ctag, + "tag": tag, + "feats": feats, + "head": head, + "rel": rel, + } + ) + + # Make sure that the fake root node has labeled dependencies. + if (cell_number == 3) and (head == 0): + rel = top_relation_label + self.nodes[head]["deps"][rel].append(index) + + if self.nodes[0]["deps"][top_relation_label]: + root_address = self.nodes[0]["deps"][top_relation_label][0] + self.root = self.nodes[root_address] + self.top_relation_label = top_relation_label + else: + warnings.warn( + "The graph doesn't contain a node " "that depends on the root element." + ) + + def _word(self, node, filter=True): + w = node["word"] + if filter: + if w != ",": + return w + return w + + def _tree(self, i): + """Turn dependency graphs into NLTK trees. + + :param int i: index of a node + :return: either a word (if the indexed node is a leaf) or a ``Tree``. + """ + node = self.get_by_address(i) + word = node["word"] + deps = sorted(chain.from_iterable(node["deps"].values())) + + if deps: + return Tree(word, [self._tree(dep) for dep in deps]) + else: + return word + + def tree(self): + """ + Starting with the ``root`` node, build a dependency tree using the NLTK + ``Tree`` constructor. Dependency labels are omitted. + """ + node = self.root + + word = node["word"] + deps = sorted(chain.from_iterable(node["deps"].values())) + return Tree(word, [self._tree(dep) for dep in deps]) + + def triples(self, node=None): + """ + Extract dependency triples of the form: + ((head word, head tag), rel, (dep word, dep tag)) + """ + + if not node: + node = self.root + + head = (node["word"], node["ctag"]) + for i in sorted(chain.from_iterable(node["deps"].values())): + dep = self.get_by_address(i) + yield (head, dep["rel"], (dep["word"], dep["ctag"])) + yield from self.triples(node=dep) + + def _hd(self, i): + try: + return self.nodes[i]["head"] + except IndexError: + return None + + def _rel(self, i): + try: + return self.nodes[i]["rel"] + except IndexError: + return None + + # what's the return type? Boolean or list? + def contains_cycle(self): + """Check whether there are cycles. + + >>> dg = DependencyGraph(treebank_data) + >>> dg.contains_cycle() + False + + >>> cyclic_dg = DependencyGraph() + >>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0} + >>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1} + >>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2} + >>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3} + >>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4} + >>> cyclic_dg.nodes = { + ... 0: top, + ... 1: child1, + ... 2: child2, + ... 3: child3, + ... 4: child4, + ... } + >>> cyclic_dg.root = top + + >>> cyclic_dg.contains_cycle() + [1, 2, 4, 3] + + """ + distances = {} + + for node in self.nodes.values(): + for dep in node["deps"]: + key = tuple([node["address"], dep]) + distances[key] = 1 + + for _ in self.nodes: + new_entries = {} + + for pair1 in distances: + for pair2 in distances: + if pair1[1] == pair2[0]: + key = tuple([pair1[0], pair2[1]]) + new_entries[key] = distances[pair1] + distances[pair2] + + for pair in new_entries: + distances[pair] = new_entries[pair] + if pair[0] == pair[1]: + path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0]) + return path + + return False # return []? + + def get_cycle_path(self, curr_node, goal_node_index): + for dep in curr_node["deps"]: + if dep == goal_node_index: + return [curr_node["address"]] + for dep in curr_node["deps"]: + path = self.get_cycle_path(self.get_by_address(dep), goal_node_index) + if len(path) > 0: + path.insert(0, curr_node["address"]) + return path + return [] + + def to_conll(self, style): + """ + The dependency graph in CoNLL format. + + :param style: the style to use for the format (3, 4, 10 columns) + :type style: int + :rtype: str + """ + + if style == 3: + template = "{word}\t{tag}\t{head}\n" + elif style == 4: + template = "{word}\t{tag}\t{head}\t{rel}\n" + elif style == 10: + template = ( + "{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n" + ) + else: + raise ValueError( + "Number of tab-delimited fields ({}) not supported by " + "CoNLL(10) or Malt-Tab(4) format".format(style) + ) + + return "".join( + template.format(i=i, **node) + for i, node in sorted(self.nodes.items()) + if node["tag"] != "TOP" + ) + + def nx_graph(self): + """Convert the data in a ``nodelist`` into a networkx labeled directed graph.""" + import networkx + + nx_nodelist = list(range(1, len(self.nodes))) + nx_edgelist = [ + (n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n) + ] + self.nx_labels = {} + for n in nx_nodelist: + self.nx_labels[n] = self.nodes[n]["word"] + + g = networkx.MultiDiGraph() + g.add_nodes_from(nx_nodelist) + g.add_edges_from(nx_edgelist) + + return g + + +def dot2img(dot_string, t="svg"): + """ + Create image representation fom dot_string, using the 'dot' program + from the Graphviz package. + + Use the 't' argument to specify the image file format, for ex. 'jpeg', 'eps', + 'json', 'png' or 'webp' (Running 'dot -T:' lists all available formats). + + Note that the "capture_output" option of subprocess.run() is only available + with text formats (like svg), but not with binary image formats (like png). + """ + + try: + find_binary("dot") + try: + if t in ["dot", "dot_json", "json", "svg"]: + proc = subprocess.run( + ["dot", "-T%s" % t], + capture_output=True, + input=dot_string, + text=True, + ) + else: + proc = subprocess.run( + ["dot", "-T%s" % t], + input=bytes(dot_string, encoding="utf8"), + ) + return proc.stdout + except: + raise Exception( + "Cannot create image representation by running dot from string: {}" + "".format(dot_string) + ) + except OSError as e: + raise Exception("Cannot find the dot binary from Graphviz package") from e + + +class DependencyGraphError(Exception): + """Dependency graph exception.""" + + +def demo(): + malt_demo() + conll_demo() + conll_file_demo() + cycle_finding_demo() + + +def malt_demo(nx=False): + """ + A demonstration of the result of reading a dependency + version of the first sentence of the Penn Treebank. + """ + dg = DependencyGraph( + """Pierre NNP 2 NMOD +Vinken NNP 8 SUB +, , 2 P +61 CD 5 NMOD +years NNS 6 AMOD +old JJ 2 NMOD +, , 2 P +will MD 0 ROOT +join VB 8 VC +the DT 11 NMOD +board NN 9 OBJ +as IN 9 VMOD +a DT 15 NMOD +nonexecutive JJ 15 NMOD +director NN 12 PMOD +Nov. NNP 9 VMOD +29 CD 16 NMOD +. . 9 VMOD +""" + ) + tree = dg.tree() + tree.pprint() + if nx: + # currently doesn't work + import networkx + from matplotlib import pylab + + g = dg.nx_graph() + g.info() + pos = networkx.spring_layout(g, dim=1) + networkx.draw_networkx_nodes(g, pos, node_size=50) + # networkx.draw_networkx_edges(g, pos, edge_color='k', width=8) + networkx.draw_networkx_labels(g, pos, dg.nx_labels) + pylab.xticks([]) + pylab.yticks([]) + pylab.savefig("tree.png") + pylab.show() + + +def conll_demo(): + """ + A demonstration of how to read a string representation of + a CoNLL format dependency tree. + """ + dg = DependencyGraph(conll_data1) + tree = dg.tree() + tree.pprint() + print(dg) + print(dg.to_conll(4)) + + +def conll_file_demo(): + print("Mass conll_read demo...") + graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] + for graph in graphs: + tree = graph.tree() + print("\n") + tree.pprint() + + +def cycle_finding_demo(): + dg = DependencyGraph(treebank_data) + print(dg.contains_cycle()) + cyclic_dg = DependencyGraph() + cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0}) + cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1}) + cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2}) + cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3}) + cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4}) + print(cyclic_dg.contains_cycle()) + + +treebank_data = """Pierre NNP 2 NMOD +Vinken NNP 8 SUB +, , 2 P +61 CD 5 NMOD +years NNS 6 AMOD +old JJ 2 NMOD +, , 2 P +will MD 0 ROOT +join VB 8 VC +the DT 11 NMOD +board NN 9 OBJ +as IN 9 VMOD +a DT 15 NMOD +nonexecutive JJ 15 NMOD +director NN 12 PMOD +Nov. NNP 9 VMOD +29 CD 16 NMOD +. . 9 VMOD +""" + +conll_data1 = """ +1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ +2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ +3 met met Prep Prep voor 8 mod _ _ +4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ +5 moeder moeder N N soort|ev|neut 3 obj1 _ _ +6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ +7 gaan ga V V hulp|inf 6 vc _ _ +8 winkelen winkel V V intrans|inf 11 cnj _ _ +9 , , Punc Punc komma 8 punct _ _ +10 zwemmen zwem V V intrans|inf 11 cnj _ _ +11 of of Conj Conj neven 7 vc _ _ +12 terrassen terras N N soort|mv|neut 11 cnj _ _ +13 . . Punc Punc punt 12 punct _ _ +""" + +conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _ +2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _ +3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _ +4 wild wild Adj Adj attr|stell|onverv 5 mod _ _ +5 zwaaien zwaai N N soort|mv|neut 2 vc _ _ +6 . . Punc Punc punt 5 punct _ _ + +1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ +2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ +3 met met Prep Prep voor 8 mod _ _ +4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ +5 moeder moeder N N soort|ev|neut 3 obj1 _ _ +6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ +7 gaan ga V V hulp|inf 6 vc _ _ +8 winkelen winkel V V intrans|inf 11 cnj _ _ +9 , , Punc Punc komma 8 punct _ _ +10 zwemmen zwem V V intrans|inf 11 cnj _ _ +11 of of Conj Conj neven 7 vc _ _ +12 terrassen terras N N soort|mv|neut 11 cnj _ _ +13 . . Punc Punc punt 12 punct _ _ + +1 Dat dat Pron Pron aanw|neut|attr 2 det _ _ +2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _ +3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ +4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _ +5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _ +6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _ +7 . . Punc Punc punt 6 punct _ _ + +1 Het het Pron Pron onbep|neut|zelfst 2 su _ _ +2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _ +3 bij bij Prep Prep voor 2 ld _ _ +4 de de Art Art bep|zijdofmv|neut 6 det _ _ +5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _ +6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _ +7 die die Pron Pron betr|neut|zelfst 6 mod _ _ +8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _ +9 ginds ginds Adv Adv gew|aanw 12 mod _ _ +10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _ +11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _ +12 gelaten laat V V trans|verldw|onverv 11 vc _ _ +13 . . Punc Punc punt 12 punct _ _ + +1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ +2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _ +3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _ +4 naast naast Prep Prep voor 11 mod _ _ +5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _ +6 op op Prep Prep voor 11 ld _ _ +7 de de Art Art bep|zijdofmv|neut 8 det _ _ +8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _ +9 kunnen kan V V hulp|inf 2 vc _ _ +10 gaan ga V V hulp|inf 9 vc _ _ +11 liggen lig V V intrans|inf 10 vc _ _ +12 . . Punc Punc punt 11 punct _ _ + +1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _ +2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _ +3 mams mams N N soort|ev|neut 4 det _ _ +4 rug rug N N soort|ev|neut 5 obj1 _ _ +5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _ +6 hebben heb V V hulp|inf 2 vc _ _ +7 en en Conj Conj neven 0 ROOT _ _ +8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _ +9 de de Art Art bep|zijdofmv|neut 10 det _ _ +10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _ +11 . . Punc Punc punt 10 punct _ _ + +1 Of of Conj Conj onder|metfin 0 ROOT _ _ +2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _ +3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ +4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _ +5 met met Prep Prep voor 10 mod _ _ +6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _ +7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _ +8 rond rond Adv Adv deelv 10 svp _ _ +9 kunnen kan V V hulp|inf 3 vc _ _ +10 slenteren slenter V V intrans|inf 9 vc _ _ +11 in in Prep Prep voor 10 mod _ _ +12 de de Art Art bep|zijdofmv|neut 13 det _ _ +13 buurt buurt N N soort|ev|neut 11 obj1 _ _ +14 van van Prep Prep voor 13 mod _ _ +15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _ +16 . . Punc Punc punt 15 punct _ _ +""" + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/earleychart.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/earleychart.py new file mode 100644 index 0000000000000000000000000000000000000000..26d46e635871efbafc40e73f4aab888e89770d6b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/earleychart.py @@ -0,0 +1,552 @@ +# Natural Language Toolkit: An Incremental Earley Chart Parser +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Peter Ljunglöf +# Rob Speer +# Edward Loper +# Steven Bird +# Jean Mark Gawron +# URL: +# For license information, see LICENSE.TXT + +""" +Data classes and parser implementations for *incremental* chart +parsers, which use dynamic programming to efficiently parse a text. +A "chart parser" derives parse trees for a text by iteratively adding +\"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree +structure for a subsequence of the text. The "chart" is a +\"blackboard\" for composing and combining these hypotheses. + +A parser is "incremental", if it guarantees that for all i, j where i < j, +all edges ending at i are built before any edges ending at j. +This is appealing for, say, speech recognizer hypothesis filtering. + +The main parser class is ``EarleyChartParser``, which is a top-down +algorithm, originally formulated by Jay Earley (1970). +""" + +from time import perf_counter + +from nltk.parse.chart import ( + BottomUpPredictCombineRule, + BottomUpPredictRule, + CachedTopDownPredictRule, + Chart, + ChartParser, + EdgeI, + EmptyPredictRule, + FilteredBottomUpPredictCombineRule, + FilteredSingleEdgeFundamentalRule, + LeafEdge, + LeafInitRule, + SingleEdgeFundamentalRule, + TopDownInitRule, +) +from nltk.parse.featurechart import ( + FeatureBottomUpPredictCombineRule, + FeatureBottomUpPredictRule, + FeatureChart, + FeatureChartParser, + FeatureEmptyPredictRule, + FeatureSingleEdgeFundamentalRule, + FeatureTopDownInitRule, + FeatureTopDownPredictRule, +) + +# //////////////////////////////////////////////////////////// +# Incremental Chart +# //////////////////////////////////////////////////////////// + + +class IncrementalChart(Chart): + def initialize(self): + # A sequence of edge lists contained in this chart. + self._edgelists = tuple([] for x in self._positions()) + + # The set of child pointer lists associated with each edge. + self._edge_to_cpls = {} + + # Indexes mapping attribute values to lists of edges + # (used by select()). + self._indexes = {} + + def edges(self): + return list(self.iteredges()) + + def iteredges(self): + return (edge for edgelist in self._edgelists for edge in edgelist) + + def select(self, end, **restrictions): + edgelist = self._edgelists[end] + + # If there are no restrictions, then return all edges. + if restrictions == {}: + return iter(edgelist) + + # Find the index corresponding to the given restrictions. + restr_keys = sorted(restrictions.keys()) + restr_keys = tuple(restr_keys) + + # If it doesn't exist, then create it. + if restr_keys not in self._indexes: + self._add_index(restr_keys) + + vals = tuple(restrictions[key] for key in restr_keys) + return iter(self._indexes[restr_keys][end].get(vals, [])) + + def _add_index(self, restr_keys): + # Make sure it's a valid index. + for key in restr_keys: + if not hasattr(EdgeI, key): + raise ValueError("Bad restriction: %s" % key) + + # Create the index. + index = self._indexes[restr_keys] = tuple({} for x in self._positions()) + + # Add all existing edges to the index. + for end, edgelist in enumerate(self._edgelists): + this_index = index[end] + for edge in edgelist: + vals = tuple(getattr(edge, key)() for key in restr_keys) + this_index.setdefault(vals, []).append(edge) + + def _register_with_indexes(self, edge): + end = edge.end() + for (restr_keys, index) in self._indexes.items(): + vals = tuple(getattr(edge, key)() for key in restr_keys) + index[end].setdefault(vals, []).append(edge) + + def _append_edge(self, edge): + self._edgelists[edge.end()].append(edge) + + def _positions(self): + return range(self.num_leaves() + 1) + + +class FeatureIncrementalChart(IncrementalChart, FeatureChart): + def select(self, end, **restrictions): + edgelist = self._edgelists[end] + + # If there are no restrictions, then return all edges. + if restrictions == {}: + return iter(edgelist) + + # Find the index corresponding to the given restrictions. + restr_keys = sorted(restrictions.keys()) + restr_keys = tuple(restr_keys) + + # If it doesn't exist, then create it. + if restr_keys not in self._indexes: + self._add_index(restr_keys) + + vals = tuple( + self._get_type_if_possible(restrictions[key]) for key in restr_keys + ) + return iter(self._indexes[restr_keys][end].get(vals, [])) + + def _add_index(self, restr_keys): + # Make sure it's a valid index. + for key in restr_keys: + if not hasattr(EdgeI, key): + raise ValueError("Bad restriction: %s" % key) + + # Create the index. + index = self._indexes[restr_keys] = tuple({} for x in self._positions()) + + # Add all existing edges to the index. + for end, edgelist in enumerate(self._edgelists): + this_index = index[end] + for edge in edgelist: + vals = tuple( + self._get_type_if_possible(getattr(edge, key)()) + for key in restr_keys + ) + this_index.setdefault(vals, []).append(edge) + + def _register_with_indexes(self, edge): + end = edge.end() + for (restr_keys, index) in self._indexes.items(): + vals = tuple( + self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys + ) + index[end].setdefault(vals, []).append(edge) + + +# //////////////////////////////////////////////////////////// +# Incremental CFG Rules +# //////////////////////////////////////////////////////////// + + +class CompleteFundamentalRule(SingleEdgeFundamentalRule): + def _apply_incomplete(self, chart, grammar, left_edge): + end = left_edge.end() + # When the chart is incremental, we only have to look for + # empty complete edges here. + for right_edge in chart.select( + start=end, end=end, is_complete=True, lhs=left_edge.nextsym() + ): + new_edge = left_edge.move_dot_forward(right_edge.end()) + if chart.insert_with_backpointer(new_edge, left_edge, right_edge): + yield new_edge + + +class CompleterRule(CompleteFundamentalRule): + _fundamental_rule = CompleteFundamentalRule() + + def apply(self, chart, grammar, edge): + if not isinstance(edge, LeafEdge): + yield from self._fundamental_rule.apply(chart, grammar, edge) + + +class ScannerRule(CompleteFundamentalRule): + _fundamental_rule = CompleteFundamentalRule() + + def apply(self, chart, grammar, edge): + if isinstance(edge, LeafEdge): + yield from self._fundamental_rule.apply(chart, grammar, edge) + + +class PredictorRule(CachedTopDownPredictRule): + pass + + +class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule): + def apply(self, chart, grammar, edge): + # Since the Filtered rule only works for grammars without empty productions, + # we only have to bother with complete edges here. + if edge.is_complete(): + yield from self._apply_complete(chart, grammar, edge) + + +# //////////////////////////////////////////////////////////// +# Incremental FCFG Rules +# //////////////////////////////////////////////////////////// + + +class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule): + def _apply_incomplete(self, chart, grammar, left_edge): + fr = self._fundamental_rule + end = left_edge.end() + # When the chart is incremental, we only have to look for + # empty complete edges here. + for right_edge in chart.select( + start=end, end=end, is_complete=True, lhs=left_edge.nextsym() + ): + yield from fr.apply(chart, grammar, left_edge, right_edge) + + +class FeatureCompleterRule(CompleterRule): + _fundamental_rule = FeatureCompleteFundamentalRule() + + +class FeatureScannerRule(ScannerRule): + _fundamental_rule = FeatureCompleteFundamentalRule() + + +class FeaturePredictorRule(FeatureTopDownPredictRule): + pass + + +# //////////////////////////////////////////////////////////// +# Incremental CFG Chart Parsers +# //////////////////////////////////////////////////////////// + +EARLEY_STRATEGY = [ + LeafInitRule(), + TopDownInitRule(), + CompleterRule(), + ScannerRule(), + PredictorRule(), +] +TD_INCREMENTAL_STRATEGY = [ + LeafInitRule(), + TopDownInitRule(), + CachedTopDownPredictRule(), + CompleteFundamentalRule(), +] +BU_INCREMENTAL_STRATEGY = [ + LeafInitRule(), + EmptyPredictRule(), + BottomUpPredictRule(), + CompleteFundamentalRule(), +] +BU_LC_INCREMENTAL_STRATEGY = [ + LeafInitRule(), + EmptyPredictRule(), + BottomUpPredictCombineRule(), + CompleteFundamentalRule(), +] + +LC_INCREMENTAL_STRATEGY = [ + LeafInitRule(), + FilteredBottomUpPredictCombineRule(), + FilteredCompleteFundamentalRule(), +] + + +class IncrementalChartParser(ChartParser): + """ + An *incremental* chart parser implementing Jay Earley's + parsing algorithm: + + | For each index end in [0, 1, ..., N]: + | For each edge such that edge.end = end: + | If edge is incomplete and edge.next is not a part of speech: + | Apply PredictorRule to edge + | If edge is incomplete and edge.next is a part of speech: + | Apply ScannerRule to edge + | If edge is complete: + | Apply CompleterRule to edge + | Return any complete parses in the chart + """ + + def __init__( + self, + grammar, + strategy=BU_LC_INCREMENTAL_STRATEGY, + trace=0, + trace_chart_width=50, + chart_class=IncrementalChart, + ): + """ + Create a new Earley chart parser, that uses ``grammar`` to + parse texts. + + :type grammar: CFG + :param grammar: The grammar used to parse texts. + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + and higher numbers will produce more verbose tracing + output. + :type trace_chart_width: int + :param trace_chart_width: The default total width reserved for + the chart in trace output. The remainder of each line will + be used to display edges. + :param chart_class: The class that should be used to create + the charts used by this parser. + """ + self._grammar = grammar + self._trace = trace + self._trace_chart_width = trace_chart_width + self._chart_class = chart_class + + self._axioms = [] + self._inference_rules = [] + for rule in strategy: + if rule.NUM_EDGES == 0: + self._axioms.append(rule) + elif rule.NUM_EDGES == 1: + self._inference_rules.append(rule) + else: + raise ValueError( + "Incremental inference rules must have " "NUM_EDGES == 0 or 1" + ) + + def chart_parse(self, tokens, trace=None): + if trace is None: + trace = self._trace + trace_new_edges = self._trace_new_edges + + tokens = list(tokens) + self._grammar.check_coverage(tokens) + chart = self._chart_class(tokens) + grammar = self._grammar + + # Width, for printing trace edges. + trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) + if trace: + print(chart.pretty_format_leaves(trace_edge_width)) + + for axiom in self._axioms: + new_edges = list(axiom.apply(chart, grammar)) + trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) + + inference_rules = self._inference_rules + for end in range(chart.num_leaves() + 1): + if trace > 1: + print("\n* Processing queue:", end, "\n") + agenda = list(chart.select(end=end)) + while agenda: + edge = agenda.pop() + for rule in inference_rules: + new_edges = list(rule.apply(chart, grammar, edge)) + trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) + for new_edge in new_edges: + if new_edge.end() == end: + agenda.append(new_edge) + + return chart + + +class EarleyChartParser(IncrementalChartParser): + def __init__(self, grammar, **parser_args): + IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args) + + +class IncrementalTopDownChartParser(IncrementalChartParser): + def __init__(self, grammar, **parser_args): + IncrementalChartParser.__init__( + self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args + ) + + +class IncrementalBottomUpChartParser(IncrementalChartParser): + def __init__(self, grammar, **parser_args): + IncrementalChartParser.__init__( + self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args + ) + + +class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser): + def __init__(self, grammar, **parser_args): + IncrementalChartParser.__init__( + self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args + ) + + +class IncrementalLeftCornerChartParser(IncrementalChartParser): + def __init__(self, grammar, **parser_args): + if not grammar.is_nonempty(): + raise ValueError( + "IncrementalLeftCornerParser only works for grammars " + "without empty productions." + ) + IncrementalChartParser.__init__( + self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args + ) + + +# //////////////////////////////////////////////////////////// +# Incremental FCFG Chart Parsers +# //////////////////////////////////////////////////////////// + +EARLEY_FEATURE_STRATEGY = [ + LeafInitRule(), + FeatureTopDownInitRule(), + FeatureCompleterRule(), + FeatureScannerRule(), + FeaturePredictorRule(), +] +TD_INCREMENTAL_FEATURE_STRATEGY = [ + LeafInitRule(), + FeatureTopDownInitRule(), + FeatureTopDownPredictRule(), + FeatureCompleteFundamentalRule(), +] +BU_INCREMENTAL_FEATURE_STRATEGY = [ + LeafInitRule(), + FeatureEmptyPredictRule(), + FeatureBottomUpPredictRule(), + FeatureCompleteFundamentalRule(), +] +BU_LC_INCREMENTAL_FEATURE_STRATEGY = [ + LeafInitRule(), + FeatureEmptyPredictRule(), + FeatureBottomUpPredictCombineRule(), + FeatureCompleteFundamentalRule(), +] + + +class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser): + def __init__( + self, + grammar, + strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY, + trace_chart_width=20, + chart_class=FeatureIncrementalChart, + **parser_args + ): + IncrementalChartParser.__init__( + self, + grammar, + strategy=strategy, + trace_chart_width=trace_chart_width, + chart_class=chart_class, + **parser_args + ) + + +class FeatureEarleyChartParser(FeatureIncrementalChartParser): + def __init__(self, grammar, **parser_args): + FeatureIncrementalChartParser.__init__( + self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args + ) + + +class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser): + def __init__(self, grammar, **parser_args): + FeatureIncrementalChartParser.__init__( + self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args + ) + + +class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser): + def __init__(self, grammar, **parser_args): + FeatureIncrementalChartParser.__init__( + self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args + ) + + +class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser): + def __init__(self, grammar, **parser_args): + FeatureIncrementalChartParser.__init__( + self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args + ) + + +# //////////////////////////////////////////////////////////// +# Demonstration +# //////////////////////////////////////////////////////////// + + +def demo( + print_times=True, + print_grammar=False, + print_trees=True, + trace=2, + sent="I saw John with a dog with my cookie", + numparses=5, +): + """ + A demonstration of the Earley parsers. + """ + import sys + import time + + from nltk.parse.chart import demo_grammar + + # The grammar for ChartParser and SteppingChartParser: + grammar = demo_grammar() + if print_grammar: + print("* Grammar") + print(grammar) + + # Tokenize the sample sentence. + print("* Sentence:") + print(sent) + tokens = sent.split() + print(tokens) + print() + + # Do the parsing. + earley = EarleyChartParser(grammar, trace=trace) + t = perf_counter() + chart = earley.chart_parse(tokens) + parses = list(chart.parses(grammar.start())) + t = perf_counter() - t + + # Print results. + if numparses: + assert len(parses) == numparses, "Not all parses found" + if print_trees: + for tree in parses: + print(tree) + else: + print("Nr trees:", len(parses)) + if print_times: + print("Time:", t) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/evaluate.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..28df99c1419b86fce1e98bc59462c88404569405 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/evaluate.py @@ -0,0 +1,129 @@ +# Natural Language Toolkit: evaluation of dependency parser +# +# Author: Long Duong +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +import unicodedata + + +class DependencyEvaluator: + """ + Class for measuring labelled and unlabelled attachment score for + dependency parsing. Note that the evaluation ignores punctuation. + + >>> from nltk.parse import DependencyGraph, DependencyEvaluator + + >>> gold_sent = DependencyGraph(\""" + ... Pierre NNP 2 NMOD + ... Vinken NNP 8 SUB + ... , , 2 P + ... 61 CD 5 NMOD + ... years NNS 6 AMOD + ... old JJ 2 NMOD + ... , , 2 P + ... will MD 0 ROOT + ... join VB 8 VC + ... the DT 11 NMOD + ... board NN 9 OBJ + ... as IN 9 VMOD + ... a DT 15 NMOD + ... nonexecutive JJ 15 NMOD + ... director NN 12 PMOD + ... Nov. NNP 9 VMOD + ... 29 CD 16 NMOD + ... . . 9 VMOD + ... \""") + + >>> parsed_sent = DependencyGraph(\""" + ... Pierre NNP 8 NMOD + ... Vinken NNP 1 SUB + ... , , 3 P + ... 61 CD 6 NMOD + ... years NNS 6 AMOD + ... old JJ 2 NMOD + ... , , 3 AMOD + ... will MD 0 ROOT + ... join VB 8 VC + ... the DT 11 AMOD + ... board NN 9 OBJECT + ... as IN 9 NMOD + ... a DT 15 NMOD + ... nonexecutive JJ 15 NMOD + ... director NN 12 PMOD + ... Nov. NNP 9 VMOD + ... 29 CD 16 NMOD + ... . . 9 VMOD + ... \""") + + >>> de = DependencyEvaluator([parsed_sent],[gold_sent]) + >>> las, uas = de.eval() + >>> las + 0.6 + >>> uas + 0.8 + >>> abs(uas - 0.8) < 0.00001 + True + """ + + def __init__(self, parsed_sents, gold_sents): + """ + :param parsed_sents: the list of parsed_sents as the output of parser + :type parsed_sents: list(DependencyGraph) + """ + self._parsed_sents = parsed_sents + self._gold_sents = gold_sents + + def _remove_punct(self, inStr): + """ + Function to remove punctuation from Unicode string. + :param input: the input string + :return: Unicode string after remove all punctuation + """ + punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"} + return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat) + + def eval(self): + """ + Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS) + + :return : tuple(float,float) + """ + if len(self._parsed_sents) != len(self._gold_sents): + raise ValueError( + " Number of parsed sentence is different with number of gold sentence." + ) + + corr = 0 + corrL = 0 + total = 0 + + for i in range(len(self._parsed_sents)): + parsed_sent_nodes = self._parsed_sents[i].nodes + gold_sent_nodes = self._gold_sents[i].nodes + + if len(parsed_sent_nodes) != len(gold_sent_nodes): + raise ValueError("Sentences must have equal length.") + + for parsed_node_address, parsed_node in parsed_sent_nodes.items(): + gold_node = gold_sent_nodes[parsed_node_address] + + if parsed_node["word"] is None: + continue + if parsed_node["word"] != gold_node["word"]: + raise ValueError("Sentence sequence is not matched.") + + # Ignore if word is punctuation by default + # if (parsed_sent[j]["word"] in string.punctuation): + if self._remove_punct(parsed_node["word"]) == "": + continue + + total += 1 + if parsed_node["head"] == gold_node["head"]: + corr += 1 + if parsed_node["rel"] == gold_node["rel"]: + corrL += 1 + + return corrL / total, corr / total diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/featurechart.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/featurechart.py new file mode 100644 index 0000000000000000000000000000000000000000..3e4f9c84d1c493b77779c8f171cd904570baad5d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/featurechart.py @@ -0,0 +1,674 @@ +# Natural Language Toolkit: Chart Parser for Feature-Based Grammars +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Rob Speer +# Peter Ljunglöf +# URL: +# For license information, see LICENSE.TXT + +""" +Extension of chart parsing implementation to handle grammars with +feature structures as nodes. +""" +from time import perf_counter + +from nltk.featstruct import TYPE, FeatStruct, find_variables, unify +from nltk.grammar import ( + CFG, + FeatStructNonterminal, + Nonterminal, + Production, + is_nonterminal, + is_terminal, +) +from nltk.parse.chart import ( + BottomUpPredictCombineRule, + BottomUpPredictRule, + CachedTopDownPredictRule, + Chart, + ChartParser, + EdgeI, + EmptyPredictRule, + FundamentalRule, + LeafInitRule, + SingleEdgeFundamentalRule, + TopDownInitRule, + TreeEdge, +) +from nltk.sem import logic +from nltk.tree import Tree + +# //////////////////////////////////////////////////////////// +# Tree Edge +# //////////////////////////////////////////////////////////// + + +class FeatureTreeEdge(TreeEdge): + """ + A specialized tree edge that allows shared variable bindings + between nonterminals on the left-hand side and right-hand side. + + Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a + dictionary mapping from variables to values. If the edge is not + complete, then these bindings are simply stored. However, if the + edge is complete, then the constructor applies these bindings to + every nonterminal in the edge whose symbol implements the + interface ``SubstituteBindingsI``. + """ + + def __init__(self, span, lhs, rhs, dot=0, bindings=None): + """ + Construct a new edge. If the edge is incomplete (i.e., if + ``dot alpha \* B1 beta][i:j]`` + - ``[B2 -> gamma \*][j:k]`` + + licenses the edge: + + - ``[A -> alpha B3 \* beta][i:j]`` + + assuming that B1 and B2 can be unified to generate B3. + """ + + def apply(self, chart, grammar, left_edge, right_edge): + # Make sure the rule is applicable. + if not ( + left_edge.end() == right_edge.start() + and left_edge.is_incomplete() + and right_edge.is_complete() + and isinstance(left_edge, FeatureTreeEdge) + ): + return + found = right_edge.lhs() + nextsym = left_edge.nextsym() + if isinstance(right_edge, FeatureTreeEdge): + if not is_nonterminal(nextsym): + return + if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: + return + # Create a copy of the bindings. + bindings = left_edge.bindings() + # We rename vars here, because we don't want variables + # from the two different productions to match. + found = found.rename_variables(used_vars=left_edge.variables()) + # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to + # generate B3 (result). + result = unify(nextsym, found, bindings, rename_vars=False) + if result is None: + return + else: + if nextsym != found: + return + # Create a copy of the bindings. + bindings = left_edge.bindings() + + # Construct the new edge. + new_edge = left_edge.move_dot_forward(right_edge.end(), bindings) + + # Add it to the chart, with appropriate child pointers. + if chart.insert_with_backpointer(new_edge, left_edge, right_edge): + yield new_edge + + +class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): + """ + A specialized version of the completer / single edge fundamental rule + that operates on nonterminals whose symbols are ``FeatStructNonterminal``. + Rather than simply comparing the nonterminals for equality, they are + unified. + """ + + _fundamental_rule = FeatureFundamentalRule() + + def _apply_complete(self, chart, grammar, right_edge): + fr = self._fundamental_rule + for left_edge in chart.select( + end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() + ): + yield from fr.apply(chart, grammar, left_edge, right_edge) + + def _apply_incomplete(self, chart, grammar, left_edge): + fr = self._fundamental_rule + for right_edge in chart.select( + start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() + ): + yield from fr.apply(chart, grammar, left_edge, right_edge) + + +# //////////////////////////////////////////////////////////// +# Top-Down Prediction +# //////////////////////////////////////////////////////////// + + +class FeatureTopDownInitRule(TopDownInitRule): + def apply(self, chart, grammar): + for prod in grammar.productions(lhs=grammar.start()): + new_edge = FeatureTreeEdge.from_production(prod, 0) + if chart.insert(new_edge, ()): + yield new_edge + + +class FeatureTopDownPredictRule(CachedTopDownPredictRule): + r""" + A specialized version of the (cached) top down predict rule that operates + on nonterminals whose symbols are ``FeatStructNonterminal``. Rather + than simply comparing the nonterminals for equality, they are + unified. + + The top down expand rule states that: + + - ``[A -> alpha \* B1 beta][i:j]`` + + licenses the edge: + + - ``[B2 -> \* gamma][j:j]`` + + for each grammar production ``B2 -> gamma``, assuming that B1 + and B2 can be unified. + """ + + def apply(self, chart, grammar, edge): + if edge.is_complete(): + return + nextsym, index = edge.nextsym(), edge.end() + if not is_nonterminal(nextsym): + return + + # If we've already applied this rule to an edge with the same + # next & end, and the chart & grammar have not changed, then + # just return (no new edges to add). + nextsym_with_bindings = edge.next_with_bindings() + done = self._done.get((nextsym_with_bindings, index), (None, None)) + if done[0] is chart and done[1] is grammar: + return + + for prod in grammar.productions(lhs=nextsym): + # If the left corner in the predicted production is + # leaf, it must match with the input. + if prod.rhs(): + first = prod.rhs()[0] + if is_terminal(first): + if index >= chart.num_leaves(): + continue + if first != chart.leaf(index): + continue + + # We rename vars here, because we don't want variables + # from the two different productions to match. + if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True): + new_edge = FeatureTreeEdge.from_production(prod, edge.end()) + if chart.insert(new_edge, ()): + yield new_edge + + # Record the fact that we've applied this rule. + self._done[nextsym_with_bindings, index] = (chart, grammar) + + +# //////////////////////////////////////////////////////////// +# Bottom-Up Prediction +# //////////////////////////////////////////////////////////// + + +class FeatureBottomUpPredictRule(BottomUpPredictRule): + def apply(self, chart, grammar, edge): + if edge.is_incomplete(): + return + for prod in grammar.productions(rhs=edge.lhs()): + if isinstance(edge, FeatureTreeEdge): + _next = prod.rhs()[0] + if not is_nonterminal(_next): + continue + + new_edge = FeatureTreeEdge.from_production(prod, edge.start()) + if chart.insert(new_edge, ()): + yield new_edge + + +class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule): + def apply(self, chart, grammar, edge): + if edge.is_incomplete(): + return + found = edge.lhs() + for prod in grammar.productions(rhs=found): + bindings = {} + if isinstance(edge, FeatureTreeEdge): + _next = prod.rhs()[0] + if not is_nonterminal(_next): + continue + + # We rename vars here, because we don't want variables + # from the two different productions to match. + used_vars = find_variables( + (prod.lhs(),) + prod.rhs(), fs_class=FeatStruct + ) + found = found.rename_variables(used_vars=used_vars) + + result = unify(_next, found, bindings, rename_vars=False) + if result is None: + continue + + new_edge = FeatureTreeEdge.from_production( + prod, edge.start() + ).move_dot_forward(edge.end(), bindings) + if chart.insert(new_edge, (edge,)): + yield new_edge + + +class FeatureEmptyPredictRule(EmptyPredictRule): + def apply(self, chart, grammar): + for prod in grammar.productions(empty=True): + for index in range(chart.num_leaves() + 1): + new_edge = FeatureTreeEdge.from_production(prod, index) + if chart.insert(new_edge, ()): + yield new_edge + + +# //////////////////////////////////////////////////////////// +# Feature Chart Parser +# //////////////////////////////////////////////////////////// + +TD_FEATURE_STRATEGY = [ + LeafInitRule(), + FeatureTopDownInitRule(), + FeatureTopDownPredictRule(), + FeatureSingleEdgeFundamentalRule(), +] +BU_FEATURE_STRATEGY = [ + LeafInitRule(), + FeatureEmptyPredictRule(), + FeatureBottomUpPredictRule(), + FeatureSingleEdgeFundamentalRule(), +] +BU_LC_FEATURE_STRATEGY = [ + LeafInitRule(), + FeatureEmptyPredictRule(), + FeatureBottomUpPredictCombineRule(), + FeatureSingleEdgeFundamentalRule(), +] + + +class FeatureChartParser(ChartParser): + def __init__( + self, + grammar, + strategy=BU_LC_FEATURE_STRATEGY, + trace_chart_width=20, + chart_class=FeatureChart, + **parser_args, + ): + ChartParser.__init__( + self, + grammar, + strategy=strategy, + trace_chart_width=trace_chart_width, + chart_class=chart_class, + **parser_args, + ) + + +class FeatureTopDownChartParser(FeatureChartParser): + def __init__(self, grammar, **parser_args): + FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args) + + +class FeatureBottomUpChartParser(FeatureChartParser): + def __init__(self, grammar, **parser_args): + FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args) + + +class FeatureBottomUpLeftCornerChartParser(FeatureChartParser): + def __init__(self, grammar, **parser_args): + FeatureChartParser.__init__( + self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args + ) + + +# //////////////////////////////////////////////////////////// +# Instantiate Variable Chart +# //////////////////////////////////////////////////////////// + + +class InstantiateVarsChart(FeatureChart): + """ + A specialized chart that 'instantiates' variables whose names + start with '@', by replacing them with unique new variables. + In particular, whenever a complete edge is added to the chart, any + variables in the edge's ``lhs`` whose names start with '@' will be + replaced by unique new ``Variable``. + """ + + def __init__(self, tokens): + FeatureChart.__init__(self, tokens) + + def initialize(self): + self._instantiated = set() + FeatureChart.initialize(self) + + def insert(self, edge, child_pointer_list): + if edge in self._instantiated: + return False + self.instantiate_edge(edge) + return FeatureChart.insert(self, edge, child_pointer_list) + + def instantiate_edge(self, edge): + """ + If the edge is a ``FeatureTreeEdge``, and it is complete, + then instantiate all variables whose names start with '@', + by replacing them with unique new variables. + + Note that instantiation is done in-place, since the + parsing algorithms might already hold a reference to + the edge for future use. + """ + # If the edge is a leaf, or is not complete, or is + # already in the chart, then just return it as-is. + if not isinstance(edge, FeatureTreeEdge): + return + if not edge.is_complete(): + return + if edge in self._edge_to_cpls: + return + + # Get a list of variables that need to be instantiated. + # If there are none, then return as-is. + inst_vars = self.inst_vars(edge) + if not inst_vars: + return + + # Instantiate the edge! + self._instantiated.add(edge) + edge._lhs = edge.lhs().substitute_bindings(inst_vars) + + def inst_vars(self, edge): + return { + var: logic.unique_variable() + for var in edge.lhs().variables() + if var.name.startswith("@") + } + + +# //////////////////////////////////////////////////////////// +# Demo +# //////////////////////////////////////////////////////////// + + +def demo_grammar(): + from nltk.grammar import FeatureGrammar + + return FeatureGrammar.fromstring( + """ +S -> NP VP +PP -> Prep NP +NP -> NP PP +VP -> VP PP +VP -> Verb NP +VP -> Verb +NP -> Det[pl=?x] Noun[pl=?x] +NP -> "John" +NP -> "I" +Det -> "the" +Det -> "my" +Det[-pl] -> "a" +Noun[-pl] -> "dog" +Noun[-pl] -> "cookie" +Verb -> "ate" +Verb -> "saw" +Prep -> "with" +Prep -> "under" +""" + ) + + +def demo( + print_times=True, + print_grammar=True, + print_trees=True, + print_sentence=True, + trace=1, + parser=FeatureChartParser, + sent="I saw John with a dog with my cookie", +): + import sys + import time + + print() + grammar = demo_grammar() + if print_grammar: + print(grammar) + print() + print("*", parser.__name__) + if print_sentence: + print("Sentence:", sent) + tokens = sent.split() + t = perf_counter() + cp = parser(grammar, trace=trace) + chart = cp.chart_parse(tokens) + trees = list(chart.parses(grammar.start())) + if print_times: + print("Time: %s" % (perf_counter() - t)) + if print_trees: + for tree in trees: + print(tree) + else: + print("Nr trees:", len(trees)) + + +def run_profile(): + import profile + + profile.run("for i in range(1): demo()", "/tmp/profile.out") + import pstats + + p = pstats.Stats("/tmp/profile.out") + p.strip_dirs().sort_stats("time", "cum").print_stats(60) + p.strip_dirs().sort_stats("cum", "time").print_stats(60) + + +if __name__ == "__main__": + from nltk.data import load + + demo() + print() + grammar = load("grammars/book_grammars/feat0.fcfg") + cp = FeatureChartParser(grammar, trace=2) + sent = "Kim likes children" + tokens = sent.split() + trees = cp.parse(tokens) + for tree in trees: + print(tree) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/generate.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..7c84de9c0879e9d24436e64cd04cfa69794b86fb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/generate.py @@ -0,0 +1,85 @@ +# Natural Language Toolkit: Generating from a CFG +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Peter Ljunglöf +# URL: +# For license information, see LICENSE.TXT +# + +import itertools +import sys + +from nltk.grammar import Nonterminal + + +def generate(grammar, start=None, depth=None, n=None): + """ + Generates an iterator of all sentences from a CFG. + + :param grammar: The Grammar used to generate sentences. + :param start: The Nonterminal from which to start generate sentences. + :param depth: The maximal depth of the generated tree. + :param n: The maximum number of sentences to return. + :return: An iterator of lists of terminal tokens. + """ + if not start: + start = grammar.start() + if depth is None: + depth = sys.maxsize + + iter = _generate_all(grammar, [start], depth) + + if n: + iter = itertools.islice(iter, n) + + return iter + + +def _generate_all(grammar, items, depth): + if items: + try: + for frag1 in _generate_one(grammar, items[0], depth): + for frag2 in _generate_all(grammar, items[1:], depth): + yield frag1 + frag2 + except RecursionError as error: + # Helpful error message while still showing the recursion stack. + raise RuntimeError( + "The grammar has rule(s) that yield infinite recursion!" + ) from error + else: + yield [] + + +def _generate_one(grammar, item, depth): + if depth > 0: + if isinstance(item, Nonterminal): + for prod in grammar.productions(lhs=item): + yield from _generate_all(grammar, prod.rhs(), depth - 1) + else: + yield [item] + + +demo_grammar = """ + S -> NP VP + NP -> Det N + PP -> P NP + VP -> 'slept' | 'saw' NP | 'walked' PP + Det -> 'the' | 'a' + N -> 'man' | 'park' | 'dog' + P -> 'in' | 'with' +""" + + +def demo(N=23): + from nltk.grammar import CFG + + print("Generating the first %d sentences for demo grammar:" % (N,)) + print(demo_grammar) + grammar = CFG.fromstring(demo_grammar) + for n, sent in enumerate(generate(grammar, n=N), 1): + print("%3d. %s" % (n, " ".join(sent))) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/pchart.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/pchart.py new file mode 100644 index 0000000000000000000000000000000000000000..33b19b638ad57578f7a1c03351d1d4b9ec1e4150 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/pchart.py @@ -0,0 +1,579 @@ +# Natural Language Toolkit: Probabilistic Chart Parsers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Classes and interfaces for associating probabilities with tree +structures that represent the internal organization of a text. The +probabilistic parser module defines ``BottomUpProbabilisticChartParser``. + +``BottomUpProbabilisticChartParser`` is an abstract class that implements +a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges, +and adds them to the chart one at a time. The ordering of this queue +is based on the probabilities associated with the edges, allowing the +parser to expand more likely edges before less likely ones. Each +subclass implements a different queue ordering, producing different +search strategies. Currently the following subclasses are defined: + + - ``InsideChartParser`` searches edges in decreasing order of + their trees' inside probabilities. + - ``RandomChartParser`` searches edges in random order. + - ``LongestChartParser`` searches edges in decreasing order of their + location's length. + +The ``BottomUpProbabilisticChartParser`` constructor has an optional +argument beam_size. If non-zero, this controls the size of the beam +(aka the edge queue). This option is most useful with InsideChartParser. +""" + +##////////////////////////////////////////////////////// +## Bottom-Up PCFG Chart Parser +##////////////////////////////////////////////////////// + +# [XX] This might not be implemented quite right -- it would be better +# to associate probabilities with child pointer lists. + +import random +from functools import reduce + +from nltk.grammar import PCFG, Nonterminal +from nltk.parse.api import ParserI +from nltk.parse.chart import AbstractChartRule, Chart, LeafEdge, TreeEdge +from nltk.tree import ProbabilisticTree, Tree + + +# Probabilistic edges +class ProbabilisticLeafEdge(LeafEdge): + def prob(self): + return 1.0 + + +class ProbabilisticTreeEdge(TreeEdge): + def __init__(self, prob, *args, **kwargs): + TreeEdge.__init__(self, *args, **kwargs) + self._prob = prob + # two edges with different probabilities are not equal. + self._comparison_key = (self._comparison_key, prob) + + def prob(self): + return self._prob + + @staticmethod + def from_production(production, index, p): + return ProbabilisticTreeEdge( + p, (index, index), production.lhs(), production.rhs(), 0 + ) + + +# Rules using probabilistic edges +class ProbabilisticBottomUpInitRule(AbstractChartRule): + NUM_EDGES = 0 + + def apply(self, chart, grammar): + for index in range(chart.num_leaves()): + new_edge = ProbabilisticLeafEdge(chart.leaf(index), index) + if chart.insert(new_edge, ()): + yield new_edge + + +class ProbabilisticBottomUpPredictRule(AbstractChartRule): + NUM_EDGES = 1 + + def apply(self, chart, grammar, edge): + if edge.is_incomplete(): + return + for prod in grammar.productions(): + if edge.lhs() == prod.rhs()[0]: + new_edge = ProbabilisticTreeEdge.from_production( + prod, edge.start(), prod.prob() + ) + if chart.insert(new_edge, ()): + yield new_edge + + +class ProbabilisticFundamentalRule(AbstractChartRule): + NUM_EDGES = 2 + + def apply(self, chart, grammar, left_edge, right_edge): + # Make sure the rule is applicable. + if not ( + left_edge.end() == right_edge.start() + and left_edge.nextsym() == right_edge.lhs() + and left_edge.is_incomplete() + and right_edge.is_complete() + ): + return + + # Construct the new edge. + p = left_edge.prob() * right_edge.prob() + new_edge = ProbabilisticTreeEdge( + p, + span=(left_edge.start(), right_edge.end()), + lhs=left_edge.lhs(), + rhs=left_edge.rhs(), + dot=left_edge.dot() + 1, + ) + + # Add it to the chart, with appropriate child pointers. + changed_chart = False + for cpl1 in chart.child_pointer_lists(left_edge): + if chart.insert(new_edge, cpl1 + (right_edge,)): + changed_chart = True + + # If we changed the chart, then generate the edge. + if changed_chart: + yield new_edge + + +class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule): + NUM_EDGES = 1 + + _fundamental_rule = ProbabilisticFundamentalRule() + + def apply(self, chart, grammar, edge1): + fr = self._fundamental_rule + if edge1.is_incomplete(): + # edge1 = left_edge; edge2 = right_edge + for edge2 in chart.select( + start=edge1.end(), is_complete=True, lhs=edge1.nextsym() + ): + yield from fr.apply(chart, grammar, edge1, edge2) + else: + # edge2 = left_edge; edge1 = right_edge + for edge2 in chart.select( + end=edge1.start(), is_complete=False, nextsym=edge1.lhs() + ): + yield from fr.apply(chart, grammar, edge2, edge1) + + def __str__(self): + return "Fundamental Rule" + + +class BottomUpProbabilisticChartParser(ParserI): + """ + An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to + record partial results. ``BottomUpProbabilisticChartParser`` maintains + a queue of edges that can be added to the chart. This queue is + initialized with edges for each token in the text that is being + parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into + the chart one at a time, starting with the most likely edges, and + proceeding to less likely edges. For each edge that is added to + the chart, it may become possible to insert additional edges into + the chart; these are added to the queue. This process continues + until enough complete parses have been generated, or until the + queue is empty. + + The sorting order for the queue is not specified by + ``BottomUpProbabilisticChartParser``. Different sorting orders will + result in different search strategies. The sorting order for the + queue is defined by the method ``sort_queue``; subclasses are required + to provide a definition for this method. + + :type _grammar: PCFG + :ivar _grammar: The grammar used to parse sentences. + :type _trace: int + :ivar _trace: The level of tracing output that should be generated + when parsing a text. + """ + + def __init__(self, grammar, beam_size=0, trace=0): + """ + Create a new ``BottomUpProbabilisticChartParser``, that uses + ``grammar`` to parse texts. + + :type grammar: PCFG + :param grammar: The grammar used to parse texts. + :type beam_size: int + :param beam_size: The maximum length for the parser's edge queue. + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + and higher numbers will produce more verbose tracing + output. + """ + if not isinstance(grammar, PCFG): + raise ValueError("The grammar must be probabilistic PCFG") + self._grammar = grammar + self.beam_size = beam_size + self._trace = trace + + def grammar(self): + return self._grammar + + def trace(self, trace=2): + """ + Set the level of tracing output that should be generated when + parsing a text. + + :type trace: int + :param trace: The trace level. A trace level of ``0`` will + generate no tracing output; and higher trace levels will + produce more verbose tracing output. + :rtype: None + """ + self._trace = trace + + # TODO: change this to conform more with the standard ChartParser + def parse(self, tokens): + self._grammar.check_coverage(tokens) + chart = Chart(list(tokens)) + grammar = self._grammar + + # Chart parser rules. + bu_init = ProbabilisticBottomUpInitRule() + bu = ProbabilisticBottomUpPredictRule() + fr = SingleEdgeProbabilisticFundamentalRule() + + # Our queue + queue = [] + + # Initialize the chart. + for edge in bu_init.apply(chart, grammar): + if self._trace > 1: + print( + " %-50s [%s]" + % (chart.pretty_format_edge(edge, width=2), edge.prob()) + ) + queue.append(edge) + + while len(queue) > 0: + # Re-sort the queue. + self.sort_queue(queue, chart) + + # Prune the queue to the correct size if a beam was defined + if self.beam_size: + self._prune(queue, chart) + + # Get the best edge. + edge = queue.pop() + if self._trace > 0: + print( + " %-50s [%s]" + % (chart.pretty_format_edge(edge, width=2), edge.prob()) + ) + + # Apply BU & FR to it. + queue.extend(bu.apply(chart, grammar, edge)) + queue.extend(fr.apply(chart, grammar, edge)) + + # Get a list of complete parses. + parses = list(chart.parses(grammar.start(), ProbabilisticTree)) + + # Assign probabilities to the trees. + prod_probs = {} + for prod in grammar.productions(): + prod_probs[prod.lhs(), prod.rhs()] = prod.prob() + for parse in parses: + self._setprob(parse, prod_probs) + + # Sort by probability + parses.sort(reverse=True, key=lambda tree: tree.prob()) + + return iter(parses) + + def _setprob(self, tree, prod_probs): + if tree.prob() is not None: + return + + # Get the prob of the CFG production. + lhs = Nonterminal(tree.label()) + rhs = [] + for child in tree: + if isinstance(child, Tree): + rhs.append(Nonterminal(child.label())) + else: + rhs.append(child) + prob = prod_probs[lhs, tuple(rhs)] + + # Get the probs of children. + for child in tree: + if isinstance(child, Tree): + self._setprob(child, prod_probs) + prob *= child.prob() + + tree.set_prob(prob) + + def sort_queue(self, queue, chart): + """ + Sort the given queue of ``Edge`` objects, placing the edge that should + be tried first at the beginning of the queue. This method + will be called after each ``Edge`` is added to the queue. + + :param queue: The queue of ``Edge`` objects to sort. Each edge in + this queue is an edge that could be added to the chart by + the fundamental rule; but that has not yet been added. + :type queue: list(Edge) + :param chart: The chart being used to parse the text. This + chart can be used to provide extra information for sorting + the queue. + :type chart: Chart + :rtype: None + """ + raise NotImplementedError() + + def _prune(self, queue, chart): + """Discard items in the queue if the queue is longer than the beam.""" + if len(queue) > self.beam_size: + split = len(queue) - self.beam_size + if self._trace > 2: + for edge in queue[:split]: + print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2)) + del queue[:split] + + +class InsideChartParser(BottomUpProbabilisticChartParser): + """ + A bottom-up parser for ``PCFG`` grammars that tries edges in descending + order of the inside probabilities of their trees. The "inside + probability" of a tree is simply the + probability of the entire tree, ignoring its context. In + particular, the inside probability of a tree generated by + production *p* with children *c[1], c[2], ..., c[n]* is + *P(p)P(c[1])P(c[2])...P(c[n])*; and the inside + probability of a token is 1 if it is present in the text, and 0 if + it is absent. + + This sorting order results in a type of lowest-cost-first search + strategy. + """ + + # Inherit constructor. + def sort_queue(self, queue, chart): + """ + Sort the given queue of edges, in descending order of the + inside probabilities of the edges' trees. + + :param queue: The queue of ``Edge`` objects to sort. Each edge in + this queue is an edge that could be added to the chart by + the fundamental rule; but that has not yet been added. + :type queue: list(Edge) + :param chart: The chart being used to parse the text. This + chart can be used to provide extra information for sorting + the queue. + :type chart: Chart + :rtype: None + """ + queue.sort(key=lambda edge: edge.prob()) + + +# Eventually, this will become some sort of inside-outside parser: +# class InsideOutsideParser(BottomUpProbabilisticChartParser): +# def __init__(self, grammar, trace=0): +# # Inherit docs. +# BottomUpProbabilisticChartParser.__init__(self, grammar, trace) +# +# # Find the best path from S to each nonterminal +# bestp = {} +# for production in grammar.productions(): bestp[production.lhs()]=0 +# bestp[grammar.start()] = 1.0 +# +# for i in range(len(grammar.productions())): +# for production in grammar.productions(): +# lhs = production.lhs() +# for elt in production.rhs(): +# bestp[elt] = max(bestp[lhs]*production.prob(), +# bestp.get(elt,0)) +# +# self._bestp = bestp +# for (k,v) in self._bestp.items(): print(k,v) +# +# def _sortkey(self, edge): +# return edge.structure()[PROB] * self._bestp[edge.lhs()] +# +# def sort_queue(self, queue, chart): +# queue.sort(key=self._sortkey) + + +class RandomChartParser(BottomUpProbabilisticChartParser): + """ + A bottom-up parser for ``PCFG`` grammars that tries edges in random order. + This sorting order results in a random search strategy. + """ + + # Inherit constructor + def sort_queue(self, queue, chart): + i = random.randint(0, len(queue) - 1) + (queue[-1], queue[i]) = (queue[i], queue[-1]) + + +class UnsortedChartParser(BottomUpProbabilisticChartParser): + """ + A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order. + """ + + # Inherit constructor + def sort_queue(self, queue, chart): + return + + +class LongestChartParser(BottomUpProbabilisticChartParser): + """ + A bottom-up parser for ``PCFG`` grammars that tries longer edges before + shorter ones. This sorting order results in a type of best-first + search strategy. + """ + + # Inherit constructor + def sort_queue(self, queue, chart): + queue.sort(key=lambda edge: edge.length()) + + +##////////////////////////////////////////////////////// +## Test Code +##////////////////////////////////////////////////////// + + +def demo(choice=None, draw_parses=None, print_parses=None): + """ + A demonstration of the probabilistic parsers. The user is + prompted to select which demo to run, and how many parses should + be found; and then each parser is run on the same demo, and a + summary of the results are displayed. + """ + import sys + import time + + from nltk import tokenize + from nltk.parse import pchart + + # Define two demos. Each demo has a sentence and a grammar. + toy_pcfg1 = PCFG.fromstring( + """ + S -> NP VP [1.0] + NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] + Det -> 'the' [0.8] | 'my' [0.2] + N -> 'man' [0.5] | 'telescope' [0.5] + VP -> VP PP [0.1] | V NP [0.7] | V [0.2] + V -> 'ate' [0.35] | 'saw' [0.65] + PP -> P NP [1.0] + P -> 'with' [0.61] | 'under' [0.39] + """ + ) + + toy_pcfg2 = PCFG.fromstring( + """ + S -> NP VP [1.0] + VP -> V NP [.59] + VP -> V [.40] + VP -> VP PP [.01] + NP -> Det N [.41] + NP -> Name [.28] + NP -> NP PP [.31] + PP -> P NP [1.0] + V -> 'saw' [.21] + V -> 'ate' [.51] + V -> 'ran' [.28] + N -> 'boy' [.11] + N -> 'cookie' [.12] + N -> 'table' [.13] + N -> 'telescope' [.14] + N -> 'hill' [.5] + Name -> 'Jack' [.52] + Name -> 'Bob' [.48] + P -> 'with' [.61] + P -> 'under' [.39] + Det -> 'the' [.41] + Det -> 'a' [.31] + Det -> 'my' [.28] + """ + ) + + demos = [ + ("I saw John with my telescope", toy_pcfg1), + ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), + ] + + if choice is None: + # Ask the user which demo they want to use. + print() + for i in range(len(demos)): + print(f"{i + 1:>3}: {demos[i][0]}") + print(" %r" % demos[i][1]) + print() + print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") + choice = int(sys.stdin.readline().strip()) - 1 + try: + sent, grammar = demos[choice] + except: + print("Bad sentence number") + return + + # Tokenize the sentence. + tokens = sent.split() + + # Define a list of parsers. We'll use all parsers. + parsers = [ + pchart.InsideChartParser(grammar), + pchart.RandomChartParser(grammar), + pchart.UnsortedChartParser(grammar), + pchart.LongestChartParser(grammar), + pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser + ] + + # Run the parsers on the tokenized sentence. + times = [] + average_p = [] + num_parses = [] + all_parses = {} + for parser in parsers: + print(f"\ns: {sent}\nparser: {parser}\ngrammar: {grammar}") + parser.trace(3) + t = time.time() + parses = list(parser.parse(tokens)) + times.append(time.time() - t) + p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0 + average_p.append(p) + num_parses.append(len(parses)) + for p in parses: + all_parses[p.freeze()] = 1 + + # Print some summary statistics + print() + print(" Parser Beam | Time (secs) # Parses Average P(parse)") + print("------------------------+------------------------------------------") + for i in range(len(parsers)): + print( + "%18s %4d |%11.4f%11d%19.14f" + % ( + parsers[i].__class__.__name__, + parsers[i].beam_size, + times[i], + num_parses[i], + average_p[i], + ) + ) + parses = all_parses.keys() + if parses: + p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) + else: + p = 0 + print("------------------------+------------------------------------------") + print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p)) + + if draw_parses is None: + # Ask the user if we should draw the parses. + print() + print("Draw parses (y/n)? ", end=" ") + draw_parses = sys.stdin.readline().strip().lower().startswith("y") + if draw_parses: + from nltk.draw.tree import draw_trees + + print(" please wait...") + draw_trees(*parses) + + if print_parses is None: + # Ask the user if we should print the parses. + print() + print("Print parses (y/n)? ", end=" ") + print_parses = sys.stdin.readline().strip().lower().startswith("y") + if print_parses: + for parse in parses: + print(parse) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/projectivedependencyparser.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/projectivedependencyparser.py new file mode 100644 index 0000000000000000000000000000000000000000..6139d130f71370cb6ce4e81e07a10ec337b6afc1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/projectivedependencyparser.py @@ -0,0 +1,716 @@ +# Natural Language Toolkit: Dependency Grammars +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Jason Narad +# +# URL: +# For license information, see LICENSE.TXT +# + +from collections import defaultdict +from functools import total_ordering +from itertools import chain + +from nltk.grammar import ( + DependencyGrammar, + DependencyProduction, + ProbabilisticDependencyGrammar, +) +from nltk.internals import raise_unorderable_types +from nltk.parse.dependencygraph import DependencyGraph + +################################################################# +# Dependency Span +################################################################# + + +@total_ordering +class DependencySpan: + """ + A contiguous span over some part of the input string representing + dependency (head -> modifier) relationships amongst words. An atomic + span corresponds to only one word so it isn't a 'span' in the conventional + sense, as its _start_index = _end_index = _head_index for concatenation + purposes. All other spans are assumed to have arcs between all nodes + within the start and end indexes of the span, and one head index corresponding + to the head word for the entire span. This is the same as the root node if + the dependency structure were depicted as a graph. + """ + + def __init__(self, start_index, end_index, head_index, arcs, tags): + self._start_index = start_index + self._end_index = end_index + self._head_index = head_index + self._arcs = arcs + self._tags = tags + self._comparison_key = (start_index, end_index, head_index, tuple(arcs)) + self._hash = hash(self._comparison_key) + + def head_index(self): + """ + :return: An value indexing the head of the entire ``DependencySpan``. + :rtype: int + """ + return self._head_index + + def __repr__(self): + """ + :return: A concise string representatino of the ``DependencySpan``. + :rtype: str. + """ + return "Span %d-%d; Head Index: %d" % ( + self._start_index, + self._end_index, + self._head_index, + ) + + def __str__(self): + """ + :return: A verbose string representation of the ``DependencySpan``. + :rtype: str + """ + str = "Span %d-%d; Head Index: %d" % ( + self._start_index, + self._end_index, + self._head_index, + ) + for i in range(len(self._arcs)): + str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i]) + return str + + def __eq__(self, other): + return ( + type(self) == type(other) and self._comparison_key == other._comparison_key + ) + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, DependencySpan): + raise_unorderable_types("<", self, other) + return self._comparison_key < other._comparison_key + + def __hash__(self): + """ + :return: The hash value of this ``DependencySpan``. + """ + return self._hash + + +################################################################# +# Chart Cell +################################################################# + + +class ChartCell: + """ + A cell from the parse chart formed when performing the CYK algorithm. + Each cell keeps track of its x and y coordinates (though this will probably + be discarded), and a list of spans serving as the cell's entries. + """ + + def __init__(self, x, y): + """ + :param x: This cell's x coordinate. + :type x: int. + :param y: This cell's y coordinate. + :type y: int. + """ + self._x = x + self._y = y + self._entries = set() + + def add(self, span): + """ + Appends the given span to the list of spans + representing the chart cell's entries. + + :param span: The span to add. + :type span: DependencySpan + """ + self._entries.add(span) + + def __str__(self): + """ + :return: A verbose string representation of this ``ChartCell``. + :rtype: str. + """ + return "CC[%d,%d]: %s" % (self._x, self._y, self._entries) + + def __repr__(self): + """ + :return: A concise string representation of this ``ChartCell``. + :rtype: str. + """ + return "%s" % self + + +################################################################# +# Parsing with Dependency Grammars +################################################################# + + +class ProjectiveDependencyParser: + """ + A projective, rule-based, dependency parser. A ProjectiveDependencyParser + is created with a DependencyGrammar, a set of productions specifying + word-to-word dependency relations. The parse() method will then + return the set of all parses, in tree representation, for a given input + sequence of tokens. Each parse must meet the requirements of the both + the grammar and the projectivity constraint which specifies that the + branches of the dependency tree are not allowed to cross. Alternatively, + this can be understood as stating that each parent node and its children + in the parse tree form a continuous substring of the input sequence. + """ + + def __init__(self, dependency_grammar): + """ + Create a new ProjectiveDependencyParser, from a word-to-word + dependency grammar ``DependencyGrammar``. + + :param dependency_grammar: A word-to-word relation dependencygrammar. + :type dependency_grammar: DependencyGrammar + """ + self._grammar = dependency_grammar + + def parse(self, tokens): + """ + Performs a projective dependency parse on the list of tokens using + a chart-based, span-concatenation algorithm similar to Eisner (1996). + + :param tokens: The list of input tokens. + :type tokens: list(str) + :return: An iterator over parse trees. + :rtype: iter(Tree) + """ + self._tokens = list(tokens) + chart = [] + for i in range(0, len(self._tokens) + 1): + chart.append([]) + for j in range(0, len(self._tokens) + 1): + chart[i].append(ChartCell(i, j)) + if i == j + 1: + chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"])) + + for i in range(1, len(self._tokens) + 1): + for j in range(i - 2, -1, -1): + for k in range(i - 1, j, -1): + for span1 in chart[k][j]._entries: + for span2 in chart[i][k]._entries: + for newspan in self.concatenate(span1, span2): + chart[i][j].add(newspan) + + for parse in chart[len(self._tokens)][0]._entries: + conll_format = "" + # malt_format = "" + for i in range(len(tokens)): + # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') + # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-') + # Modify to comply with the new Dependency Graph requirement (at least must have an root elements) + conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( + i + 1, + tokens[i], + tokens[i], + "null", + "null", + "null", + parse._arcs[i] + 1, + "ROOT", + "-", + "-", + ) + dg = DependencyGraph(conll_format) + # if self.meets_arity(dg): + yield dg.tree() + + def concatenate(self, span1, span2): + """ + Concatenates the two spans in whichever way possible. This + includes rightward concatenation (from the leftmost word of the + leftmost span to the rightmost word of the rightmost span) and + leftward concatenation (vice-versa) between adjacent spans. Unlike + Eisner's presentation of span concatenation, these spans do not + share or pivot on a particular word/word-index. + + :return: A list of new spans formed through concatenation. + :rtype: list(DependencySpan) + """ + spans = [] + if span1._start_index == span2._start_index: + print("Error: Mismatched spans - replace this with thrown error") + if span1._start_index > span2._start_index: + temp_span = span1 + span1 = span2 + span2 = temp_span + # adjacent rightward covered concatenation + new_arcs = span1._arcs + span2._arcs + new_tags = span1._tags + span2._tags + if self._grammar.contains( + self._tokens[span1._head_index], self._tokens[span2._head_index] + ): + # print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index)) + new_arcs[span2._head_index - span1._start_index] = span1._head_index + spans.append( + DependencySpan( + span1._start_index, + span2._end_index, + span1._head_index, + new_arcs, + new_tags, + ) + ) + # adjacent leftward covered concatenation + new_arcs = span1._arcs + span2._arcs + if self._grammar.contains( + self._tokens[span2._head_index], self._tokens[span1._head_index] + ): + # print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index)) + new_arcs[span1._head_index - span1._start_index] = span2._head_index + spans.append( + DependencySpan( + span1._start_index, + span2._end_index, + span2._head_index, + new_arcs, + new_tags, + ) + ) + return spans + + +################################################################# +# Parsing with Probabilistic Dependency Grammars +################################################################# + + +class ProbabilisticProjectiveDependencyParser: + """A probabilistic, projective dependency parser. + + This parser returns the most probable projective parse derived from the + probabilistic dependency grammar derived from the train() method. The + probabilistic model is an implementation of Eisner's (1996) Model C, which + conditions on head-word, head-tag, child-word, and child-tag. The decoding + uses a bottom-up chart-based span concatenation algorithm that's identical + to the one utilized by the rule-based projective parser. + + Usage example + + >>> from nltk.parse.dependencygraph import conll_data2 + + >>> graphs = [ + ... DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry + ... ] + + >>> ppdp = ProbabilisticProjectiveDependencyParser() + >>> ppdp.train(graphs) + + >>> sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.'] + >>> list(ppdp.parse(sent)) + [Tree('zag', ['Cathy', 'hen', Tree('zwaaien', ['wild', '.'])])] + + """ + + def __init__(self): + """ + Create a new probabilistic dependency parser. No additional + operations are necessary. + """ + + def parse(self, tokens): + """ + Parses the list of tokens subject to the projectivity constraint + and the productions in the parser's grammar. This uses a method + similar to the span-concatenation algorithm defined in Eisner (1996). + It returns the most probable parse derived from the parser's + probabilistic dependency grammar. + """ + self._tokens = list(tokens) + chart = [] + for i in range(0, len(self._tokens) + 1): + chart.append([]) + for j in range(0, len(self._tokens) + 1): + chart[i].append(ChartCell(i, j)) + if i == j + 1: + if tokens[i - 1] in self._grammar._tags: + for tag in self._grammar._tags[tokens[i - 1]]: + chart[i][j].add( + DependencySpan(i - 1, i, i - 1, [-1], [tag]) + ) + else: + print( + "No tag found for input token '%s', parse is impossible." + % tokens[i - 1] + ) + return [] + for i in range(1, len(self._tokens) + 1): + for j in range(i - 2, -1, -1): + for k in range(i - 1, j, -1): + for span1 in chart[k][j]._entries: + for span2 in chart[i][k]._entries: + for newspan in self.concatenate(span1, span2): + chart[i][j].add(newspan) + trees = [] + max_parse = None + max_score = 0 + for parse in chart[len(self._tokens)][0]._entries: + conll_format = "" + malt_format = "" + for i in range(len(tokens)): + malt_format += "%s\t%s\t%d\t%s\n" % ( + tokens[i], + "null", + parse._arcs[i] + 1, + "null", + ) + # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') + # Modify to comply with recent change in dependency graph such that there must be a ROOT element. + conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( + i + 1, + tokens[i], + tokens[i], + parse._tags[i], + parse._tags[i], + "null", + parse._arcs[i] + 1, + "ROOT", + "-", + "-", + ) + dg = DependencyGraph(conll_format) + score = self.compute_prob(dg) + trees.append((score, dg.tree())) + trees.sort() + return (tree for (score, tree) in trees) + + def concatenate(self, span1, span2): + """ + Concatenates the two spans in whichever way possible. This + includes rightward concatenation (from the leftmost word of the + leftmost span to the rightmost word of the rightmost span) and + leftward concatenation (vice-versa) between adjacent spans. Unlike + Eisner's presentation of span concatenation, these spans do not + share or pivot on a particular word/word-index. + + :return: A list of new spans formed through concatenation. + :rtype: list(DependencySpan) + """ + spans = [] + if span1._start_index == span2._start_index: + print("Error: Mismatched spans - replace this with thrown error") + if span1._start_index > span2._start_index: + temp_span = span1 + span1 = span2 + span2 = temp_span + # adjacent rightward covered concatenation + new_arcs = span1._arcs + span2._arcs + new_tags = span1._tags + span2._tags + if self._grammar.contains( + self._tokens[span1._head_index], self._tokens[span2._head_index] + ): + new_arcs[span2._head_index - span1._start_index] = span1._head_index + spans.append( + DependencySpan( + span1._start_index, + span2._end_index, + span1._head_index, + new_arcs, + new_tags, + ) + ) + # adjacent leftward covered concatenation + new_arcs = span1._arcs + span2._arcs + new_tags = span1._tags + span2._tags + if self._grammar.contains( + self._tokens[span2._head_index], self._tokens[span1._head_index] + ): + new_arcs[span1._head_index - span1._start_index] = span2._head_index + spans.append( + DependencySpan( + span1._start_index, + span2._end_index, + span2._head_index, + new_arcs, + new_tags, + ) + ) + return spans + + def train(self, graphs): + """ + Trains a ProbabilisticDependencyGrammar based on the list of input + DependencyGraphs. This model is an implementation of Eisner's (1996) + Model C, which derives its statistics from head-word, head-tag, + child-word, and child-tag relationships. + + :param graphs: A list of dependency graphs to train from. + :type: list(DependencyGraph) + """ + productions = [] + events = defaultdict(int) + tags = {} + for dg in graphs: + for node_index in range(1, len(dg.nodes)): + # children = dg.nodes[node_index]['deps'] + children = list( + chain.from_iterable(dg.nodes[node_index]["deps"].values()) + ) + + nr_left_children = dg.left_children(node_index) + nr_right_children = dg.right_children(node_index) + nr_children = nr_left_children + nr_right_children + for child_index in range( + 0 - (nr_left_children + 1), nr_right_children + 2 + ): + head_word = dg.nodes[node_index]["word"] + head_tag = dg.nodes[node_index]["tag"] + if head_word in tags: + tags[head_word].add(head_tag) + else: + tags[head_word] = {head_tag} + child = "STOP" + child_tag = "STOP" + prev_word = "START" + prev_tag = "START" + if child_index < 0: + array_index = child_index + nr_left_children + if array_index >= 0: + child = dg.nodes[children[array_index]]["word"] + child_tag = dg.nodes[children[array_index]]["tag"] + if child_index != -1: + prev_word = dg.nodes[children[array_index + 1]]["word"] + prev_tag = dg.nodes[children[array_index + 1]]["tag"] + if child != "STOP": + productions.append(DependencyProduction(head_word, [child])) + head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format( + child, + child_tag, + prev_tag, + head_word, + head_tag, + ) + mod_event = "(mods ({}, {}, {}) left))".format( + prev_tag, + head_word, + head_tag, + ) + events[head_event] += 1 + events[mod_event] += 1 + elif child_index > 0: + array_index = child_index + nr_left_children - 1 + if array_index < nr_children: + child = dg.nodes[children[array_index]]["word"] + child_tag = dg.nodes[children[array_index]]["tag"] + if child_index != 1: + prev_word = dg.nodes[children[array_index - 1]]["word"] + prev_tag = dg.nodes[children[array_index - 1]]["tag"] + if child != "STOP": + productions.append(DependencyProduction(head_word, [child])) + head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format( + child, + child_tag, + prev_tag, + head_word, + head_tag, + ) + mod_event = "(mods ({}, {}, {}) right))".format( + prev_tag, + head_word, + head_tag, + ) + events[head_event] += 1 + events[mod_event] += 1 + self._grammar = ProbabilisticDependencyGrammar(productions, events, tags) + + def compute_prob(self, dg): + """ + Computes the probability of a dependency graph based + on the parser's probability model (defined by the parser's + statistical dependency grammar). + + :param dg: A dependency graph to score. + :type dg: DependencyGraph + :return: The probability of the dependency graph. + :rtype: int + """ + prob = 1.0 + for node_index in range(1, len(dg.nodes)): + # children = dg.nodes[node_index]['deps'] + children = list(chain.from_iterable(dg.nodes[node_index]["deps"].values())) + + nr_left_children = dg.left_children(node_index) + nr_right_children = dg.right_children(node_index) + nr_children = nr_left_children + nr_right_children + for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2): + head_word = dg.nodes[node_index]["word"] + head_tag = dg.nodes[node_index]["tag"] + child = "STOP" + child_tag = "STOP" + prev_word = "START" + prev_tag = "START" + if child_index < 0: + array_index = child_index + nr_left_children + if array_index >= 0: + child = dg.nodes[children[array_index]]["word"] + child_tag = dg.nodes[children[array_index]]["tag"] + if child_index != -1: + prev_word = dg.nodes[children[array_index + 1]]["word"] + prev_tag = dg.nodes[children[array_index + 1]]["tag"] + head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format( + child, + child_tag, + prev_tag, + head_word, + head_tag, + ) + mod_event = "(mods ({}, {}, {}) left))".format( + prev_tag, + head_word, + head_tag, + ) + h_count = self._grammar._events[head_event] + m_count = self._grammar._events[mod_event] + + # If the grammar is not covered + if m_count != 0: + prob *= h_count / m_count + else: + prob = 0.00000001 # Very small number + + elif child_index > 0: + array_index = child_index + nr_left_children - 1 + if array_index < nr_children: + child = dg.nodes[children[array_index]]["word"] + child_tag = dg.nodes[children[array_index]]["tag"] + if child_index != 1: + prev_word = dg.nodes[children[array_index - 1]]["word"] + prev_tag = dg.nodes[children[array_index - 1]]["tag"] + head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format( + child, + child_tag, + prev_tag, + head_word, + head_tag, + ) + mod_event = "(mods ({}, {}, {}) right))".format( + prev_tag, + head_word, + head_tag, + ) + h_count = self._grammar._events[head_event] + m_count = self._grammar._events[mod_event] + + if m_count != 0: + prob *= h_count / m_count + else: + prob = 0.00000001 # Very small number + + return prob + + +################################################################# +# Demos +################################################################# + + +def demo(): + projective_rule_parse_demo() + # arity_parse_demo() + projective_prob_parse_demo() + + +def projective_rule_parse_demo(): + """ + A demonstration showing the creation and use of a + ``DependencyGrammar`` to perform a projective dependency + parse. + """ + grammar = DependencyGrammar.fromstring( + """ + 'scratch' -> 'cats' | 'walls' + 'walls' -> 'the' + 'cats' -> 'the' + """ + ) + print(grammar) + pdp = ProjectiveDependencyParser(grammar) + trees = pdp.parse(["the", "cats", "scratch", "the", "walls"]) + for tree in trees: + print(tree) + + +def arity_parse_demo(): + """ + A demonstration showing the creation of a ``DependencyGrammar`` + in which a specific number of modifiers is listed for a given + head. This can further constrain the number of possible parses + created by a ``ProjectiveDependencyParser``. + """ + print() + print("A grammar with no arity constraints. Each DependencyProduction") + print("specifies a relationship between one head word and only one") + print("modifier word.") + grammar = DependencyGrammar.fromstring( + """ + 'fell' -> 'price' | 'stock' + 'price' -> 'of' | 'the' + 'of' -> 'stock' + 'stock' -> 'the' + """ + ) + print(grammar) + + print() + print("For the sentence 'The price of the stock fell', this grammar") + print("will produce the following three parses:") + pdp = ProjectiveDependencyParser(grammar) + trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"]) + for tree in trees: + print(tree) + + print() + print("By contrast, the following grammar contains a ") + print("DependencyProduction that specifies a relationship") + print("between a single head word, 'price', and two modifier") + print("words, 'of' and 'the'.") + grammar = DependencyGrammar.fromstring( + """ + 'fell' -> 'price' | 'stock' + 'price' -> 'of' 'the' + 'of' -> 'stock' + 'stock' -> 'the' + """ + ) + print(grammar) + + print() + print( + "This constrains the number of possible parses to just one:" + ) # unimplemented, soon to replace + pdp = ProjectiveDependencyParser(grammar) + trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"]) + for tree in trees: + print(tree) + + +def projective_prob_parse_demo(): + """ + A demo showing the training and use of a projective + dependency parser. + """ + from nltk.parse.dependencygraph import conll_data2 + + graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] + ppdp = ProbabilisticProjectiveDependencyParser() + print("Training Probabilistic Projective Dependency Parser...") + ppdp.train(graphs) + + sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."] + print("Parsing '", " ".join(sent), "'...") + print("Parse:") + for tree in ppdp.parse(sent): + print(tree) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/probability.py b/.eggs/nltk-3.8-py3.10.egg/nltk/probability.py new file mode 100644 index 0000000000000000000000000000000000000000..0823cd8c7d2c5208412a99aba81bb908a100418b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/probability.py @@ -0,0 +1,2578 @@ +# Natural Language Toolkit: Probability and Statistics +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (additions) +# Trevor Cohn (additions) +# Peter Ljunglöf (additions) +# Liang Dong (additions) +# Geoffrey Sampson (additions) +# Ilia Kurenkov (additions) +# +# URL: +# For license information, see LICENSE.TXT + +""" +Classes for representing and processing probabilistic information. + +The ``FreqDist`` class is used to encode "frequency distributions", +which count the number of times that each outcome of an experiment +occurs. + +The ``ProbDistI`` class defines a standard interface for "probability +distributions", which encode the probability of each outcome for an +experiment. There are two types of probability distribution: + + - "derived probability distributions" are created from frequency + distributions. They attempt to model the probability distribution + that generated the frequency distribution. + - "analytic probability distributions" are created directly from + parameters (such as variance). + +The ``ConditionalFreqDist`` class and ``ConditionalProbDistI`` interface +are used to encode conditional distributions. Conditional probability +distributions can be derived or analytic; but currently the only +implementation of the ``ConditionalProbDistI`` interface is +``ConditionalProbDist``, a derived distribution. + +""" + +import array +import math +import random +import warnings +from abc import ABCMeta, abstractmethod +from collections import Counter, defaultdict +from functools import reduce + +from nltk.internals import raise_unorderable_types + +_NINF = float("-1e300") + +##////////////////////////////////////////////////////// +## Frequency Distributions +##////////////////////////////////////////////////////// + + +class FreqDist(Counter): + """ + A frequency distribution for the outcomes of an experiment. A + frequency distribution records the number of times each outcome of + an experiment has occurred. For example, a frequency distribution + could be used to record the frequency of each word type in a + document. Formally, a frequency distribution can be defined as a + function mapping from each sample to the number of times that + sample occurred as an outcome. + + Frequency distributions are generally constructed by running a + number of experiments, and incrementing the count for a sample + every time it is an outcome of an experiment. For example, the + following code will produce a frequency distribution that encodes + how often each word occurs in a text: + + >>> from nltk.tokenize import word_tokenize + >>> from nltk.probability import FreqDist + >>> sent = 'This is an example sentence' + >>> fdist = FreqDist() + >>> for word in word_tokenize(sent): + ... fdist[word.lower()] += 1 + + An equivalent way to do this is with the initializer: + + >>> fdist = FreqDist(word.lower() for word in word_tokenize(sent)) + + """ + + def __init__(self, samples=None): + """ + Construct a new frequency distribution. If ``samples`` is + given, then the frequency distribution will be initialized + with the count of each object in ``samples``; otherwise, it + will be initialized to be empty. + + In particular, ``FreqDist()`` returns an empty frequency + distribution; and ``FreqDist(samples)`` first creates an empty + frequency distribution, and then calls ``update`` with the + list ``samples``. + + :param samples: The samples to initialize the frequency + distribution with. + :type samples: Sequence + """ + Counter.__init__(self, samples) + + # Cached number of samples in this FreqDist + self._N = None + + def N(self): + """ + Return the total number of sample outcomes that have been + recorded by this FreqDist. For the number of unique + sample values (or bins) with counts greater than zero, use + ``FreqDist.B()``. + + :rtype: int + """ + if self._N is None: + # Not already cached, or cache has been invalidated + self._N = sum(self.values()) + return self._N + + def __setitem__(self, key, val): + """ + Override ``Counter.__setitem__()`` to invalidate the cached N + """ + self._N = None + super().__setitem__(key, val) + + def __delitem__(self, key): + """ + Override ``Counter.__delitem__()`` to invalidate the cached N + """ + self._N = None + super().__delitem__(key) + + def update(self, *args, **kwargs): + """ + Override ``Counter.update()`` to invalidate the cached N + """ + self._N = None + super().update(*args, **kwargs) + + def setdefault(self, key, val): + """ + Override ``Counter.setdefault()`` to invalidate the cached N + """ + self._N = None + super().setdefault(key, val) + + def B(self): + """ + Return the total number of sample values (or "bins") that + have counts greater than zero. For the total + number of sample outcomes recorded, use ``FreqDist.N()``. + (FreqDist.B() is the same as len(FreqDist).) + + :rtype: int + """ + return len(self) + + def hapaxes(self): + """ + Return a list of all samples that occur once (hapax legomena) + + :rtype: list + """ + return [item for item in self if self[item] == 1] + + def Nr(self, r, bins=None): + return self.r_Nr(bins)[r] + + def r_Nr(self, bins=None): + """ + Return the dictionary mapping r to Nr, the number of samples with frequency r, where Nr > 0. + + :type bins: int + :param bins: The number of possible sample outcomes. ``bins`` + is used to calculate Nr(0). In particular, Nr(0) is + ``bins-self.B()``. If ``bins`` is not specified, it + defaults to ``self.B()`` (so Nr(0) will be 0). + :rtype: int + """ + + _r_Nr = defaultdict(int) + for count in self.values(): + _r_Nr[count] += 1 + + # Special case for Nr[0]: + _r_Nr[0] = bins - self.B() if bins is not None else 0 + + return _r_Nr + + def _cumulative_frequencies(self, samples): + """ + Return the cumulative frequencies of the specified samples. + If no samples are specified, all counts are returned, starting + with the largest. + + :param samples: the samples whose frequencies should be returned. + :type samples: any + :rtype: list(float) + """ + cf = 0.0 + for sample in samples: + cf += self[sample] + yield cf + + # slightly odd nomenclature freq() if FreqDist does counts and ProbDist does probs, + # here, freq() does probs + def freq(self, sample): + """ + Return the frequency of a given sample. The frequency of a + sample is defined as the count of that sample divided by the + total number of sample outcomes that have been recorded by + this FreqDist. The count of a sample is defined as the + number of times that sample outcome was recorded by this + FreqDist. Frequencies are always real numbers in the range + [0, 1]. + + :param sample: the sample whose frequency + should be returned. + :type sample: any + :rtype: float + """ + n = self.N() + if n == 0: + return 0 + return self[sample] / n + + def max(self): + """ + Return the sample with the greatest number of outcomes in this + frequency distribution. If two or more samples have the same + number of outcomes, return one of them; which sample is + returned is undefined. If no outcomes have occurred in this + frequency distribution, return None. + + :return: The sample with the maximum number of outcomes in this + frequency distribution. + :rtype: any or None + """ + if len(self) == 0: + raise ValueError( + "A FreqDist must have at least one sample before max is defined." + ) + return self.most_common(1)[0][0] + + def plot( + self, *args, title="", cumulative=False, percents=False, show=True, **kwargs + ): + """ + Plot samples from the frequency distribution + displaying the most frequent sample first. If an integer + parameter is supplied, stop after this many samples have been + plotted. For a cumulative plot, specify cumulative=True. Additional + ``**kwargs`` are passed to matplotlib's plot function. + (Requires Matplotlib to be installed.) + + :param title: The title for the graph. + :type title: str + :param cumulative: Whether the plot is cumulative. (default = False) + :type cumulative: bool + :param percents: Whether the plot uses percents instead of counts. (default = False) + :type percents: bool + :param show: Whether to show the plot, or only return the ax. + :type show: bool + """ + try: + import matplotlib.pyplot as plt + except ImportError as e: + raise ValueError( + "The plot function requires matplotlib to be installed." + "See https://matplotlib.org/" + ) from e + + if len(args) == 0: + args = [len(self)] + samples = [item for item, _ in self.most_common(*args)] + + if cumulative: + freqs = list(self._cumulative_frequencies(samples)) + ylabel = "Cumulative " + else: + freqs = [self[sample] for sample in samples] + ylabel = "" + + if percents: + freqs = [f / self.N() * 100 for f in freqs] + ylabel += "Percents" + else: + ylabel += "Counts" + + ax = plt.gca() + ax.grid(True, color="silver") + + if "linewidth" not in kwargs: + kwargs["linewidth"] = 2 + if title: + ax.set_title(title) + + ax.plot(freqs, **kwargs) + ax.set_xticks(range(len(samples))) + ax.set_xticklabels([str(s) for s in samples], rotation=90) + ax.set_xlabel("Samples") + ax.set_ylabel(ylabel) + + if show: + plt.show() + + return ax + + def tabulate(self, *args, **kwargs): + """ + Tabulate the given samples from the frequency distribution (cumulative), + displaying the most frequent sample first. If an integer + parameter is supplied, stop after this many samples have been + plotted. + + :param samples: The samples to plot (default is all samples) + :type samples: list + :param cumulative: A flag to specify whether the freqs are cumulative (default = False) + :type title: bool + """ + if len(args) == 0: + args = [len(self)] + samples = _get_kwarg( + kwargs, "samples", [item for item, _ in self.most_common(*args)] + ) + + cumulative = _get_kwarg(kwargs, "cumulative", False) + if cumulative: + freqs = list(self._cumulative_frequencies(samples)) + else: + freqs = [self[sample] for sample in samples] + # percents = [f * 100 for f in freqs] only in ProbDist? + + width = max(len(f"{s}") for s in samples) + width = max(width, max(len("%d" % f) for f in freqs)) + + for i in range(len(samples)): + print("%*s" % (width, samples[i]), end=" ") + print() + for i in range(len(samples)): + print("%*d" % (width, freqs[i]), end=" ") + print() + + def copy(self): + """ + Create a copy of this frequency distribution. + + :rtype: FreqDist + """ + return self.__class__(self) + + # Mathematical operatiors + + def __add__(self, other): + """ + Add counts from two counters. + + >>> FreqDist('abbb') + FreqDist('bcc') + FreqDist({'b': 4, 'c': 2, 'a': 1}) + + """ + return self.__class__(super().__add__(other)) + + def __sub__(self, other): + """ + Subtract count, but keep only results with positive counts. + + >>> FreqDist('abbbc') - FreqDist('bccd') + FreqDist({'b': 2, 'a': 1}) + + """ + return self.__class__(super().__sub__(other)) + + def __or__(self, other): + """ + Union is the maximum of value in either of the input counters. + + >>> FreqDist('abbb') | FreqDist('bcc') + FreqDist({'b': 3, 'c': 2, 'a': 1}) + + """ + return self.__class__(super().__or__(other)) + + def __and__(self, other): + """ + Intersection is the minimum of corresponding counts. + + >>> FreqDist('abbb') & FreqDist('bcc') + FreqDist({'b': 1}) + + """ + return self.__class__(super().__and__(other)) + + def __le__(self, other): + """ + Returns True if this frequency distribution is a subset of the other + and for no key the value exceeds the value of the same key from + the other frequency distribution. + + The <= operator forms partial order and satisfying the axioms + reflexivity, antisymmetry and transitivity. + + >>> FreqDist('a') <= FreqDist('a') + True + >>> a = FreqDist('abc') + >>> b = FreqDist('aabc') + >>> (a <= b, b <= a) + (True, False) + >>> FreqDist('a') <= FreqDist('abcd') + True + >>> FreqDist('abc') <= FreqDist('xyz') + False + >>> FreqDist('xyz') <= FreqDist('abc') + False + >>> c = FreqDist('a') + >>> d = FreqDist('aa') + >>> e = FreqDist('aaa') + >>> c <= d and d <= e and c <= e + True + """ + if not isinstance(other, FreqDist): + raise_unorderable_types("<=", self, other) + return set(self).issubset(other) and all( + self[key] <= other[key] for key in self + ) + + def __ge__(self, other): + if not isinstance(other, FreqDist): + raise_unorderable_types(">=", self, other) + return set(self).issuperset(other) and all( + self[key] >= other[key] for key in other + ) + + __lt__ = lambda self, other: self <= other and not self == other + __gt__ = lambda self, other: self >= other and not self == other + + def __repr__(self): + """ + Return a string representation of this FreqDist. + + :rtype: string + """ + return self.pformat() + + def pprint(self, maxlen=10, stream=None): + """ + Print a string representation of this FreqDist to 'stream' + + :param maxlen: The maximum number of items to print + :type maxlen: int + :param stream: The stream to print to. stdout by default + """ + print(self.pformat(maxlen=maxlen), file=stream) + + def pformat(self, maxlen=10): + """ + Return a string representation of this FreqDist. + + :param maxlen: The maximum number of items to display + :type maxlen: int + :rtype: string + """ + items = ["{!r}: {!r}".format(*item) for item in self.most_common(maxlen)] + if len(self) > maxlen: + items.append("...") + return "FreqDist({{{0}}})".format(", ".join(items)) + + def __str__(self): + """ + Return a string representation of this FreqDist. + + :rtype: string + """ + return "" % (len(self), self.N()) + + def __iter__(self): + """ + Return an iterator which yields tokens ordered by frequency. + + :rtype: iterator + """ + for token, _ in self.most_common(self.B()): + yield token + + +##////////////////////////////////////////////////////// +## Probability Distributions +##////////////////////////////////////////////////////// + + +class ProbDistI(metaclass=ABCMeta): + """ + A probability distribution for the outcomes of an experiment. A + probability distribution specifies how likely it is that an + experiment will have any given outcome. For example, a + probability distribution could be used to predict the probability + that a token in a document will have a given type. Formally, a + probability distribution can be defined as a function mapping from + samples to nonnegative real numbers, such that the sum of every + number in the function's range is 1.0. A ``ProbDist`` is often + used to model the probability distribution of the experiment used + to generate a frequency distribution. + """ + + SUM_TO_ONE = True + """True if the probabilities of the samples in this probability + distribution will always sum to one.""" + + @abstractmethod + def __init__(self): + """ + Classes inheriting from ProbDistI should implement __init__. + """ + + @abstractmethod + def prob(self, sample): + """ + Return the probability for a given sample. Probabilities + are always real numbers in the range [0, 1]. + + :param sample: The sample whose probability + should be returned. + :type sample: any + :rtype: float + """ + + def logprob(self, sample): + """ + Return the base 2 logarithm of the probability for a given sample. + + :param sample: The sample whose probability + should be returned. + :type sample: any + :rtype: float + """ + # Default definition, in terms of prob() + p = self.prob(sample) + return math.log(p, 2) if p != 0 else _NINF + + @abstractmethod + def max(self): + """ + Return the sample with the greatest probability. If two or + more samples have the same probability, return one of them; + which sample is returned is undefined. + + :rtype: any + """ + + @abstractmethod + def samples(self): + """ + Return a list of all samples that have nonzero probabilities. + Use ``prob`` to find the probability of each sample. + + :rtype: list + """ + + # cf self.SUM_TO_ONE + def discount(self): + """ + Return the ratio by which counts are discounted on average: c*/c + + :rtype: float + """ + return 0.0 + + # Subclasses should define more efficient implementations of this, + # where possible. + def generate(self): + """ + Return a randomly selected sample from this probability distribution. + The probability of returning each sample ``samp`` is equal to + ``self.prob(samp)``. + """ + p = random.random() + p_init = p + for sample in self.samples(): + p -= self.prob(sample) + if p <= 0: + return sample + # allow for some rounding error: + if p < 0.0001: + return sample + # we *should* never get here + if self.SUM_TO_ONE: + warnings.warn( + "Probability distribution %r sums to %r; generate()" + " is returning an arbitrary sample." % (self, p_init - p) + ) + return random.choice(list(self.samples())) + + +class UniformProbDist(ProbDistI): + """ + A probability distribution that assigns equal probability to each + sample in a given set; and a zero probability to all other + samples. + """ + + def __init__(self, samples): + """ + Construct a new uniform probability distribution, that assigns + equal probability to each sample in ``samples``. + + :param samples: The samples that should be given uniform + probability. + :type samples: list + :raise ValueError: If ``samples`` is empty. + """ + if len(samples) == 0: + raise ValueError( + "A Uniform probability distribution must " + "have at least one sample." + ) + self._sampleset = set(samples) + self._prob = 1.0 / len(self._sampleset) + self._samples = list(self._sampleset) + + def prob(self, sample): + return self._prob if sample in self._sampleset else 0 + + def max(self): + return self._samples[0] + + def samples(self): + return self._samples + + def __repr__(self): + return "" % len(self._sampleset) + + +class RandomProbDist(ProbDistI): + """ + Generates a random probability distribution whereby each sample + will be between 0 and 1 with equal probability (uniform random distribution. + Also called a continuous uniform distribution). + """ + + def __init__(self, samples): + if len(samples) == 0: + raise ValueError( + "A probability distribution must " + "have at least one sample." + ) + self._probs = self.unirand(samples) + self._samples = list(self._probs.keys()) + + @classmethod + def unirand(cls, samples): + """ + The key function that creates a randomized initial distribution + that still sums to 1. Set as a dictionary of prob values so that + it can still be passed to MutableProbDist and called with identical + syntax to UniformProbDist + """ + samples = set(samples) + randrow = [random.random() for i in range(len(samples))] + total = sum(randrow) + for i, x in enumerate(randrow): + randrow[i] = x / total + + total = sum(randrow) + if total != 1: + # this difference, if present, is so small (near NINF) that it + # can be subtracted from any element without risking probs not (0 1) + randrow[-1] -= total - 1 + + return {s: randrow[i] for i, s in enumerate(samples)} + + def max(self): + if not hasattr(self, "_max"): + self._max = max((p, v) for (v, p) in self._probs.items())[1] + return self._max + + def prob(self, sample): + return self._probs.get(sample, 0) + + def samples(self): + return self._samples + + def __repr__(self): + return "" % len(self._probs) + + +class DictionaryProbDist(ProbDistI): + """ + A probability distribution whose probabilities are directly + specified by a given dictionary. The given dictionary maps + samples to probabilities. + """ + + def __init__(self, prob_dict=None, log=False, normalize=False): + """ + Construct a new probability distribution from the given + dictionary, which maps values to probabilities (or to log + probabilities, if ``log`` is true). If ``normalize`` is + true, then the probability values are scaled by a constant + factor such that they sum to 1. + + If called without arguments, the resulting probability + distribution assigns zero probability to all values. + """ + + self._prob_dict = prob_dict.copy() if prob_dict is not None else {} + self._log = log + + # Normalize the distribution, if requested. + if normalize: + if len(prob_dict) == 0: + raise ValueError( + "A DictionaryProbDist must have at least one sample " + + "before it can be normalized." + ) + if log: + value_sum = sum_logs(list(self._prob_dict.values())) + if value_sum <= _NINF: + logp = math.log(1.0 / len(prob_dict), 2) + for x in prob_dict: + self._prob_dict[x] = logp + else: + for (x, p) in self._prob_dict.items(): + self._prob_dict[x] -= value_sum + else: + value_sum = sum(self._prob_dict.values()) + if value_sum == 0: + p = 1.0 / len(prob_dict) + for x in prob_dict: + self._prob_dict[x] = p + else: + norm_factor = 1.0 / value_sum + for (x, p) in self._prob_dict.items(): + self._prob_dict[x] *= norm_factor + + def prob(self, sample): + if self._log: + return 2 ** (self._prob_dict[sample]) if sample in self._prob_dict else 0 + else: + return self._prob_dict.get(sample, 0) + + def logprob(self, sample): + if self._log: + return self._prob_dict.get(sample, _NINF) + else: + if sample not in self._prob_dict: + return _NINF + elif self._prob_dict[sample] == 0: + return _NINF + else: + return math.log(self._prob_dict[sample], 2) + + def max(self): + if not hasattr(self, "_max"): + self._max = max((p, v) for (v, p) in self._prob_dict.items())[1] + return self._max + + def samples(self): + return self._prob_dict.keys() + + def __repr__(self): + return "" % len(self._prob_dict) + + +class MLEProbDist(ProbDistI): + """ + The maximum likelihood estimate for the probability distribution + of the experiment used to generate a frequency distribution. The + "maximum likelihood estimate" approximates the probability of + each sample as the frequency of that sample in the frequency + distribution. + """ + + def __init__(self, freqdist, bins=None): + """ + Use the maximum likelihood estimate to create a probability + distribution for the experiment used to generate ``freqdist``. + + :type freqdist: FreqDist + :param freqdist: The frequency distribution that the + probability estimates should be based on. + """ + self._freqdist = freqdist + + def freqdist(self): + """ + Return the frequency distribution that this probability + distribution is based on. + + :rtype: FreqDist + """ + return self._freqdist + + def prob(self, sample): + return self._freqdist.freq(sample) + + def max(self): + return self._freqdist.max() + + def samples(self): + return self._freqdist.keys() + + def __repr__(self): + """ + :rtype: str + :return: A string representation of this ``ProbDist``. + """ + return "" % self._freqdist.N() + + +class LidstoneProbDist(ProbDistI): + """ + The Lidstone estimate for the probability distribution of the + experiment used to generate a frequency distribution. The + "Lidstone estimate" is parameterized by a real number *gamma*, + which typically ranges from 0 to 1. The Lidstone estimate + approximates the probability of a sample with count *c* from an + experiment with *N* outcomes and *B* bins as + ``c+gamma)/(N+B*gamma)``. This is equivalent to adding + *gamma* to the count for each bin, and taking the maximum + likelihood estimate of the resulting frequency distribution. + """ + + SUM_TO_ONE = False + + def __init__(self, freqdist, gamma, bins=None): + """ + Use the Lidstone estimate to create a probability distribution + for the experiment used to generate ``freqdist``. + + :type freqdist: FreqDist + :param freqdist: The frequency distribution that the + probability estimates should be based on. + :type gamma: float + :param gamma: A real number used to parameterize the + estimate. The Lidstone estimate is equivalent to adding + *gamma* to the count for each bin, and taking the + maximum likelihood estimate of the resulting frequency + distribution. + :type bins: int + :param bins: The number of sample values that can be generated + by the experiment that is described by the probability + distribution. This value must be correctly set for the + probabilities of the sample values to sum to one. If + ``bins`` is not specified, it defaults to ``freqdist.B()``. + """ + if (bins == 0) or (bins is None and freqdist.N() == 0): + name = self.__class__.__name__[:-8] + raise ValueError( + "A %s probability distribution " % name + "must have at least one bin." + ) + if (bins is not None) and (bins < freqdist.B()): + name = self.__class__.__name__[:-8] + raise ValueError( + "\nThe number of bins in a %s distribution " % name + + "(%d) must be greater than or equal to\n" % bins + + "the number of bins in the FreqDist used " + + "to create it (%d)." % freqdist.B() + ) + + self._freqdist = freqdist + self._gamma = float(gamma) + self._N = self._freqdist.N() + + if bins is None: + bins = freqdist.B() + self._bins = bins + + self._divisor = self._N + bins * gamma + if self._divisor == 0.0: + # In extreme cases we force the probability to be 0, + # which it will be, since the count will be 0: + self._gamma = 0 + self._divisor = 1 + + def freqdist(self): + """ + Return the frequency distribution that this probability + distribution is based on. + + :rtype: FreqDist + """ + return self._freqdist + + def prob(self, sample): + c = self._freqdist[sample] + return (c + self._gamma) / self._divisor + + def max(self): + # For Lidstone distributions, probability is monotonic with + # frequency, so the most probable sample is the one that + # occurs most frequently. + return self._freqdist.max() + + def samples(self): + return self._freqdist.keys() + + def discount(self): + gb = self._gamma * self._bins + return gb / (self._N + gb) + + def __repr__(self): + """ + Return a string representation of this ``ProbDist``. + + :rtype: str + """ + return "" % self._freqdist.N() + + +class LaplaceProbDist(LidstoneProbDist): + """ + The Laplace estimate for the probability distribution of the + experiment used to generate a frequency distribution. The + "Laplace estimate" approximates the probability of a sample with + count *c* from an experiment with *N* outcomes and *B* bins as + *(c+1)/(N+B)*. This is equivalent to adding one to the count for + each bin, and taking the maximum likelihood estimate of the + resulting frequency distribution. + """ + + def __init__(self, freqdist, bins=None): + """ + Use the Laplace estimate to create a probability distribution + for the experiment used to generate ``freqdist``. + + :type freqdist: FreqDist + :param freqdist: The frequency distribution that the + probability estimates should be based on. + :type bins: int + :param bins: The number of sample values that can be generated + by the experiment that is described by the probability + distribution. This value must be correctly set for the + probabilities of the sample values to sum to one. If + ``bins`` is not specified, it defaults to ``freqdist.B()``. + """ + LidstoneProbDist.__init__(self, freqdist, 1, bins) + + def __repr__(self): + """ + :rtype: str + :return: A string representation of this ``ProbDist``. + """ + return "" % self._freqdist.N() + + +class ELEProbDist(LidstoneProbDist): + """ + The expected likelihood estimate for the probability distribution + of the experiment used to generate a frequency distribution. The + "expected likelihood estimate" approximates the probability of a + sample with count *c* from an experiment with *N* outcomes and + *B* bins as *(c+0.5)/(N+B/2)*. This is equivalent to adding 0.5 + to the count for each bin, and taking the maximum likelihood + estimate of the resulting frequency distribution. + """ + + def __init__(self, freqdist, bins=None): + """ + Use the expected likelihood estimate to create a probability + distribution for the experiment used to generate ``freqdist``. + + :type freqdist: FreqDist + :param freqdist: The frequency distribution that the + probability estimates should be based on. + :type bins: int + :param bins: The number of sample values that can be generated + by the experiment that is described by the probability + distribution. This value must be correctly set for the + probabilities of the sample values to sum to one. If + ``bins`` is not specified, it defaults to ``freqdist.B()``. + """ + LidstoneProbDist.__init__(self, freqdist, 0.5, bins) + + def __repr__(self): + """ + Return a string representation of this ``ProbDist``. + + :rtype: str + """ + return "" % self._freqdist.N() + + +class HeldoutProbDist(ProbDistI): + """ + The heldout estimate for the probability distribution of the + experiment used to generate two frequency distributions. These + two frequency distributions are called the "heldout frequency + distribution" and the "base frequency distribution." The + "heldout estimate" uses uses the "heldout frequency + distribution" to predict the probability of each sample, given its + frequency in the "base frequency distribution". + + In particular, the heldout estimate approximates the probability + for a sample that occurs *r* times in the base distribution as + the average frequency in the heldout distribution of all samples + that occur *r* times in the base distribution. + + This average frequency is *Tr[r]/(Nr[r].N)*, where: + + - *Tr[r]* is the total count in the heldout distribution for + all samples that occur *r* times in the base distribution. + - *Nr[r]* is the number of samples that occur *r* times in + the base distribution. + - *N* is the number of outcomes recorded by the heldout + frequency distribution. + + In order to increase the efficiency of the ``prob`` member + function, *Tr[r]/(Nr[r].N)* is precomputed for each value of *r* + when the ``HeldoutProbDist`` is created. + + :type _estimate: list(float) + :ivar _estimate: A list mapping from *r*, the number of + times that a sample occurs in the base distribution, to the + probability estimate for that sample. ``_estimate[r]`` is + calculated by finding the average frequency in the heldout + distribution of all samples that occur *r* times in the base + distribution. In particular, ``_estimate[r]`` = + *Tr[r]/(Nr[r].N)*. + :type _max_r: int + :ivar _max_r: The maximum number of times that any sample occurs + in the base distribution. ``_max_r`` is used to decide how + large ``_estimate`` must be. + """ + + SUM_TO_ONE = False + + def __init__(self, base_fdist, heldout_fdist, bins=None): + """ + Use the heldout estimate to create a probability distribution + for the experiment used to generate ``base_fdist`` and + ``heldout_fdist``. + + :type base_fdist: FreqDist + :param base_fdist: The base frequency distribution. + :type heldout_fdist: FreqDist + :param heldout_fdist: The heldout frequency distribution. + :type bins: int + :param bins: The number of sample values that can be generated + by the experiment that is described by the probability + distribution. This value must be correctly set for the + probabilities of the sample values to sum to one. If + ``bins`` is not specified, it defaults to ``freqdist.B()``. + """ + + self._base_fdist = base_fdist + self._heldout_fdist = heldout_fdist + + # The max number of times any sample occurs in base_fdist. + self._max_r = base_fdist[base_fdist.max()] + + # Calculate Tr, Nr, and N. + Tr = self._calculate_Tr() + r_Nr = base_fdist.r_Nr(bins) + Nr = [r_Nr[r] for r in range(self._max_r + 1)] + N = heldout_fdist.N() + + # Use Tr, Nr, and N to compute the probability estimate for + # each value of r. + self._estimate = self._calculate_estimate(Tr, Nr, N) + + def _calculate_Tr(self): + """ + Return the list *Tr*, where *Tr[r]* is the total count in + ``heldout_fdist`` for all samples that occur *r* + times in ``base_fdist``. + + :rtype: list(float) + """ + Tr = [0.0] * (self._max_r + 1) + for sample in self._heldout_fdist: + r = self._base_fdist[sample] + Tr[r] += self._heldout_fdist[sample] + return Tr + + def _calculate_estimate(self, Tr, Nr, N): + """ + Return the list *estimate*, where *estimate[r]* is the probability + estimate for any sample that occurs *r* times in the base frequency + distribution. In particular, *estimate[r]* is *Tr[r]/(N[r].N)*. + In the special case that *N[r]=0*, *estimate[r]* will never be used; + so we define *estimate[r]=None* for those cases. + + :rtype: list(float) + :type Tr: list(float) + :param Tr: the list *Tr*, where *Tr[r]* is the total count in + the heldout distribution for all samples that occur *r* + times in base distribution. + :type Nr: list(float) + :param Nr: The list *Nr*, where *Nr[r]* is the number of + samples that occur *r* times in the base distribution. + :type N: int + :param N: The total number of outcomes recorded by the heldout + frequency distribution. + """ + estimate = [] + for r in range(self._max_r + 1): + if Nr[r] == 0: + estimate.append(None) + else: + estimate.append(Tr[r] / (Nr[r] * N)) + return estimate + + def base_fdist(self): + """ + Return the base frequency distribution that this probability + distribution is based on. + + :rtype: FreqDist + """ + return self._base_fdist + + def heldout_fdist(self): + """ + Return the heldout frequency distribution that this + probability distribution is based on. + + :rtype: FreqDist + """ + return self._heldout_fdist + + def samples(self): + return self._base_fdist.keys() + + def prob(self, sample): + # Use our precomputed probability estimate. + r = self._base_fdist[sample] + return self._estimate[r] + + def max(self): + # Note: the Heldout estimation is *not* necessarily monotonic; + # so this implementation is currently broken. However, it + # should give the right answer *most* of the time. :) + return self._base_fdist.max() + + def discount(self): + raise NotImplementedError() + + def __repr__(self): + """ + :rtype: str + :return: A string representation of this ``ProbDist``. + """ + s = "" + return s % (self._base_fdist.N(), self._heldout_fdist.N()) + + +class CrossValidationProbDist(ProbDistI): + """ + The cross-validation estimate for the probability distribution of + the experiment used to generate a set of frequency distribution. + The "cross-validation estimate" for the probability of a sample + is found by averaging the held-out estimates for the sample in + each pair of frequency distributions. + """ + + SUM_TO_ONE = False + + def __init__(self, freqdists, bins): + """ + Use the cross-validation estimate to create a probability + distribution for the experiment used to generate + ``freqdists``. + + :type freqdists: list(FreqDist) + :param freqdists: A list of the frequency distributions + generated by the experiment. + :type bins: int + :param bins: The number of sample values that can be generated + by the experiment that is described by the probability + distribution. This value must be correctly set for the + probabilities of the sample values to sum to one. If + ``bins`` is not specified, it defaults to ``freqdist.B()``. + """ + self._freqdists = freqdists + + # Create a heldout probability distribution for each pair of + # frequency distributions in freqdists. + self._heldout_probdists = [] + for fdist1 in freqdists: + for fdist2 in freqdists: + if fdist1 is not fdist2: + probdist = HeldoutProbDist(fdist1, fdist2, bins) + self._heldout_probdists.append(probdist) + + def freqdists(self): + """ + Return the list of frequency distributions that this ``ProbDist`` is based on. + + :rtype: list(FreqDist) + """ + return self._freqdists + + def samples(self): + # [xx] nb: this is not too efficient + return set(sum((list(fd) for fd in self._freqdists), [])) + + def prob(self, sample): + # Find the average probability estimate returned by each + # heldout distribution. + prob = 0.0 + for heldout_probdist in self._heldout_probdists: + prob += heldout_probdist.prob(sample) + return prob / len(self._heldout_probdists) + + def discount(self): + raise NotImplementedError() + + def __repr__(self): + """ + Return a string representation of this ``ProbDist``. + + :rtype: str + """ + return "" % len(self._freqdists) + + +class WittenBellProbDist(ProbDistI): + """ + The Witten-Bell estimate of a probability distribution. This distribution + allocates uniform probability mass to as yet unseen events by using the + number of events that have only been seen once. The probability mass + reserved for unseen events is equal to *T / (N + T)* + where *T* is the number of observed event types and *N* is the total + number of observed events. This equates to the maximum likelihood estimate + of a new type event occurring. The remaining probability mass is discounted + such that all probability estimates sum to one, yielding: + + - *p = T / Z (N + T)*, if count = 0 + - *p = c / (N + T)*, otherwise + """ + + def __init__(self, freqdist, bins=None): + """ + Creates a distribution of Witten-Bell probability estimates. This + distribution allocates uniform probability mass to as yet unseen + events by using the number of events that have only been seen once. The + probability mass reserved for unseen events is equal to *T / (N + T)* + where *T* is the number of observed event types and *N* is the total + number of observed events. This equates to the maximum likelihood + estimate of a new type event occurring. The remaining probability mass + is discounted such that all probability estimates sum to one, + yielding: + + - *p = T / Z (N + T)*, if count = 0 + - *p = c / (N + T)*, otherwise + + The parameters *T* and *N* are taken from the ``freqdist`` parameter + (the ``B()`` and ``N()`` values). The normalizing factor *Z* is + calculated using these values along with the ``bins`` parameter. + + :param freqdist: The frequency counts upon which to base the + estimation. + :type freqdist: FreqDist + :param bins: The number of possible event types. This must be at least + as large as the number of bins in the ``freqdist``. If None, then + it's assumed to be equal to that of the ``freqdist`` + :type bins: int + """ + assert bins is None or bins >= freqdist.B(), ( + "bins parameter must not be less than %d=freqdist.B()" % freqdist.B() + ) + if bins is None: + bins = freqdist.B() + self._freqdist = freqdist + self._T = self._freqdist.B() + self._Z = bins - self._freqdist.B() + self._N = self._freqdist.N() + # self._P0 is P(0), precalculated for efficiency: + if self._N == 0: + # if freqdist is empty, we approximate P(0) by a UniformProbDist: + self._P0 = 1.0 / self._Z + else: + self._P0 = self._T / (self._Z * (self._N + self._T)) + + def prob(self, sample): + # inherit docs from ProbDistI + c = self._freqdist[sample] + return c / (self._N + self._T) if c != 0 else self._P0 + + def max(self): + return self._freqdist.max() + + def samples(self): + return self._freqdist.keys() + + def freqdist(self): + return self._freqdist + + def discount(self): + raise NotImplementedError() + + def __repr__(self): + """ + Return a string representation of this ``ProbDist``. + + :rtype: str + """ + return "" % self._freqdist.N() + + +##////////////////////////////////////////////////////// +## Good-Turing Probability Distributions +##////////////////////////////////////////////////////// + +# Good-Turing frequency estimation was contributed by Alan Turing and +# his statistical assistant I.J. Good, during their collaboration in +# the WWII. It is a statistical technique for predicting the +# probability of occurrence of objects belonging to an unknown number +# of species, given past observations of such objects and their +# species. (In drawing balls from an urn, the 'objects' would be balls +# and the 'species' would be the distinct colors of the balls (finite +# but unknown in number). +# +# Good-Turing method calculates the probability mass to assign to +# events with zero or low counts based on the number of events with +# higher counts. It does so by using the adjusted count *c\**: +# +# - *c\* = (c + 1) N(c + 1) / N(c)* for c >= 1 +# - *things with frequency zero in training* = N(1) for c == 0 +# +# where *c* is the original count, *N(i)* is the number of event types +# observed with count *i*. We can think the count of unseen as the count +# of frequency one (see Jurafsky & Martin 2nd Edition, p101). +# +# This method is problematic because the situation ``N(c+1) == 0`` +# is quite common in the original Good-Turing estimation; smoothing or +# interpolation of *N(i)* values is essential in practice. +# +# Bill Gale and Geoffrey Sampson present a simple and effective approach, +# Simple Good-Turing. As a smoothing curve they simply use a power curve: +# +# Nr = a*r^b (with b < -1 to give the appropriate hyperbolic +# relationship) +# +# They estimate a and b by simple linear regression technique on the +# logarithmic form of the equation: +# +# log Nr = a + b*log(r) +# +# However, they suggest that such a simple curve is probably only +# appropriate for high values of r. For low values of r, they use the +# measured Nr directly. (see M&S, p.213) +# +# Gale and Sampson propose to use r while the difference between r and +# r* is 1.96 greater than the standard deviation, and switch to r* if +# it is less or equal: +# +# |r - r*| > 1.96 * sqrt((r + 1)^2 (Nr+1 / Nr^2) (1 + Nr+1 / Nr)) +# +# The 1.96 coefficient correspond to a 0.05 significance criterion, +# some implementations can use a coefficient of 1.65 for a 0.1 +# significance criterion. +# + +##////////////////////////////////////////////////////// +## Simple Good-Turing Probablity Distributions +##////////////////////////////////////////////////////// + + +class SimpleGoodTuringProbDist(ProbDistI): + """ + SimpleGoodTuring ProbDist approximates from frequency to frequency of + frequency into a linear line under log space by linear regression. + Details of Simple Good-Turing algorithm can be found in: + + - Good Turing smoothing without tears" (Gale & Sampson 1995), + Journal of Quantitative Linguistics, vol. 2 pp. 217-237. + - "Speech and Language Processing (Jurafsky & Martin), + 2nd Edition, Chapter 4.5 p103 (log(Nc) = a + b*log(c)) + - https://www.grsampson.net/RGoodTur.html + + Given a set of pair (xi, yi), where the xi denotes the frequency and + yi denotes the frequency of frequency, we want to minimize their + square variation. E(x) and E(y) represent the mean of xi and yi. + + - slope: b = sigma ((xi-E(x)(yi-E(y))) / sigma ((xi-E(x))(xi-E(x))) + - intercept: a = E(y) - b.E(x) + """ + + SUM_TO_ONE = False + + def __init__(self, freqdist, bins=None): + """ + :param freqdist: The frequency counts upon which to base the + estimation. + :type freqdist: FreqDist + :param bins: The number of possible event types. This must be + larger than the number of bins in the ``freqdist``. If None, + then it's assumed to be equal to ``freqdist``.B() + 1 + :type bins: int + """ + assert ( + bins is None or bins > freqdist.B() + ), "bins parameter must not be less than %d=freqdist.B()+1" % (freqdist.B() + 1) + if bins is None: + bins = freqdist.B() + 1 + self._freqdist = freqdist + self._bins = bins + r, nr = self._r_Nr() + self.find_best_fit(r, nr) + self._switch(r, nr) + self._renormalize(r, nr) + + def _r_Nr_non_zero(self): + r_Nr = self._freqdist.r_Nr() + del r_Nr[0] + return r_Nr + + def _r_Nr(self): + """ + Split the frequency distribution in two list (r, Nr), where Nr(r) > 0 + """ + nonzero = self._r_Nr_non_zero() + + if not nonzero: + return [], [] + return zip(*sorted(nonzero.items())) + + def find_best_fit(self, r, nr): + """ + Use simple linear regression to tune parameters self._slope and + self._intercept in the log-log space based on count and Nr(count) + (Work in log space to avoid floating point underflow.) + """ + # For higher sample frequencies the data points becomes horizontal + # along line Nr=1. To create a more evident linear model in log-log + # space, we average positive Nr values with the surrounding zero + # values. (Church and Gale, 1991) + + if not r or not nr: + # Empty r or nr? + return + + zr = [] + for j in range(len(r)): + i = r[j - 1] if j > 0 else 0 + k = 2 * r[j] - i if j == len(r) - 1 else r[j + 1] + zr_ = 2.0 * nr[j] / (k - i) + zr.append(zr_) + + log_r = [math.log(i) for i in r] + log_zr = [math.log(i) for i in zr] + + xy_cov = x_var = 0.0 + x_mean = sum(log_r) / len(log_r) + y_mean = sum(log_zr) / len(log_zr) + for (x, y) in zip(log_r, log_zr): + xy_cov += (x - x_mean) * (y - y_mean) + x_var += (x - x_mean) ** 2 + self._slope = xy_cov / x_var if x_var != 0 else 0.0 + if self._slope >= -1: + warnings.warn( + "SimpleGoodTuring did not find a proper best fit " + "line for smoothing probabilities of occurrences. " + "The probability estimates are likely to be " + "unreliable." + ) + self._intercept = y_mean - self._slope * x_mean + + def _switch(self, r, nr): + """ + Calculate the r frontier where we must switch from Nr to Sr + when estimating E[Nr]. + """ + for i, r_ in enumerate(r): + if len(r) == i + 1 or r[i + 1] != r_ + 1: + # We are at the end of r, or there is a gap in r + self._switch_at = r_ + break + + Sr = self.smoothedNr + smooth_r_star = (r_ + 1) * Sr(r_ + 1) / Sr(r_) + unsmooth_r_star = (r_ + 1) * nr[i + 1] / nr[i] + + std = math.sqrt(self._variance(r_, nr[i], nr[i + 1])) + if abs(unsmooth_r_star - smooth_r_star) <= 1.96 * std: + self._switch_at = r_ + break + + def _variance(self, r, nr, nr_1): + r = float(r) + nr = float(nr) + nr_1 = float(nr_1) + return (r + 1.0) ** 2 * (nr_1 / nr**2) * (1.0 + nr_1 / nr) + + def _renormalize(self, r, nr): + """ + It is necessary to renormalize all the probability estimates to + ensure a proper probability distribution results. This can be done + by keeping the estimate of the probability mass for unseen items as + N(1)/N and renormalizing all the estimates for previously seen items + (as Gale and Sampson (1995) propose). (See M&S P.213, 1999) + """ + prob_cov = 0.0 + for r_, nr_ in zip(r, nr): + prob_cov += nr_ * self._prob_measure(r_) + if prob_cov: + self._renormal = (1 - self._prob_measure(0)) / prob_cov + + def smoothedNr(self, r): + """ + Return the number of samples with count r. + + :param r: The amount of frequency. + :type r: int + :rtype: float + """ + + # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic + # relationship) + # Estimate a and b by simple linear regression technique on + # the logarithmic form of the equation: log Nr = a + b*log(r) + + return math.exp(self._intercept + self._slope * math.log(r)) + + def prob(self, sample): + """ + Return the sample's probability. + + :param sample: sample of the event + :type sample: str + :rtype: float + """ + count = self._freqdist[sample] + p = self._prob_measure(count) + if count == 0: + if self._bins == self._freqdist.B(): + p = 0.0 + else: + p = p / (self._bins - self._freqdist.B()) + else: + p = p * self._renormal + return p + + def _prob_measure(self, count): + if count == 0 and self._freqdist.N() == 0: + return 1.0 + elif count == 0 and self._freqdist.N() != 0: + return self._freqdist.Nr(1) / self._freqdist.N() + + if self._switch_at > count: + Er_1 = self._freqdist.Nr(count + 1) + Er = self._freqdist.Nr(count) + else: + Er_1 = self.smoothedNr(count + 1) + Er = self.smoothedNr(count) + + r_star = (count + 1) * Er_1 / Er + return r_star / self._freqdist.N() + + def check(self): + prob_sum = 0.0 + for i in range(0, len(self._Nr)): + prob_sum += self._Nr[i] * self._prob_measure(i) / self._renormal + print("Probability Sum:", prob_sum) + # assert prob_sum != 1.0, "probability sum should be one!" + + def discount(self): + """ + This function returns the total mass of probability transfers from the + seen samples to the unseen samples. + """ + return self.smoothedNr(1) / self._freqdist.N() + + def max(self): + return self._freqdist.max() + + def samples(self): + return self._freqdist.keys() + + def freqdist(self): + return self._freqdist + + def __repr__(self): + """ + Return a string representation of this ``ProbDist``. + + :rtype: str + """ + return "" % self._freqdist.N() + + +class MutableProbDist(ProbDistI): + """ + An mutable probdist where the probabilities may be easily modified. This + simply copies an existing probdist, storing the probability values in a + mutable dictionary and providing an update method. + """ + + def __init__(self, prob_dist, samples, store_logs=True): + """ + Creates the mutable probdist based on the given prob_dist and using + the list of samples given. These values are stored as log + probabilities if the store_logs flag is set. + + :param prob_dist: the distribution from which to garner the + probabilities + :type prob_dist: ProbDist + :param samples: the complete set of samples + :type samples: sequence of any + :param store_logs: whether to store the probabilities as logarithms + :type store_logs: bool + """ + self._samples = samples + self._sample_dict = {samples[i]: i for i in range(len(samples))} + self._data = array.array("d", [0.0]) * len(samples) + for i in range(len(samples)): + if store_logs: + self._data[i] = prob_dist.logprob(samples[i]) + else: + self._data[i] = prob_dist.prob(samples[i]) + self._logs = store_logs + + def max(self): + # inherit documentation + return max((p, v) for (v, p) in self._sample_dict.items())[1] + + def samples(self): + # inherit documentation + return self._samples + + def prob(self, sample): + # inherit documentation + i = self._sample_dict.get(sample) + if i is None: + return 0.0 + return 2 ** (self._data[i]) if self._logs else self._data[i] + + def logprob(self, sample): + # inherit documentation + i = self._sample_dict.get(sample) + if i is None: + return float("-inf") + return self._data[i] if self._logs else math.log(self._data[i], 2) + + def update(self, sample, prob, log=True): + """ + Update the probability for the given sample. This may cause the object + to stop being the valid probability distribution - the user must + ensure that they update the sample probabilities such that all samples + have probabilities between 0 and 1 and that all probabilities sum to + one. + + :param sample: the sample for which to update the probability + :type sample: any + :param prob: the new probability + :type prob: float + :param log: is the probability already logged + :type log: bool + """ + i = self._sample_dict.get(sample) + assert i is not None + if self._logs: + self._data[i] = prob if log else math.log(prob, 2) + else: + self._data[i] = 2 ** (prob) if log else prob + + +##///////////////////////////////////////////////////// +## Kneser-Ney Probability Distribution +##////////////////////////////////////////////////////// + +# This method for calculating probabilities was introduced in 1995 by Reinhard +# Kneser and Hermann Ney. It was meant to improve the accuracy of language +# models that use backing-off to deal with sparse data. The authors propose two +# ways of doing so: a marginal distribution constraint on the back-off +# distribution and a leave-one-out distribution. For a start, the first one is +# implemented as a class below. +# +# The idea behind a back-off n-gram model is that we have a series of +# frequency distributions for our n-grams so that in case we have not seen a +# given n-gram during training (and as a result have a 0 probability for it) we +# can 'back off' (hence the name!) and try testing whether we've seen the +# n-1-gram part of the n-gram in training. +# +# The novelty of Kneser and Ney's approach was that they decided to fiddle +# around with the way this latter, backed off probability was being calculated +# whereas their peers seemed to focus on the primary probability. +# +# The implementation below uses one of the techniques described in their paper +# titled "Improved backing-off for n-gram language modeling." In the same paper +# another technique is introduced to attempt to smooth the back-off +# distribution as well as the primary one. There is also a much-cited +# modification of this method proposed by Chen and Goodman. +# +# In order for the implementation of Kneser-Ney to be more efficient, some +# changes have been made to the original algorithm. Namely, the calculation of +# the normalizing function gamma has been significantly simplified and +# combined slightly differently with beta. None of these changes affect the +# nature of the algorithm, but instead aim to cut out unnecessary calculations +# and take advantage of storing and retrieving information in dictionaries +# where possible. + + +class KneserNeyProbDist(ProbDistI): + """ + Kneser-Ney estimate of a probability distribution. This is a version of + back-off that counts how likely an n-gram is provided the n-1-gram had + been seen in training. Extends the ProbDistI interface, requires a trigram + FreqDist instance to train on. Optionally, a different from default discount + value can be specified. The default discount is set to 0.75. + + """ + + def __init__(self, freqdist, bins=None, discount=0.75): + """ + :param freqdist: The trigram frequency distribution upon which to base + the estimation + :type freqdist: FreqDist + :param bins: Included for compatibility with nltk.tag.hmm + :type bins: int or float + :param discount: The discount applied when retrieving counts of + trigrams + :type discount: float (preferred, but can be set to int) + """ + + if not bins: + self._bins = freqdist.B() + else: + self._bins = bins + self._D = discount + + # cache for probability calculation + self._cache = {} + + # internal bigram and trigram frequency distributions + self._bigrams = defaultdict(int) + self._trigrams = freqdist + + # helper dictionaries used to calculate probabilities + self._wordtypes_after = defaultdict(float) + self._trigrams_contain = defaultdict(float) + self._wordtypes_before = defaultdict(float) + for w0, w1, w2 in freqdist: + self._bigrams[(w0, w1)] += freqdist[(w0, w1, w2)] + self._wordtypes_after[(w0, w1)] += 1 + self._trigrams_contain[w1] += 1 + self._wordtypes_before[(w1, w2)] += 1 + + def prob(self, trigram): + # sample must be a triple + if len(trigram) != 3: + raise ValueError("Expected an iterable with 3 members.") + trigram = tuple(trigram) + w0, w1, w2 = trigram + + if trigram in self._cache: + return self._cache[trigram] + else: + # if the sample trigram was seen during training + if trigram in self._trigrams: + prob = (self._trigrams[trigram] - self.discount()) / self._bigrams[ + (w0, w1) + ] + + # else if the 'rougher' environment was seen during training + elif (w0, w1) in self._bigrams and (w1, w2) in self._wordtypes_before: + aftr = self._wordtypes_after[(w0, w1)] + bfr = self._wordtypes_before[(w1, w2)] + + # the probability left over from alphas + leftover_prob = (aftr * self.discount()) / self._bigrams[(w0, w1)] + + # the beta (including normalization) + beta = bfr / (self._trigrams_contain[w1] - aftr) + + prob = leftover_prob * beta + + # else the sample was completely unseen during training + else: + prob = 0.0 + + self._cache[trigram] = prob + return prob + + def discount(self): + """ + Return the value by which counts are discounted. By default set to 0.75. + + :rtype: float + """ + return self._D + + def set_discount(self, discount): + """ + Set the value by which counts are discounted to the value of discount. + + :param discount: the new value to discount counts by + :type discount: float (preferred, but int possible) + :rtype: None + """ + self._D = discount + + def samples(self): + return self._trigrams.keys() + + def max(self): + return self._trigrams.max() + + def __repr__(self): + """ + Return a string representation of this ProbDist + + :rtype: str + """ + return f">> from nltk.probability import ConditionalFreqDist + >>> from nltk.tokenize import word_tokenize + >>> sent = "the the the dog dog some other words that we do not care about" + >>> cfdist = ConditionalFreqDist() + >>> for word in word_tokenize(sent): + ... condition = len(word) + ... cfdist[condition][word] += 1 + + An equivalent way to do this is with the initializer: + + >>> cfdist = ConditionalFreqDist((len(word), word) for word in word_tokenize(sent)) + + The frequency distribution for each condition is accessed using + the indexing operator: + + >>> cfdist[3] + FreqDist({'the': 3, 'dog': 2, 'not': 1}) + >>> cfdist[3].freq('the') + 0.5 + >>> cfdist[3]['dog'] + 2 + + When the indexing operator is used to access the frequency + distribution for a condition that has not been accessed before, + ``ConditionalFreqDist`` creates a new empty FreqDist for that + condition. + + """ + + def __init__(self, cond_samples=None): + """ + Construct a new empty conditional frequency distribution. In + particular, the count for every sample, under every condition, + is zero. + + :param cond_samples: The samples to initialize the conditional + frequency distribution with + :type cond_samples: Sequence of (condition, sample) tuples + """ + defaultdict.__init__(self, FreqDist) + + if cond_samples: + for (cond, sample) in cond_samples: + self[cond][sample] += 1 + + def __reduce__(self): + kv_pairs = ((cond, self[cond]) for cond in self.conditions()) + return (self.__class__, (), None, None, kv_pairs) + + def conditions(self): + """ + Return a list of the conditions that have been accessed for + this ``ConditionalFreqDist``. Use the indexing operator to + access the frequency distribution for a given condition. + Note that the frequency distributions for some conditions + may contain zero sample outcomes. + + :rtype: list + """ + return list(self.keys()) + + def N(self): + """ + Return the total number of sample outcomes that have been + recorded by this ``ConditionalFreqDist``. + + :rtype: int + """ + return sum(fdist.N() for fdist in self.values()) + + def plot( + self, + *args, + samples=None, + title="", + cumulative=False, + percents=False, + conditions=None, + show=True, + **kwargs, + ): + """ + Plot the given samples from the conditional frequency distribution. + For a cumulative plot, specify cumulative=True. Additional ``*args`` and + ``**kwargs`` are passed to matplotlib's plot function. + (Requires Matplotlib to be installed.) + + :param samples: The samples to plot + :type samples: list + :param title: The title for the graph + :type title: str + :param cumulative: Whether the plot is cumulative. (default = False) + :type cumulative: bool + :param percents: Whether the plot uses percents instead of counts. (default = False) + :type percents: bool + :param conditions: The conditions to plot (default is all) + :type conditions: list + :param show: Whether to show the plot, or only return the ax. + :type show: bool + """ + try: + import matplotlib.pyplot as plt # import statement fix + except ImportError as e: + raise ValueError( + "The plot function requires matplotlib to be installed." + "See https://matplotlib.org/" + ) from e + + if not conditions: + conditions = self.conditions() + else: + conditions = [c for c in conditions if c in self] + if not samples: + samples = sorted({v for c in conditions for v in self[c]}) + if "linewidth" not in kwargs: + kwargs["linewidth"] = 2 + ax = plt.gca() + if conditions: + freqs = [] + for condition in conditions: + if cumulative: + # freqs should be a list of list where each sub list will be a frequency of a condition + freq = list(self[condition]._cumulative_frequencies(samples)) + else: + freq = [self[condition][sample] for sample in samples] + + if percents: + freq = [f / self[condition].N() * 100 for f in freq] + + freqs.append(freq) + + if cumulative: + ylabel = "Cumulative " + legend_loc = "lower right" + else: + ylabel = "" + legend_loc = "upper right" + + if percents: + ylabel += "Percents" + else: + ylabel += "Counts" + + i = 0 + for freq in freqs: + kwargs["label"] = conditions[i] # label for each condition + i += 1 + ax.plot(freq, *args, **kwargs) + ax.legend(loc=legend_loc) + ax.grid(True, color="silver") + ax.set_xticks(range(len(samples))) + ax.set_xticklabels([str(s) for s in samples], rotation=90) + if title: + ax.set_title(title) + ax.set_xlabel("Samples") + ax.set_ylabel(ylabel) + + if show: + plt.show() + + return ax + + def tabulate(self, *args, **kwargs): + """ + Tabulate the given samples from the conditional frequency distribution. + + :param samples: The samples to plot + :type samples: list + :param conditions: The conditions to plot (default is all) + :type conditions: list + :param cumulative: A flag to specify whether the freqs are cumulative (default = False) + :type title: bool + """ + + cumulative = _get_kwarg(kwargs, "cumulative", False) + conditions = _get_kwarg(kwargs, "conditions", sorted(self.conditions())) + samples = _get_kwarg( + kwargs, + "samples", + sorted({v for c in conditions if c in self for v in self[c]}), + ) # this computation could be wasted + + width = max(len("%s" % s) for s in samples) + freqs = dict() + for c in conditions: + if cumulative: + freqs[c] = list(self[c]._cumulative_frequencies(samples)) + else: + freqs[c] = [self[c][sample] for sample in samples] + width = max(width, max(len("%d" % f) for f in freqs[c])) + + condition_size = max(len("%s" % c) for c in conditions) + print(" " * condition_size, end=" ") + for s in samples: + print("%*s" % (width, s), end=" ") + print() + for c in conditions: + print("%*s" % (condition_size, c), end=" ") + for f in freqs[c]: + print("%*d" % (width, f), end=" ") + print() + + # Mathematical operators + + def __add__(self, other): + """ + Add counts from two ConditionalFreqDists. + """ + if not isinstance(other, ConditionalFreqDist): + return NotImplemented + result = self.copy() + for cond in other.conditions(): + result[cond] += other[cond] + return result + + def __sub__(self, other): + """ + Subtract count, but keep only results with positive counts. + """ + if not isinstance(other, ConditionalFreqDist): + return NotImplemented + result = self.copy() + for cond in other.conditions(): + result[cond] -= other[cond] + if not result[cond]: + del result[cond] + return result + + def __or__(self, other): + """ + Union is the maximum of value in either of the input counters. + """ + if not isinstance(other, ConditionalFreqDist): + return NotImplemented + result = self.copy() + for cond in other.conditions(): + result[cond] |= other[cond] + return result + + def __and__(self, other): + """ + Intersection is the minimum of corresponding counts. + """ + if not isinstance(other, ConditionalFreqDist): + return NotImplemented + result = ConditionalFreqDist() + for cond in self.conditions(): + newfreqdist = self[cond] & other[cond] + if newfreqdist: + result[cond] = newfreqdist + return result + + # @total_ordering doesn't work here, since the class inherits from a builtin class + def __le__(self, other): + if not isinstance(other, ConditionalFreqDist): + raise_unorderable_types("<=", self, other) + return set(self.conditions()).issubset(other.conditions()) and all( + self[c] <= other[c] for c in self.conditions() + ) + + def __lt__(self, other): + if not isinstance(other, ConditionalFreqDist): + raise_unorderable_types("<", self, other) + return self <= other and self != other + + def __ge__(self, other): + if not isinstance(other, ConditionalFreqDist): + raise_unorderable_types(">=", self, other) + return other <= self + + def __gt__(self, other): + if not isinstance(other, ConditionalFreqDist): + raise_unorderable_types(">", self, other) + return other < self + + def deepcopy(self): + from copy import deepcopy + + return deepcopy(self) + + copy = deepcopy + + def __repr__(self): + """ + Return a string representation of this ``ConditionalFreqDist``. + + :rtype: str + """ + return "" % len(self) + + +class ConditionalProbDistI(dict, metaclass=ABCMeta): + """ + A collection of probability distributions for a single experiment + run under different conditions. Conditional probability + distributions are used to estimate the likelihood of each sample, + given the condition under which the experiment was run. For + example, a conditional probability distribution could be used to + estimate the probability of each word type in a document, given + the length of the word type. Formally, a conditional probability + distribution can be defined as a function that maps from each + condition to the ``ProbDist`` for the experiment under that + condition. + """ + + @abstractmethod + def __init__(self): + """ + Classes inheriting from ConditionalProbDistI should implement __init__. + """ + + def conditions(self): + """ + Return a list of the conditions that are represented by + this ``ConditionalProbDist``. Use the indexing operator to + access the probability distribution for a given condition. + + :rtype: list + """ + return list(self.keys()) + + def __repr__(self): + """ + Return a string representation of this ``ConditionalProbDist``. + + :rtype: str + """ + return "<%s with %d conditions>" % (type(self).__name__, len(self)) + + +class ConditionalProbDist(ConditionalProbDistI): + """ + A conditional probability distribution modeling the experiments + that were used to generate a conditional frequency distribution. + A ConditionalProbDist is constructed from a + ``ConditionalFreqDist`` and a ``ProbDist`` factory: + + - The ``ConditionalFreqDist`` specifies the frequency + distribution for each condition. + - The ``ProbDist`` factory is a function that takes a + condition's frequency distribution, and returns its + probability distribution. A ``ProbDist`` class's name (such as + ``MLEProbDist`` or ``HeldoutProbDist``) can be used to specify + that class's constructor. + + The first argument to the ``ProbDist`` factory is the frequency + distribution that it should model; and the remaining arguments are + specified by the ``factory_args`` parameter to the + ``ConditionalProbDist`` constructor. For example, the following + code constructs a ``ConditionalProbDist``, where the probability + distribution for each condition is an ``ELEProbDist`` with 10 bins: + + >>> from nltk.corpus import brown + >>> from nltk.probability import ConditionalFreqDist + >>> from nltk.probability import ConditionalProbDist, ELEProbDist + >>> cfdist = ConditionalFreqDist(brown.tagged_words()[:5000]) + >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10) + >>> cpdist['passed'].max() + 'VBD' + >>> cpdist['passed'].prob('VBD') #doctest: +ELLIPSIS + 0.423... + + """ + + def __init__(self, cfdist, probdist_factory, *factory_args, **factory_kw_args): + """ + Construct a new conditional probability distribution, based on + the given conditional frequency distribution and ``ProbDist`` + factory. + + :type cfdist: ConditionalFreqDist + :param cfdist: The ``ConditionalFreqDist`` specifying the + frequency distribution for each condition. + :type probdist_factory: class or function + :param probdist_factory: The function or class that maps + a condition's frequency distribution to its probability + distribution. The function is called with the frequency + distribution as its first argument, + ``factory_args`` as its remaining arguments, and + ``factory_kw_args`` as keyword arguments. + :type factory_args: (any) + :param factory_args: Extra arguments for ``probdist_factory``. + These arguments are usually used to specify extra + properties for the probability distributions of individual + conditions, such as the number of bins they contain. + :type factory_kw_args: (any) + :param factory_kw_args: Extra keyword arguments for ``probdist_factory``. + """ + self._probdist_factory = probdist_factory + self._factory_args = factory_args + self._factory_kw_args = factory_kw_args + + for condition in cfdist: + self[condition] = probdist_factory( + cfdist[condition], *factory_args, **factory_kw_args + ) + + def __missing__(self, key): + self[key] = self._probdist_factory( + FreqDist(), *self._factory_args, **self._factory_kw_args + ) + return self[key] + + +class DictionaryConditionalProbDist(ConditionalProbDistI): + """ + An alternative ConditionalProbDist that simply wraps a dictionary of + ProbDists rather than creating these from FreqDists. + """ + + def __init__(self, probdist_dict): + """ + :param probdist_dict: a dictionary containing the probdists indexed + by the conditions + :type probdist_dict: dict any -> probdist + """ + self.update(probdist_dict) + + def __missing__(self, key): + self[key] = DictionaryProbDist() + return self[key] + + +##////////////////////////////////////////////////////// +## Adding in log-space. +##////////////////////////////////////////////////////// + +# If the difference is bigger than this, then just take the bigger one: +_ADD_LOGS_MAX_DIFF = math.log(1e-30, 2) + + +def add_logs(logx, logy): + """ + Given two numbers ``logx`` = *log(x)* and ``logy`` = *log(y)*, return + *log(x+y)*. Conceptually, this is the same as returning + ``log(2**(logx)+2**(logy))``, but the actual implementation + avoids overflow errors that could result from direct computation. + """ + if logx < logy + _ADD_LOGS_MAX_DIFF: + return logy + if logy < logx + _ADD_LOGS_MAX_DIFF: + return logx + base = min(logx, logy) + return base + math.log(2 ** (logx - base) + 2 ** (logy - base), 2) + + +def sum_logs(logs): + return reduce(add_logs, logs[1:], logs[0]) if len(logs) != 0 else _NINF + + +##////////////////////////////////////////////////////// +## Probabilistic Mix-in +##////////////////////////////////////////////////////// + + +class ProbabilisticMixIn: + """ + A mix-in class to associate probabilities with other classes + (trees, rules, etc.). To use the ``ProbabilisticMixIn`` class, + define a new class that derives from an existing class and from + ProbabilisticMixIn. You will need to define a new constructor for + the new class, which explicitly calls the constructors of both its + parent classes. For example: + + >>> from nltk.probability import ProbabilisticMixIn + >>> class A: + ... def __init__(self, x, y): self.data = (x,y) + ... + >>> class ProbabilisticA(A, ProbabilisticMixIn): + ... def __init__(self, x, y, **prob_kwarg): + ... A.__init__(self, x, y) + ... ProbabilisticMixIn.__init__(self, **prob_kwarg) + + See the documentation for the ProbabilisticMixIn + ``constructor<__init__>`` for information about the arguments it + expects. + + You should generally also redefine the string representation + methods, the comparison methods, and the hashing method. + """ + + def __init__(self, **kwargs): + """ + Initialize this object's probability. This initializer should + be called by subclass constructors. ``prob`` should generally be + the first argument for those constructors. + + :param prob: The probability associated with the object. + :type prob: float + :param logprob: The log of the probability associated with + the object. + :type logprob: float + """ + if "prob" in kwargs: + if "logprob" in kwargs: + raise TypeError("Must specify either prob or logprob " "(not both)") + else: + ProbabilisticMixIn.set_prob(self, kwargs["prob"]) + elif "logprob" in kwargs: + ProbabilisticMixIn.set_logprob(self, kwargs["logprob"]) + else: + self.__prob = self.__logprob = None + + def set_prob(self, prob): + """ + Set the probability associated with this object to ``prob``. + + :param prob: The new probability + :type prob: float + """ + self.__prob = prob + self.__logprob = None + + def set_logprob(self, logprob): + """ + Set the log probability associated with this object to + ``logprob``. I.e., set the probability associated with this + object to ``2**(logprob)``. + + :param logprob: The new log probability + :type logprob: float + """ + self.__logprob = logprob + self.__prob = None + + def prob(self): + """ + Return the probability associated with this object. + + :rtype: float + """ + if self.__prob is None: + if self.__logprob is None: + return None + self.__prob = 2 ** (self.__logprob) + return self.__prob + + def logprob(self): + """ + Return ``log(p)``, where ``p`` is the probability associated + with this object. + + :rtype: float + """ + if self.__logprob is None: + if self.__prob is None: + return None + self.__logprob = math.log(self.__prob, 2) + return self.__logprob + + +class ImmutableProbabilisticMixIn(ProbabilisticMixIn): + def set_prob(self, prob): + raise ValueError("%s is immutable" % self.__class__.__name__) + + def set_logprob(self, prob): + raise ValueError("%s is immutable" % self.__class__.__name__) + + +## Helper function for processing keyword arguments + + +def _get_kwarg(kwargs, key, default): + if key in kwargs: + arg = kwargs[key] + del kwargs[key] + else: + arg = default + return arg + + +##////////////////////////////////////////////////////// +## Demonstration +##////////////////////////////////////////////////////// + + +def _create_rand_fdist(numsamples, numoutcomes): + """ + Create a new frequency distribution, with random samples. The + samples are numbers from 1 to ``numsamples``, and are generated by + summing two numbers, each of which has a uniform distribution. + """ + + fdist = FreqDist() + for x in range(numoutcomes): + y = random.randint(1, (1 + numsamples) // 2) + random.randint( + 0, numsamples // 2 + ) + fdist[y] += 1 + return fdist + + +def _create_sum_pdist(numsamples): + """ + Return the true probability distribution for the experiment + ``_create_rand_fdist(numsamples, x)``. + """ + fdist = FreqDist() + for x in range(1, (1 + numsamples) // 2 + 1): + for y in range(0, numsamples // 2 + 1): + fdist[x + y] += 1 + return MLEProbDist(fdist) + + +def demo(numsamples=6, numoutcomes=500): + """ + A demonstration of frequency distributions and probability + distributions. This demonstration creates three frequency + distributions with, and uses them to sample a random process with + ``numsamples`` samples. Each frequency distribution is sampled + ``numoutcomes`` times. These three frequency distributions are + then used to build six probability distributions. Finally, the + probability estimates of these distributions are compared to the + actual probability of each sample. + + :type numsamples: int + :param numsamples: The number of samples to use in each demo + frequency distributions. + :type numoutcomes: int + :param numoutcomes: The total number of outcomes for each + demo frequency distribution. These outcomes are divided into + ``numsamples`` bins. + :rtype: None + """ + + # Randomly sample a stochastic process three times. + fdist1 = _create_rand_fdist(numsamples, numoutcomes) + fdist2 = _create_rand_fdist(numsamples, numoutcomes) + fdist3 = _create_rand_fdist(numsamples, numoutcomes) + + # Use our samples to create probability distributions. + pdists = [ + MLEProbDist(fdist1), + LidstoneProbDist(fdist1, 0.5, numsamples), + HeldoutProbDist(fdist1, fdist2, numsamples), + HeldoutProbDist(fdist2, fdist1, numsamples), + CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples), + SimpleGoodTuringProbDist(fdist1), + SimpleGoodTuringProbDist(fdist1, 7), + _create_sum_pdist(numsamples), + ] + + # Find the probability of each sample. + vals = [] + for n in range(1, numsamples + 1): + vals.append(tuple([n, fdist1.freq(n)] + [pdist.prob(n) for pdist in pdists])) + + # Print the results in a formatted table. + print( + "%d samples (1-%d); %d outcomes were sampled for each FreqDist" + % (numsamples, numsamples, numoutcomes) + ) + print("=" * 9 * (len(pdists) + 2)) + FORMATSTR = " FreqDist " + "%8s " * (len(pdists) - 1) + "| Actual" + print(FORMATSTR % tuple(repr(pdist)[1:9] for pdist in pdists[:-1])) + print("-" * 9 * (len(pdists) + 2)) + FORMATSTR = "%3d %8.6f " + "%8.6f " * (len(pdists) - 1) + "| %8.6f" + for val in vals: + print(FORMATSTR % val) + + # Print the totals for each column (should all be 1.0) + zvals = list(zip(*vals)) + sums = [sum(val) for val in zvals[1:]] + print("-" * 9 * (len(pdists) + 2)) + FORMATSTR = "Total " + "%8.6f " * (len(pdists)) + "| %8.6f" + print(FORMATSTR % tuple(sums)) + print("=" * 9 * (len(pdists) + 2)) + + # Display the distributions themselves, if they're short enough. + if len("%s" % fdist1) < 70: + print(" fdist1: %s" % fdist1) + print(" fdist2: %s" % fdist2) + print(" fdist3: %s" % fdist3) + print() + + print("Generating:") + for pdist in pdists: + fdist = FreqDist(pdist.generate() for i in range(5000)) + print("{:>20} {}".format(pdist.__class__.__name__[:20], ("%s" % fdist)[:55])) + print() + + +def gt_demo(): + from nltk import corpus + + emma_words = corpus.gutenberg.words("austen-emma.txt") + fd = FreqDist(emma_words) + sgt = SimpleGoodTuringProbDist(fd) + print("{:>18} {:>8} {:>14}".format("word", "frequency", "SimpleGoodTuring")) + fd_keys_sorted = ( + key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True) + ) + for key in fd_keys_sorted: + print("%18s %8d %14e" % (key, fd[key], sgt.prob(key))) + + +if __name__ == "__main__": + demo(6, 10) + demo(5, 5000) + gt_demo() + +__all__ = [ + "ConditionalFreqDist", + "ConditionalProbDist", + "ConditionalProbDistI", + "CrossValidationProbDist", + "DictionaryConditionalProbDist", + "DictionaryProbDist", + "ELEProbDist", + "FreqDist", + "SimpleGoodTuringProbDist", + "HeldoutProbDist", + "ImmutableProbabilisticMixIn", + "LaplaceProbDist", + "LidstoneProbDist", + "MLEProbDist", + "MutableProbDist", + "KneserNeyProbDist", + "ProbDistI", + "ProbabilisticMixIn", + "UniformProbDist", + "WittenBellProbDist", + "add_logs", + "log_likelihood", + "sum_logs", + "entropy", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c6b29baf8ed419eace6f5c35f409e1abba6c362e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/__init__.py @@ -0,0 +1,75 @@ +# Natural Language Toolkit: Semantic Interpretation +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# URL: +# For license information, see LICENSE.TXT + +""" +NLTK Semantic Interpretation Package + +This package contains classes for representing semantic structure in +formulas of first-order logic and for evaluating such formulas in +set-theoretic models. + + >>> from nltk.sem import logic + >>> logic._counter._value = 0 + +The package has two main components: + + - ``logic`` provides support for analyzing expressions of First + Order Logic (FOL). + - ``evaluate`` allows users to recursively determine truth in a + model for formulas of FOL. + +A model consists of a domain of discourse and a valuation function, +which assigns values to non-logical constants. We assume that entities +in the domain are represented as strings such as ``'b1'``, ``'g1'``, +etc. A ``Valuation`` is initialized with a list of (symbol, value) +pairs, where values are entities, sets of entities or sets of tuples +of entities. +The domain of discourse can be inferred from the valuation, and model +is then created with domain and valuation as parameters. + + >>> from nltk.sem import Valuation, Model + >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), + ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), + ... ('dog', set(['d1'])), + ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] + >>> val = Valuation(v) + >>> dom = val.domain + >>> m = Model(dom, val) +""" + +from nltk.sem.boxer import Boxer +from nltk.sem.drt import DRS, DrtExpression +from nltk.sem.evaluate import ( + Assignment, + Model, + Undefined, + Valuation, + arity, + is_rel, + read_valuation, + set2rel, +) +from nltk.sem.lfg import FStructure +from nltk.sem.logic import ( + ApplicationExpression, + Expression, + LogicalExpressionException, + Variable, + binding_ops, + boolean_ops, + equality_preds, + read_logic, +) +from nltk.sem.relextract import clause, extract_rels, rtuple +from nltk.sem.skolemize import skolemize +from nltk.sem.util import evaluate_sents, interpret_sents, parse_sents, root_semrep + +# from nltk.sem.glue import Glue +# from nltk.sem.hole import HoleSemantics +# from nltk.sem.cooper_storage import CooperStore + +# don't import chat80 as its names are too generic diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/chat80.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/chat80.py new file mode 100644 index 0000000000000000000000000000000000000000..d71d1badd1df641c211d5ab4153573b8c75beca2 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/chat80.py @@ -0,0 +1,857 @@ +# Natural Language Toolkit: Chat-80 KB Reader +# See https://www.w3.org/TR/swbp-skos-core-guide/ +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein , +# URL: +# For license information, see LICENSE.TXT + +r""" +Overview +======== + +Chat-80 was a natural language system which allowed the user to +interrogate a Prolog knowledge base in the domain of world +geography. It was developed in the early '80s by Warren and Pereira; see +``https://www.aclweb.org/anthology/J82-3002.pdf`` for a description and +``http://www.cis.upenn.edu/~pereira/oldies.html`` for the source +files. + +This module contains functions to extract data from the Chat-80 +relation files ('the world database'), and convert then into a format +that can be incorporated in the FOL models of +``nltk.sem.evaluate``. The code assumes that the Prolog +input files are available in the NLTK corpora directory. + +The Chat-80 World Database consists of the following files:: + + world0.pl + rivers.pl + cities.pl + countries.pl + contain.pl + borders.pl + +This module uses a slightly modified version of ``world0.pl``, in which +a set of Prolog rules have been omitted. The modified file is named +``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since +it uses a list rather than a string in the second field. + +Reading Chat-80 Files +===================== + +Chat-80 relations are like tables in a relational database. The +relation acts as the name of the table; the first argument acts as the +'primary key'; and subsequent arguments are further fields in the +table. In general, the name of the table provides a label for a unary +predicate whose extension is all the primary keys. For example, +relations in ``cities.pl`` are of the following form:: + + 'city(athens,greece,1368).' + +Here, ``'athens'`` is the key, and will be mapped to a member of the +unary predicate *city*. + +The fields in the table are mapped to binary predicates. The first +argument of the predicate is the primary key, while the second +argument is the data in the relevant field. Thus, in the above +example, the third field is mapped to the binary predicate +*population_of*, whose extension is a set of pairs such as +``'(athens, 1368)'``. + +An exception to this general framework is required by the relations in +the files ``borders.pl`` and ``contains.pl``. These contain facts of the +following form:: + + 'borders(albania,greece).' + + 'contains0(africa,central_africa).' + +We do not want to form a unary concept out the element in +the first field of these records, and we want the label of the binary +relation just to be ``'border'``/``'contain'`` respectively. + +In order to drive the extraction process, we use 'relation metadata bundles' +which are Python dictionaries such as the following:: + + city = {'label': 'city', + 'closures': [], + 'schema': ['city', 'country', 'population'], + 'filename': 'cities.pl'} + +According to this, the file ``city['filename']`` contains a list of +relational tuples (or more accurately, the corresponding strings in +Prolog form) whose predicate symbol is ``city['label']`` and whose +relational schema is ``city['schema']``. The notion of a ``closure`` is +discussed in the next section. + +Concepts +======== +In order to encapsulate the results of the extraction, a class of +``Concept`` objects is introduced. A ``Concept`` object has a number of +attributes, in particular a ``prefLabel`` and ``extension``, which make +it easier to inspect the output of the extraction. In addition, the +``extension`` can be further processed: in the case of the ``'border'`` +relation, we check that the relation is symmetric, and in the case +of the ``'contain'`` relation, we carry out the transitive +closure. The closure properties associated with a concept is +indicated in the relation metadata, as indicated earlier. + +The ``extension`` of a ``Concept`` object is then incorporated into a +``Valuation`` object. + +Persistence +=========== +The functions ``val_dump`` and ``val_load`` are provided to allow a +valuation to be stored in a persistent database and re-loaded, rather +than having to be re-computed each time. + +Individuals and Lexical Items +============================= +As well as deriving relations from the Chat-80 data, we also create a +set of individual constants, one for each entity in the domain. The +individual constants are string-identical to the entities. For +example, given a data item such as ``'zloty'``, we add to the valuation +a pair ``('zloty', 'zloty')``. In order to parse English sentences that +refer to these entities, we also create a lexical item such as the +following for each individual constant:: + + PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty' + +The set of rules is written to the file ``chat_pnames.cfg`` in the +current directory. + +""" + +import os +import re +import shelve +import sys + +import nltk.data + +########################################################################### +# Chat-80 relation metadata bundles needed to build the valuation +########################################################################### + +borders = { + "rel_name": "borders", + "closures": ["symmetric"], + "schema": ["region", "border"], + "filename": "borders.pl", +} + +contains = { + "rel_name": "contains0", + "closures": ["transitive"], + "schema": ["region", "contain"], + "filename": "contain.pl", +} + +city = { + "rel_name": "city", + "closures": [], + "schema": ["city", "country", "population"], + "filename": "cities.pl", +} + +country = { + "rel_name": "country", + "closures": [], + "schema": [ + "country", + "region", + "latitude", + "longitude", + "area", + "population", + "capital", + "currency", + ], + "filename": "countries.pl", +} + +circle_of_lat = { + "rel_name": "circle_of_latitude", + "closures": [], + "schema": ["circle_of_latitude", "degrees"], + "filename": "world1.pl", +} + +circle_of_long = { + "rel_name": "circle_of_longitude", + "closures": [], + "schema": ["circle_of_longitude", "degrees"], + "filename": "world1.pl", +} + +continent = { + "rel_name": "continent", + "closures": [], + "schema": ["continent"], + "filename": "world1.pl", +} + +region = { + "rel_name": "in_continent", + "closures": [], + "schema": ["region", "continent"], + "filename": "world1.pl", +} + +ocean = { + "rel_name": "ocean", + "closures": [], + "schema": ["ocean"], + "filename": "world1.pl", +} + +sea = {"rel_name": "sea", "closures": [], "schema": ["sea"], "filename": "world1.pl"} + + +items = [ + "borders", + "contains", + "city", + "country", + "circle_of_lat", + "circle_of_long", + "continent", + "region", + "ocean", + "sea", +] +items = tuple(sorted(items)) + +item_metadata = { + "borders": borders, + "contains": contains, + "city": city, + "country": country, + "circle_of_lat": circle_of_lat, + "circle_of_long": circle_of_long, + "continent": continent, + "region": region, + "ocean": ocean, + "sea": sea, +} + +rels = item_metadata.values() + +not_unary = ["borders.pl", "contain.pl"] + +########################################################################### + + +class Concept: + """ + A Concept class, loosely based on SKOS + (https://www.w3.org/TR/swbp-skos-core-guide/). + """ + + def __init__(self, prefLabel, arity, altLabels=[], closures=[], extension=set()): + """ + :param prefLabel: the preferred label for the concept + :type prefLabel: str + :param arity: the arity of the concept + :type arity: int + :param altLabels: other (related) labels + :type altLabels: list + :param closures: closure properties of the extension + (list items can be ``symmetric``, ``reflexive``, ``transitive``) + :type closures: list + :param extension: the extensional value of the concept + :type extension: set + """ + self.prefLabel = prefLabel + self.arity = arity + self.altLabels = altLabels + self.closures = closures + # keep _extension internally as a set + self._extension = extension + # public access is via a list (for slicing) + self.extension = sorted(list(extension)) + + def __str__(self): + # _extension = '' + # for element in sorted(self.extension): + # if isinstance(element, tuple): + # element = '(%s, %s)' % (element) + # _extension += element + ', ' + # _extension = _extension[:-1] + + return "Label = '{}'\nArity = {}\nExtension = {}".format( + self.prefLabel, + self.arity, + self.extension, + ) + + def __repr__(self): + return "Concept('%s')" % self.prefLabel + + def augment(self, data): + """ + Add more data to the ``Concept``'s extension set. + + :param data: a new semantic value + :type data: string or pair of strings + :rtype: set + + """ + self._extension.add(data) + self.extension = sorted(list(self._extension)) + return self._extension + + def _make_graph(self, s): + """ + Convert a set of pairs into an adjacency linked list encoding of a graph. + """ + g = {} + for (x, y) in s: + if x in g: + g[x].append(y) + else: + g[x] = [y] + return g + + def _transclose(self, g): + """ + Compute the transitive closure of a graph represented as a linked list. + """ + for x in g: + for adjacent in g[x]: + # check that adjacent is a key + if adjacent in g: + for y in g[adjacent]: + if y not in g[x]: + g[x].append(y) + return g + + def _make_pairs(self, g): + """ + Convert an adjacency linked list back into a set of pairs. + """ + pairs = [] + for node in g: + for adjacent in g[node]: + pairs.append((node, adjacent)) + return set(pairs) + + def close(self): + """ + Close a binary relation in the ``Concept``'s extension set. + + :return: a new extension for the ``Concept`` in which the + relation is closed under a given property + """ + from nltk.sem import is_rel + + assert is_rel(self._extension) + if "symmetric" in self.closures: + pairs = [] + for (x, y) in self._extension: + pairs.append((y, x)) + sym = set(pairs) + self._extension = self._extension.union(sym) + if "transitive" in self.closures: + all = self._make_graph(self._extension) + closed = self._transclose(all) + trans = self._make_pairs(closed) + self._extension = self._extension.union(trans) + self.extension = sorted(list(self._extension)) + + +def clause2concepts(filename, rel_name, schema, closures=[]): + """ + Convert a file of Prolog clauses into a list of ``Concept`` objects. + + :param filename: filename containing the relations + :type filename: str + :param rel_name: name of the relation + :type rel_name: str + :param schema: the schema used in a set of relational tuples + :type schema: list + :param closures: closure properties for the extension of the concept + :type closures: list + :return: a list of ``Concept`` objects + :rtype: list + """ + concepts = [] + # position of the subject of a binary relation + subj = 0 + # label of the 'primary key' + pkey = schema[0] + # fields other than the primary key + fields = schema[1:] + + # convert a file into a list of lists + records = _str2records(filename, rel_name) + + # add a unary concept corresponding to the set of entities + # in the primary key position + # relations in 'not_unary' are more like ordinary binary relations + if not filename in not_unary: + concepts.append(unary_concept(pkey, subj, records)) + + # add a binary concept for each non-key field + for field in fields: + obj = schema.index(field) + concepts.append(binary_concept(field, closures, subj, obj, records)) + + return concepts + + +def cities2table(filename, rel_name, dbname, verbose=False, setup=False): + """ + Convert a file of Prolog clauses into a database table. + + This is not generic, since it doesn't allow arbitrary + schemas to be set as a parameter. + + Intended usage:: + + cities2table('cities.pl', 'city', 'city.db', verbose=True, setup=True) + + :param filename: filename containing the relations + :type filename: str + :param rel_name: name of the relation + :type rel_name: str + :param dbname: filename of persistent store + :type schema: str + """ + import sqlite3 + + records = _str2records(filename, rel_name) + connection = sqlite3.connect(dbname) + cur = connection.cursor() + if setup: + cur.execute( + """CREATE TABLE city_table + (City text, Country text, Population int)""" + ) + + table_name = "city_table" + for t in records: + cur.execute("insert into %s values (?,?,?)" % table_name, t) + if verbose: + print("inserting values into %s: " % table_name, t) + connection.commit() + if verbose: + print("Committing update to %s" % dbname) + cur.close() + + +def sql_query(dbname, query): + """ + Execute an SQL query over a database. + :param dbname: filename of persistent store + :type schema: str + :param query: SQL query + :type rel_name: str + """ + import sqlite3 + + try: + path = nltk.data.find(dbname) + connection = sqlite3.connect(str(path)) + cur = connection.cursor() + return cur.execute(query) + except (ValueError, sqlite3.OperationalError): + import warnings + + warnings.warn( + "Make sure the database file %s is installed and uncompressed." % dbname + ) + raise + + +def _str2records(filename, rel): + """ + Read a file into memory and convert each relation clause into a list. + """ + recs = [] + contents = nltk.data.load("corpora/chat80/%s" % filename, format="text") + for line in contents.splitlines(): + if line.startswith(rel): + line = re.sub(rel + r"\(", "", line) + line = re.sub(r"\)\.$", "", line) + record = line.split(",") + recs.append(record) + return recs + + +def unary_concept(label, subj, records): + """ + Make a unary concept out of the primary key in a record. + + A record is a list of entities in some relation, such as + ``['france', 'paris']``, where ``'france'`` is acting as the primary + key. + + :param label: the preferred label for the concept + :type label: string + :param subj: position in the record of the subject of the predicate + :type subj: int + :param records: a list of records + :type records: list of lists + :return: ``Concept`` of arity 1 + :rtype: Concept + """ + c = Concept(label, arity=1, extension=set()) + for record in records: + c.augment(record[subj]) + return c + + +def binary_concept(label, closures, subj, obj, records): + """ + Make a binary concept out of the primary key and another field in a record. + + A record is a list of entities in some relation, such as + ``['france', 'paris']``, where ``'france'`` is acting as the primary + key, and ``'paris'`` stands in the ``'capital_of'`` relation to + ``'france'``. + + More generally, given a record such as ``['a', 'b', 'c']``, where + label is bound to ``'B'``, and ``obj`` bound to 1, the derived + binary concept will have label ``'B_of'``, and its extension will + be a set of pairs such as ``('a', 'b')``. + + + :param label: the base part of the preferred label for the concept + :type label: str + :param closures: closure properties for the extension of the concept + :type closures: list + :param subj: position in the record of the subject of the predicate + :type subj: int + :param obj: position in the record of the object of the predicate + :type obj: int + :param records: a list of records + :type records: list of lists + :return: ``Concept`` of arity 2 + :rtype: Concept + """ + if not label == "border" and not label == "contain": + label = label + "_of" + c = Concept(label, arity=2, closures=closures, extension=set()) + for record in records: + c.augment((record[subj], record[obj])) + # close the concept's extension according to the properties in closures + c.close() + return c + + +def process_bundle(rels): + """ + Given a list of relation metadata bundles, make a corresponding + dictionary of concepts, indexed by the relation name. + + :param rels: bundle of metadata needed for constructing a concept + :type rels: list(dict) + :return: a dictionary of concepts, indexed by the relation name. + :rtype: dict(str): Concept + """ + concepts = {} + for rel in rels: + rel_name = rel["rel_name"] + closures = rel["closures"] + schema = rel["schema"] + filename = rel["filename"] + + concept_list = clause2concepts(filename, rel_name, schema, closures) + for c in concept_list: + label = c.prefLabel + if label in concepts: + for data in c.extension: + concepts[label].augment(data) + concepts[label].close() + else: + concepts[label] = c + return concepts + + +def make_valuation(concepts, read=False, lexicon=False): + """ + Convert a list of ``Concept`` objects into a list of (label, extension) pairs; + optionally create a ``Valuation`` object. + + :param concepts: concepts + :type concepts: list(Concept) + :param read: if ``True``, ``(symbol, set)`` pairs are read into a ``Valuation`` + :type read: bool + :rtype: list or Valuation + """ + vals = [] + + for c in concepts: + vals.append((c.prefLabel, c.extension)) + if lexicon: + read = True + if read: + from nltk.sem import Valuation + + val = Valuation({}) + val.update(vals) + # add labels for individuals + val = label_indivs(val, lexicon=lexicon) + return val + else: + return vals + + +def val_dump(rels, db): + """ + Make a ``Valuation`` from a list of relation metadata bundles and dump to + persistent database. + + :param rels: bundle of metadata needed for constructing a concept + :type rels: list of dict + :param db: name of file to which data is written. + The suffix '.db' will be automatically appended. + :type db: str + """ + concepts = process_bundle(rels).values() + valuation = make_valuation(concepts, read=True) + db_out = shelve.open(db, "n") + + db_out.update(valuation) + + db_out.close() + + +def val_load(db): + """ + Load a ``Valuation`` from a persistent database. + + :param db: name of file from which data is read. + The suffix '.db' should be omitted from the name. + :type db: str + """ + dbname = db + ".db" + + if not os.access(dbname, os.R_OK): + sys.exit("Cannot read file: %s" % dbname) + else: + db_in = shelve.open(db) + from nltk.sem import Valuation + + val = Valuation(db_in) + # val.read(db_in.items()) + return val + + +# def alpha(str): +# """ +# Utility to filter out non-alphabetic constants. + +#:param str: candidate constant +#:type str: string +#:rtype: bool +# """ +# try: +# int(str) +# return False +# except ValueError: +## some unknown values in records are labeled '?' +# if not str == '?': +# return True + + +def label_indivs(valuation, lexicon=False): + """ + Assign individual constants to the individuals in the domain of a ``Valuation``. + + Given a valuation with an entry of the form ``{'rel': {'a': True}}``, + add a new entry ``{'a': 'a'}``. + + :type valuation: Valuation + :rtype: Valuation + """ + # collect all the individuals into a domain + domain = valuation.domain + # convert the domain into a sorted list of alphabetic terms + # use the same string as a label + pairs = [(e, e) for e in domain] + if lexicon: + lex = make_lex(domain) + with open("chat_pnames.cfg", "w") as outfile: + outfile.writelines(lex) + # read the pairs into the valuation + valuation.update(pairs) + return valuation + + +def make_lex(symbols): + """ + Create lexical CFG rules for each individual symbol. + + Given a valuation with an entry of the form ``{'zloty': 'zloty'}``, + create a lexical rule for the proper name 'Zloty'. + + :param symbols: a list of individual constants in the semantic representation + :type symbols: sequence -- set(str) + :rtype: list(str) + """ + lex = [] + header = """ +################################################################## +# Lexical rules automatically generated by running 'chat80.py -x'. +################################################################## + +""" + lex.append(header) + template = r"PropN[num=sg, sem=<\P.(P %s)>] -> '%s'\n" + + for s in symbols: + parts = s.split("_") + caps = [p.capitalize() for p in parts] + pname = "_".join(caps) + rule = template % (s, pname) + lex.append(rule) + return lex + + +########################################################################### +# Interface function to emulate other corpus readers +########################################################################### + + +def concepts(items=items): + """ + Build a list of concepts corresponding to the relation names in ``items``. + + :param items: names of the Chat-80 relations to extract + :type items: list(str) + :return: the ``Concept`` objects which are extracted from the relations + :rtype: list(Concept) + """ + if isinstance(items, str): + items = (items,) + + rels = [item_metadata[r] for r in items] + + concept_map = process_bundle(rels) + return concept_map.values() + + +########################################################################### + + +def main(): + import sys + from optparse import OptionParser + + description = """ +Extract data from the Chat-80 Prolog files and convert them into a +Valuation object for use in the NLTK semantics package. + """ + + opts = OptionParser(description=description) + opts.set_defaults(verbose=True, lex=False, vocab=False) + opts.add_option( + "-s", "--store", dest="outdb", help="store a valuation in DB", metavar="DB" + ) + opts.add_option( + "-l", + "--load", + dest="indb", + help="load a stored valuation from DB", + metavar="DB", + ) + opts.add_option( + "-c", + "--concepts", + action="store_true", + help="print concepts instead of a valuation", + ) + opts.add_option( + "-r", + "--relation", + dest="label", + help="print concept with label REL (check possible labels with '-v' option)", + metavar="REL", + ) + opts.add_option( + "-q", + "--quiet", + action="store_false", + dest="verbose", + help="don't print out progress info", + ) + opts.add_option( + "-x", + "--lex", + action="store_true", + dest="lex", + help="write a file of lexical entries for country names, then exit", + ) + opts.add_option( + "-v", + "--vocab", + action="store_true", + dest="vocab", + help="print out the vocabulary of concept labels and their arity, then exit", + ) + + (options, args) = opts.parse_args() + if options.outdb and options.indb: + opts.error("Options --store and --load are mutually exclusive") + + if options.outdb: + # write the valuation to a persistent database + if options.verbose: + outdb = options.outdb + ".db" + print("Dumping a valuation to %s" % outdb) + val_dump(rels, options.outdb) + sys.exit(0) + else: + # try to read in a valuation from a database + if options.indb is not None: + dbname = options.indb + ".db" + if not os.access(dbname, os.R_OK): + sys.exit("Cannot read file: %s" % dbname) + else: + valuation = val_load(options.indb) + # we need to create the valuation from scratch + else: + # build some concepts + concept_map = process_bundle(rels) + concepts = concept_map.values() + # just print out the vocabulary + if options.vocab: + items = sorted((c.arity, c.prefLabel) for c in concepts) + for (arity, label) in items: + print(label, arity) + sys.exit(0) + # show all the concepts + if options.concepts: + for c in concepts: + print(c) + print() + if options.label: + print(concept_map[options.label]) + sys.exit(0) + else: + # turn the concepts into a Valuation + if options.lex: + if options.verbose: + print("Writing out lexical rules") + make_valuation(concepts, lexicon=True) + else: + valuation = make_valuation(concepts, read=True) + print(valuation) + + +def sql_demo(): + """ + Print out every row from the 'city.db' database. + """ + print() + print("Using SQL to extract rows from 'city.db' RDB.") + for row in sql_query("corpora/city_database/city.db", "SELECT * FROM city_table"): + print(row) + + +if __name__ == "__main__": + main() + sql_demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/cooper_storage.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/cooper_storage.py new file mode 100644 index 0000000000000000000000000000000000000000..11320020d270efa9f423ccf89edc0667fde18461 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/cooper_storage.py @@ -0,0 +1,124 @@ +# Natural Language Toolkit: Cooper storage for Quantifier Ambiguity +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# URL: +# For license information, see LICENSE.TXT + +from nltk.parse import load_parser +from nltk.parse.featurechart import InstantiateVarsChart +from nltk.sem.logic import ApplicationExpression, LambdaExpression, Variable + + +class CooperStore: + """ + A container for handling quantifier ambiguity via Cooper storage. + """ + + def __init__(self, featstruct): + """ + :param featstruct: The value of the ``sem`` node in a tree from + ``parse_with_bindops()`` + :type featstruct: FeatStruct (with features ``core`` and ``store``) + + """ + self.featstruct = featstruct + self.readings = [] + try: + self.core = featstruct["CORE"] + self.store = featstruct["STORE"] + except KeyError: + print("%s is not a Cooper storage structure" % featstruct) + + def _permute(self, lst): + """ + :return: An iterator over the permutations of the input list + :type lst: list + :rtype: iter + """ + remove = lambda lst0, index: lst0[:index] + lst0[index + 1 :] + if lst: + for index, x in enumerate(lst): + for y in self._permute(remove(lst, index)): + yield (x,) + y + else: + yield () + + def s_retrieve(self, trace=False): + r""" + Carry out S-Retrieval of binding operators in store. If hack=True, + serialize the bindop and core as strings and reparse. Ugh. + + Each permutation of the store (i.e. list of binding operators) is + taken to be a possible scoping of quantifiers. We iterate through the + binding operators in each permutation, and successively apply them to + the current term, starting with the core semantic representation, + working from the inside out. + + Binding operators are of the form:: + + bo(\P.all x.(man(x) -> P(x)),z1) + """ + for perm, store_perm in enumerate(self._permute(self.store)): + if trace: + print("Permutation %s" % (perm + 1)) + term = self.core + for bindop in store_perm: + # we just want the arguments that are wrapped by the 'bo' predicate + quant, varex = tuple(bindop.args) + # use var to make an abstraction over the current term and then + # apply the quantifier to it + term = ApplicationExpression( + quant, LambdaExpression(varex.variable, term) + ) + if trace: + print(" ", term) + term = term.simplify() + self.readings.append(term) + + +def parse_with_bindops(sentence, grammar=None, trace=0): + """ + Use a grammar with Binding Operators to parse a sentence. + """ + if not grammar: + grammar = "grammars/book_grammars/storage.fcfg" + parser = load_parser(grammar, trace=trace, chart_class=InstantiateVarsChart) + # Parse the sentence. + tokens = sentence.split() + return list(parser.parse(tokens)) + + +def demo(): + from nltk.sem import cooper_storage as cs + + sentence = "every girl chases a dog" + # sentence = "a man gives a bone to every dog" + print() + print("Analysis of sentence '%s'" % sentence) + print("=" * 50) + trees = cs.parse_with_bindops(sentence, trace=0) + for tree in trees: + semrep = cs.CooperStore(tree.label()["SEM"]) + print() + print("Binding operators:") + print("-" * 15) + for s in semrep.store: + print(s) + print() + print("Core:") + print("-" * 15) + print(semrep.core) + print() + print("S-Retrieval:") + print("-" * 15) + semrep.s_retrieve(trace=True) + print("Readings:") + print("-" * 15) + + for i, reading in enumerate(semrep.readings): + print(f"{i + 1}: {reading}") + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt.py new file mode 100644 index 0000000000000000000000000000000000000000..1959ee489bf23299dff01527906e3d68258c6308 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt.py @@ -0,0 +1,1460 @@ +# Natural Language Toolkit: Discourse Representation Theory (DRT) +# +# Author: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +import operator +from functools import reduce +from itertools import chain + +from nltk.sem.logic import ( + APP, + AbstractVariableExpression, + AllExpression, + AndExpression, + ApplicationExpression, + BinaryExpression, + BooleanExpression, + ConstantExpression, + EqualityExpression, + EventVariableExpression, + ExistsExpression, + Expression, + FunctionVariableExpression, + ImpExpression, + IndividualVariableExpression, + LambdaExpression, + LogicParser, + NegatedExpression, + OrExpression, + Tokens, + Variable, + is_eventvar, + is_funcvar, + is_indvar, + unique_variable, +) + +# Import Tkinter-based modules if they are available +try: + from tkinter import Canvas, Tk + from tkinter.font import Font + + from nltk.util import in_idle + +except ImportError: + # No need to print a warning here, nltk.draw has already printed one. + pass + + +class DrtTokens(Tokens): + DRS = "DRS" + DRS_CONC = "+" + PRONOUN = "PRO" + OPEN_BRACKET = "[" + CLOSE_BRACKET = "]" + COLON = ":" + + PUNCT = [DRS_CONC, OPEN_BRACKET, CLOSE_BRACKET, COLON] + + SYMBOLS = Tokens.SYMBOLS + PUNCT + + TOKENS = Tokens.TOKENS + [DRS] + PUNCT + + +class DrtParser(LogicParser): + """A lambda calculus expression parser.""" + + def __init__(self): + LogicParser.__init__(self) + + self.operator_precedence = dict( + [(x, 1) for x in DrtTokens.LAMBDA_LIST] + + [(x, 2) for x in DrtTokens.NOT_LIST] + + [(APP, 3)] + + [(x, 4) for x in DrtTokens.EQ_LIST + Tokens.NEQ_LIST] + + [(DrtTokens.COLON, 5)] + + [(DrtTokens.DRS_CONC, 6)] + + [(x, 7) for x in DrtTokens.OR_LIST] + + [(x, 8) for x in DrtTokens.IMP_LIST] + + [(None, 9)] + ) + + def get_all_symbols(self): + """This method exists to be overridden""" + return DrtTokens.SYMBOLS + + def isvariable(self, tok): + return tok not in DrtTokens.TOKENS + + def handle(self, tok, context): + """This method is intended to be overridden for logics that + use different operators or expressions""" + if tok in DrtTokens.NOT_LIST: + return self.handle_negation(tok, context) + + elif tok in DrtTokens.LAMBDA_LIST: + return self.handle_lambda(tok, context) + + elif tok == DrtTokens.OPEN: + if self.inRange(0) and self.token(0) == DrtTokens.OPEN_BRACKET: + return self.handle_DRS(tok, context) + else: + return self.handle_open(tok, context) + + elif tok.upper() == DrtTokens.DRS: + self.assertNextToken(DrtTokens.OPEN) + return self.handle_DRS(tok, context) + + elif self.isvariable(tok): + if self.inRange(0) and self.token(0) == DrtTokens.COLON: + return self.handle_prop(tok, context) + else: + return self.handle_variable(tok, context) + + def make_NegatedExpression(self, expression): + return DrtNegatedExpression(expression) + + def handle_DRS(self, tok, context): + # a DRS + refs = self.handle_refs() + if ( + self.inRange(0) and self.token(0) == DrtTokens.COMMA + ): # if there is a comma (it's optional) + self.token() # swallow the comma + conds = self.handle_conds(context) + self.assertNextToken(DrtTokens.CLOSE) + return DRS(refs, conds, None) + + def handle_refs(self): + self.assertNextToken(DrtTokens.OPEN_BRACKET) + refs = [] + while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET: + # Support expressions like: DRS([x y],C) == DRS([x,y],C) + if refs and self.token(0) == DrtTokens.COMMA: + self.token() # swallow the comma + refs.append(self.get_next_token_variable("quantified")) + self.assertNextToken(DrtTokens.CLOSE_BRACKET) + return refs + + def handle_conds(self, context): + self.assertNextToken(DrtTokens.OPEN_BRACKET) + conds = [] + while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET: + # Support expressions like: DRS([x y],C) == DRS([x, y],C) + if conds and self.token(0) == DrtTokens.COMMA: + self.token() # swallow the comma + conds.append(self.process_next_expression(context)) + self.assertNextToken(DrtTokens.CLOSE_BRACKET) + return conds + + def handle_prop(self, tok, context): + variable = self.make_VariableExpression(tok) + self.assertNextToken(":") + drs = self.process_next_expression(DrtTokens.COLON) + return DrtProposition(variable, drs) + + def make_EqualityExpression(self, first, second): + """This method serves as a hook for other logic parsers that + have different equality expression classes""" + return DrtEqualityExpression(first, second) + + def get_BooleanExpression_factory(self, tok): + """This method serves as a hook for other logic parsers that + have different boolean operators""" + if tok == DrtTokens.DRS_CONC: + return lambda first, second: DrtConcatenation(first, second, None) + elif tok in DrtTokens.OR_LIST: + return DrtOrExpression + elif tok in DrtTokens.IMP_LIST: + + def make_imp_expression(first, second): + if isinstance(first, DRS): + return DRS(first.refs, first.conds, second) + if isinstance(first, DrtConcatenation): + return DrtConcatenation(first.first, first.second, second) + raise Exception("Antecedent of implication must be a DRS") + + return make_imp_expression + else: + return None + + def make_BooleanExpression(self, factory, first, second): + return factory(first, second) + + def make_ApplicationExpression(self, function, argument): + return DrtApplicationExpression(function, argument) + + def make_VariableExpression(self, name): + return DrtVariableExpression(Variable(name)) + + def make_LambdaExpression(self, variables, term): + return DrtLambdaExpression(variables, term) + + +class DrtExpression: + """ + This is the base abstract DRT Expression from which every DRT + Expression extends. + """ + + _drt_parser = DrtParser() + + @classmethod + def fromstring(cls, s): + return cls._drt_parser.parse(s) + + def applyto(self, other): + return DrtApplicationExpression(self, other) + + def __neg__(self): + return DrtNegatedExpression(self) + + def __and__(self, other): + return NotImplemented + + def __or__(self, other): + assert isinstance(other, DrtExpression) + return DrtOrExpression(self, other) + + def __gt__(self, other): + assert isinstance(other, DrtExpression) + if isinstance(self, DRS): + return DRS(self.refs, self.conds, other) + if isinstance(self, DrtConcatenation): + return DrtConcatenation(self.first, self.second, other) + raise Exception("Antecedent of implication must be a DRS") + + def equiv(self, other, prover=None): + """ + Check for logical equivalence. + Pass the expression (self <-> other) to the theorem prover. + If the prover says it is valid, then the self and other are equal. + + :param other: an ``DrtExpression`` to check equality against + :param prover: a ``nltk.inference.api.Prover`` + """ + assert isinstance(other, DrtExpression) + + f1 = self.simplify().fol() + f2 = other.simplify().fol() + return f1.equiv(f2, prover) + + @property + def type(self): + raise AttributeError( + "'%s' object has no attribute 'type'" % self.__class__.__name__ + ) + + def typecheck(self, signature=None): + raise NotImplementedError() + + def __add__(self, other): + return DrtConcatenation(self, other, None) + + def get_refs(self, recursive=False): + """ + Return the set of discourse referents in this DRS. + :param recursive: bool Also find discourse referents in subterms? + :return: list of ``Variable`` objects + """ + raise NotImplementedError() + + def is_pronoun_function(self): + """Is self of the form "PRO(x)"?""" + return ( + isinstance(self, DrtApplicationExpression) + and isinstance(self.function, DrtAbstractVariableExpression) + and self.function.variable.name == DrtTokens.PRONOUN + and isinstance(self.argument, DrtIndividualVariableExpression) + ) + + def make_EqualityExpression(self, first, second): + return DrtEqualityExpression(first, second) + + def make_VariableExpression(self, variable): + return DrtVariableExpression(variable) + + def resolve_anaphora(self): + return resolve_anaphora(self) + + def eliminate_equality(self): + return self.visit_structured(lambda e: e.eliminate_equality(), self.__class__) + + def pretty_format(self): + """ + Draw the DRS + :return: the pretty print string + """ + return "\n".join(self._pretty()) + + def pretty_print(self): + print(self.pretty_format()) + + def draw(self): + DrsDrawer(self).draw() + + +class DRS(DrtExpression, Expression): + """A Discourse Representation Structure.""" + + def __init__(self, refs, conds, consequent=None): + """ + :param refs: list of ``DrtIndividualVariableExpression`` for the + discourse referents + :param conds: list of ``Expression`` for the conditions + """ + self.refs = refs + self.conds = conds + self.consequent = consequent + + def replace(self, variable, expression, replace_bound=False, alpha_convert=True): + """Replace all instances of variable v with expression E in self, + where v is free in self.""" + if variable in self.refs: + # if a bound variable is the thing being replaced + if not replace_bound: + return self + else: + i = self.refs.index(variable) + if self.consequent: + consequent = self.consequent.replace( + variable, expression, True, alpha_convert + ) + else: + consequent = None + return DRS( + self.refs[:i] + [expression.variable] + self.refs[i + 1 :], + [ + cond.replace(variable, expression, True, alpha_convert) + for cond in self.conds + ], + consequent, + ) + else: + if alpha_convert: + # any bound variable that appears in the expression must + # be alpha converted to avoid a conflict + for ref in set(self.refs) & expression.free(): + newvar = unique_variable(ref) + newvarex = DrtVariableExpression(newvar) + i = self.refs.index(ref) + if self.consequent: + consequent = self.consequent.replace( + ref, newvarex, True, alpha_convert + ) + else: + consequent = None + self = DRS( + self.refs[:i] + [newvar] + self.refs[i + 1 :], + [ + cond.replace(ref, newvarex, True, alpha_convert) + for cond in self.conds + ], + consequent, + ) + + # replace in the conditions + if self.consequent: + consequent = self.consequent.replace( + variable, expression, replace_bound, alpha_convert + ) + else: + consequent = None + return DRS( + self.refs, + [ + cond.replace(variable, expression, replace_bound, alpha_convert) + for cond in self.conds + ], + consequent, + ) + + def free(self): + """:see: Expression.free()""" + conds_free = reduce(operator.or_, [c.free() for c in self.conds], set()) + if self.consequent: + conds_free.update(self.consequent.free()) + return conds_free - set(self.refs) + + def get_refs(self, recursive=False): + """:see: AbstractExpression.get_refs()""" + if recursive: + conds_refs = self.refs + list( + chain.from_iterable(c.get_refs(True) for c in self.conds) + ) + if self.consequent: + conds_refs.extend(self.consequent.get_refs(True)) + return conds_refs + else: + return self.refs + + def visit(self, function, combinator): + """:see: Expression.visit()""" + parts = list(map(function, self.conds)) + if self.consequent: + parts.append(function(self.consequent)) + return combinator(parts) + + def visit_structured(self, function, combinator): + """:see: Expression.visit_structured()""" + consequent = function(self.consequent) if self.consequent else None + return combinator(self.refs, list(map(function, self.conds)), consequent) + + def eliminate_equality(self): + drs = self + i = 0 + while i < len(drs.conds): + cond = drs.conds[i] + if ( + isinstance(cond, EqualityExpression) + and isinstance(cond.first, AbstractVariableExpression) + and isinstance(cond.second, AbstractVariableExpression) + ): + drs = DRS( + list(set(drs.refs) - {cond.second.variable}), + drs.conds[:i] + drs.conds[i + 1 :], + drs.consequent, + ) + if cond.second.variable != cond.first.variable: + drs = drs.replace(cond.second.variable, cond.first, False, False) + i = 0 + i -= 1 + i += 1 + + conds = [] + for cond in drs.conds: + new_cond = cond.eliminate_equality() + new_cond_simp = new_cond.simplify() + if ( + not isinstance(new_cond_simp, DRS) + or new_cond_simp.refs + or new_cond_simp.conds + or new_cond_simp.consequent + ): + conds.append(new_cond) + + consequent = drs.consequent.eliminate_equality() if drs.consequent else None + return DRS(drs.refs, conds, consequent) + + def fol(self): + if self.consequent: + accum = None + if self.conds: + accum = reduce(AndExpression, [c.fol() for c in self.conds]) + + if accum: + accum = ImpExpression(accum, self.consequent.fol()) + else: + accum = self.consequent.fol() + + for ref in self.refs[::-1]: + accum = AllExpression(ref, accum) + + return accum + + else: + if not self.conds: + raise Exception("Cannot convert DRS with no conditions to FOL.") + accum = reduce(AndExpression, [c.fol() for c in self.conds]) + for ref in map(Variable, self._order_ref_strings(self.refs)[::-1]): + accum = ExistsExpression(ref, accum) + return accum + + def _pretty(self): + refs_line = " ".join(self._order_ref_strings(self.refs)) + + cond_lines = [ + cond + for cond_line in [ + filter(lambda s: s.strip(), cond._pretty()) for cond in self.conds + ] + for cond in cond_line + ] + length = max([len(refs_line)] + list(map(len, cond_lines))) + drs = ( + [ + " _" + "_" * length + "_ ", + "| " + refs_line.ljust(length) + " |", + "|-" + "-" * length + "-|", + ] + + ["| " + line.ljust(length) + " |" for line in cond_lines] + + ["|_" + "_" * length + "_|"] + ) + if self.consequent: + return DrtBinaryExpression._assemble_pretty( + drs, DrtTokens.IMP, self.consequent._pretty() + ) + return drs + + def _order_ref_strings(self, refs): + strings = ["%s" % ref for ref in refs] + ind_vars = [] + func_vars = [] + event_vars = [] + other_vars = [] + for s in strings: + if is_indvar(s): + ind_vars.append(s) + elif is_funcvar(s): + func_vars.append(s) + elif is_eventvar(s): + event_vars.append(s) + else: + other_vars.append(s) + return ( + sorted(other_vars) + + sorted(event_vars, key=lambda v: int([v[2:], -1][len(v[2:]) == 0])) + + sorted(func_vars, key=lambda v: (v[0], int([v[1:], -1][len(v[1:]) == 0]))) + + sorted(ind_vars, key=lambda v: (v[0], int([v[1:], -1][len(v[1:]) == 0]))) + ) + + def __eq__(self, other): + r"""Defines equality modulo alphabetic variance. + If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" + if isinstance(other, DRS): + if len(self.refs) == len(other.refs): + converted_other = other + for (r1, r2) in zip(self.refs, converted_other.refs): + varex = self.make_VariableExpression(r1) + converted_other = converted_other.replace(r2, varex, True) + if self.consequent == converted_other.consequent and len( + self.conds + ) == len(converted_other.conds): + for c1, c2 in zip(self.conds, converted_other.conds): + if not (c1 == c2): + return False + return True + return False + + def __ne__(self, other): + return not self == other + + __hash__ = Expression.__hash__ + + def __str__(self): + drs = "([{}],[{}])".format( + ",".join(self._order_ref_strings(self.refs)), + ", ".join("%s" % cond for cond in self.conds), + ) # map(str, self.conds))) + if self.consequent: + return ( + DrtTokens.OPEN + + drs + + " " + + DrtTokens.IMP + + " " + + "%s" % self.consequent + + DrtTokens.CLOSE + ) + return drs + + +def DrtVariableExpression(variable): + """ + This is a factory method that instantiates and returns a subtype of + ``DrtAbstractVariableExpression`` appropriate for the given variable. + """ + if is_indvar(variable.name): + return DrtIndividualVariableExpression(variable) + elif is_funcvar(variable.name): + return DrtFunctionVariableExpression(variable) + elif is_eventvar(variable.name): + return DrtEventVariableExpression(variable) + else: + return DrtConstantExpression(variable) + + +class DrtAbstractVariableExpression(DrtExpression, AbstractVariableExpression): + def fol(self): + return self + + def get_refs(self, recursive=False): + """:see: AbstractExpression.get_refs()""" + return [] + + def _pretty(self): + s = "%s" % self + blank = " " * len(s) + return [blank, blank, s, blank] + + def eliminate_equality(self): + return self + + +class DrtIndividualVariableExpression( + DrtAbstractVariableExpression, IndividualVariableExpression +): + pass + + +class DrtFunctionVariableExpression( + DrtAbstractVariableExpression, FunctionVariableExpression +): + pass + + +class DrtEventVariableExpression( + DrtIndividualVariableExpression, EventVariableExpression +): + pass + + +class DrtConstantExpression(DrtAbstractVariableExpression, ConstantExpression): + pass + + +class DrtProposition(DrtExpression, Expression): + def __init__(self, variable, drs): + self.variable = variable + self.drs = drs + + def replace(self, variable, expression, replace_bound=False, alpha_convert=True): + if self.variable == variable: + assert isinstance( + expression, DrtAbstractVariableExpression + ), "Can only replace a proposition label with a variable" + return DrtProposition( + expression.variable, + self.drs.replace(variable, expression, replace_bound, alpha_convert), + ) + else: + return DrtProposition( + self.variable, + self.drs.replace(variable, expression, replace_bound, alpha_convert), + ) + + def eliminate_equality(self): + return DrtProposition(self.variable, self.drs.eliminate_equality()) + + def get_refs(self, recursive=False): + return self.drs.get_refs(True) if recursive else [] + + def __eq__(self, other): + return ( + self.__class__ == other.__class__ + and self.variable == other.variable + and self.drs == other.drs + ) + + def __ne__(self, other): + return not self == other + + __hash__ = Expression.__hash__ + + def fol(self): + return self.drs.fol() + + def _pretty(self): + drs_s = self.drs._pretty() + blank = " " * len("%s" % self.variable) + return ( + [blank + " " + line for line in drs_s[:1]] + + ["%s" % self.variable + ":" + line for line in drs_s[1:2]] + + [blank + " " + line for line in drs_s[2:]] + ) + + def visit(self, function, combinator): + """:see: Expression.visit()""" + return combinator([function(self.drs)]) + + def visit_structured(self, function, combinator): + """:see: Expression.visit_structured()""" + return combinator(self.variable, function(self.drs)) + + def __str__(self): + return f"prop({self.variable}, {self.drs})" + + +class DrtNegatedExpression(DrtExpression, NegatedExpression): + def fol(self): + return NegatedExpression(self.term.fol()) + + def get_refs(self, recursive=False): + """:see: AbstractExpression.get_refs()""" + return self.term.get_refs(recursive) + + def _pretty(self): + term_lines = self.term._pretty() + return ( + [" " + line for line in term_lines[:2]] + + ["__ " + line for line in term_lines[2:3]] + + [" | " + line for line in term_lines[3:4]] + + [" " + line for line in term_lines[4:]] + ) + + +class DrtLambdaExpression(DrtExpression, LambdaExpression): + def alpha_convert(self, newvar): + """Rename all occurrences of the variable introduced by this variable + binder in the expression to ``newvar``. + :param newvar: ``Variable``, for the new variable + """ + return self.__class__( + newvar, + self.term.replace(self.variable, DrtVariableExpression(newvar), True), + ) + + def fol(self): + return LambdaExpression(self.variable, self.term.fol()) + + def _pretty(self): + variables = [self.variable] + term = self.term + while term.__class__ == self.__class__: + variables.append(term.variable) + term = term.term + var_string = " ".join("%s" % v for v in variables) + DrtTokens.DOT + term_lines = term._pretty() + blank = " " * len(var_string) + return ( + [" " + blank + line for line in term_lines[:1]] + + [r" \ " + blank + line for line in term_lines[1:2]] + + [r" /\ " + var_string + line for line in term_lines[2:3]] + + [" " + blank + line for line in term_lines[3:]] + ) + + def get_refs(self, recursive=False): + """:see: AbstractExpression.get_refs()""" + return ( + [self.variable] + self.term.get_refs(True) if recursive else [self.variable] + ) + + +class DrtBinaryExpression(DrtExpression, BinaryExpression): + def get_refs(self, recursive=False): + """:see: AbstractExpression.get_refs()""" + return ( + self.first.get_refs(True) + self.second.get_refs(True) if recursive else [] + ) + + def _pretty(self): + return DrtBinaryExpression._assemble_pretty( + self._pretty_subex(self.first), + self.getOp(), + self._pretty_subex(self.second), + ) + + @staticmethod + def _assemble_pretty(first_lines, op, second_lines): + max_lines = max(len(first_lines), len(second_lines)) + first_lines = _pad_vertically(first_lines, max_lines) + second_lines = _pad_vertically(second_lines, max_lines) + blank = " " * len(op) + first_second_lines = list(zip(first_lines, second_lines)) + return ( + [ + " " + first_line + " " + blank + " " + second_line + " " + for first_line, second_line in first_second_lines[:2] + ] + + [ + "(" + first_line + " " + op + " " + second_line + ")" + for first_line, second_line in first_second_lines[2:3] + ] + + [ + " " + first_line + " " + blank + " " + second_line + " " + for first_line, second_line in first_second_lines[3:] + ] + ) + + def _pretty_subex(self, subex): + return subex._pretty() + + +class DrtBooleanExpression(DrtBinaryExpression, BooleanExpression): + pass + + +class DrtOrExpression(DrtBooleanExpression, OrExpression): + def fol(self): + return OrExpression(self.first.fol(), self.second.fol()) + + def _pretty_subex(self, subex): + if isinstance(subex, DrtOrExpression): + return [line[1:-1] for line in subex._pretty()] + return DrtBooleanExpression._pretty_subex(self, subex) + + +class DrtEqualityExpression(DrtBinaryExpression, EqualityExpression): + def fol(self): + return EqualityExpression(self.first.fol(), self.second.fol()) + + +class DrtConcatenation(DrtBooleanExpression): + """DRS of the form '(DRS + DRS)'""" + + def __init__(self, first, second, consequent=None): + DrtBooleanExpression.__init__(self, first, second) + self.consequent = consequent + + def replace(self, variable, expression, replace_bound=False, alpha_convert=True): + """Replace all instances of variable v with expression E in self, + where v is free in self.""" + first = self.first + second = self.second + consequent = self.consequent + + # If variable is bound + if variable in self.get_refs(): + if replace_bound: + first = first.replace( + variable, expression, replace_bound, alpha_convert + ) + second = second.replace( + variable, expression, replace_bound, alpha_convert + ) + if consequent: + consequent = consequent.replace( + variable, expression, replace_bound, alpha_convert + ) + else: + if alpha_convert: + # alpha convert every ref that is free in 'expression' + for ref in set(self.get_refs(True)) & expression.free(): + v = DrtVariableExpression(unique_variable(ref)) + first = first.replace(ref, v, True, alpha_convert) + second = second.replace(ref, v, True, alpha_convert) + if consequent: + consequent = consequent.replace(ref, v, True, alpha_convert) + + first = first.replace(variable, expression, replace_bound, alpha_convert) + second = second.replace(variable, expression, replace_bound, alpha_convert) + if consequent: + consequent = consequent.replace( + variable, expression, replace_bound, alpha_convert + ) + + return self.__class__(first, second, consequent) + + def eliminate_equality(self): + # TODO: at some point. for now, simplify. + drs = self.simplify() + assert not isinstance(drs, DrtConcatenation) + return drs.eliminate_equality() + + def simplify(self): + first = self.first.simplify() + second = self.second.simplify() + consequent = self.consequent.simplify() if self.consequent else None + + if isinstance(first, DRS) and isinstance(second, DRS): + # For any ref that is in both 'first' and 'second' + for ref in set(first.get_refs(True)) & set(second.get_refs(True)): + # alpha convert the ref in 'second' to prevent collision + newvar = DrtVariableExpression(unique_variable(ref)) + second = second.replace(ref, newvar, True) + + return DRS(first.refs + second.refs, first.conds + second.conds, consequent) + else: + return self.__class__(first, second, consequent) + + def get_refs(self, recursive=False): + """:see: AbstractExpression.get_refs()""" + refs = self.first.get_refs(recursive) + self.second.get_refs(recursive) + if self.consequent and recursive: + refs.extend(self.consequent.get_refs(True)) + return refs + + def getOp(self): + return DrtTokens.DRS_CONC + + def __eq__(self, other): + r"""Defines equality modulo alphabetic variance. + If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" + if isinstance(other, DrtConcatenation): + self_refs = self.get_refs() + other_refs = other.get_refs() + if len(self_refs) == len(other_refs): + converted_other = other + for (r1, r2) in zip(self_refs, other_refs): + varex = self.make_VariableExpression(r1) + converted_other = converted_other.replace(r2, varex, True) + return ( + self.first == converted_other.first + and self.second == converted_other.second + and self.consequent == converted_other.consequent + ) + return False + + def __ne__(self, other): + return not self == other + + __hash__ = DrtBooleanExpression.__hash__ + + def fol(self): + e = AndExpression(self.first.fol(), self.second.fol()) + if self.consequent: + e = ImpExpression(e, self.consequent.fol()) + return e + + def _pretty(self): + drs = DrtBinaryExpression._assemble_pretty( + self._pretty_subex(self.first), + self.getOp(), + self._pretty_subex(self.second), + ) + if self.consequent: + drs = DrtBinaryExpression._assemble_pretty( + drs, DrtTokens.IMP, self.consequent._pretty() + ) + return drs + + def _pretty_subex(self, subex): + if isinstance(subex, DrtConcatenation): + return [line[1:-1] for line in subex._pretty()] + return DrtBooleanExpression._pretty_subex(self, subex) + + def visit(self, function, combinator): + """:see: Expression.visit()""" + if self.consequent: + return combinator( + [function(self.first), function(self.second), function(self.consequent)] + ) + else: + return combinator([function(self.first), function(self.second)]) + + def __str__(self): + first = self._str_subex(self.first) + second = self._str_subex(self.second) + drs = Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE + if self.consequent: + return ( + DrtTokens.OPEN + + drs + + " " + + DrtTokens.IMP + + " " + + "%s" % self.consequent + + DrtTokens.CLOSE + ) + return drs + + def _str_subex(self, subex): + s = "%s" % subex + if isinstance(subex, DrtConcatenation) and subex.consequent is None: + return s[1:-1] + return s + + +class DrtApplicationExpression(DrtExpression, ApplicationExpression): + def fol(self): + return ApplicationExpression(self.function.fol(), self.argument.fol()) + + def get_refs(self, recursive=False): + """:see: AbstractExpression.get_refs()""" + return ( + self.function.get_refs(True) + self.argument.get_refs(True) + if recursive + else [] + ) + + def _pretty(self): + function, args = self.uncurry() + function_lines = function._pretty() + args_lines = [arg._pretty() for arg in args] + max_lines = max(map(len, [function_lines] + args_lines)) + function_lines = _pad_vertically(function_lines, max_lines) + args_lines = [_pad_vertically(arg_lines, max_lines) for arg_lines in args_lines] + func_args_lines = list(zip(function_lines, list(zip(*args_lines)))) + return ( + [ + func_line + " " + " ".join(args_line) + " " + for func_line, args_line in func_args_lines[:2] + ] + + [ + func_line + "(" + ",".join(args_line) + ")" + for func_line, args_line in func_args_lines[2:3] + ] + + [ + func_line + " " + " ".join(args_line) + " " + for func_line, args_line in func_args_lines[3:] + ] + ) + + +def _pad_vertically(lines, max_lines): + pad_line = [" " * len(lines[0])] + return lines + pad_line * (max_lines - len(lines)) + + +class PossibleAntecedents(list, DrtExpression, Expression): + def free(self): + """Set of free variables.""" + return set(self) + + def replace(self, variable, expression, replace_bound=False, alpha_convert=True): + """Replace all instances of variable v with expression E in self, + where v is free in self.""" + result = PossibleAntecedents() + for item in self: + if item == variable: + self.append(expression) + else: + self.append(item) + return result + + def _pretty(self): + s = "%s" % self + blank = " " * len(s) + return [blank, blank, s] + + def __str__(self): + return "[" + ",".join("%s" % it for it in self) + "]" + + +class AnaphoraResolutionException(Exception): + pass + + +def resolve_anaphora(expression, trail=[]): + if isinstance(expression, ApplicationExpression): + if expression.is_pronoun_function(): + possible_antecedents = PossibleAntecedents() + for ancestor in trail: + for ref in ancestor.get_refs(): + refex = expression.make_VariableExpression(ref) + + # ========================================================== + # Don't allow resolution to itself or other types + # ========================================================== + if refex.__class__ == expression.argument.__class__ and not ( + refex == expression.argument + ): + possible_antecedents.append(refex) + + if len(possible_antecedents) == 1: + resolution = possible_antecedents[0] + else: + resolution = possible_antecedents + return expression.make_EqualityExpression(expression.argument, resolution) + else: + r_function = resolve_anaphora(expression.function, trail + [expression]) + r_argument = resolve_anaphora(expression.argument, trail + [expression]) + return expression.__class__(r_function, r_argument) + + elif isinstance(expression, DRS): + r_conds = [] + for cond in expression.conds: + r_cond = resolve_anaphora(cond, trail + [expression]) + + # if the condition is of the form '(x = [])' then raise exception + if isinstance(r_cond, EqualityExpression): + if isinstance(r_cond.first, PossibleAntecedents): + # Reverse the order so that the variable is on the left + temp = r_cond.first + r_cond.first = r_cond.second + r_cond.second = temp + if isinstance(r_cond.second, PossibleAntecedents): + if not r_cond.second: + raise AnaphoraResolutionException( + "Variable '%s' does not " + "resolve to anything." % r_cond.first + ) + + r_conds.append(r_cond) + if expression.consequent: + consequent = resolve_anaphora(expression.consequent, trail + [expression]) + else: + consequent = None + return expression.__class__(expression.refs, r_conds, consequent) + + elif isinstance(expression, AbstractVariableExpression): + return expression + + elif isinstance(expression, NegatedExpression): + return expression.__class__( + resolve_anaphora(expression.term, trail + [expression]) + ) + + elif isinstance(expression, DrtConcatenation): + if expression.consequent: + consequent = resolve_anaphora(expression.consequent, trail + [expression]) + else: + consequent = None + return expression.__class__( + resolve_anaphora(expression.first, trail + [expression]), + resolve_anaphora(expression.second, trail + [expression]), + consequent, + ) + + elif isinstance(expression, BinaryExpression): + return expression.__class__( + resolve_anaphora(expression.first, trail + [expression]), + resolve_anaphora(expression.second, trail + [expression]), + ) + + elif isinstance(expression, LambdaExpression): + return expression.__class__( + expression.variable, resolve_anaphora(expression.term, trail + [expression]) + ) + + +class DrsDrawer: + BUFFER = 3 # Space between elements + TOPSPACE = 10 # Space above whole DRS + OUTERSPACE = 6 # Space to the left, right, and bottom of the while DRS + + def __init__(self, drs, size_canvas=True, canvas=None): + """ + :param drs: ``DrtExpression``, The DRS to be drawn + :param size_canvas: bool, True if the canvas size should be the exact size of the DRS + :param canvas: ``Canvas`` The canvas on which to draw the DRS. If none is given, create a new canvas. + """ + master = None + if not canvas: + master = Tk() + master.title("DRT") + + font = Font(family="helvetica", size=12) + + if size_canvas: + canvas = Canvas(master, width=0, height=0) + canvas.font = font + self.canvas = canvas + (right, bottom) = self._visit(drs, self.OUTERSPACE, self.TOPSPACE) + + width = max(right + self.OUTERSPACE, 100) + height = bottom + self.OUTERSPACE + canvas = Canvas(master, width=width, height=height) # , bg='white') + else: + canvas = Canvas(master, width=300, height=300) + + canvas.pack() + canvas.font = font + + self.canvas = canvas + self.drs = drs + self.master = master + + def _get_text_height(self): + """Get the height of a line of text""" + return self.canvas.font.metrics("linespace") + + def draw(self, x=OUTERSPACE, y=TOPSPACE): + """Draw the DRS""" + self._handle(self.drs, self._draw_command, x, y) + + if self.master and not in_idle(): + self.master.mainloop() + else: + return self._visit(self.drs, x, y) + + def _visit(self, expression, x, y): + """ + Return the bottom-rightmost point without actually drawing the item + + :param expression: the item to visit + :param x: the top of the current drawing area + :param y: the left side of the current drawing area + :return: the bottom-rightmost point + """ + return self._handle(expression, self._visit_command, x, y) + + def _draw_command(self, item, x, y): + """ + Draw the given item at the given location + + :param item: the item to draw + :param x: the top of the current drawing area + :param y: the left side of the current drawing area + :return: the bottom-rightmost point + """ + if isinstance(item, str): + self.canvas.create_text(x, y, anchor="nw", font=self.canvas.font, text=item) + elif isinstance(item, tuple): + # item is the lower-right of a box + (right, bottom) = item + self.canvas.create_rectangle(x, y, right, bottom) + horiz_line_y = ( + y + self._get_text_height() + (self.BUFFER * 2) + ) # the line separating refs from conds + self.canvas.create_line(x, horiz_line_y, right, horiz_line_y) + + return self._visit_command(item, x, y) + + def _visit_command(self, item, x, y): + """ + Return the bottom-rightmost point without actually drawing the item + + :param item: the item to visit + :param x: the top of the current drawing area + :param y: the left side of the current drawing area + :return: the bottom-rightmost point + """ + if isinstance(item, str): + return (x + self.canvas.font.measure(item), y + self._get_text_height()) + elif isinstance(item, tuple): + return item + + def _handle(self, expression, command, x=0, y=0): + """ + :param expression: the expression to handle + :param command: the function to apply, either _draw_command or _visit_command + :param x: the top of the current drawing area + :param y: the left side of the current drawing area + :return: the bottom-rightmost point + """ + if command == self._visit_command: + # if we don't need to draw the item, then we can use the cached values + try: + # attempt to retrieve cached values + right = expression._drawing_width + x + bottom = expression._drawing_height + y + return (right, bottom) + except AttributeError: + # the values have not been cached yet, so compute them + pass + + if isinstance(expression, DrtAbstractVariableExpression): + factory = self._handle_VariableExpression + elif isinstance(expression, DRS): + factory = self._handle_DRS + elif isinstance(expression, DrtNegatedExpression): + factory = self._handle_NegatedExpression + elif isinstance(expression, DrtLambdaExpression): + factory = self._handle_LambdaExpression + elif isinstance(expression, BinaryExpression): + factory = self._handle_BinaryExpression + elif isinstance(expression, DrtApplicationExpression): + factory = self._handle_ApplicationExpression + elif isinstance(expression, PossibleAntecedents): + factory = self._handle_VariableExpression + elif isinstance(expression, DrtProposition): + factory = self._handle_DrtProposition + else: + raise Exception(expression.__class__.__name__) + + (right, bottom) = factory(expression, command, x, y) + + # cache the values + expression._drawing_width = right - x + expression._drawing_height = bottom - y + + return (right, bottom) + + def _handle_VariableExpression(self, expression, command, x, y): + return command("%s" % expression, x, y) + + def _handle_NegatedExpression(self, expression, command, x, y): + # Find the width of the negation symbol + right = self._visit_command(DrtTokens.NOT, x, y)[0] + + # Handle term + (right, bottom) = self._handle(expression.term, command, right, y) + + # Handle variables now that we know the y-coordinate + command( + DrtTokens.NOT, + x, + self._get_centered_top(y, bottom - y, self._get_text_height()), + ) + + return (right, bottom) + + def _handle_DRS(self, expression, command, x, y): + left = x + self.BUFFER # indent the left side + bottom = y + self.BUFFER # indent the top + + # Handle Discourse Referents + if expression.refs: + refs = " ".join("%s" % r for r in expression.refs) + else: + refs = " " + (max_right, bottom) = command(refs, left, bottom) + bottom += self.BUFFER * 2 + + # Handle Conditions + if expression.conds: + for cond in expression.conds: + (right, bottom) = self._handle(cond, command, left, bottom) + max_right = max(max_right, right) + bottom += self.BUFFER + else: + bottom += self._get_text_height() + self.BUFFER + + # Handle Box + max_right += self.BUFFER + return command((max_right, bottom), x, y) + + def _handle_ApplicationExpression(self, expression, command, x, y): + function, args = expression.uncurry() + if not isinstance(function, DrtAbstractVariableExpression): + # It's not a predicate expression ("P(x,y)"), so leave arguments curried + function = expression.function + args = [expression.argument] + + # Get the max bottom of any element on the line + function_bottom = self._visit(function, x, y)[1] + max_bottom = max( + [function_bottom] + [self._visit(arg, x, y)[1] for arg in args] + ) + + line_height = max_bottom - y + + # Handle 'function' + function_drawing_top = self._get_centered_top( + y, line_height, function._drawing_height + ) + right = self._handle(function, command, x, function_drawing_top)[0] + + # Handle open paren + centred_string_top = self._get_centered_top( + y, line_height, self._get_text_height() + ) + right = command(DrtTokens.OPEN, right, centred_string_top)[0] + + # Handle each arg + for (i, arg) in enumerate(args): + arg_drawing_top = self._get_centered_top( + y, line_height, arg._drawing_height + ) + right = self._handle(arg, command, right, arg_drawing_top)[0] + + if i + 1 < len(args): + # since it's not the last arg, add a comma + right = command(DrtTokens.COMMA + " ", right, centred_string_top)[0] + + # Handle close paren + right = command(DrtTokens.CLOSE, right, centred_string_top)[0] + + return (right, max_bottom) + + def _handle_LambdaExpression(self, expression, command, x, y): + # Find the width of the lambda symbol and abstracted variables + variables = DrtTokens.LAMBDA + "%s" % expression.variable + DrtTokens.DOT + right = self._visit_command(variables, x, y)[0] + + # Handle term + (right, bottom) = self._handle(expression.term, command, right, y) + + # Handle variables now that we know the y-coordinate + command( + variables, x, self._get_centered_top(y, bottom - y, self._get_text_height()) + ) + + return (right, bottom) + + def _handle_BinaryExpression(self, expression, command, x, y): + # Get the full height of the line, based on the operands + first_height = self._visit(expression.first, 0, 0)[1] + second_height = self._visit(expression.second, 0, 0)[1] + line_height = max(first_height, second_height) + + # Handle open paren + centred_string_top = self._get_centered_top( + y, line_height, self._get_text_height() + ) + right = command(DrtTokens.OPEN, x, centred_string_top)[0] + + # Handle the first operand + first_height = expression.first._drawing_height + (right, first_bottom) = self._handle( + expression.first, + command, + right, + self._get_centered_top(y, line_height, first_height), + ) + + # Handle the operator + right = command(" %s " % expression.getOp(), right, centred_string_top)[0] + + # Handle the second operand + second_height = expression.second._drawing_height + (right, second_bottom) = self._handle( + expression.second, + command, + right, + self._get_centered_top(y, line_height, second_height), + ) + + # Handle close paren + right = command(DrtTokens.CLOSE, right, centred_string_top)[0] + + return (right, max(first_bottom, second_bottom)) + + def _handle_DrtProposition(self, expression, command, x, y): + # Find the width of the negation symbol + right = command(expression.variable, x, y)[0] + + # Handle term + (right, bottom) = self._handle(expression.term, command, right, y) + + return (right, bottom) + + def _get_centered_top(self, top, full_height, item_height): + """Get the y-coordinate of the point that a figure should start at if + its height is 'item_height' and it needs to be centered in an area that + starts at 'top' and is 'full_height' tall.""" + return top + (full_height - item_height) / 2 + + +def demo(): + print("=" * 20 + "TEST PARSE" + "=" * 20) + dexpr = DrtExpression.fromstring + print(dexpr(r"([x,y],[sees(x,y)])")) + print(dexpr(r"([x],[man(x), walks(x)])")) + print(dexpr(r"\x.\y.([],[sees(x,y)])")) + print(dexpr(r"\x.([],[walks(x)])(john)")) + print(dexpr(r"(([x],[walks(x)]) + ([y],[runs(y)]))")) + print(dexpr(r"(([],[walks(x)]) -> ([],[runs(x)]))")) + print(dexpr(r"([x],[PRO(x), sees(John,x)])")) + print(dexpr(r"([x],[man(x), -([],[walks(x)])])")) + print(dexpr(r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])")) + + print("=" * 20 + "Test fol()" + "=" * 20) + print(dexpr(r"([x,y],[sees(x,y)])").fol()) + + print("=" * 20 + "Test alpha conversion and lambda expression equality" + "=" * 20) + e1 = dexpr(r"\x.([],[P(x)])") + print(e1) + e2 = e1.alpha_convert(Variable("z")) + print(e2) + print(e1 == e2) + + print("=" * 20 + "Test resolve_anaphora()" + "=" * 20) + print(resolve_anaphora(dexpr(r"([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])"))) + print( + resolve_anaphora(dexpr(r"([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])")) + ) + print(resolve_anaphora(dexpr(r"(([x,y],[]) + ([],[PRO(x)]))"))) + + print("=" * 20 + "Test pretty_print()" + "=" * 20) + dexpr(r"([],[])").pretty_print() + dexpr( + r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])" + ).pretty_print() + dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print() + dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print() + dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print() + + +def test_draw(): + try: + from tkinter import Tk + except ImportError as e: + raise ValueError("tkinter is required, but it's not available.") + + expressions = [ + r"x", + r"([],[])", + r"([x],[])", + r"([x],[man(x)])", + r"([x,y],[sees(x,y)])", + r"([x],[man(x), walks(x)])", + r"\x.([],[man(x), walks(x)])", + r"\x y.([],[sees(x,y)])", + r"([],[(([],[walks(x)]) + ([],[runs(x)]))])", + r"([x],[man(x), -([],[walks(x)])])", + r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])", + ] + + for e in expressions: + d = DrtExpression.fromstring(e) + d.draw() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/evaluate.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..7e4d89e2eb650b49749254ed98208454d65f346e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/evaluate.py @@ -0,0 +1,829 @@ +# Natural Language Toolkit: Models for first-order languages with lambda +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein , +# URL: +# For license information, see LICENSE.TXT + +# TODO: +# - fix tracing +# - fix iterator-based approach to existentials + +""" +This module provides data structures for representing first-order +models. +""" + +import inspect +import re +import sys +import textwrap +from pprint import pformat + +from nltk.decorators import decorator # this used in code that is commented out +from nltk.sem.logic import ( + AbstractVariableExpression, + AllExpression, + AndExpression, + ApplicationExpression, + EqualityExpression, + ExistsExpression, + Expression, + IffExpression, + ImpExpression, + IndividualVariableExpression, + IotaExpression, + LambdaExpression, + NegatedExpression, + OrExpression, + Variable, + is_indvar, +) + + +class Error(Exception): + pass + + +class Undefined(Error): + pass + + +def trace(f, *args, **kw): + argspec = inspect.getfullargspec(f) + d = dict(zip(argspec[0], args)) + if d.pop("trace", None): + print() + for item in d.items(): + print("%s => %s" % item) + return f(*args, **kw) + + +def is_rel(s): + """ + Check whether a set represents a relation (of any arity). + + :param s: a set containing tuples of str elements + :type s: set + :rtype: bool + """ + # we have the empty relation, i.e. set() + if len(s) == 0: + return True + # all the elements are tuples of the same length + elif all(isinstance(el, tuple) for el in s) and len(max(s)) == len(min(s)): + return True + else: + raise ValueError("Set %r contains sequences of different lengths" % s) + + +def set2rel(s): + """ + Convert a set containing individuals (strings or numbers) into a set of + unary tuples. Any tuples of strings already in the set are passed through + unchanged. + + For example: + - set(['a', 'b']) => set([('a',), ('b',)]) + - set([3, 27]) => set([('3',), ('27',)]) + + :type s: set + :rtype: set of tuple of str + """ + new = set() + for elem in s: + if isinstance(elem, str): + new.add((elem,)) + elif isinstance(elem, int): + new.add(str(elem)) + else: + new.add(elem) + return new + + +def arity(rel): + """ + Check the arity of a relation. + :type rel: set of tuples + :rtype: int of tuple of str + """ + if len(rel) == 0: + return 0 + return len(list(rel)[0]) + + +class Valuation(dict): + """ + A dictionary which represents a model-theoretic Valuation of non-logical constants. + Keys are strings representing the constants to be interpreted, and values correspond + to individuals (represented as strings) and n-ary relations (represented as sets of tuples + of strings). + + An instance of ``Valuation`` will raise a KeyError exception (i.e., + just behave like a standard dictionary) if indexed with an expression that + is not in its list of symbols. + """ + + def __init__(self, xs): + """ + :param xs: a list of (symbol, value) pairs. + """ + super().__init__() + for (sym, val) in xs: + if isinstance(val, str) or isinstance(val, bool): + self[sym] = val + elif isinstance(val, set): + self[sym] = set2rel(val) + else: + msg = textwrap.fill( + "Error in initializing Valuation. " + "Unrecognized value for symbol '%s':\n%s" % (sym, val), + width=66, + ) + + raise ValueError(msg) + + def __getitem__(self, key): + if key in self: + return dict.__getitem__(self, key) + else: + raise Undefined("Unknown expression: '%s'" % key) + + def __str__(self): + return pformat(self) + + @property + def domain(self): + """Set-theoretic domain of the value-space of a Valuation.""" + dom = [] + for val in self.values(): + if isinstance(val, str): + dom.append(val) + elif not isinstance(val, bool): + dom.extend( + [elem for tuple_ in val for elem in tuple_ if elem is not None] + ) + return set(dom) + + @property + def symbols(self): + """The non-logical constants which the Valuation recognizes.""" + return sorted(self.keys()) + + @classmethod + def fromstring(cls, s): + return read_valuation(s) + + +########################################## +# REs used by the _read_valuation function +########################################## +_VAL_SPLIT_RE = re.compile(r"\s*=+>\s*") +_ELEMENT_SPLIT_RE = re.compile(r"\s*,\s*") +_TUPLES_RE = re.compile( + r"""\s* + (\([^)]+\)) # tuple-expression + \s*""", + re.VERBOSE, +) + + +def _read_valuation_line(s): + """ + Read a line in a valuation file. + + Lines are expected to be of the form:: + + noosa => n + girl => {g1, g2} + chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} + + :param s: input line + :type s: str + :return: a pair (symbol, value) + :rtype: tuple + """ + pieces = _VAL_SPLIT_RE.split(s) + symbol = pieces[0] + value = pieces[1] + # check whether the value is meant to be a set + if value.startswith("{"): + value = value[1:-1] + tuple_strings = _TUPLES_RE.findall(value) + # are the set elements tuples? + if tuple_strings: + set_elements = [] + for ts in tuple_strings: + ts = ts[1:-1] + element = tuple(_ELEMENT_SPLIT_RE.split(ts)) + set_elements.append(element) + else: + set_elements = _ELEMENT_SPLIT_RE.split(value) + value = set(set_elements) + return symbol, value + + +def read_valuation(s, encoding=None): + """ + Convert a valuation string into a valuation. + + :param s: a valuation string + :type s: str + :param encoding: the encoding of the input string, if it is binary + :type encoding: str + :return: a ``nltk.sem`` valuation + :rtype: Valuation + """ + if encoding is not None: + s = s.decode(encoding) + statements = [] + for linenum, line in enumerate(s.splitlines()): + line = line.strip() + if line.startswith("#") or line == "": + continue + try: + statements.append(_read_valuation_line(line)) + except ValueError as e: + raise ValueError(f"Unable to parse line {linenum}: {line}") from e + return Valuation(statements) + + +class Assignment(dict): + r""" + A dictionary which represents an assignment of values to variables. + + An assignment can only assign values from its domain. + + If an unknown expression *a* is passed to a model *M*\ 's + interpretation function *i*, *i* will first check whether *M*\ 's + valuation assigns an interpretation to *a* as a constant, and if + this fails, *i* will delegate the interpretation of *a* to + *g*. *g* only assigns values to individual variables (i.e., + members of the class ``IndividualVariableExpression`` in the ``logic`` + module. If a variable is not assigned a value by *g*, it will raise + an ``Undefined`` exception. + + A variable *Assignment* is a mapping from individual variables to + entities in the domain. Individual variables are usually indicated + with the letters ``'x'``, ``'y'``, ``'w'`` and ``'z'``, optionally + followed by an integer (e.g., ``'x0'``, ``'y332'``). Assignments are + created using the ``Assignment`` constructor, which also takes the + domain as a parameter. + + >>> from nltk.sem.evaluate import Assignment + >>> dom = set(['u1', 'u2', 'u3', 'u4']) + >>> g3 = Assignment(dom, [('x', 'u1'), ('y', 'u2')]) + >>> g3 == {'x': 'u1', 'y': 'u2'} + True + + There is also a ``print`` format for assignments which uses a notation + closer to that in logic textbooks: + + >>> print(g3) + g[u1/x][u2/y] + + It is also possible to update an assignment using the ``add`` method: + + >>> dom = set(['u1', 'u2', 'u3', 'u4']) + >>> g4 = Assignment(dom) + >>> g4.add('x', 'u1') + {'x': 'u1'} + + With no arguments, ``purge()`` is equivalent to ``clear()`` on a dictionary: + + >>> g4.purge() + >>> g4 + {} + + :param domain: the domain of discourse + :type domain: set + :param assign: a list of (varname, value) associations + :type assign: list + """ + + def __init__(self, domain, assign=None): + super().__init__() + self.domain = domain + if assign: + for (var, val) in assign: + assert val in self.domain, "'{}' is not in the domain: {}".format( + val, + self.domain, + ) + assert is_indvar(var), ( + "Wrong format for an Individual Variable: '%s'" % var + ) + self[var] = val + self.variant = None + self._addvariant() + + def __getitem__(self, key): + if key in self: + return dict.__getitem__(self, key) + else: + raise Undefined("Not recognized as a variable: '%s'" % key) + + def copy(self): + new = Assignment(self.domain) + new.update(self) + return new + + def purge(self, var=None): + """ + Remove one or all keys (i.e. logic variables) from an + assignment, and update ``self.variant``. + + :param var: a Variable acting as a key for the assignment. + """ + if var: + del self[var] + else: + self.clear() + self._addvariant() + return None + + def __str__(self): + """ + Pretty printing for assignments. {'x', 'u'} appears as 'g[u/x]' + """ + gstring = "g" + # Deterministic output for unit testing. + variant = sorted(self.variant) + for (val, var) in variant: + gstring += f"[{val}/{var}]" + return gstring + + def _addvariant(self): + """ + Create a more pretty-printable version of the assignment. + """ + list_ = [] + for item in self.items(): + pair = (item[1], item[0]) + list_.append(pair) + self.variant = list_ + return None + + def add(self, var, val): + """ + Add a new variable-value pair to the assignment, and update + ``self.variant``. + + """ + assert val in self.domain, f"{val} is not in the domain {self.domain}" + assert is_indvar(var), "Wrong format for an Individual Variable: '%s'" % var + self[var] = val + self._addvariant() + return self + + +class Model: + """ + A first order model is a domain *D* of discourse and a valuation *V*. + + A domain *D* is a set, and a valuation *V* is a map that associates + expressions with values in the model. + The domain of *V* should be a subset of *D*. + + Construct a new ``Model``. + + :type domain: set + :param domain: A set of entities representing the domain of discourse of the model. + :type valuation: Valuation + :param valuation: the valuation of the model. + :param prop: If this is set, then we are building a propositional\ + model and don't require the domain of *V* to be subset of *D*. + """ + + def __init__(self, domain, valuation): + assert isinstance(domain, set) + self.domain = domain + self.valuation = valuation + if not domain.issuperset(valuation.domain): + raise Error( + "The valuation domain, %s, must be a subset of the model's domain, %s" + % (valuation.domain, domain) + ) + + def __repr__(self): + return f"({self.domain!r}, {self.valuation!r})" + + def __str__(self): + return f"Domain = {self.domain},\nValuation = \n{self.valuation}" + + def evaluate(self, expr, g, trace=None): + """ + Read input expressions, and provide a handler for ``satisfy`` + that blocks further propagation of the ``Undefined`` error. + :param expr: An ``Expression`` of ``logic``. + :type g: Assignment + :param g: an assignment to individual variables. + :rtype: bool or 'Undefined' + """ + try: + parsed = Expression.fromstring(expr) + value = self.satisfy(parsed, g, trace=trace) + if trace: + print() + print(f"'{expr}' evaluates to {value} under M, {g}") + return value + except Undefined: + if trace: + print() + print(f"'{expr}' is undefined under M, {g}") + return "Undefined" + + def satisfy(self, parsed, g, trace=None): + """ + Recursive interpretation function for a formula of first-order logic. + + Raises an ``Undefined`` error when ``parsed`` is an atomic string + but is not a symbol or an individual variable. + + :return: Returns a truth value or ``Undefined`` if ``parsed`` is\ + complex, and calls the interpretation function ``i`` if ``parsed``\ + is atomic. + + :param parsed: An expression of ``logic``. + :type g: Assignment + :param g: an assignment to individual variables. + """ + + if isinstance(parsed, ApplicationExpression): + function, arguments = parsed.uncurry() + if isinstance(function, AbstractVariableExpression): + # It's a predicate expression ("P(x,y)"), so used uncurried arguments + funval = self.satisfy(function, g) + argvals = tuple(self.satisfy(arg, g) for arg in arguments) + return argvals in funval + else: + # It must be a lambda expression, so use curried form + funval = self.satisfy(parsed.function, g) + argval = self.satisfy(parsed.argument, g) + return funval[argval] + elif isinstance(parsed, NegatedExpression): + return not self.satisfy(parsed.term, g) + elif isinstance(parsed, AndExpression): + return self.satisfy(parsed.first, g) and self.satisfy(parsed.second, g) + elif isinstance(parsed, OrExpression): + return self.satisfy(parsed.first, g) or self.satisfy(parsed.second, g) + elif isinstance(parsed, ImpExpression): + return (not self.satisfy(parsed.first, g)) or self.satisfy(parsed.second, g) + elif isinstance(parsed, IffExpression): + return self.satisfy(parsed.first, g) == self.satisfy(parsed.second, g) + elif isinstance(parsed, EqualityExpression): + return self.satisfy(parsed.first, g) == self.satisfy(parsed.second, g) + elif isinstance(parsed, AllExpression): + new_g = g.copy() + for u in self.domain: + new_g.add(parsed.variable.name, u) + if not self.satisfy(parsed.term, new_g): + return False + return True + elif isinstance(parsed, ExistsExpression): + new_g = g.copy() + for u in self.domain: + new_g.add(parsed.variable.name, u) + if self.satisfy(parsed.term, new_g): + return True + return False + elif isinstance(parsed, IotaExpression): + new_g = g.copy() + for u in self.domain: + new_g.add(parsed.variable.name, u) + if self.satisfy(parsed.term, new_g): + return True + return False + elif isinstance(parsed, LambdaExpression): + cf = {} + var = parsed.variable.name + for u in self.domain: + val = self.satisfy(parsed.term, g.add(var, u)) + # NB the dict would be a lot smaller if we do this: + # if val: cf[u] = val + # But then need to deal with cases where f(a) should yield + # a function rather than just False. + cf[u] = val + return cf + else: + return self.i(parsed, g, trace) + + # @decorator(trace_eval) + def i(self, parsed, g, trace=False): + """ + An interpretation function. + + Assuming that ``parsed`` is atomic: + + - if ``parsed`` is a non-logical constant, calls the valuation *V* + - else if ``parsed`` is an individual variable, calls assignment *g* + - else returns ``Undefined``. + + :param parsed: an ``Expression`` of ``logic``. + :type g: Assignment + :param g: an assignment to individual variables. + :return: a semantic value + """ + # If parsed is a propositional letter 'p', 'q', etc, it could be in valuation.symbols + # and also be an IndividualVariableExpression. We want to catch this first case. + # So there is a procedural consequence to the ordering of clauses here: + if parsed.variable.name in self.valuation.symbols: + return self.valuation[parsed.variable.name] + elif isinstance(parsed, IndividualVariableExpression): + return g[parsed.variable.name] + + else: + raise Undefined("Can't find a value for %s" % parsed) + + def satisfiers(self, parsed, varex, g, trace=None, nesting=0): + """ + Generate the entities from the model's domain that satisfy an open formula. + + :param parsed: an open formula + :type parsed: Expression + :param varex: the relevant free individual variable in ``parsed``. + :type varex: VariableExpression or str + :param g: a variable assignment + :type g: Assignment + :return: a set of the entities that satisfy ``parsed``. + """ + + spacer = " " + indent = spacer + (spacer * nesting) + candidates = [] + + if isinstance(varex, str): + var = Variable(varex) + else: + var = varex + + if var in parsed.free(): + if trace: + print() + print( + (spacer * nesting) + + f"Open formula is '{parsed}' with assignment {g}" + ) + for u in self.domain: + new_g = g.copy() + new_g.add(var.name, u) + if trace and trace > 1: + lowtrace = trace - 1 + else: + lowtrace = 0 + value = self.satisfy(parsed, new_g, lowtrace) + + if trace: + print(indent + "(trying assignment %s)" % new_g) + + # parsed == False under g[u/var]? + if value == False: + if trace: + print(indent + f"value of '{parsed}' under {new_g} is False") + + # so g[u/var] is a satisfying assignment + else: + candidates.append(u) + if trace: + print(indent + f"value of '{parsed}' under {new_g} is {value}") + + result = {c for c in candidates} + # var isn't free in parsed + else: + raise Undefined(f"{var.name} is not free in {parsed}") + + return result + + +# ////////////////////////////////////////////////////////////////////// +# Demo.. +# ////////////////////////////////////////////////////////////////////// +# number of spacer chars +mult = 30 + +# Demo 1: Propositional Logic +################# +def propdemo(trace=None): + """Example of a propositional model.""" + + global val1, dom1, m1, g1 + val1 = Valuation([("P", True), ("Q", True), ("R", False)]) + dom1 = set() + m1 = Model(dom1, val1) + g1 = Assignment(dom1) + + print() + print("*" * mult) + print("Propositional Formulas Demo") + print("*" * mult) + print("(Propositional constants treated as nullary predicates)") + print() + print("Model m1:\n", m1) + print("*" * mult) + sentences = [ + "(P & Q)", + "(P & R)", + "- P", + "- R", + "- - P", + "- (P & R)", + "(P | R)", + "(R | P)", + "(R | R)", + "(- P | R)", + "(P | - P)", + "(P -> Q)", + "(P -> R)", + "(R -> P)", + "(P <-> P)", + "(R <-> R)", + "(P <-> R)", + ] + + for sent in sentences: + if trace: + print() + m1.evaluate(sent, g1, trace) + else: + print(f"The value of '{sent}' is: {m1.evaluate(sent, g1)}") + + +# Demo 2: FOL Model +############# + + +def folmodel(quiet=False, trace=None): + """Example of a first-order model.""" + + global val2, v2, dom2, m2, g2 + + v2 = [ + ("adam", "b1"), + ("betty", "g1"), + ("fido", "d1"), + ("girl", {"g1", "g2"}), + ("boy", {"b1", "b2"}), + ("dog", {"d1"}), + ("love", {("b1", "g1"), ("b2", "g2"), ("g1", "b1"), ("g2", "b1")}), + ] + val2 = Valuation(v2) + dom2 = val2.domain + m2 = Model(dom2, val2) + g2 = Assignment(dom2, [("x", "b1"), ("y", "g2")]) + + if not quiet: + print() + print("*" * mult) + print("Models Demo") + print("*" * mult) + print("Model m2:\n", "-" * 14, "\n", m2) + print("Variable assignment = ", g2) + + exprs = ["adam", "boy", "love", "walks", "x", "y", "z"] + parsed_exprs = [Expression.fromstring(e) for e in exprs] + + print() + for parsed in parsed_exprs: + try: + print( + "The interpretation of '%s' in m2 is %s" + % (parsed, m2.i(parsed, g2)) + ) + except Undefined: + print("The interpretation of '%s' in m2 is Undefined" % parsed) + + applications = [ + ("boy", ("adam")), + ("walks", ("adam",)), + ("love", ("adam", "y")), + ("love", ("y", "adam")), + ] + + for (fun, args) in applications: + try: + funval = m2.i(Expression.fromstring(fun), g2) + argsval = tuple(m2.i(Expression.fromstring(arg), g2) for arg in args) + print(f"{fun}({args}) evaluates to {argsval in funval}") + except Undefined: + print(f"{fun}({args}) evaluates to Undefined") + + +# Demo 3: FOL +######### + + +def foldemo(trace=None): + """ + Interpretation of closed expressions in a first-order model. + """ + folmodel(quiet=True) + + print() + print("*" * mult) + print("FOL Formulas Demo") + print("*" * mult) + + formulas = [ + "love (adam, betty)", + "(adam = mia)", + "\\x. (boy(x) | girl(x))", + "\\x. boy(x)(adam)", + "\\x y. love(x, y)", + "\\x y. love(x, y)(adam)(betty)", + "\\x y. love(x, y)(adam, betty)", + "\\x y. (boy(x) & love(x, y))", + "\\x. exists y. (boy(x) & love(x, y))", + "exists z1. boy(z1)", + "exists x. (boy(x) & -(x = adam))", + "exists x. (boy(x) & all y. love(y, x))", + "all x. (boy(x) | girl(x))", + "all x. (girl(x) -> exists y. boy(y) & love(x, y))", # Every girl loves exists boy. + "exists x. (boy(x) & all y. (girl(y) -> love(y, x)))", # There is exists boy that every girl loves. + "exists x. (boy(x) & all y. (girl(y) -> love(x, y)))", # exists boy loves every girl. + "all x. (dog(x) -> - girl(x))", + "exists x. exists y. (love(x, y) & love(x, y))", + ] + + for fmla in formulas: + g2.purge() + if trace: + m2.evaluate(fmla, g2, trace) + else: + print(f"The value of '{fmla}' is: {m2.evaluate(fmla, g2)}") + + +# Demo 3: Satisfaction +############# + + +def satdemo(trace=None): + """Satisfiers of an open formula in a first order model.""" + + print() + print("*" * mult) + print("Satisfiers Demo") + print("*" * mult) + + folmodel(quiet=True) + + formulas = [ + "boy(x)", + "(x = x)", + "(boy(x) | girl(x))", + "(boy(x) & girl(x))", + "love(adam, x)", + "love(x, adam)", + "-(x = adam)", + "exists z22. love(x, z22)", + "exists y. love(y, x)", + "all y. (girl(y) -> love(x, y))", + "all y. (girl(y) -> love(y, x))", + "all y. (girl(y) -> (boy(x) & love(y, x)))", + "(boy(x) & all y. (girl(y) -> love(x, y)))", + "(boy(x) & all y. (girl(y) -> love(y, x)))", + "(boy(x) & exists y. (girl(y) & love(y, x)))", + "(girl(x) -> dog(x))", + "all y. (dog(y) -> (x = y))", + "exists y. love(y, x)", + "exists y. (love(adam, y) & love(y, x))", + ] + + if trace: + print(m2) + + for fmla in formulas: + print(fmla) + Expression.fromstring(fmla) + + parsed = [Expression.fromstring(fmla) for fmla in formulas] + + for p in parsed: + g2.purge() + print( + "The satisfiers of '{}' are: {}".format(p, m2.satisfiers(p, "x", g2, trace)) + ) + + +def demo(num=0, trace=None): + """ + Run exists demos. + + - num = 1: propositional logic demo + - num = 2: first order model demo (only if trace is set) + - num = 3: first order sentences demo + - num = 4: satisfaction of open formulas demo + - any other value: run all the demos + + :param trace: trace = 1, or trace = 2 for more verbose tracing + """ + demos = {1: propdemo, 2: folmodel, 3: foldemo, 4: satdemo} + + try: + demos[num](trace=trace) + except KeyError: + for num in demos: + demos[num](trace=trace) + + +if __name__ == "__main__": + demo(2, trace=0) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/linearlogic.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/linearlogic.py new file mode 100644 index 0000000000000000000000000000000000000000..dee0eac41d09fabfe887216b3c879253f8996f61 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/linearlogic.py @@ -0,0 +1,482 @@ +# Natural Language Toolkit: Linear Logic +# +# Author: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +from nltk.internals import Counter +from nltk.sem.logic import APP, LogicParser + +_counter = Counter() + + +class Tokens: + # Punctuation + OPEN = "(" + CLOSE = ")" + + # Operations + IMP = "-o" + + PUNCT = [OPEN, CLOSE] + TOKENS = PUNCT + [IMP] + + +class LinearLogicParser(LogicParser): + """A linear logic expression parser.""" + + def __init__(self): + LogicParser.__init__(self) + + self.operator_precedence = {APP: 1, Tokens.IMP: 2, None: 3} + self.right_associated_operations += [Tokens.IMP] + + def get_all_symbols(self): + return Tokens.TOKENS + + def handle(self, tok, context): + if tok not in Tokens.TOKENS: + return self.handle_variable(tok, context) + elif tok == Tokens.OPEN: + return self.handle_open(tok, context) + + def get_BooleanExpression_factory(self, tok): + if tok == Tokens.IMP: + return ImpExpression + else: + return None + + def make_BooleanExpression(self, factory, first, second): + return factory(first, second) + + def attempt_ApplicationExpression(self, expression, context): + """Attempt to make an application expression. If the next tokens + are an argument in parens, then the argument expression is a + function being applied to the arguments. Otherwise, return the + argument expression.""" + if self.has_priority(APP, context): + if self.inRange(0) and self.token(0) == Tokens.OPEN: + self.token() # swallow then open paren + argument = self.process_next_expression(APP) + self.assertNextToken(Tokens.CLOSE) + expression = ApplicationExpression(expression, argument, None) + return expression + + def make_VariableExpression(self, name): + if name[0].isupper(): + return VariableExpression(name) + else: + return ConstantExpression(name) + + +class Expression: + + _linear_logic_parser = LinearLogicParser() + + @classmethod + def fromstring(cls, s): + return cls._linear_logic_parser.parse(s) + + def applyto(self, other, other_indices=None): + return ApplicationExpression(self, other, other_indices) + + def __call__(self, other): + return self.applyto(other) + + def __repr__(self): + return f"<{self.__class__.__name__} {self}>" + + +class AtomicExpression(Expression): + def __init__(self, name, dependencies=None): + """ + :param name: str for the constant name + :param dependencies: list of int for the indices on which this atom is dependent + """ + assert isinstance(name, str) + self.name = name + + if not dependencies: + dependencies = [] + self.dependencies = dependencies + + def simplify(self, bindings=None): + """ + If 'self' is bound by 'bindings', return the atomic to which it is bound. + Otherwise, return self. + + :param bindings: ``BindingDict`` A dictionary of bindings used to simplify + :return: ``AtomicExpression`` + """ + if bindings and self in bindings: + return bindings[self] + else: + return self + + def compile_pos(self, index_counter, glueFormulaFactory): + """ + From Iddo Lev's PhD Dissertation p108-109 + + :param index_counter: ``Counter`` for unique indices + :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas + :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas + """ + self.dependencies = [] + return (self, []) + + def compile_neg(self, index_counter, glueFormulaFactory): + """ + From Iddo Lev's PhD Dissertation p108-109 + + :param index_counter: ``Counter`` for unique indices + :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas + :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas + """ + self.dependencies = [] + return (self, []) + + def initialize_labels(self, fstruct): + self.name = fstruct.initialize_label(self.name.lower()) + + def __eq__(self, other): + return self.__class__ == other.__class__ and self.name == other.name + + def __ne__(self, other): + return not self == other + + def __str__(self): + accum = self.name + if self.dependencies: + accum += "%s" % self.dependencies + return accum + + def __hash__(self): + return hash(self.name) + + +class ConstantExpression(AtomicExpression): + def unify(self, other, bindings): + """ + If 'other' is a constant, then it must be equal to 'self'. If 'other' is a variable, + then it must not be bound to anything other than 'self'. + + :param other: ``Expression`` + :param bindings: ``BindingDict`` A dictionary of all current bindings + :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new binding + :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' + """ + assert isinstance(other, Expression) + if isinstance(other, VariableExpression): + try: + return bindings + BindingDict([(other, self)]) + except VariableBindingException: + pass + elif self == other: + return bindings + raise UnificationException(self, other, bindings) + + +class VariableExpression(AtomicExpression): + def unify(self, other, bindings): + """ + 'self' must not be bound to anything other than 'other'. + + :param other: ``Expression`` + :param bindings: ``BindingDict`` A dictionary of all current bindings + :return: ``BindingDict`` A new combined dictionary of of 'bindings' and the new binding + :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' + """ + assert isinstance(other, Expression) + try: + if self == other: + return bindings + else: + return bindings + BindingDict([(self, other)]) + except VariableBindingException as e: + raise UnificationException(self, other, bindings) from e + + +class ImpExpression(Expression): + def __init__(self, antecedent, consequent): + """ + :param antecedent: ``Expression`` for the antecedent + :param consequent: ``Expression`` for the consequent + """ + assert isinstance(antecedent, Expression) + assert isinstance(consequent, Expression) + self.antecedent = antecedent + self.consequent = consequent + + def simplify(self, bindings=None): + return self.__class__( + self.antecedent.simplify(bindings), self.consequent.simplify(bindings) + ) + + def unify(self, other, bindings): + """ + Both the antecedent and consequent of 'self' and 'other' must unify. + + :param other: ``ImpExpression`` + :param bindings: ``BindingDict`` A dictionary of all current bindings + :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new bindings + :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' + """ + assert isinstance(other, ImpExpression) + try: + return ( + bindings + + self.antecedent.unify(other.antecedent, bindings) + + self.consequent.unify(other.consequent, bindings) + ) + except VariableBindingException as e: + raise UnificationException(self, other, bindings) from e + + def compile_pos(self, index_counter, glueFormulaFactory): + """ + From Iddo Lev's PhD Dissertation p108-109 + + :param index_counter: ``Counter`` for unique indices + :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas + :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas + """ + (a, a_new) = self.antecedent.compile_neg(index_counter, glueFormulaFactory) + (c, c_new) = self.consequent.compile_pos(index_counter, glueFormulaFactory) + return (ImpExpression(a, c), a_new + c_new) + + def compile_neg(self, index_counter, glueFormulaFactory): + """ + From Iddo Lev's PhD Dissertation p108-109 + + :param index_counter: ``Counter`` for unique indices + :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas + :return: (``Expression``,list of ``GlueFormula``) for the compiled linear logic and any newly created glue formulas + """ + (a, a_new) = self.antecedent.compile_pos(index_counter, glueFormulaFactory) + (c, c_new) = self.consequent.compile_neg(index_counter, glueFormulaFactory) + fresh_index = index_counter.get() + c.dependencies.append(fresh_index) + new_v = glueFormulaFactory("v%s" % fresh_index, a, {fresh_index}) + return (c, a_new + c_new + [new_v]) + + def initialize_labels(self, fstruct): + self.antecedent.initialize_labels(fstruct) + self.consequent.initialize_labels(fstruct) + + def __eq__(self, other): + return ( + self.__class__ == other.__class__ + and self.antecedent == other.antecedent + and self.consequent == other.consequent + ) + + def __ne__(self, other): + return not self == other + + def __str__(self): + return "{}{} {} {}{}".format( + Tokens.OPEN, + self.antecedent, + Tokens.IMP, + self.consequent, + Tokens.CLOSE, + ) + + def __hash__(self): + return hash(f"{hash(self.antecedent)}{Tokens.IMP}{hash(self.consequent)}") + + +class ApplicationExpression(Expression): + def __init__(self, function, argument, argument_indices=None): + """ + :param function: ``Expression`` for the function + :param argument: ``Expression`` for the argument + :param argument_indices: set for the indices of the glue formula from which the argument came + :raise LinearLogicApplicationException: If 'function' cannot be applied to 'argument' given 'argument_indices'. + """ + function_simp = function.simplify() + argument_simp = argument.simplify() + + assert isinstance(function_simp, ImpExpression) + assert isinstance(argument_simp, Expression) + + bindings = BindingDict() + + try: + if isinstance(function, ApplicationExpression): + bindings += function.bindings + if isinstance(argument, ApplicationExpression): + bindings += argument.bindings + bindings += function_simp.antecedent.unify(argument_simp, bindings) + except UnificationException as e: + raise LinearLogicApplicationException( + f"Cannot apply {function_simp} to {argument_simp}. {e}" + ) from e + + # If you are running it on complied premises, more conditions apply + if argument_indices: + # A.dependencies of (A -o (B -o C)) must be a proper subset of argument_indices + if not set(function_simp.antecedent.dependencies) < argument_indices: + raise LinearLogicApplicationException( + "Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s" + % (function_simp, argument_simp) + ) + if set(function_simp.antecedent.dependencies) == argument_indices: + raise LinearLogicApplicationException( + "Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s" + % (function_simp, argument_simp) + ) + + self.function = function + self.argument = argument + self.bindings = bindings + + def simplify(self, bindings=None): + """ + Since function is an implication, return its consequent. There should be + no need to check that the application is valid since the checking is done + by the constructor. + + :param bindings: ``BindingDict`` A dictionary of bindings used to simplify + :return: ``Expression`` + """ + if not bindings: + bindings = self.bindings + + return self.function.simplify(bindings).consequent + + def __eq__(self, other): + return ( + self.__class__ == other.__class__ + and self.function == other.function + and self.argument == other.argument + ) + + def __ne__(self, other): + return not self == other + + def __str__(self): + return "%s" % self.function + Tokens.OPEN + "%s" % self.argument + Tokens.CLOSE + + def __hash__(self): + return hash(f"{hash(self.antecedent)}{Tokens.OPEN}{hash(self.consequent)}") + + +class BindingDict: + def __init__(self, bindings=None): + """ + :param bindings: + list [(``VariableExpression``, ``AtomicExpression``)] to initialize the dictionary + dict {``VariableExpression``: ``AtomicExpression``} to initialize the dictionary + """ + self.d = {} + + if isinstance(bindings, dict): + bindings = bindings.items() + + if bindings: + for (v, b) in bindings: + self[v] = b + + def __setitem__(self, variable, binding): + """ + A binding is consistent with the dict if its variable is not already bound, OR if its + variable is already bound to its argument. + + :param variable: ``VariableExpression`` The variable bind + :param binding: ``Expression`` The expression to which 'variable' should be bound + :raise VariableBindingException: If the variable cannot be bound in this dictionary + """ + assert isinstance(variable, VariableExpression) + assert isinstance(binding, Expression) + + assert variable != binding + + existing = self.d.get(variable, None) + + if not existing or binding == existing: + self.d[variable] = binding + else: + raise VariableBindingException( + "Variable %s already bound to another value" % (variable) + ) + + def __getitem__(self, variable): + """ + Return the expression to which 'variable' is bound + """ + assert isinstance(variable, VariableExpression) + + intermediate = self.d[variable] + while intermediate: + try: + intermediate = self.d[intermediate] + except KeyError: + return intermediate + + def __contains__(self, item): + return item in self.d + + def __add__(self, other): + """ + :param other: ``BindingDict`` The dict with which to combine self + :return: ``BindingDict`` A new dict containing all the elements of both parameters + :raise VariableBindingException: If the parameter dictionaries are not consistent with each other + """ + try: + combined = BindingDict() + for v in self.d: + combined[v] = self.d[v] + for v in other.d: + combined[v] = other.d[v] + return combined + except VariableBindingException as e: + raise VariableBindingException( + "Attempting to add two contradicting" + " VariableBindingsLists: %s, %s" % (self, other) + ) from e + + def __ne__(self, other): + return not self == other + + def __eq__(self, other): + if not isinstance(other, BindingDict): + raise TypeError + return self.d == other.d + + def __str__(self): + return "{" + ", ".join(f"{v}: {self.d[v]}" for v in sorted(self.d.keys())) + "}" + + def __repr__(self): + return "BindingDict: %s" % self + + +class VariableBindingException(Exception): + pass + + +class UnificationException(Exception): + def __init__(self, a, b, bindings): + Exception.__init__(self, f"Cannot unify {a} with {b} given {bindings}") + + +class LinearLogicApplicationException(Exception): + pass + + +def demo(): + lexpr = Expression.fromstring + + print(lexpr(r"f")) + print(lexpr(r"(g -o f)")) + print(lexpr(r"((g -o G) -o G)")) + print(lexpr(r"g -o h -o f")) + print(lexpr(r"(g -o f)(g)").simplify()) + print(lexpr(r"(H -o f)(g)").simplify()) + print(lexpr(r"((g -o G) -o G)((g -o f))").simplify()) + print(lexpr(r"(H -o H)((g -o f))").simplify()) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/logic.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/logic.py new file mode 100644 index 0000000000000000000000000000000000000000..962db0b4e999e9470d45a203cfd3b6de17be1fba --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/logic.py @@ -0,0 +1,2065 @@ +# Natural Language Toolkit: Logic +# +# Author: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +""" +A version of first order predicate logic, built on +top of the typed lambda calculus. +""" + +import operator +import re +from collections import defaultdict +from functools import reduce, total_ordering + +from nltk.internals import Counter +from nltk.util import Trie + +APP = "APP" + +_counter = Counter() + + +class Tokens: + LAMBDA = "\\" + LAMBDA_LIST = ["\\"] + + # Quantifiers + EXISTS = "exists" + EXISTS_LIST = ["some", "exists", "exist"] + ALL = "all" + ALL_LIST = ["all", "forall"] + IOTA = "iota" + IOTA_LIST = ["iota"] + + # Punctuation + DOT = "." + OPEN = "(" + CLOSE = ")" + COMMA = "," + + # Operations + NOT = "-" + NOT_LIST = ["not", "-", "!"] + AND = "&" + AND_LIST = ["and", "&", "^"] + OR = "|" + OR_LIST = ["or", "|"] + IMP = "->" + IMP_LIST = ["implies", "->", "=>"] + IFF = "<->" + IFF_LIST = ["iff", "<->", "<=>"] + EQ = "=" + EQ_LIST = ["=", "=="] + NEQ = "!=" + NEQ_LIST = ["!="] + + # Collections of tokens + BINOPS = AND_LIST + OR_LIST + IMP_LIST + IFF_LIST + QUANTS = EXISTS_LIST + ALL_LIST + IOTA_LIST + PUNCT = [DOT, OPEN, CLOSE, COMMA] + + TOKENS = BINOPS + EQ_LIST + NEQ_LIST + QUANTS + LAMBDA_LIST + PUNCT + NOT_LIST + + # Special + SYMBOLS = [x for x in TOKENS if re.match(r"^[-\\.(),!&^|>=<]*$", x)] + + +def boolean_ops(): + """ + Boolean operators + """ + names = ["negation", "conjunction", "disjunction", "implication", "equivalence"] + for pair in zip(names, [Tokens.NOT, Tokens.AND, Tokens.OR, Tokens.IMP, Tokens.IFF]): + print("%-15s\t%s" % pair) + + +def equality_preds(): + """ + Equality predicates + """ + names = ["equality", "inequality"] + for pair in zip(names, [Tokens.EQ, Tokens.NEQ]): + print("%-15s\t%s" % pair) + + +def binding_ops(): + """ + Binding operators + """ + names = ["existential", "universal", "lambda"] + for pair in zip(names, [Tokens.EXISTS, Tokens.ALL, Tokens.LAMBDA, Tokens.IOTA]): + print("%-15s\t%s" % pair) + + +class LogicParser: + """A lambda calculus expression parser.""" + + def __init__(self, type_check=False): + """ + :param type_check: should type checking be performed + to their types? + :type type_check: bool + """ + assert isinstance(type_check, bool) + + self._currentIndex = 0 + self._buffer = [] + self.type_check = type_check + + """A list of tuples of quote characters. The 4-tuple is comprised + of the start character, the end character, the escape character, and + a boolean indicating whether the quotes should be included in the + result. Quotes are used to signify that a token should be treated as + atomic, ignoring any special characters within the token. The escape + character allows the quote end character to be used within the quote. + If True, the boolean indicates that the final token should contain the + quote and escape characters. + This method exists to be overridden""" + self.quote_chars = [] + + self.operator_precedence = dict( + [(x, 1) for x in Tokens.LAMBDA_LIST] + + [(x, 2) for x in Tokens.NOT_LIST] + + [(APP, 3)] + + [(x, 4) for x in Tokens.EQ_LIST + Tokens.NEQ_LIST] + + [(x, 5) for x in Tokens.QUANTS] + + [(x, 6) for x in Tokens.AND_LIST] + + [(x, 7) for x in Tokens.OR_LIST] + + [(x, 8) for x in Tokens.IMP_LIST] + + [(x, 9) for x in Tokens.IFF_LIST] + + [(None, 10)] + ) + self.right_associated_operations = [APP] + + def parse(self, data, signature=None): + """ + Parse the expression. + + :param data: str for the input to be parsed + :param signature: ``dict`` that maps variable names to type + strings + :returns: a parsed Expression + """ + data = data.rstrip() + + self._currentIndex = 0 + self._buffer, mapping = self.process(data) + + try: + result = self.process_next_expression(None) + if self.inRange(0): + raise UnexpectedTokenException(self._currentIndex + 1, self.token(0)) + except LogicalExpressionException as e: + msg = "{}\n{}\n{}^".format(e, data, " " * mapping[e.index - 1]) + raise LogicalExpressionException(None, msg) from e + + if self.type_check: + result.typecheck(signature) + + return result + + def process(self, data): + """Split the data into tokens""" + out = [] + mapping = {} + tokenTrie = Trie(self.get_all_symbols()) + token = "" + data_idx = 0 + token_start_idx = data_idx + while data_idx < len(data): + cur_data_idx = data_idx + quoted_token, data_idx = self.process_quoted_token(data_idx, data) + if quoted_token: + if not token: + token_start_idx = cur_data_idx + token += quoted_token + continue + + st = tokenTrie + c = data[data_idx] + symbol = "" + while c in st: + symbol += c + st = st[c] + if len(data) - data_idx > len(symbol): + c = data[data_idx + len(symbol)] + else: + break + if Trie.LEAF in st: + # token is a complete symbol + if token: + mapping[len(out)] = token_start_idx + out.append(token) + token = "" + mapping[len(out)] = data_idx + out.append(symbol) + data_idx += len(symbol) + else: + if data[data_idx] in " \t\n": # any whitespace + if token: + mapping[len(out)] = token_start_idx + out.append(token) + token = "" + else: + if not token: + token_start_idx = data_idx + token += data[data_idx] + data_idx += 1 + if token: + mapping[len(out)] = token_start_idx + out.append(token) + mapping[len(out)] = len(data) + mapping[len(out) + 1] = len(data) + 1 + return out, mapping + + def process_quoted_token(self, data_idx, data): + token = "" + c = data[data_idx] + i = data_idx + for start, end, escape, incl_quotes in self.quote_chars: + if c == start: + if incl_quotes: + token += c + i += 1 + while data[i] != end: + if data[i] == escape: + if incl_quotes: + token += data[i] + i += 1 + if len(data) == i: # if there are no more chars + raise LogicalExpressionException( + None, + "End of input reached. " + "Escape character [%s] found at end." % escape, + ) + token += data[i] + else: + token += data[i] + i += 1 + if len(data) == i: + raise LogicalExpressionException( + None, "End of input reached. " "Expected: [%s]" % end + ) + if incl_quotes: + token += data[i] + i += 1 + if not token: + raise LogicalExpressionException(None, "Empty quoted token found") + break + return token, i + + def get_all_symbols(self): + """This method exists to be overridden""" + return Tokens.SYMBOLS + + def inRange(self, location): + """Return TRUE if the given location is within the buffer""" + return self._currentIndex + location < len(self._buffer) + + def token(self, location=None): + """Get the next waiting token. If a location is given, then + return the token at currentIndex+location without advancing + currentIndex; setting it gives lookahead/lookback capability.""" + try: + if location is None: + tok = self._buffer[self._currentIndex] + self._currentIndex += 1 + else: + tok = self._buffer[self._currentIndex + location] + return tok + except IndexError as e: + raise ExpectedMoreTokensException(self._currentIndex + 1) from e + + def isvariable(self, tok): + return tok not in Tokens.TOKENS + + def process_next_expression(self, context): + """Parse the next complete expression from the stream and return it.""" + try: + tok = self.token() + except ExpectedMoreTokensException as e: + raise ExpectedMoreTokensException( + self._currentIndex + 1, message="Expression expected." + ) from e + + accum = self.handle(tok, context) + + if not accum: + raise UnexpectedTokenException( + self._currentIndex, tok, message="Expression expected." + ) + + return self.attempt_adjuncts(accum, context) + + def handle(self, tok, context): + """This method is intended to be overridden for logics that + use different operators or expressions""" + if self.isvariable(tok): + return self.handle_variable(tok, context) + + elif tok in Tokens.NOT_LIST: + return self.handle_negation(tok, context) + + elif tok in Tokens.LAMBDA_LIST: + return self.handle_lambda(tok, context) + + elif tok in Tokens.QUANTS: + return self.handle_quant(tok, context) + + elif tok == Tokens.OPEN: + return self.handle_open(tok, context) + + def attempt_adjuncts(self, expression, context): + cur_idx = None + while cur_idx != self._currentIndex: # while adjuncts are added + cur_idx = self._currentIndex + expression = self.attempt_EqualityExpression(expression, context) + expression = self.attempt_ApplicationExpression(expression, context) + expression = self.attempt_BooleanExpression(expression, context) + return expression + + def handle_negation(self, tok, context): + return self.make_NegatedExpression(self.process_next_expression(Tokens.NOT)) + + def make_NegatedExpression(self, expression): + return NegatedExpression(expression) + + def handle_variable(self, tok, context): + # It's either: 1) a predicate expression: sees(x,y) + # 2) an application expression: P(x) + # 3) a solo variable: john OR x + accum = self.make_VariableExpression(tok) + if self.inRange(0) and self.token(0) == Tokens.OPEN: + # The predicate has arguments + if not isinstance(accum, FunctionVariableExpression) and not isinstance( + accum, ConstantExpression + ): + raise LogicalExpressionException( + self._currentIndex, + "'%s' is an illegal predicate name. " + "Individual variables may not be used as " + "predicates." % tok, + ) + self.token() # swallow the Open Paren + + # curry the arguments + accum = self.make_ApplicationExpression( + accum, self.process_next_expression(APP) + ) + while self.inRange(0) and self.token(0) == Tokens.COMMA: + self.token() # swallow the comma + accum = self.make_ApplicationExpression( + accum, self.process_next_expression(APP) + ) + self.assertNextToken(Tokens.CLOSE) + return accum + + def get_next_token_variable(self, description): + try: + tok = self.token() + except ExpectedMoreTokensException as e: + raise ExpectedMoreTokensException(e.index, "Variable expected.") from e + if isinstance(self.make_VariableExpression(tok), ConstantExpression): + raise LogicalExpressionException( + self._currentIndex, + "'%s' is an illegal variable name. " + "Constants may not be %s." % (tok, description), + ) + return Variable(tok) + + def handle_lambda(self, tok, context): + # Expression is a lambda expression + if not self.inRange(0): + raise ExpectedMoreTokensException( + self._currentIndex + 2, + message="Variable and Expression expected following lambda operator.", + ) + vars = [self.get_next_token_variable("abstracted")] + while True: + if not self.inRange(0) or ( + self.token(0) == Tokens.DOT and not self.inRange(1) + ): + raise ExpectedMoreTokensException( + self._currentIndex + 2, message="Expression expected." + ) + if not self.isvariable(self.token(0)): + break + # Support expressions like: \x y.M == \x.\y.M + vars.append(self.get_next_token_variable("abstracted")) + if self.inRange(0) and self.token(0) == Tokens.DOT: + self.token() # swallow the dot + + accum = self.process_next_expression(tok) + while vars: + accum = self.make_LambdaExpression(vars.pop(), accum) + return accum + + def handle_quant(self, tok, context): + # Expression is a quantified expression: some x.M + factory = self.get_QuantifiedExpression_factory(tok) + + if not self.inRange(0): + raise ExpectedMoreTokensException( + self._currentIndex + 2, + message="Variable and Expression expected following quantifier '%s'." + % tok, + ) + vars = [self.get_next_token_variable("quantified")] + while True: + if not self.inRange(0) or ( + self.token(0) == Tokens.DOT and not self.inRange(1) + ): + raise ExpectedMoreTokensException( + self._currentIndex + 2, message="Expression expected." + ) + if not self.isvariable(self.token(0)): + break + # Support expressions like: some x y.M == some x.some y.M + vars.append(self.get_next_token_variable("quantified")) + if self.inRange(0) and self.token(0) == Tokens.DOT: + self.token() # swallow the dot + + accum = self.process_next_expression(tok) + while vars: + accum = self.make_QuanifiedExpression(factory, vars.pop(), accum) + return accum + + def get_QuantifiedExpression_factory(self, tok): + """This method serves as a hook for other logic parsers that + have different quantifiers""" + if tok in Tokens.EXISTS_LIST: + return ExistsExpression + elif tok in Tokens.ALL_LIST: + return AllExpression + elif tok in Tokens.IOTA_LIST: + return IotaExpression + else: + self.assertToken(tok, Tokens.QUANTS) + + def make_QuanifiedExpression(self, factory, variable, term): + return factory(variable, term) + + def handle_open(self, tok, context): + # Expression is in parens + accum = self.process_next_expression(None) + self.assertNextToken(Tokens.CLOSE) + return accum + + def attempt_EqualityExpression(self, expression, context): + """Attempt to make an equality expression. If the next token is an + equality operator, then an EqualityExpression will be returned. + Otherwise, the parameter will be returned.""" + if self.inRange(0): + tok = self.token(0) + if tok in Tokens.EQ_LIST + Tokens.NEQ_LIST and self.has_priority( + tok, context + ): + self.token() # swallow the "=" or "!=" + expression = self.make_EqualityExpression( + expression, self.process_next_expression(tok) + ) + if tok in Tokens.NEQ_LIST: + expression = self.make_NegatedExpression(expression) + return expression + + def make_EqualityExpression(self, first, second): + """This method serves as a hook for other logic parsers that + have different equality expression classes""" + return EqualityExpression(first, second) + + def attempt_BooleanExpression(self, expression, context): + """Attempt to make a boolean expression. If the next token is a boolean + operator, then a BooleanExpression will be returned. Otherwise, the + parameter will be returned.""" + while self.inRange(0): + tok = self.token(0) + factory = self.get_BooleanExpression_factory(tok) + if factory and self.has_priority(tok, context): + self.token() # swallow the operator + expression = self.make_BooleanExpression( + factory, expression, self.process_next_expression(tok) + ) + else: + break + return expression + + def get_BooleanExpression_factory(self, tok): + """This method serves as a hook for other logic parsers that + have different boolean operators""" + if tok in Tokens.AND_LIST: + return AndExpression + elif tok in Tokens.OR_LIST: + return OrExpression + elif tok in Tokens.IMP_LIST: + return ImpExpression + elif tok in Tokens.IFF_LIST: + return IffExpression + else: + return None + + def make_BooleanExpression(self, factory, first, second): + return factory(first, second) + + def attempt_ApplicationExpression(self, expression, context): + """Attempt to make an application expression. The next tokens are + a list of arguments in parens, then the argument expression is a + function being applied to the arguments. Otherwise, return the + argument expression.""" + if self.has_priority(APP, context): + if self.inRange(0) and self.token(0) == Tokens.OPEN: + if ( + not isinstance(expression, LambdaExpression) + and not isinstance(expression, ApplicationExpression) + and not isinstance(expression, FunctionVariableExpression) + and not isinstance(expression, ConstantExpression) + ): + raise LogicalExpressionException( + self._currentIndex, + ("The function '%s" % expression) + + "' is not a Lambda Expression, an " + "Application Expression, or a " + "functional predicate, so it may " + "not take arguments.", + ) + self.token() # swallow then open paren + # curry the arguments + accum = self.make_ApplicationExpression( + expression, self.process_next_expression(APP) + ) + while self.inRange(0) and self.token(0) == Tokens.COMMA: + self.token() # swallow the comma + accum = self.make_ApplicationExpression( + accum, self.process_next_expression(APP) + ) + self.assertNextToken(Tokens.CLOSE) + return accum + return expression + + def make_ApplicationExpression(self, function, argument): + return ApplicationExpression(function, argument) + + def make_VariableExpression(self, name): + return VariableExpression(Variable(name)) + + def make_LambdaExpression(self, variable, term): + return LambdaExpression(variable, term) + + def has_priority(self, operation, context): + return self.operator_precedence[operation] < self.operator_precedence[ + context + ] or ( + operation in self.right_associated_operations + and self.operator_precedence[operation] == self.operator_precedence[context] + ) + + def assertNextToken(self, expected): + try: + tok = self.token() + except ExpectedMoreTokensException as e: + raise ExpectedMoreTokensException( + e.index, message="Expected token '%s'." % expected + ) from e + + if isinstance(expected, list): + if tok not in expected: + raise UnexpectedTokenException(self._currentIndex, tok, expected) + else: + if tok != expected: + raise UnexpectedTokenException(self._currentIndex, tok, expected) + + def assertToken(self, tok, expected): + if isinstance(expected, list): + if tok not in expected: + raise UnexpectedTokenException(self._currentIndex, tok, expected) + else: + if tok != expected: + raise UnexpectedTokenException(self._currentIndex, tok, expected) + + def __repr__(self): + if self.inRange(0): + msg = "Next token: " + self.token(0) + else: + msg = "No more tokens" + return "<" + self.__class__.__name__ + ": " + msg + ">" + + +def read_logic(s, logic_parser=None, encoding=None): + """ + Convert a file of First Order Formulas into a list of {Expression}s. + + :param s: the contents of the file + :type s: str + :param logic_parser: The parser to be used to parse the logical expression + :type logic_parser: LogicParser + :param encoding: the encoding of the input string, if it is binary + :type encoding: str + :return: a list of parsed formulas. + :rtype: list(Expression) + """ + if encoding is not None: + s = s.decode(encoding) + if logic_parser is None: + logic_parser = LogicParser() + + statements = [] + for linenum, line in enumerate(s.splitlines()): + line = line.strip() + if line.startswith("#") or line == "": + continue + try: + statements.append(logic_parser.parse(line)) + except LogicalExpressionException as e: + raise ValueError(f"Unable to parse line {linenum}: {line}") from e + return statements + + +@total_ordering +class Variable: + def __init__(self, name): + """ + :param name: the name of the variable + """ + assert isinstance(name, str), "%s is not a string" % name + self.name = name + + def __eq__(self, other): + return isinstance(other, Variable) and self.name == other.name + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, Variable): + raise TypeError + return self.name < other.name + + def substitute_bindings(self, bindings): + return bindings.get(self, self) + + def __hash__(self): + return hash(self.name) + + def __str__(self): + return self.name + + def __repr__(self): + return "Variable('%s')" % self.name + + +def unique_variable(pattern=None, ignore=None): + """ + Return a new, unique variable. + + :param pattern: ``Variable`` that is being replaced. The new variable must + be the same type. + :param term: a set of ``Variable`` objects that should not be returned from + this function. + :rtype: Variable + """ + if pattern is not None: + if is_indvar(pattern.name): + prefix = "z" + elif is_funcvar(pattern.name): + prefix = "F" + elif is_eventvar(pattern.name): + prefix = "e0" + else: + assert False, "Cannot generate a unique constant" + else: + prefix = "z" + + v = Variable(f"{prefix}{_counter.get()}") + while ignore is not None and v in ignore: + v = Variable(f"{prefix}{_counter.get()}") + return v + + +def skolem_function(univ_scope=None): + """ + Return a skolem function over the variables in univ_scope + param univ_scope + """ + skolem = VariableExpression(Variable("F%s" % _counter.get())) + if univ_scope: + for v in list(univ_scope): + skolem = skolem(VariableExpression(v)) + return skolem + + +class Type: + def __repr__(self): + return "%s" % self + + def __hash__(self): + return hash("%s" % self) + + @classmethod + def fromstring(cls, s): + return read_type(s) + + +class ComplexType(Type): + def __init__(self, first, second): + assert isinstance(first, Type), "%s is not a Type" % first + assert isinstance(second, Type), "%s is not a Type" % second + self.first = first + self.second = second + + def __eq__(self, other): + return ( + isinstance(other, ComplexType) + and self.first == other.first + and self.second == other.second + ) + + def __ne__(self, other): + return not self == other + + __hash__ = Type.__hash__ + + def matches(self, other): + if isinstance(other, ComplexType): + return self.first.matches(other.first) and self.second.matches(other.second) + else: + return self == ANY_TYPE + + def resolve(self, other): + if other == ANY_TYPE: + return self + elif isinstance(other, ComplexType): + f = self.first.resolve(other.first) + s = self.second.resolve(other.second) + if f and s: + return ComplexType(f, s) + else: + return None + elif self == ANY_TYPE: + return other + else: + return None + + def __str__(self): + if self == ANY_TYPE: + return "%s" % ANY_TYPE + else: + return f"<{self.first},{self.second}>" + + def str(self): + if self == ANY_TYPE: + return ANY_TYPE.str() + else: + return f"({self.first.str()} -> {self.second.str()})" + + +class BasicType(Type): + def __eq__(self, other): + return isinstance(other, BasicType) and ("%s" % self) == ("%s" % other) + + def __ne__(self, other): + return not self == other + + __hash__ = Type.__hash__ + + def matches(self, other): + return other == ANY_TYPE or self == other + + def resolve(self, other): + if self.matches(other): + return self + else: + return None + + +class EntityType(BasicType): + def __str__(self): + return "e" + + def str(self): + return "IND" + + +class TruthValueType(BasicType): + def __str__(self): + return "t" + + def str(self): + return "BOOL" + + +class EventType(BasicType): + def __str__(self): + return "v" + + def str(self): + return "EVENT" + + +class AnyType(BasicType, ComplexType): + def __init__(self): + pass + + @property + def first(self): + return self + + @property + def second(self): + return self + + def __eq__(self, other): + return isinstance(other, AnyType) or other.__eq__(self) + + def __ne__(self, other): + return not self == other + + __hash__ = Type.__hash__ + + def matches(self, other): + return True + + def resolve(self, other): + return other + + def __str__(self): + return "?" + + def str(self): + return "ANY" + + +TRUTH_TYPE = TruthValueType() +ENTITY_TYPE = EntityType() +EVENT_TYPE = EventType() +ANY_TYPE = AnyType() + + +def read_type(type_string): + assert isinstance(type_string, str) + type_string = type_string.replace(" ", "") # remove spaces + + if type_string[0] == "<": + assert type_string[-1] == ">" + paren_count = 0 + for i, char in enumerate(type_string): + if char == "<": + paren_count += 1 + elif char == ">": + paren_count -= 1 + assert paren_count > 0 + elif char == ",": + if paren_count == 1: + break + return ComplexType( + read_type(type_string[1:i]), read_type(type_string[i + 1 : -1]) + ) + elif type_string[0] == "%s" % ENTITY_TYPE: + return ENTITY_TYPE + elif type_string[0] == "%s" % TRUTH_TYPE: + return TRUTH_TYPE + elif type_string[0] == "%s" % ANY_TYPE: + return ANY_TYPE + else: + raise LogicalExpressionException( + None, "Unexpected character: '%s'." % type_string[0] + ) + + +class TypeException(Exception): + def __init__(self, msg): + super().__init__(msg) + + +class InconsistentTypeHierarchyException(TypeException): + def __init__(self, variable, expression=None): + if expression: + msg = ( + "The variable '%s' was found in multiple places with different" + " types in '%s'." % (variable, expression) + ) + else: + msg = ( + "The variable '%s' was found in multiple places with different" + " types." % (variable) + ) + super().__init__(msg) + + +class TypeResolutionException(TypeException): + def __init__(self, expression, other_type): + super().__init__( + "The type of '%s', '%s', cannot be resolved with type '%s'" + % (expression, expression.type, other_type) + ) + + +class IllegalTypeException(TypeException): + def __init__(self, expression, other_type, allowed_type): + super().__init__( + "Cannot set type of %s '%s' to '%s'; must match type '%s'." + % (expression.__class__.__name__, expression, other_type, allowed_type) + ) + + +def typecheck(expressions, signature=None): + """ + Ensure correct typing across a collection of ``Expression`` objects. + :param expressions: a collection of expressions + :param signature: dict that maps variable names to types (or string + representations of types) + """ + # typecheck and create master signature + for expression in expressions: + signature = expression.typecheck(signature) + # apply master signature to all expressions + for expression in expressions[:-1]: + expression.typecheck(signature) + return signature + + +class SubstituteBindingsI: + """ + An interface for classes that can perform substitutions for + variables. + """ + + def substitute_bindings(self, bindings): + """ + :return: The object that is obtained by replacing + each variable bound by ``bindings`` with its values. + Aliases are already resolved. (maybe?) + :rtype: (any) + """ + raise NotImplementedError() + + def variables(self): + """ + :return: A list of all variables in this object. + """ + raise NotImplementedError() + + +class Expression(SubstituteBindingsI): + """This is the base abstract object for all logical expressions""" + + _logic_parser = LogicParser() + _type_checking_logic_parser = LogicParser(type_check=True) + + @classmethod + def fromstring(cls, s, type_check=False, signature=None): + if type_check: + return cls._type_checking_logic_parser.parse(s, signature) + else: + return cls._logic_parser.parse(s, signature) + + def __call__(self, other, *additional): + accum = self.applyto(other) + for a in additional: + accum = accum(a) + return accum + + def applyto(self, other): + assert isinstance(other, Expression), "%s is not an Expression" % other + return ApplicationExpression(self, other) + + def __neg__(self): + return NegatedExpression(self) + + def negate(self): + """If this is a negated expression, remove the negation. + Otherwise add a negation.""" + return -self + + def __and__(self, other): + if not isinstance(other, Expression): + raise TypeError("%s is not an Expression" % other) + return AndExpression(self, other) + + def __or__(self, other): + if not isinstance(other, Expression): + raise TypeError("%s is not an Expression" % other) + return OrExpression(self, other) + + def __gt__(self, other): + if not isinstance(other, Expression): + raise TypeError("%s is not an Expression" % other) + return ImpExpression(self, other) + + def __lt__(self, other): + if not isinstance(other, Expression): + raise TypeError("%s is not an Expression" % other) + return IffExpression(self, other) + + def __eq__(self, other): + return NotImplemented + + def __ne__(self, other): + return not self == other + + def equiv(self, other, prover=None): + """ + Check for logical equivalence. + Pass the expression (self <-> other) to the theorem prover. + If the prover says it is valid, then the self and other are equal. + + :param other: an ``Expression`` to check equality against + :param prover: a ``nltk.inference.api.Prover`` + """ + assert isinstance(other, Expression), "%s is not an Expression" % other + + if prover is None: + from nltk.inference import Prover9 + + prover = Prover9() + bicond = IffExpression(self.simplify(), other.simplify()) + return prover.prove(bicond) + + def __hash__(self): + return hash(repr(self)) + + def substitute_bindings(self, bindings): + expr = self + for var in expr.variables(): + if var in bindings: + val = bindings[var] + if isinstance(val, Variable): + val = self.make_VariableExpression(val) + elif not isinstance(val, Expression): + raise ValueError( + "Can not substitute a non-expression " + "value into an expression: %r" % (val,) + ) + # Substitute bindings in the target value. + val = val.substitute_bindings(bindings) + # Replace var w/ the target value. + expr = expr.replace(var, val) + return expr.simplify() + + def typecheck(self, signature=None): + """ + Infer and check types. Raise exceptions if necessary. + + :param signature: dict that maps variable names to types (or string + representations of types) + :return: the signature, plus any additional type mappings + """ + sig = defaultdict(list) + if signature: + for key in signature: + val = signature[key] + varEx = VariableExpression(Variable(key)) + if isinstance(val, Type): + varEx.type = val + else: + varEx.type = read_type(val) + sig[key].append(varEx) + + self._set_type(signature=sig) + + return {key: sig[key][0].type for key in sig} + + def findtype(self, variable): + """ + Find the type of the given variable as it is used in this expression. + For example, finding the type of "P" in "P(x) & Q(x,y)" yields "" + + :param variable: Variable + """ + raise NotImplementedError() + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """ + Set the type of this expression to be the given type. Raise type + exceptions where applicable. + + :param other_type: Type + :param signature: dict(str -> list(AbstractVariableExpression)) + """ + raise NotImplementedError() + + def replace(self, variable, expression, replace_bound=False, alpha_convert=True): + """ + Replace every instance of 'variable' with 'expression' + :param variable: ``Variable`` The variable to replace + :param expression: ``Expression`` The expression with which to replace it + :param replace_bound: bool Should bound variables be replaced? + :param alpha_convert: bool Alpha convert automatically to avoid name clashes? + """ + assert isinstance(variable, Variable), "%s is not a Variable" % variable + assert isinstance(expression, Expression), ( + "%s is not an Expression" % expression + ) + + return self.visit_structured( + lambda e: e.replace(variable, expression, replace_bound, alpha_convert), + self.__class__, + ) + + def normalize(self, newvars=None): + """Rename auto-generated unique variables""" + + def get_indiv_vars(e): + if isinstance(e, IndividualVariableExpression): + return {e} + elif isinstance(e, AbstractVariableExpression): + return set() + else: + return e.visit( + get_indiv_vars, lambda parts: reduce(operator.or_, parts, set()) + ) + + result = self + for i, e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)): + if isinstance(e, EventVariableExpression): + newVar = e.__class__(Variable("e0%s" % (i + 1))) + elif isinstance(e, IndividualVariableExpression): + newVar = e.__class__(Variable("z%s" % (i + 1))) + else: + newVar = e + result = result.replace(e.variable, newVar, True) + return result + + def visit(self, function, combinator): + """ + Recursively visit subexpressions. Apply 'function' to each + subexpression and pass the result of each function application + to the 'combinator' for aggregation: + + return combinator(map(function, self.subexpressions)) + + Bound variables are neither applied upon by the function nor given to + the combinator. + :param function: ``Function`` to call on each subexpression + :param combinator: ``Function,R>`` to combine the results of the + function calls + :return: result of combination ``R`` + """ + raise NotImplementedError() + + def visit_structured(self, function, combinator): + """ + Recursively visit subexpressions. Apply 'function' to each + subexpression and pass the result of each function application + to the 'combinator' for aggregation. The combinator must have + the same signature as the constructor. The function is not + applied to bound variables, but they are passed to the + combinator. + :param function: ``Function`` to call on each subexpression + :param combinator: ``Function`` with the same signature as the + constructor, to combine the results of the function calls + :return: result of combination + """ + return self.visit(function, lambda parts: combinator(*parts)) + + def __repr__(self): + return f"<{self.__class__.__name__} {self}>" + + def __str__(self): + return self.str() + + def variables(self): + """ + Return a set of all the variables for binding substitution. + The variables returned include all free (non-bound) individual + variables and any variable starting with '?' or '@'. + :return: set of ``Variable`` objects + """ + return self.free() | { + p for p in self.predicates() | self.constants() if re.match("^[?@]", p.name) + } + + def free(self): + """ + Return a set of all the free (non-bound) variables. This includes + both individual and predicate variables, but not constants. + :return: set of ``Variable`` objects + """ + return self.visit( + lambda e: e.free(), lambda parts: reduce(operator.or_, parts, set()) + ) + + def constants(self): + """ + Return a set of individual constants (non-predicates). + :return: set of ``Variable`` objects + """ + return self.visit( + lambda e: e.constants(), lambda parts: reduce(operator.or_, parts, set()) + ) + + def predicates(self): + """ + Return a set of predicates (constants, not variables). + :return: set of ``Variable`` objects + """ + return self.visit( + lambda e: e.predicates(), lambda parts: reduce(operator.or_, parts, set()) + ) + + def simplify(self): + """ + :return: beta-converted version of this expression + """ + return self.visit_structured(lambda e: e.simplify(), self.__class__) + + def make_VariableExpression(self, variable): + return VariableExpression(variable) + + +class ApplicationExpression(Expression): + r""" + This class is used to represent two related types of logical expressions. + + The first is a Predicate Expression, such as "P(x,y)". A predicate + expression is comprised of a ``FunctionVariableExpression`` or + ``ConstantExpression`` as the predicate and a list of Expressions as the + arguments. + + The second is a an application of one expression to another, such as + "(\x.dog(x))(fido)". + + The reason Predicate Expressions are treated as Application Expressions is + that the Variable Expression predicate of the expression may be replaced + with another Expression, such as a LambdaExpression, which would mean that + the Predicate should be thought of as being applied to the arguments. + + The logical expression reader will always curry arguments in a application expression. + So, "\x y.see(x,y)(john,mary)" will be represented internally as + "((\x y.(see(x))(y))(john))(mary)". This simplifies the internals since + there will always be exactly one argument in an application. + + The str() method will usually print the curried forms of application + expressions. The one exception is when the the application expression is + really a predicate expression (ie, underlying function is an + ``AbstractVariableExpression``). This means that the example from above + will be returned as "(\x y.see(x,y)(john))(mary)". + """ + + def __init__(self, function, argument): + """ + :param function: ``Expression``, for the function expression + :param argument: ``Expression``, for the argument + """ + assert isinstance(function, Expression), "%s is not an Expression" % function + assert isinstance(argument, Expression), "%s is not an Expression" % argument + self.function = function + self.argument = argument + + def simplify(self): + function = self.function.simplify() + argument = self.argument.simplify() + if isinstance(function, LambdaExpression): + return function.term.replace(function.variable, argument).simplify() + else: + return self.__class__(function, argument) + + @property + def type(self): + if isinstance(self.function.type, ComplexType): + return self.function.type.second + else: + return ANY_TYPE + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + self.argument._set_type(ANY_TYPE, signature) + try: + self.function._set_type( + ComplexType(self.argument.type, other_type), signature + ) + except TypeResolutionException as e: + raise TypeException( + "The function '%s' is of type '%s' and cannot be applied " + "to '%s' of type '%s'. Its argument must match type '%s'." + % ( + self.function, + self.function.type, + self.argument, + self.argument.type, + self.function.type.first, + ) + ) from e + + def findtype(self, variable): + """:see Expression.findtype()""" + assert isinstance(variable, Variable), "%s is not a Variable" % variable + if self.is_atom(): + function, args = self.uncurry() + else: + # It's not a predicate expression ("P(x,y)"), so leave args curried + function = self.function + args = [self.argument] + + found = [arg.findtype(variable) for arg in [function] + args] + + unique = [] + for f in found: + if f != ANY_TYPE: + if unique: + for u in unique: + if f.matches(u): + break + else: + unique.append(f) + + if len(unique) == 1: + return list(unique)[0] + else: + return ANY_TYPE + + def constants(self): + """:see: Expression.constants()""" + if isinstance(self.function, AbstractVariableExpression): + function_constants = set() + else: + function_constants = self.function.constants() + return function_constants | self.argument.constants() + + def predicates(self): + """:see: Expression.predicates()""" + if isinstance(self.function, ConstantExpression): + function_preds = {self.function.variable} + else: + function_preds = self.function.predicates() + return function_preds | self.argument.predicates() + + def visit(self, function, combinator): + """:see: Expression.visit()""" + return combinator([function(self.function), function(self.argument)]) + + def __eq__(self, other): + return ( + isinstance(other, ApplicationExpression) + and self.function == other.function + and self.argument == other.argument + ) + + def __ne__(self, other): + return not self == other + + __hash__ = Expression.__hash__ + + def __str__(self): + # uncurry the arguments and find the base function + if self.is_atom(): + function, args = self.uncurry() + arg_str = ",".join("%s" % arg for arg in args) + else: + # Leave arguments curried + function = self.function + arg_str = "%s" % self.argument + + function_str = "%s" % function + parenthesize_function = False + if isinstance(function, LambdaExpression): + if isinstance(function.term, ApplicationExpression): + if not isinstance(function.term.function, AbstractVariableExpression): + parenthesize_function = True + elif not isinstance(function.term, BooleanExpression): + parenthesize_function = True + elif isinstance(function, ApplicationExpression): + parenthesize_function = True + + if parenthesize_function: + function_str = Tokens.OPEN + function_str + Tokens.CLOSE + + return function_str + Tokens.OPEN + arg_str + Tokens.CLOSE + + def uncurry(self): + """ + Uncurry this application expression + + return: A tuple (base-function, arg-list) + """ + function = self.function + args = [self.argument] + while isinstance(function, ApplicationExpression): + # (\x.\y.sees(x,y)(john))(mary) + args.insert(0, function.argument) + function = function.function + return (function, args) + + @property + def pred(self): + """ + Return uncurried base-function. + If this is an atom, then the result will be a variable expression. + Otherwise, it will be a lambda expression. + """ + return self.uncurry()[0] + + @property + def args(self): + """ + Return uncurried arg-list + """ + return self.uncurry()[1] + + def is_atom(self): + """ + Is this expression an atom (as opposed to a lambda expression applied + to a term)? + """ + return isinstance(self.pred, AbstractVariableExpression) + + +@total_ordering +class AbstractVariableExpression(Expression): + """This class represents a variable to be used as a predicate or entity""" + + def __init__(self, variable): + """ + :param variable: ``Variable``, for the variable + """ + assert isinstance(variable, Variable), "%s is not a Variable" % variable + self.variable = variable + + def simplify(self): + return self + + def replace(self, variable, expression, replace_bound=False, alpha_convert=True): + """:see: Expression.replace()""" + assert isinstance(variable, Variable), "%s is not an Variable" % variable + assert isinstance(expression, Expression), ( + "%s is not an Expression" % expression + ) + if self.variable == variable: + return expression + else: + return self + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + resolution = other_type + for varEx in signature[self.variable.name]: + resolution = varEx.type.resolve(resolution) + if not resolution: + raise InconsistentTypeHierarchyException(self) + + signature[self.variable.name].append(self) + for varEx in signature[self.variable.name]: + varEx.type = resolution + + def findtype(self, variable): + """:see Expression.findtype()""" + assert isinstance(variable, Variable), "%s is not a Variable" % variable + if self.variable == variable: + return self.type + else: + return ANY_TYPE + + def predicates(self): + """:see: Expression.predicates()""" + return set() + + def __eq__(self, other): + """Allow equality between instances of ``AbstractVariableExpression`` + subtypes.""" + return ( + isinstance(other, AbstractVariableExpression) + and self.variable == other.variable + ) + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, AbstractVariableExpression): + raise TypeError + return self.variable < other.variable + + __hash__ = Expression.__hash__ + + def __str__(self): + return "%s" % self.variable + + +class IndividualVariableExpression(AbstractVariableExpression): + """This class represents variables that take the form of a single lowercase + character (other than 'e') followed by zero or more digits.""" + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + if not other_type.matches(ENTITY_TYPE): + raise IllegalTypeException(self, other_type, ENTITY_TYPE) + + signature[self.variable.name].append(self) + + def _get_type(self): + return ENTITY_TYPE + + type = property(_get_type, _set_type) + + def free(self): + """:see: Expression.free()""" + return {self.variable} + + def constants(self): + """:see: Expression.constants()""" + return set() + + +class FunctionVariableExpression(AbstractVariableExpression): + """This class represents variables that take the form of a single uppercase + character followed by zero or more digits.""" + + type = ANY_TYPE + + def free(self): + """:see: Expression.free()""" + return {self.variable} + + def constants(self): + """:see: Expression.constants()""" + return set() + + +class EventVariableExpression(IndividualVariableExpression): + """This class represents variables that take the form of a single lowercase + 'e' character followed by zero or more digits.""" + + type = EVENT_TYPE + + +class ConstantExpression(AbstractVariableExpression): + """This class represents variables that do not take the form of a single + character followed by zero or more digits.""" + + type = ENTITY_TYPE + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + if other_type == ANY_TYPE: + # entity type by default, for individuals + resolution = ENTITY_TYPE + else: + resolution = other_type + if self.type != ENTITY_TYPE: + resolution = resolution.resolve(self.type) + + for varEx in signature[self.variable.name]: + resolution = varEx.type.resolve(resolution) + if not resolution: + raise InconsistentTypeHierarchyException(self) + + signature[self.variable.name].append(self) + for varEx in signature[self.variable.name]: + varEx.type = resolution + + def free(self): + """:see: Expression.free()""" + return set() + + def constants(self): + """:see: Expression.constants()""" + return {self.variable} + + +def VariableExpression(variable): + """ + This is a factory method that instantiates and returns a subtype of + ``AbstractVariableExpression`` appropriate for the given variable. + """ + assert isinstance(variable, Variable), "%s is not a Variable" % variable + if is_indvar(variable.name): + return IndividualVariableExpression(variable) + elif is_funcvar(variable.name): + return FunctionVariableExpression(variable) + elif is_eventvar(variable.name): + return EventVariableExpression(variable) + else: + return ConstantExpression(variable) + + +class VariableBinderExpression(Expression): + """This an abstract class for any Expression that binds a variable in an + Expression. This includes LambdaExpressions and Quantified Expressions""" + + def __init__(self, variable, term): + """ + :param variable: ``Variable``, for the variable + :param term: ``Expression``, for the term + """ + assert isinstance(variable, Variable), "%s is not a Variable" % variable + assert isinstance(term, Expression), "%s is not an Expression" % term + self.variable = variable + self.term = term + + def replace(self, variable, expression, replace_bound=False, alpha_convert=True): + """:see: Expression.replace()""" + assert isinstance(variable, Variable), "%s is not a Variable" % variable + assert isinstance(expression, Expression), ( + "%s is not an Expression" % expression + ) + # if the bound variable is the thing being replaced + if self.variable == variable: + if replace_bound: + assert isinstance(expression, AbstractVariableExpression), ( + "%s is not a AbstractVariableExpression" % expression + ) + return self.__class__( + expression.variable, + self.term.replace(variable, expression, True, alpha_convert), + ) + else: + return self + else: + # if the bound variable appears in the expression, then it must + # be alpha converted to avoid a conflict + if alpha_convert and self.variable in expression.free(): + self = self.alpha_convert(unique_variable(pattern=self.variable)) + + # replace in the term + return self.__class__( + self.variable, + self.term.replace(variable, expression, replace_bound, alpha_convert), + ) + + def alpha_convert(self, newvar): + """Rename all occurrences of the variable introduced by this variable + binder in the expression to ``newvar``. + :param newvar: ``Variable``, for the new variable + """ + assert isinstance(newvar, Variable), "%s is not a Variable" % newvar + return self.__class__( + newvar, self.term.replace(self.variable, VariableExpression(newvar), True) + ) + + def free(self): + """:see: Expression.free()""" + return self.term.free() - {self.variable} + + def findtype(self, variable): + """:see Expression.findtype()""" + assert isinstance(variable, Variable), "%s is not a Variable" % variable + if variable == self.variable: + return ANY_TYPE + else: + return self.term.findtype(variable) + + def visit(self, function, combinator): + """:see: Expression.visit()""" + return combinator([function(self.term)]) + + def visit_structured(self, function, combinator): + """:see: Expression.visit_structured()""" + return combinator(self.variable, function(self.term)) + + def __eq__(self, other): + r"""Defines equality modulo alphabetic variance. If we are comparing + \x.M and \y.N, then check equality of M and N[x/y].""" + if isinstance(self, other.__class__) or isinstance(other, self.__class__): + if self.variable == other.variable: + return self.term == other.term + else: + # Comparing \x.M and \y.N. Relabel y in N with x and continue. + varex = VariableExpression(self.variable) + return self.term == other.term.replace(other.variable, varex) + else: + return False + + def __ne__(self, other): + return not self == other + + __hash__ = Expression.__hash__ + + +class LambdaExpression(VariableBinderExpression): + @property + def type(self): + return ComplexType(self.term.findtype(self.variable), self.term.type) + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + self.term._set_type(other_type.second, signature) + if not self.type.resolve(other_type): + raise TypeResolutionException(self, other_type) + + def __str__(self): + variables = [self.variable] + term = self.term + while term.__class__ == self.__class__: + variables.append(term.variable) + term = term.term + return ( + Tokens.LAMBDA + + " ".join("%s" % v for v in variables) + + Tokens.DOT + + "%s" % term + ) + + +class QuantifiedExpression(VariableBinderExpression): + @property + def type(self): + return TRUTH_TYPE + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + if not other_type.matches(TRUTH_TYPE): + raise IllegalTypeException(self, other_type, TRUTH_TYPE) + self.term._set_type(TRUTH_TYPE, signature) + + def __str__(self): + variables = [self.variable] + term = self.term + while term.__class__ == self.__class__: + variables.append(term.variable) + term = term.term + return ( + self.getQuantifier() + + " " + + " ".join("%s" % v for v in variables) + + Tokens.DOT + + "%s" % term + ) + + +class ExistsExpression(QuantifiedExpression): + def getQuantifier(self): + return Tokens.EXISTS + + +class AllExpression(QuantifiedExpression): + def getQuantifier(self): + return Tokens.ALL + + +class IotaExpression(QuantifiedExpression): + def getQuantifier(self): + return Tokens.IOTA + + +class NegatedExpression(Expression): + def __init__(self, term): + assert isinstance(term, Expression), "%s is not an Expression" % term + self.term = term + + @property + def type(self): + return TRUTH_TYPE + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + if not other_type.matches(TRUTH_TYPE): + raise IllegalTypeException(self, other_type, TRUTH_TYPE) + self.term._set_type(TRUTH_TYPE, signature) + + def findtype(self, variable): + assert isinstance(variable, Variable), "%s is not a Variable" % variable + return self.term.findtype(variable) + + def visit(self, function, combinator): + """:see: Expression.visit()""" + return combinator([function(self.term)]) + + def negate(self): + """:see: Expression.negate()""" + return self.term + + def __eq__(self, other): + return isinstance(other, NegatedExpression) and self.term == other.term + + def __ne__(self, other): + return not self == other + + __hash__ = Expression.__hash__ + + def __str__(self): + return Tokens.NOT + "%s" % self.term + + +class BinaryExpression(Expression): + def __init__(self, first, second): + assert isinstance(first, Expression), "%s is not an Expression" % first + assert isinstance(second, Expression), "%s is not an Expression" % second + self.first = first + self.second = second + + @property + def type(self): + return TRUTH_TYPE + + def findtype(self, variable): + """:see Expression.findtype()""" + assert isinstance(variable, Variable), "%s is not a Variable" % variable + f = self.first.findtype(variable) + s = self.second.findtype(variable) + if f == s or s == ANY_TYPE: + return f + elif f == ANY_TYPE: + return s + else: + return ANY_TYPE + + def visit(self, function, combinator): + """:see: Expression.visit()""" + return combinator([function(self.first), function(self.second)]) + + def __eq__(self, other): + return ( + (isinstance(self, other.__class__) or isinstance(other, self.__class__)) + and self.first == other.first + and self.second == other.second + ) + + def __ne__(self, other): + return not self == other + + __hash__ = Expression.__hash__ + + def __str__(self): + first = self._str_subex(self.first) + second = self._str_subex(self.second) + return Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE + + def _str_subex(self, subex): + return "%s" % subex + + +class BooleanExpression(BinaryExpression): + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + if not other_type.matches(TRUTH_TYPE): + raise IllegalTypeException(self, other_type, TRUTH_TYPE) + self.first._set_type(TRUTH_TYPE, signature) + self.second._set_type(TRUTH_TYPE, signature) + + +class AndExpression(BooleanExpression): + """This class represents conjunctions""" + + def getOp(self): + return Tokens.AND + + def _str_subex(self, subex): + s = "%s" % subex + if isinstance(subex, AndExpression): + return s[1:-1] + return s + + +class OrExpression(BooleanExpression): + """This class represents disjunctions""" + + def getOp(self): + return Tokens.OR + + def _str_subex(self, subex): + s = "%s" % subex + if isinstance(subex, OrExpression): + return s[1:-1] + return s + + +class ImpExpression(BooleanExpression): + """This class represents implications""" + + def getOp(self): + return Tokens.IMP + + +class IffExpression(BooleanExpression): + """This class represents biconditionals""" + + def getOp(self): + return Tokens.IFF + + +class EqualityExpression(BinaryExpression): + """This class represents equality expressions like "(x = y)".""" + + def _set_type(self, other_type=ANY_TYPE, signature=None): + """:see Expression._set_type()""" + assert isinstance(other_type, Type) + + if signature is None: + signature = defaultdict(list) + + if not other_type.matches(TRUTH_TYPE): + raise IllegalTypeException(self, other_type, TRUTH_TYPE) + self.first._set_type(ENTITY_TYPE, signature) + self.second._set_type(ENTITY_TYPE, signature) + + def getOp(self): + return Tokens.EQ + + +### Utilities + + +class LogicalExpressionException(Exception): + def __init__(self, index, message): + self.index = index + Exception.__init__(self, message) + + +class UnexpectedTokenException(LogicalExpressionException): + def __init__(self, index, unexpected=None, expected=None, message=None): + if unexpected and expected: + msg = "Unexpected token: '%s'. " "Expected token '%s'." % ( + unexpected, + expected, + ) + elif unexpected: + msg = "Unexpected token: '%s'." % unexpected + if message: + msg += " " + message + else: + msg = "Expected token '%s'." % expected + LogicalExpressionException.__init__(self, index, msg) + + +class ExpectedMoreTokensException(LogicalExpressionException): + def __init__(self, index, message=None): + if not message: + message = "More tokens expected." + LogicalExpressionException.__init__( + self, index, "End of input found. " + message + ) + + +def is_indvar(expr): + """ + An individual variable must be a single lowercase character other than 'e', + followed by zero or more digits. + + :param expr: str + :return: bool True if expr is of the correct form + """ + assert isinstance(expr, str), "%s is not a string" % expr + return re.match(r"^[a-df-z]\d*$", expr) is not None + + +def is_funcvar(expr): + """ + A function variable must be a single uppercase character followed by + zero or more digits. + + :param expr: str + :return: bool True if expr is of the correct form + """ + assert isinstance(expr, str), "%s is not a string" % expr + return re.match(r"^[A-Z]\d*$", expr) is not None + + +def is_eventvar(expr): + """ + An event variable must be a single lowercase 'e' character followed by + zero or more digits. + + :param expr: str + :return: bool True if expr is of the correct form + """ + assert isinstance(expr, str), "%s is not a string" % expr + return re.match(r"^e\d*$", expr) is not None + + +def demo(): + lexpr = Expression.fromstring + print("=" * 20 + "Test reader" + "=" * 20) + print(lexpr(r"john")) + print(lexpr(r"man(x)")) + print(lexpr(r"-man(x)")) + print(lexpr(r"(man(x) & tall(x) & walks(x))")) + print(lexpr(r"exists x.(man(x) & tall(x) & walks(x))")) + print(lexpr(r"\x.man(x)")) + print(lexpr(r"\x.man(x)(john)")) + print(lexpr(r"\x y.sees(x,y)")) + print(lexpr(r"\x y.sees(x,y)(a,b)")) + print(lexpr(r"(\x.exists y.walks(x,y))(x)")) + print(lexpr(r"exists x.x = y")) + print(lexpr(r"exists x.(x = y)")) + print(lexpr("P(x) & x=y & P(y)")) + print(lexpr(r"\P Q.exists x.(P(x) & Q(x))")) + print(lexpr(r"man(x) <-> tall(x)")) + + print("=" * 20 + "Test simplify" + "=" * 20) + print(lexpr(r"\x.\y.sees(x,y)(john)(mary)").simplify()) + print(lexpr(r"\x.\y.sees(x,y)(john, mary)").simplify()) + print(lexpr(r"all x.(man(x) & (\x.exists y.walks(x,y))(x))").simplify()) + print(lexpr(r"(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))").simplify()) + + print("=" * 20 + "Test alpha conversion and binder expression equality" + "=" * 20) + e1 = lexpr("exists x.P(x)") + print(e1) + e2 = e1.alpha_convert(Variable("z")) + print(e2) + print(e1 == e2) + + +def demo_errors(): + print("=" * 20 + "Test reader errors" + "=" * 20) + demoException("(P(x) & Q(x)") + demoException("((P(x) &) & Q(x))") + demoException("P(x) -> ") + demoException("P(x") + demoException("P(x,") + demoException("P(x,)") + demoException("exists") + demoException("exists x.") + demoException("\\") + demoException("\\ x y.") + demoException("P(x)Q(x)") + demoException("(P(x)Q(x)") + demoException("exists x -> y") + + +def demoException(s): + try: + Expression.fromstring(s) + except LogicalExpressionException as e: + print(f"{e.__class__.__name__}: {e}") + + +def printtype(ex): + print(f"{ex.str()} : {ex.type}") + + +if __name__ == "__main__": + demo() +# demo_errors() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/relextract.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/relextract.py new file mode 100644 index 0000000000000000000000000000000000000000..afc4f8c6d152816b4f7b34cbe26e07abdff29ddd --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/relextract.py @@ -0,0 +1,539 @@ +# Natural Language Toolkit: Relation Extraction +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# URL: +# For license information, see LICENSE.TXT + +""" +Code for extracting relational triples from the ieer and conll2002 corpora. + +Relations are stored internally as dictionaries ('reldicts'). + +The two serialization outputs are "rtuple" and "clause". + +- An rtuple is a tuple of the form ``(subj, filler, obj)``, + where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words + occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to + circumvent locale variations in rendering utf-8 encoded strings. +- A clause is an atom of the form ``relsym(subjsym, objsym)``, + where the relation, subject and object have been canonicalized to single strings. +""" + +# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs? + +import html +import re +from collections import defaultdict + +# Dictionary that associates corpora with NE classes +NE_CLASSES = { + "ieer": [ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DURATION", + "DATE", + "CARDINAL", + "PERCENT", + "MONEY", + "MEASURE", + ], + "conll2002": ["LOC", "PER", "ORG"], + "ace": [ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DURATION", + "DATE", + "CARDINAL", + "PERCENT", + "MONEY", + "MEASURE", + "FACILITY", + "GPE", + ], +} + +# Allow abbreviated class labels +short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON") +long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER") + + +def _expand(type): + """ + Expand an NE class name. + :type type: str + :rtype: str + """ + try: + return short2long[type] + except KeyError: + return type + + +def class_abbrev(type): + """ + Abbreviate an NE class name. + :type type: str + :rtype: str + """ + try: + return long2short[type] + except KeyError: + return type + + +def _join(lst, sep=" ", untag=False): + """ + Join a list into a string, turning tags tuples into tag strings or just words. + :param untag: if ``True``, omit the tag from tagged input strings. + :type lst: list + :rtype: str + """ + try: + return sep.join(lst) + except TypeError: + if untag: + return sep.join(tup[0] for tup in lst) + from nltk.tag import tuple2str + + return sep.join(tuple2str(tup) for tup in lst) + + +def descape_entity(m, defs=html.entities.entitydefs): + """ + Translate one entity to its ISO Latin value. + Inspired by example from effbot.org + + + """ + try: + return defs[m.group(1)] + + except KeyError: + return m.group(0) # use as is + + +def list2sym(lst): + """ + Convert a list of strings into a canonical symbol. + :type lst: list + :return: a Unicode string without whitespace + :rtype: unicode + """ + sym = _join(lst, "_", untag=True) + sym = sym.lower() + ENT = re.compile(r"&(\w+?);") + sym = ENT.sub(descape_entity, sym) + sym = sym.replace(".", "") + return sym + + +def tree2semi_rel(tree): + """ + Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). + + In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this + identifies pairs whose first member is a list (possibly empty) of terminal + strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). + + :param tree: a chunk tree + :return: a list of pairs (list(str), ``Tree``) + :rtype: list of tuple + """ + + from nltk.tree import Tree + + semi_rels = [] + semi_rel = [[], None] + + for dtr in tree: + if not isinstance(dtr, Tree): + semi_rel[0].append(dtr) + else: + # dtr is a Tree + semi_rel[1] = dtr + semi_rels.append(semi_rel) + semi_rel = [[], None] + return semi_rels + + +def semi_rel2reldict(pairs, window=5, trace=False): + """ + Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which + stores information about the subject and object NEs plus the filler between them. + Additionally, a left and right context of length =< window are captured (within + a given input sentence). + + :param pairs: a pair of list(str) and ``Tree``, as generated by + :param window: a threshold for the number of items to include in the left and right context + :type window: int + :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' + :rtype: list(defaultdict) + """ + result = [] + while len(pairs) > 2: + reldict = defaultdict(str) + reldict["lcon"] = _join(pairs[0][0][-window:]) + reldict["subjclass"] = pairs[0][1].label() + reldict["subjtext"] = _join(pairs[0][1].leaves()) + reldict["subjsym"] = list2sym(pairs[0][1].leaves()) + reldict["filler"] = _join(pairs[1][0]) + reldict["untagged_filler"] = _join(pairs[1][0], untag=True) + reldict["objclass"] = pairs[1][1].label() + reldict["objtext"] = _join(pairs[1][1].leaves()) + reldict["objsym"] = list2sym(pairs[1][1].leaves()) + reldict["rcon"] = _join(pairs[2][0][:window]) + if trace: + print( + "(%s(%s, %s)" + % ( + reldict["untagged_filler"], + reldict["subjclass"], + reldict["objclass"], + ) + ) + result.append(reldict) + pairs = pairs[1:] + return result + + +def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10): + """ + Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern. + + The parameters ``subjclass`` and ``objclass`` can be used to restrict the + Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', + 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). + + :param subjclass: the class of the subject Named Entity. + :type subjclass: str + :param objclass: the class of the object Named Entity. + :type objclass: str + :param doc: input document + :type doc: ieer document or a list of chunk trees + :param corpus: name of the corpus to take as input; possible values are + 'ieer' and 'conll2002' + :type corpus: str + :param pattern: a regular expression for filtering the fillers of + retrieved triples. + :type pattern: SRE_Pattern + :param window: filters out fillers which exceed this threshold + :type window: int + :return: see ``mk_reldicts`` + :rtype: list(defaultdict) + """ + + if subjclass and subjclass not in NE_CLASSES[corpus]: + if _expand(subjclass) in NE_CLASSES[corpus]: + subjclass = _expand(subjclass) + else: + raise ValueError( + "your value for the subject type has not been recognized: %s" + % subjclass + ) + if objclass and objclass not in NE_CLASSES[corpus]: + if _expand(objclass) in NE_CLASSES[corpus]: + objclass = _expand(objclass) + else: + raise ValueError( + "your value for the object type has not been recognized: %s" % objclass + ) + + if corpus == "ace" or corpus == "conll2002": + pairs = tree2semi_rel(doc) + elif corpus == "ieer": + pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline) + else: + raise ValueError("corpus type not recognized") + + reldicts = semi_rel2reldict(pairs) + + relfilter = lambda x: ( + x["subjclass"] == subjclass + and len(x["filler"].split()) <= window + and pattern.match(x["filler"]) + and x["objclass"] == objclass + ) + + return list(filter(relfilter, reldicts)) + + +def rtuple(reldict, lcon=False, rcon=False): + """ + Pretty print the reldict as an rtuple. + :param reldict: a relation dictionary + :type reldict: defaultdict + """ + items = [ + class_abbrev(reldict["subjclass"]), + reldict["subjtext"], + reldict["filler"], + class_abbrev(reldict["objclass"]), + reldict["objtext"], + ] + format = "[%s: %r] %r [%s: %r]" + if lcon: + items = [reldict["lcon"]] + items + format = "...%r)" + format + if rcon: + items.append(reldict["rcon"]) + format = format + "(%r..." + printargs = tuple(items) + return format % printargs + + +def clause(reldict, relsym): + """ + Print the relation in clausal form. + :param reldict: a relation dictionary + :type reldict: defaultdict + :param relsym: a label for the relation + :type relsym: str + """ + items = (relsym, reldict["subjsym"], reldict["objsym"]) + return "%s(%r, %r)" % items + + +####################################################### +# Demos of relation extraction with regular expressions +####################################################### + +############################################ +# Example of in(ORG, LOC) +############################################ +def in_demo(trace=0, sql=True): + """ + Select pairs of organizations and locations whose mentions occur with an + intervening occurrence of the preposition "in". + + If the sql parameter is set to True, then the entity pairs are loaded into + an in-memory database, and subsequently pulled out using an SQL "SELECT" + query. + """ + from nltk.corpus import ieer + + if sql: + try: + import sqlite3 + + connection = sqlite3.connect(":memory:") + cur = connection.cursor() + cur.execute( + """create table Locations + (OrgName text, LocationName text, DocID text)""" + ) + except ImportError: + import warnings + + warnings.warn("Cannot import sqlite; sql flag will be ignored.") + + IN = re.compile(r".*\bin\b(?!\b.+ing)") + + print() + print("IEER: in(ORG, LOC) -- just the clauses:") + print("=" * 45) + + for file in ieer.fileids(): + for doc in ieer.parsed_docs(file): + if trace: + print(doc.docno) + print("=" * 15) + for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN): + print(clause(rel, relsym="IN")) + if sql: + try: + rtuple = (rel["subjtext"], rel["objtext"], doc.docno) + cur.execute( + """insert into Locations + values (?, ?, ?)""", + rtuple, + ) + connection.commit() + except NameError: + pass + + if sql: + try: + cur.execute( + """select OrgName from Locations + where LocationName = 'Atlanta'""" + ) + print() + print("Extract data from SQL table: ORGs in Atlanta") + print("-" * 15) + for row in cur: + print(row) + except NameError: + pass + + +############################################ +# Example of has_role(PER, LOC) +############################################ + + +def roles_demo(trace=0): + from nltk.corpus import ieer + + roles = r""" + (.*( # assorted roles + analyst| + chair(wo)?man| + commissioner| + counsel| + director| + economist| + editor| + executive| + foreman| + governor| + head| + lawyer| + leader| + librarian).*)| + manager| + partner| + president| + producer| + professor| + researcher| + spokes(wo)?man| + writer| + ,\sof\sthe?\s* # "X, of (the) Y" + """ + ROLES = re.compile(roles, re.VERBOSE) + + print() + print("IEER: has_role(PER, ORG) -- raw rtuples:") + print("=" * 45) + + for file in ieer.fileids(): + for doc in ieer.parsed_docs(file): + lcon = rcon = False + if trace: + print(doc.docno) + print("=" * 15) + lcon = rcon = True + for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES): + print(rtuple(rel, lcon=lcon, rcon=rcon)) + + +############################################## +### Show what's in the IEER Headlines +############################################## + + +def ieer_headlines(): + + from nltk.corpus import ieer + from nltk.tree import Tree + + print("IEER: First 20 Headlines") + print("=" * 45) + + trees = [ + (doc.docno, doc.headline) + for file in ieer.fileids() + for doc in ieer.parsed_docs(file) + ] + for tree in trees[:20]: + print() + print("%s:\n%s" % tree) + + +############################################# +## Dutch CONLL2002: take_on_role(PER, ORG +############################################# + + +def conllned(trace=1): + """ + Find the copula+'van' relation ('of') in the Dutch tagged training corpus + from CoNLL 2002. + """ + + from nltk.corpus import conll2002 + + vnv = """ + ( + is/V| # 3rd sing present and + was/V| # past forms of the verb zijn ('be') + werd/V| # and also present + wordt/V # past of worden ('become) + ) + .* # followed by anything + van/Prep # followed by van ('of') + """ + VAN = re.compile(vnv, re.VERBOSE) + + print() + print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:") + print("=" * 45) + + for doc in conll2002.chunked_sents("ned.train"): + lcon = rcon = False + if trace: + lcon = rcon = True + for rel in extract_rels( + "PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10 + ): + print(rtuple(rel, lcon=lcon, rcon=rcon)) + + +############################################# +## Spanish CONLL2002: (PER, ORG) +############################################# + + +def conllesp(): + from nltk.corpus import conll2002 + + de = """ + .* + ( + de/SP| + del/SP + ) + """ + DE = re.compile(de, re.VERBOSE) + + print() + print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") + print("=" * 45) + rels = [ + rel + for doc in conll2002.chunked_sents("esp.train") + for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE) + ] + for r in rels[:10]: + print(clause(r, relsym="DE")) + print() + + +def ne_chunked(): + print() + print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker") + print("=" * 45) + ROLE = re.compile( + r".*(chairman|president|trader|scientist|economist|analyst|partner).*" + ) + rels = [] + for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]): + sent = nltk.ne_chunk(sent) + rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7) + for rel in rels: + print(f"{i:<5}{rtuple(rel)}") + + +if __name__ == "__main__": + import nltk + from nltk.sem import relextract + + in_demo(trace=0) + roles_demo(trace=0) + conllned() + conllesp() + ieer_headlines() + ne_chunked() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/skolemize.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/skolemize.py new file mode 100644 index 0000000000000000000000000000000000000000..894a88ee6eaae69c4b939a411f1915d2c9b62f0d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/skolemize.py @@ -0,0 +1,148 @@ +# Natural Language Toolkit: Semantic Interpretation +# +# Author: Ewan Klein +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +from nltk.sem.logic import ( + AllExpression, + AndExpression, + ApplicationExpression, + EqualityExpression, + ExistsExpression, + IffExpression, + ImpExpression, + NegatedExpression, + OrExpression, + VariableExpression, + skolem_function, + unique_variable, +) + + +def skolemize(expression, univ_scope=None, used_variables=None): + """ + Skolemize the expression and convert to conjunctive normal form (CNF) + """ + if univ_scope is None: + univ_scope = set() + if used_variables is None: + used_variables = set() + + if isinstance(expression, AllExpression): + term = skolemize( + expression.term, + univ_scope | {expression.variable}, + used_variables | {expression.variable}, + ) + return term.replace( + expression.variable, + VariableExpression(unique_variable(ignore=used_variables)), + ) + elif isinstance(expression, AndExpression): + return skolemize(expression.first, univ_scope, used_variables) & skolemize( + expression.second, univ_scope, used_variables + ) + elif isinstance(expression, OrExpression): + return to_cnf( + skolemize(expression.first, univ_scope, used_variables), + skolemize(expression.second, univ_scope, used_variables), + ) + elif isinstance(expression, ImpExpression): + return to_cnf( + skolemize(-expression.first, univ_scope, used_variables), + skolemize(expression.second, univ_scope, used_variables), + ) + elif isinstance(expression, IffExpression): + return to_cnf( + skolemize(-expression.first, univ_scope, used_variables), + skolemize(expression.second, univ_scope, used_variables), + ) & to_cnf( + skolemize(expression.first, univ_scope, used_variables), + skolemize(-expression.second, univ_scope, used_variables), + ) + elif isinstance(expression, EqualityExpression): + return expression + elif isinstance(expression, NegatedExpression): + negated = expression.term + if isinstance(negated, AllExpression): + term = skolemize( + -negated.term, univ_scope, used_variables | {negated.variable} + ) + if univ_scope: + return term.replace(negated.variable, skolem_function(univ_scope)) + else: + skolem_constant = VariableExpression( + unique_variable(ignore=used_variables) + ) + return term.replace(negated.variable, skolem_constant) + elif isinstance(negated, AndExpression): + return to_cnf( + skolemize(-negated.first, univ_scope, used_variables), + skolemize(-negated.second, univ_scope, used_variables), + ) + elif isinstance(negated, OrExpression): + return skolemize(-negated.first, univ_scope, used_variables) & skolemize( + -negated.second, univ_scope, used_variables + ) + elif isinstance(negated, ImpExpression): + return skolemize(negated.first, univ_scope, used_variables) & skolemize( + -negated.second, univ_scope, used_variables + ) + elif isinstance(negated, IffExpression): + return to_cnf( + skolemize(-negated.first, univ_scope, used_variables), + skolemize(-negated.second, univ_scope, used_variables), + ) & to_cnf( + skolemize(negated.first, univ_scope, used_variables), + skolemize(negated.second, univ_scope, used_variables), + ) + elif isinstance(negated, EqualityExpression): + return expression + elif isinstance(negated, NegatedExpression): + return skolemize(negated.term, univ_scope, used_variables) + elif isinstance(negated, ExistsExpression): + term = skolemize( + -negated.term, + univ_scope | {negated.variable}, + used_variables | {negated.variable}, + ) + return term.replace( + negated.variable, + VariableExpression(unique_variable(ignore=used_variables)), + ) + elif isinstance(negated, ApplicationExpression): + return expression + else: + raise Exception("'%s' cannot be skolemized" % expression) + elif isinstance(expression, ExistsExpression): + term = skolemize( + expression.term, univ_scope, used_variables | {expression.variable} + ) + if univ_scope: + return term.replace(expression.variable, skolem_function(univ_scope)) + else: + skolem_constant = VariableExpression(unique_variable(ignore=used_variables)) + return term.replace(expression.variable, skolem_constant) + elif isinstance(expression, ApplicationExpression): + return expression + else: + raise Exception("'%s' cannot be skolemized" % expression) + + +def to_cnf(first, second): + """ + Convert this split disjunction to conjunctive normal form (CNF) + """ + if isinstance(first, AndExpression): + r_first = to_cnf(first.first, second) + r_second = to_cnf(first.second, second) + return r_first & r_second + elif isinstance(second, AndExpression): + r_first = to_cnf(first, second.first) + r_second = to_cnf(first, second.second) + return r_first & r_second + else: + return first | second diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/util.py new file mode 100644 index 0000000000000000000000000000000000000000..3e0d41727158ffa5c47f243f4722043bf6bac122 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/util.py @@ -0,0 +1,309 @@ +# Natural Language Toolkit: Semantic Interpretation +# +# Author: Ewan Klein +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +""" +Utility functions for batch-processing sentences: parsing and +extraction of the semantic representation of the root node of the the +syntax tree, followed by evaluation of the semantic representation in +a first-order model. +""" + +import codecs + +from nltk.sem import evaluate + +############################################################## +## Utility functions for connecting parse output to semantics +############################################################## + + +def parse_sents(inputs, grammar, trace=0): + """ + Convert input sentences into syntactic trees. + + :param inputs: sentences to be parsed + :type inputs: list(str) + :param grammar: ``FeatureGrammar`` or name of feature-based grammar + :type grammar: nltk.grammar.FeatureGrammar + :rtype: list(nltk.tree.Tree) or dict(list(str)): list(Tree) + :return: a mapping from input sentences to a list of ``Tree`` instances. + """ + # put imports here to avoid circult dependencies + from nltk.grammar import FeatureGrammar + from nltk.parse import FeatureChartParser, load_parser + + if isinstance(grammar, FeatureGrammar): + cp = FeatureChartParser(grammar) + else: + cp = load_parser(grammar, trace=trace) + parses = [] + for sent in inputs: + tokens = sent.split() # use a tokenizer? + syntrees = list(cp.parse(tokens)) + parses.append(syntrees) + return parses + + +def root_semrep(syntree, semkey="SEM"): + """ + Find the semantic representation at the root of a tree. + + :param syntree: a parse ``Tree`` + :param semkey: the feature label to use for the root semantics in the tree + :return: the semantic representation at the root of a ``Tree`` + :rtype: sem.Expression + """ + from nltk.grammar import FeatStructNonterminal + + node = syntree.label() + assert isinstance(node, FeatStructNonterminal) + try: + return node[semkey] + except KeyError: + print(node, end=" ") + print("has no specification for the feature %s" % semkey) + raise + + +def interpret_sents(inputs, grammar, semkey="SEM", trace=0): + """ + Add the semantic representation to each syntactic parse tree + of each input sentence. + + :param inputs: a list of sentences + :type inputs: list(str) + :param grammar: ``FeatureGrammar`` or name of feature-based grammar + :type grammar: nltk.grammar.FeatureGrammar + :return: a mapping from sentences to lists of pairs (parse-tree, semantic-representations) + :rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression))) + """ + return [ + [(syn, root_semrep(syn, semkey)) for syn in syntrees] + for syntrees in parse_sents(inputs, grammar, trace=trace) + ] + + +def evaluate_sents(inputs, grammar, model, assignment, trace=0): + """ + Add the truth-in-a-model value to each semantic representation + for each syntactic parse of each input sentences. + + :param inputs: a list of sentences + :type inputs: list(str) + :param grammar: ``FeatureGrammar`` or name of feature-based grammar + :type grammar: nltk.grammar.FeatureGrammar + :return: a mapping from sentences to lists of triples (parse-tree, semantic-representations, evaluation-in-model) + :rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression, bool or dict(str): bool))) + """ + return [ + [ + (syn, sem, model.evaluate("%s" % sem, assignment, trace=trace)) + for (syn, sem) in interpretations + ] + for interpretations in interpret_sents(inputs, grammar) + ] + + +def demo_model0(): + global m0, g0 + # Initialize a valuation of non-logical constants.""" + v = [ + ("john", "b1"), + ("mary", "g1"), + ("suzie", "g2"), + ("fido", "d1"), + ("tess", "d2"), + ("noosa", "n"), + ("girl", {"g1", "g2"}), + ("boy", {"b1", "b2"}), + ("dog", {"d1", "d2"}), + ("bark", {"d1", "d2"}), + ("walk", {"b1", "g2", "d1"}), + ("chase", {("b1", "g1"), ("b2", "g1"), ("g1", "d1"), ("g2", "d2")}), + ( + "see", + {("b1", "g1"), ("b2", "d2"), ("g1", "b1"), ("d2", "b1"), ("g2", "n")}, + ), + ("in", {("b1", "n"), ("b2", "n"), ("d2", "n")}), + ("with", {("b1", "g1"), ("g1", "b1"), ("d1", "b1"), ("b1", "d1")}), + ] + # Read in the data from ``v`` + val = evaluate.Valuation(v) + # Bind ``dom`` to the ``domain`` property of ``val`` + dom = val.domain + # Initialize a model with parameters ``dom`` and ``val``. + m0 = evaluate.Model(dom, val) + # Initialize a variable assignment with parameter ``dom`` + g0 = evaluate.Assignment(dom) + + +def read_sents(filename, encoding="utf8"): + with codecs.open(filename, "r", encoding) as fp: + sents = [l.rstrip() for l in fp] + + # get rid of blank lines + sents = [l for l in sents if len(l) > 0] + sents = [l for l in sents if not l[0] == "#"] + return sents + + +def demo_legacy_grammar(): + """ + Check that interpret_sents() is compatible with legacy grammars that use + a lowercase 'sem' feature. + + Define 'test.fcfg' to be the following + + """ + from nltk.grammar import FeatureGrammar + + g = FeatureGrammar.fromstring( + """ + % start S + S[sem=] -> 'hello' + """ + ) + print("Reading grammar: %s" % g) + print("*" * 20) + for reading in interpret_sents(["hello"], g, semkey="sem"): + syn, sem = reading[0] + print() + print("output: ", sem) + + +def demo(): + import sys + from optparse import OptionParser + + description = """ + Parse and evaluate some sentences. + """ + + opts = OptionParser(description=description) + + opts.set_defaults( + evaluate=True, + beta=True, + syntrace=0, + semtrace=0, + demo="default", + grammar="", + sentences="", + ) + + opts.add_option( + "-d", + "--demo", + dest="demo", + help="choose demo D; omit this for the default demo, or specify 'chat80'", + metavar="D", + ) + opts.add_option( + "-g", "--gram", dest="grammar", help="read in grammar G", metavar="G" + ) + opts.add_option( + "-m", + "--model", + dest="model", + help="import model M (omit '.py' suffix)", + metavar="M", + ) + opts.add_option( + "-s", + "--sentences", + dest="sentences", + help="read in a file of test sentences S", + metavar="S", + ) + opts.add_option( + "-e", + "--no-eval", + action="store_false", + dest="evaluate", + help="just do a syntactic analysis", + ) + opts.add_option( + "-b", + "--no-beta-reduction", + action="store_false", + dest="beta", + help="don't carry out beta-reduction", + ) + opts.add_option( + "-t", + "--syntrace", + action="count", + dest="syntrace", + help="set syntactic tracing on; requires '-e' option", + ) + opts.add_option( + "-T", + "--semtrace", + action="count", + dest="semtrace", + help="set semantic tracing on", + ) + + (options, args) = opts.parse_args() + + SPACER = "-" * 30 + + demo_model0() + + sents = [ + "Fido sees a boy with Mary", + "John sees Mary", + "every girl chases a dog", + "every boy chases a girl", + "John walks with a girl in Noosa", + "who walks", + ] + + gramfile = "grammars/sample_grammars/sem2.fcfg" + + if options.sentences: + sentsfile = options.sentences + if options.grammar: + gramfile = options.grammar + if options.model: + exec("import %s as model" % options.model) + + if sents is None: + sents = read_sents(sentsfile) + + # Set model and assignment + model = m0 + g = g0 + + if options.evaluate: + evaluations = evaluate_sents(sents, gramfile, model, g, trace=options.semtrace) + else: + semreps = interpret_sents(sents, gramfile, trace=options.syntrace) + + for i, sent in enumerate(sents): + n = 1 + print("\nSentence: %s" % sent) + print(SPACER) + if options.evaluate: + + for (syntree, semrep, value) in evaluations[i]: + if isinstance(value, dict): + value = set(value.keys()) + print("%d: %s" % (n, semrep)) + print(value) + n += 1 + else: + + for (syntree, semrep) in semreps[i]: + print("%d: %s" % (n, semrep)) + n += 1 + + +if __name__ == "__main__": + demo() + demo_legacy_grammar() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/sentiment_analyzer.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/sentiment_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..e5393bf8985565270168bb81c7488054285293cf --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/sentiment_analyzer.py @@ -0,0 +1,255 @@ +# +# Natural Language Toolkit: Sentiment Analyzer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Pierpaolo Pantone <24alsecondo@gmail.com> +# URL: +# For license information, see LICENSE.TXT + +""" +A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks +using NLTK features and classifiers, especially for teaching and demonstrative +purposes. +""" + +import sys +from collections import defaultdict + +from nltk.classify.util import accuracy as eval_accuracy +from nltk.classify.util import apply_features +from nltk.collocations import BigramCollocationFinder +from nltk.metrics import BigramAssocMeasures +from nltk.metrics import f_measure as eval_f_measure +from nltk.metrics import precision as eval_precision +from nltk.metrics import recall as eval_recall +from nltk.probability import FreqDist + + +class SentimentAnalyzer: + """ + A Sentiment Analysis tool based on machine learning approaches. + """ + + def __init__(self, classifier=None): + self.feat_extractors = defaultdict(list) + self.classifier = classifier + + def all_words(self, documents, labeled=None): + """ + Return all words/tokens from the documents (with duplicates). + + :param documents: a list of (words, label) tuples. + :param labeled: if `True`, assume that each document is represented by a + (words, label) tuple: (list(str), str). If `False`, each document is + considered as being a simple list of strings: list(str). + :rtype: list(str) + :return: A list of all words/tokens in `documents`. + """ + all_words = [] + if labeled is None: + labeled = documents and isinstance(documents[0], tuple) + if labeled: + for words, _sentiment in documents: + all_words.extend(words) + elif not labeled: + for words in documents: + all_words.extend(words) + return all_words + + def apply_features(self, documents, labeled=None): + """ + Apply all feature extractor functions to the documents. This is a wrapper + around `nltk.classify.util.apply_features`. + + If `labeled=False`, return featuresets as: + [feature_func(doc) for doc in documents] + If `labeled=True`, return featuresets as: + [(feature_func(tok), label) for (tok, label) in toks] + + :param documents: a list of documents. `If labeled=True`, the method expects + a list of (words, label) tuples. + :rtype: LazyMap + """ + return apply_features(self.extract_features, documents, labeled) + + def unigram_word_feats(self, words, top_n=None, min_freq=0): + """ + Return most common top_n word features. + + :param words: a list of words/tokens. + :param top_n: number of best words/tokens to use, sorted by frequency. + :rtype: list(str) + :return: A list of `top_n` words/tokens (with no duplicates) sorted by + frequency. + """ + # Stopwords are not removed + unigram_feats_freqs = FreqDist(word for word in words) + return [ + w + for w, f in unigram_feats_freqs.most_common(top_n) + if unigram_feats_freqs[w] > min_freq + ] + + def bigram_collocation_feats( + self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi + ): + """ + Return `top_n` bigram features (using `assoc_measure`). + Note that this method is based on bigram collocations measures, and not + on simple bigram frequency. + + :param documents: a list (or iterable) of tokens. + :param top_n: number of best words/tokens to use, sorted by association + measure. + :param assoc_measure: bigram association measure to use as score function. + :param min_freq: the minimum number of occurrencies of bigrams to take + into consideration. + + :return: `top_n` ngrams scored by the given association measure. + """ + finder = BigramCollocationFinder.from_documents(documents) + finder.apply_freq_filter(min_freq) + return finder.nbest(assoc_measure, top_n) + + def classify(self, instance): + """ + Classify a single instance applying the features that have already been + stored in the SentimentAnalyzer. + + :param instance: a list (or iterable) of tokens. + :return: the classification result given by applying the classifier. + """ + instance_feats = self.apply_features([instance], labeled=False) + return self.classifier.classify(instance_feats[0]) + + def add_feat_extractor(self, function, **kwargs): + """ + Add a new function to extract features from a document. This function will + be used in extract_features(). + Important: in this step our kwargs are only representing additional parameters, + and NOT the document we have to parse. The document will always be the first + parameter in the parameter list, and it will be added in the extract_features() + function. + + :param function: the extractor function to add to the list of feature extractors. + :param kwargs: additional parameters required by the `function` function. + """ + self.feat_extractors[function].append(kwargs) + + def extract_features(self, document): + """ + Apply extractor functions (and their parameters) to the present document. + We pass `document` as the first parameter of the extractor functions. + If we want to use the same extractor function multiple times, we have to + add it to the extractors with `add_feat_extractor` using multiple sets of + parameters (one for each call of the extractor function). + + :param document: the document that will be passed as argument to the + feature extractor functions. + :return: A dictionary of populated features extracted from the document. + :rtype: dict + """ + all_features = {} + for extractor in self.feat_extractors: + for param_set in self.feat_extractors[extractor]: + feats = extractor(document, **param_set) + all_features.update(feats) + return all_features + + def train(self, trainer, training_set, save_classifier=None, **kwargs): + """ + Train classifier on the training set, optionally saving the output in the + file specified by `save_classifier`. + Additional arguments depend on the specific trainer used. For example, + a MaxentClassifier can use `max_iter` parameter to specify the number + of iterations, while a NaiveBayesClassifier cannot. + + :param trainer: `train` method of a classifier. + E.g.: NaiveBayesClassifier.train + :param training_set: the training set to be passed as argument to the + classifier `train` method. + :param save_classifier: the filename of the file where the classifier + will be stored (optional). + :param kwargs: additional parameters that will be passed as arguments to + the classifier `train` function. + :return: A classifier instance trained on the training set. + :rtype: + """ + print("Training classifier") + self.classifier = trainer(training_set, **kwargs) + if save_classifier: + self.save_file(self.classifier, save_classifier) + + return self.classifier + + def save_file(self, content, filename): + """ + Store `content` in `filename`. Can be used to store a SentimentAnalyzer. + """ + print("Saving", filename, file=sys.stderr) + with open(filename, "wb") as storage_file: + import pickle + + # The protocol=2 parameter is for python2 compatibility + pickle.dump(content, storage_file, protocol=2) + + def evaluate( + self, + test_set, + classifier=None, + accuracy=True, + f_measure=True, + precision=True, + recall=True, + verbose=False, + ): + """ + Evaluate and print classifier performance on the test set. + + :param test_set: A list of (tokens, label) tuples to use as gold set. + :param classifier: a classifier instance (previously trained). + :param accuracy: if `True`, evaluate classifier accuracy. + :param f_measure: if `True`, evaluate classifier f_measure. + :param precision: if `True`, evaluate classifier precision. + :param recall: if `True`, evaluate classifier recall. + :return: evaluation results. + :rtype: dict(str): float + """ + if classifier is None: + classifier = self.classifier + print(f"Evaluating {type(classifier).__name__} results...") + metrics_results = {} + if accuracy: + accuracy_score = eval_accuracy(classifier, test_set) + metrics_results["Accuracy"] = accuracy_score + + gold_results = defaultdict(set) + test_results = defaultdict(set) + labels = set() + for i, (feats, label) in enumerate(test_set): + labels.add(label) + gold_results[label].add(i) + observed = classifier.classify(feats) + test_results[observed].add(i) + + for label in labels: + if precision: + precision_score = eval_precision( + gold_results[label], test_results[label] + ) + metrics_results[f"Precision [{label}]"] = precision_score + if recall: + recall_score = eval_recall(gold_results[label], test_results[label]) + metrics_results[f"Recall [{label}]"] = recall_score + if f_measure: + f_measure_score = eval_f_measure( + gold_results[label], test_results[label] + ) + metrics_results[f"F-measure [{label}]"] = f_measure_score + + # Print evaluation results (in alphabetical order) + if verbose: + for result in sorted(metrics_results): + print(f"{result}: {metrics_results[result]}") + + return metrics_results diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/vader.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/vader.py new file mode 100644 index 0000000000000000000000000000000000000000..f6ff33bdf9e16fe68abb227995ea5fce74d3c310 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/vader.py @@ -0,0 +1,628 @@ +# Natural Language Toolkit: vader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: C.J. Hutto +# Ewan Klein (modifications) +# Pierpaolo Pantone <24alsecondo@gmail.com> (modifications) +# George Berry (modifications) +# Malavika Suresh (modifications) +# URL: +# For license information, see LICENSE.TXT +# +# Modifications to the original VADER code have been made in order to +# integrate it into NLTK. These have involved changes to +# ensure Python 3 compatibility, and refactoring to achieve greater modularity. + +""" +If you use the VADER sentiment analysis tools, please cite: + +Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for +Sentiment Analysis of Social Media Text. Eighth International Conference on +Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. +""" + +import math +import re +import string +from itertools import product + +import nltk.data +from nltk.util import pairwise + + +class VaderConstants: + """ + A class to keep the Vader lists and constants. + """ + + ##Constants## + # (empirically derived mean sentiment intensity rating increase for booster words) + B_INCR = 0.293 + B_DECR = -0.293 + + # (empirically derived mean sentiment intensity rating increase for using + # ALLCAPs to emphasize a word) + C_INCR = 0.733 + + N_SCALAR = -0.74 + + NEGATE = { + "aint", + "arent", + "cannot", + "cant", + "couldnt", + "darent", + "didnt", + "doesnt", + "ain't", + "aren't", + "can't", + "couldn't", + "daren't", + "didn't", + "doesn't", + "dont", + "hadnt", + "hasnt", + "havent", + "isnt", + "mightnt", + "mustnt", + "neither", + "don't", + "hadn't", + "hasn't", + "haven't", + "isn't", + "mightn't", + "mustn't", + "neednt", + "needn't", + "never", + "none", + "nope", + "nor", + "not", + "nothing", + "nowhere", + "oughtnt", + "shant", + "shouldnt", + "uhuh", + "wasnt", + "werent", + "oughtn't", + "shan't", + "shouldn't", + "uh-uh", + "wasn't", + "weren't", + "without", + "wont", + "wouldnt", + "won't", + "wouldn't", + "rarely", + "seldom", + "despite", + } + + # booster/dampener 'intensifiers' or 'degree adverbs' + # https://en.wiktionary.org/wiki/Category:English_degree_adverbs + + BOOSTER_DICT = { + "absolutely": B_INCR, + "amazingly": B_INCR, + "awfully": B_INCR, + "completely": B_INCR, + "considerably": B_INCR, + "decidedly": B_INCR, + "deeply": B_INCR, + "effing": B_INCR, + "enormously": B_INCR, + "entirely": B_INCR, + "especially": B_INCR, + "exceptionally": B_INCR, + "extremely": B_INCR, + "fabulously": B_INCR, + "flipping": B_INCR, + "flippin": B_INCR, + "fricking": B_INCR, + "frickin": B_INCR, + "frigging": B_INCR, + "friggin": B_INCR, + "fully": B_INCR, + "fucking": B_INCR, + "greatly": B_INCR, + "hella": B_INCR, + "highly": B_INCR, + "hugely": B_INCR, + "incredibly": B_INCR, + "intensely": B_INCR, + "majorly": B_INCR, + "more": B_INCR, + "most": B_INCR, + "particularly": B_INCR, + "purely": B_INCR, + "quite": B_INCR, + "really": B_INCR, + "remarkably": B_INCR, + "so": B_INCR, + "substantially": B_INCR, + "thoroughly": B_INCR, + "totally": B_INCR, + "tremendously": B_INCR, + "uber": B_INCR, + "unbelievably": B_INCR, + "unusually": B_INCR, + "utterly": B_INCR, + "very": B_INCR, + "almost": B_DECR, + "barely": B_DECR, + "hardly": B_DECR, + "just enough": B_DECR, + "kind of": B_DECR, + "kinda": B_DECR, + "kindof": B_DECR, + "kind-of": B_DECR, + "less": B_DECR, + "little": B_DECR, + "marginally": B_DECR, + "occasionally": B_DECR, + "partly": B_DECR, + "scarcely": B_DECR, + "slightly": B_DECR, + "somewhat": B_DECR, + "sort of": B_DECR, + "sorta": B_DECR, + "sortof": B_DECR, + "sort-of": B_DECR, + } + + # check for special case idioms using a sentiment-laden keyword known to SAGE + SPECIAL_CASE_IDIOMS = { + "the shit": 3, + "the bomb": 3, + "bad ass": 1.5, + "yeah right": -2, + "cut the mustard": 2, + "kiss of death": -1.5, + "hand to mouth": -2, + } + + # for removing punctuation + REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]") + + PUNC_LIST = [ + ".", + "!", + "?", + ",", + ";", + ":", + "-", + "'", + '"', + "!!", + "!!!", + "??", + "???", + "?!?", + "!?!", + "?!?!", + "!?!?", + ] + + def __init__(self): + pass + + def negated(self, input_words, include_nt=True): + """ + Determine if input contains negation words + """ + neg_words = self.NEGATE + if any(word.lower() in neg_words for word in input_words): + return True + if include_nt: + if any("n't" in word.lower() for word in input_words): + return True + for first, second in pairwise(input_words): + if second.lower() == "least" and first.lower() != "at": + return True + return False + + def normalize(self, score, alpha=15): + """ + Normalize the score to be between -1 and 1 using an alpha that + approximates the max expected value + """ + norm_score = score / math.sqrt((score * score) + alpha) + return norm_score + + def scalar_inc_dec(self, word, valence, is_cap_diff): + """ + Check if the preceding words increase, decrease, or negate/nullify the + valence + """ + scalar = 0.0 + word_lower = word.lower() + if word_lower in self.BOOSTER_DICT: + scalar = self.BOOSTER_DICT[word_lower] + if valence < 0: + scalar *= -1 + # check if booster/dampener word is in ALLCAPS (while others aren't) + if word.isupper() and is_cap_diff: + if valence > 0: + scalar += self.C_INCR + else: + scalar -= self.C_INCR + return scalar + + +class SentiText: + """ + Identify sentiment-relevant string-level properties of input text. + """ + + def __init__(self, text, punc_list, regex_remove_punctuation): + if not isinstance(text, str): + text = str(text.encode("utf-8")) + self.text = text + self.PUNC_LIST = punc_list + self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation + self.words_and_emoticons = self._words_and_emoticons() + # doesn't separate words from + # adjacent punctuation (keeps emoticons & contractions) + self.is_cap_diff = self.allcap_differential(self.words_and_emoticons) + + def _words_plus_punc(self): + """ + Returns mapping of form: + { + 'cat,': 'cat', + ',cat': 'cat', + } + """ + no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text) + # removes punctuation (but loses emoticons & contractions) + words_only = no_punc_text.split() + # remove singletons + words_only = {w for w in words_only if len(w) > 1} + # the product gives ('cat', ',') and (',', 'cat') + punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)} + punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)} + words_punc_dict = punc_before + words_punc_dict.update(punc_after) + return words_punc_dict + + def _words_and_emoticons(self): + """ + Removes leading and trailing puncutation + Leaves contractions and most emoticons + Does not preserve punc-plus-letter emoticons (e.g. :D) + """ + wes = self.text.split() + words_punc_dict = self._words_plus_punc() + wes = [we for we in wes if len(we) > 1] + for i, we in enumerate(wes): + if we in words_punc_dict: + wes[i] = words_punc_dict[we] + return wes + + def allcap_differential(self, words): + """ + Check whether just some words in the input are ALL CAPS + + :param list words: The words to inspect + :returns: `True` if some but not all items in `words` are ALL CAPS + """ + is_different = False + allcap_words = 0 + for word in words: + if word.isupper(): + allcap_words += 1 + cap_differential = len(words) - allcap_words + if 0 < cap_differential < len(words): + is_different = True + return is_different + + +class SentimentIntensityAnalyzer: + """ + Give a sentiment intensity score to sentences. + """ + + def __init__( + self, + lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt", + ): + self.lexicon_file = nltk.data.load(lexicon_file) + self.lexicon = self.make_lex_dict() + self.constants = VaderConstants() + + def make_lex_dict(self): + """ + Convert lexicon file to a dictionary + """ + lex_dict = {} + for line in self.lexicon_file.split("\n"): + (word, measure) = line.strip().split("\t")[0:2] + lex_dict[word] = float(measure) + return lex_dict + + def polarity_scores(self, text): + """ + Return a float for sentiment strength based on the input text. + Positive values are positive valence, negative value are negative + valence. + """ + # text, words_and_emoticons, is_cap_diff = self.preprocess(text) + sentitext = SentiText( + text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION + ) + sentiments = [] + words_and_emoticons = sentitext.words_and_emoticons + for item in words_and_emoticons: + valence = 0 + i = words_and_emoticons.index(item) + if ( + i < len(words_and_emoticons) - 1 + and item.lower() == "kind" + and words_and_emoticons[i + 1].lower() == "of" + ) or item.lower() in self.constants.BOOSTER_DICT: + sentiments.append(valence) + continue + + sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments) + + sentiments = self._but_check(words_and_emoticons, sentiments) + + return self.score_valence(sentiments, text) + + def sentiment_valence(self, valence, sentitext, item, i, sentiments): + is_cap_diff = sentitext.is_cap_diff + words_and_emoticons = sentitext.words_and_emoticons + item_lowercase = item.lower() + if item_lowercase in self.lexicon: + # get the sentiment valence + valence = self.lexicon[item_lowercase] + + # check if sentiment laden word is in ALL CAPS (while others aren't) + if item.isupper() and is_cap_diff: + if valence > 0: + valence += self.constants.C_INCR + else: + valence -= self.constants.C_INCR + + for start_i in range(0, 3): + if ( + i > start_i + and words_and_emoticons[i - (start_i + 1)].lower() + not in self.lexicon + ): + # dampen the scalar modifier of preceding words and emoticons + # (excluding the ones that immediately preceed the item) based + # on their distance from the current item. + s = self.constants.scalar_inc_dec( + words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff + ) + if start_i == 1 and s != 0: + s = s * 0.95 + if start_i == 2 and s != 0: + s = s * 0.9 + valence = valence + s + valence = self._never_check( + valence, words_and_emoticons, start_i, i + ) + if start_i == 2: + valence = self._idioms_check(valence, words_and_emoticons, i) + + # future work: consider other sentiment-laden idioms + # other_idioms = + # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2, + # "upper hand": 1, "break a leg": 2, + # "cooking with gas": 2, "in the black": 2, "in the red": -2, + # "on the ball": 2,"under the weather": -2} + + valence = self._least_check(valence, words_and_emoticons, i) + + sentiments.append(valence) + return sentiments + + def _least_check(self, valence, words_and_emoticons, i): + # check for negation case using "least" + if ( + i > 1 + and words_and_emoticons[i - 1].lower() not in self.lexicon + and words_and_emoticons[i - 1].lower() == "least" + ): + if ( + words_and_emoticons[i - 2].lower() != "at" + and words_and_emoticons[i - 2].lower() != "very" + ): + valence = valence * self.constants.N_SCALAR + elif ( + i > 0 + and words_and_emoticons[i - 1].lower() not in self.lexicon + and words_and_emoticons[i - 1].lower() == "least" + ): + valence = valence * self.constants.N_SCALAR + return valence + + def _but_check(self, words_and_emoticons, sentiments): + words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons] + but = {"but"} & set(words_and_emoticons) + if but: + bi = words_and_emoticons.index(next(iter(but))) + for sidx, sentiment in enumerate(sentiments): + if sidx < bi: + sentiments[sidx] = sentiment * 0.5 + elif sidx > bi: + sentiments[sidx] = sentiment * 1.5 + return sentiments + + def _idioms_check(self, valence, words_and_emoticons, i): + onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}" + + twoonezero = "{} {} {}".format( + words_and_emoticons[i - 2], + words_and_emoticons[i - 1], + words_and_emoticons[i], + ) + + twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}" + + threetwoone = "{} {} {}".format( + words_and_emoticons[i - 3], + words_and_emoticons[i - 2], + words_and_emoticons[i - 1], + ) + + threetwo = "{} {}".format( + words_and_emoticons[i - 3], words_and_emoticons[i - 2] + ) + + sequences = [onezero, twoonezero, twoone, threetwoone, threetwo] + + for seq in sequences: + if seq in self.constants.SPECIAL_CASE_IDIOMS: + valence = self.constants.SPECIAL_CASE_IDIOMS[seq] + break + + if len(words_and_emoticons) - 1 > i: + zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}" + if zeroone in self.constants.SPECIAL_CASE_IDIOMS: + valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone] + if len(words_and_emoticons) - 1 > i + 1: + zeroonetwo = "{} {} {}".format( + words_and_emoticons[i], + words_and_emoticons[i + 1], + words_and_emoticons[i + 2], + ) + if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS: + valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo] + + # check for booster/dampener bi-grams such as 'sort of' or 'kind of' + if ( + threetwo in self.constants.BOOSTER_DICT + or twoone in self.constants.BOOSTER_DICT + ): + valence = valence + self.constants.B_DECR + return valence + + def _never_check(self, valence, words_and_emoticons, start_i, i): + if start_i == 0: + if self.constants.negated([words_and_emoticons[i - 1]]): + valence = valence * self.constants.N_SCALAR + if start_i == 1: + if words_and_emoticons[i - 2] == "never" and ( + words_and_emoticons[i - 1] == "so" + or words_and_emoticons[i - 1] == "this" + ): + valence = valence * 1.5 + elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): + valence = valence * self.constants.N_SCALAR + if start_i == 2: + if ( + words_and_emoticons[i - 3] == "never" + and ( + words_and_emoticons[i - 2] == "so" + or words_and_emoticons[i - 2] == "this" + ) + or ( + words_and_emoticons[i - 1] == "so" + or words_and_emoticons[i - 1] == "this" + ) + ): + valence = valence * 1.25 + elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): + valence = valence * self.constants.N_SCALAR + return valence + + def _punctuation_emphasis(self, sum_s, text): + # add emphasis from exclamation points and question marks + ep_amplifier = self._amplify_ep(text) + qm_amplifier = self._amplify_qm(text) + punct_emph_amplifier = ep_amplifier + qm_amplifier + return punct_emph_amplifier + + def _amplify_ep(self, text): + # check for added emphasis resulting from exclamation points (up to 4 of them) + ep_count = text.count("!") + if ep_count > 4: + ep_count = 4 + # (empirically derived mean sentiment intensity rating increase for + # exclamation points) + ep_amplifier = ep_count * 0.292 + return ep_amplifier + + def _amplify_qm(self, text): + # check for added emphasis resulting from question marks (2 or 3+) + qm_count = text.count("?") + qm_amplifier = 0 + if qm_count > 1: + if qm_count <= 3: + # (empirically derived mean sentiment intensity rating increase for + # question marks) + qm_amplifier = qm_count * 0.18 + else: + qm_amplifier = 0.96 + return qm_amplifier + + def _sift_sentiment_scores(self, sentiments): + # want separate positive versus negative sentiment scores + pos_sum = 0.0 + neg_sum = 0.0 + neu_count = 0 + for sentiment_score in sentiments: + if sentiment_score > 0: + pos_sum += ( + float(sentiment_score) + 1 + ) # compensates for neutral words that are counted as 1 + if sentiment_score < 0: + neg_sum += ( + float(sentiment_score) - 1 + ) # when used with math.fabs(), compensates for neutrals + if sentiment_score == 0: + neu_count += 1 + return pos_sum, neg_sum, neu_count + + def score_valence(self, sentiments, text): + if sentiments: + sum_s = float(sum(sentiments)) + # compute and add emphasis from punctuation in text + punct_emph_amplifier = self._punctuation_emphasis(sum_s, text) + if sum_s > 0: + sum_s += punct_emph_amplifier + elif sum_s < 0: + sum_s -= punct_emph_amplifier + + compound = self.constants.normalize(sum_s) + # discriminate between positive, negative and neutral sentiment scores + pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments) + + if pos_sum > math.fabs(neg_sum): + pos_sum += punct_emph_amplifier + elif pos_sum < math.fabs(neg_sum): + neg_sum -= punct_emph_amplifier + + total = pos_sum + math.fabs(neg_sum) + neu_count + pos = math.fabs(pos_sum / total) + neg = math.fabs(neg_sum / total) + neu = math.fabs(neu_count / total) + + else: + compound = 0.0 + pos = 0.0 + neg = 0.0 + neu = 0.0 + + sentiment_dict = { + "neg": round(neg, 3), + "neu": round(neu, 3), + "pos": round(pos, 3), + "compound": round(compound, 4), + } + + return sentiment_dict diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/arlstem.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/arlstem.py new file mode 100644 index 0000000000000000000000000000000000000000..c990dde7b0cd0e58d800688f65726757645e4a76 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/arlstem.py @@ -0,0 +1,361 @@ +# +# Natural Language Toolkit: ARLSTem Stemmer +# +# Copyright (C) 2001-2022 NLTK Project +# +# Author: Kheireddine Abainia (x-programer) +# Algorithms: Kheireddine Abainia +# Siham Ouamour +# Halim Sayoud +# URL: +# For license information, see LICENSE.TXT + + +""" +ARLSTem Arabic Stemmer +The details about the implementation of this algorithm are described in: +K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer , +Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17), +Vol. 29, No. 3, 2017, pp. 557-573. +The ARLSTem is a light Arabic stemmer that is based on removing the affixes +from the word (i.e. prefixes, suffixes and infixes). It was evaluated and +compared to several other stemmers using Paice's parameters (under-stemming +index, over-stemming index and stemming weight), and the results showed that +ARLSTem is promising and producing high performances. This stemmer is not +based on any dictionary and can be used on-line effectively. +""" +import re + +from nltk.stem.api import StemmerI + + +class ARLSTem(StemmerI): + """ + ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary. + Department of Telecommunication & Information Processing. USTHB University, + Algiers, Algeria. + ARLSTem.stem(token) returns the Arabic stem for the input token. + The ARLSTem Stemmer requires that all tokens are encoded using Unicode + encoding. + """ + + def __init__(self): + # different Alif with hamza + self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]") + self.re_alifMaqsura = re.compile(r"[\u0649]") + self.re_diacritics = re.compile(r"[\u064B-\u065F]") + + # Alif Laam, Laam Laam, Fa Laam, Fa Ba + self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"] + # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam + self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"] + # Fa Laam Laam, Waaw Laam Laam + self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"] + # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam + self.pr4 = [ + "\u0641\u0628\u0627\u0644", + "\u0648\u0628\u0627\u0644", + "\u0641\u0643\u0627\u0644", + ] + + # Kaf Yaa, Kaf Miim + self.su2 = ["\u0643\u064A", "\u0643\u0645"] + # Ha Alif, Ha Miim + self.su22 = ["\u0647\u0627", "\u0647\u0645"] + # Kaf Miim Alif, Kaf Noon Shadda + self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"] + # Ha Miim Alif, Ha Noon Shadda + self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"] + + # Alif Noon, Ya Noon, Waaw Noon + self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"] + # Taa Alif Noon, Taa Ya Noon + self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"] + + # Alif Noon, Waaw Noon + self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"] + # Siin Taa, Siin Yaa + self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"] + # Siin Alif, Siin Noon + self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"] + # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza + self.verb_pr33 = [ + "\u0644\u0646", + "\u0644\u062A", + "\u0644\u064A", + "\u0644\u0623", + ] + # Taa Miim Alif, Taa Noon Shadda + self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"] + # Noon Alif, Taa Miim, Taa Alif, Waaw Alif + self.verb_suf2 = [ + "\u0646\u0627", + "\u062A\u0645", + "\u062A\u0627", + "\u0648\u0627", + ] + # Taa, Alif, Noon + self.verb_suf1 = ["\u062A", "\u0627", "\u0646"] + + def stem(self, token): + """ + call this function to get the word's stem based on ARLSTem . + """ + try: + if token is None: + raise ValueError( + "The word could not be stemmed, because \ + it is empty !" + ) + # remove Arabic diacritics and replace some letters with others + token = self.norm(token) + # strip common prefixes of the nouns + pre = self.pref(token) + if pre is not None: + token = pre + # strip the suffixes which are common to nouns and verbs + token = self.suff(token) + # transform a plural noun to a singular noun + ps = self.plur2sing(token) + if ps is None: + # transform from the feminine form to the masculine form + fm = self.fem2masc(token) + if fm is not None: + return fm + else: + if pre is None: # if the prefixes are not stripped + # strip the verb prefixes and suffixes + return self.verb(token) + else: + return ps + return token + except ValueError as e: + print(e) + + def norm(self, token): + """ + normalize the word by removing diacritics, replacing hamzated Alif + with Alif replacing AlifMaqsura with Yaa and removing Waaw at the + beginning. + """ + # strip Arabic diacritics + token = self.re_diacritics.sub("", token) + # replace Hamzated Alif with Alif bare + token = self.re_hamzated_alif.sub("\u0627", token) + # replace alifMaqsura with Yaa + token = self.re_alifMaqsura.sub("\u064A", token) + # strip the Waaw from the word beginning if the remaining is 3 letters + # at least + if token.startswith("\u0648") and len(token) > 3: + token = token[1:] + return token + + def pref(self, token): + """ + remove prefixes from the words' beginning. + """ + if len(token) > 5: + for p3 in self.pr3: + if token.startswith(p3): + return token[3:] + if len(token) > 6: + for p4 in self.pr4: + if token.startswith(p4): + return token[4:] + if len(token) > 5: + for p3 in self.pr32: + if token.startswith(p3): + return token[3:] + if len(token) > 4: + for p2 in self.pr2: + if token.startswith(p2): + return token[2:] + + def suff(self, token): + """ + remove suffixes from the word's end. + """ + if token.endswith("\u0643") and len(token) > 3: + return token[:-1] + if len(token) > 4: + for s2 in self.su2: + if token.endswith(s2): + return token[:-2] + if len(token) > 5: + for s3 in self.su3: + if token.endswith(s3): + return token[:-3] + if token.endswith("\u0647") and len(token) > 3: + token = token[:-1] + return token + if len(token) > 4: + for s2 in self.su22: + if token.endswith(s2): + return token[:-2] + if len(token) > 5: + for s3 in self.su32: + if token.endswith(s3): + return token[:-3] + if token.endswith("\u0646\u0627") and len(token) > 4: + return token[:-2] + return token + + def fem2masc(self, token): + """ + transform the word from the feminine form to the masculine form. + """ + if token.endswith("\u0629") and len(token) > 3: + return token[:-1] + + def plur2sing(self, token): + """ + transform the word from the plural form to the singular form. + """ + if len(token) > 4: + for ps2 in self.pl_si2: + if token.endswith(ps2): + return token[:-2] + if len(token) > 5: + for ps3 in self.pl_si3: + if token.endswith(ps3): + return token[:-3] + if len(token) > 3 and token.endswith("\u0627\u062A"): + return token[:-2] + if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627": + return token[:2] + token[3:] + if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627": + return token[1:-2] + token[-1] + + def verb(self, token): + """ + stem the verb prefixes and suffixes or both + """ + vb = self.verb_t1(token) + if vb is not None: + return vb + vb = self.verb_t2(token) + if vb is not None: + return vb + vb = self.verb_t3(token) + if vb is not None: + return vb + vb = self.verb_t4(token) + if vb is not None: + return vb + vb = self.verb_t5(token) + if vb is not None: + return vb + return self.verb_t6(token) + + def verb_t1(self, token): + """ + stem the present prefixes and suffixes + """ + if len(token) > 5 and token.startswith("\u062A"): # Taa + for s2 in self.pl_si2: + if token.endswith(s2): + return token[1:-2] + if len(token) > 5 and token.startswith("\u064A"): # Yaa + for s2 in self.verb_su2: + if token.endswith(s2): + return token[1:-2] + if len(token) > 4 and token.startswith("\u0627"): # Alif + # Waaw Alif + if len(token) > 5 and token.endswith("\u0648\u0627"): + return token[1:-2] + # Yaa + if token.endswith("\u064A"): + return token[1:-1] + # Alif + if token.endswith("\u0627"): + return token[1:-1] + # Noon + if token.endswith("\u0646"): + return token[1:-1] + # ^Yaa, Noon$ + if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"): + return token[1:-1] + # ^Taa, Noon$ + if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"): + return token[1:-1] + + def verb_t2(self, token): + """ + stem the future prefixes and suffixes + """ + if len(token) > 6: + for s2 in self.pl_si2: + # ^Siin Taa + if token.startswith(self.verb_pr2[0]) and token.endswith(s2): + return token[2:-2] + # ^Siin Yaa, Alif Noon$ + if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]): + return token[2:-2] + # ^Siin Yaa, Waaw Noon$ + if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]): + return token[2:-2] + # ^Siin Taa, Noon$ + if ( + len(token) > 5 + and token.startswith(self.verb_pr2[0]) + and token.endswith("\u0646") + ): + return token[2:-1] + # ^Siin Yaa, Noon$ + if ( + len(token) > 5 + and token.startswith(self.verb_pr2[1]) + and token.endswith("\u0646") + ): + return token[2:-1] + + def verb_t3(self, token): + """ + stem the present suffixes + """ + if len(token) > 5: + for su3 in self.verb_suf3: + if token.endswith(su3): + return token[:-3] + if len(token) > 4: + for su2 in self.verb_suf2: + if token.endswith(su2): + return token[:-2] + if len(token) > 3: + for su1 in self.verb_suf1: + if token.endswith(su1): + return token[:-1] + + def verb_t4(self, token): + """ + stem the present prefixes + """ + if len(token) > 3: + for pr1 in self.verb_suf1: + if token.startswith(pr1): + return token[1:] + if token.startswith("\u064A"): + return token[1:] + + def verb_t5(self, token): + """ + stem the future prefixes + """ + if len(token) > 4: + for pr2 in self.verb_pr22: + if token.startswith(pr2): + return token[2:] + for pr2 in self.verb_pr2: + if token.startswith(pr2): + return token[2:] + return token + + def verb_t6(self, token): + """ + stem the order prefixes + """ + if len(token) > 4: + for pr3 in self.verb_pr33: + if token.startswith(pr3): + return token[2:] + return token diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/arlstem2.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/arlstem2.py new file mode 100644 index 0000000000000000000000000000000000000000..d16e6d4a70d5585992f7bbcf81ed8c105d5aa320 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/arlstem2.py @@ -0,0 +1,457 @@ +# +# Natural Language Toolkit: ARLSTem Stemmer v2 +# +# Copyright (C) 2001-2022 NLTK Project +# +# Author: Kheireddine Abainia (x-programer) +# Algorithms: Kheireddine Abainia +# Hamza Rebbani +# URL: +# For license information, see LICENSE.TXT + + +""" +ARLSTem2 Arabic Light Stemmer +The details about the implementation of this algorithm are described in: +K. Abainia and H. Rebbani, Comparing the Effectiveness of the Improved ARLSTem +Algorithm with Existing Arabic Light Stemmers, International Conference on +Theoretical and Applicative Aspects of Computer Science (ICTAACS'19), Skikda, +Algeria, December 15-16, 2019. +ARLSTem2 is an Arabic light stemmer based on removing the affixes from +the words (i.e. prefixes, suffixes and infixes). It is an improvement +of the previous Arabic light stemmer (ARLSTem). The new version was compared to +the original algorithm and several existing Arabic light stemmers, where the +results showed that the new version considerably improves the under-stemming +errors that are common to light stemmers. Both ARLSTem and ARLSTem2 can be run +online and do not use any dictionary. +""" +import re + +from nltk.stem.api import StemmerI + + +class ARLSTem2(StemmerI): + """ + Return a stemmed Arabic word after removing affixes. This an improved + version of the previous algorithm, which reduces under-stemming errors. + Typically used in Arabic search engine, information retrieval and NLP. + + >>> from nltk.stem import arlstem2 + >>> stemmer = ARLSTem2() + >>> word = stemmer.stem('يعمل') + >>> print(word) + عمل + + :param token: The input Arabic word (unicode) to be stemmed + :type token: unicode + :return: A unicode Arabic word + """ + + def __init__(self): + # different Alif with hamza + self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]") + self.re_alifMaqsura = re.compile(r"[\u0649]") + self.re_diacritics = re.compile(r"[\u064B-\u065F]") + + # Alif Laam, Laam Laam, Fa Laam, Fa Ba + self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"] + # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam + self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"] + # Fa Laam Laam, Waaw Laam Laam + self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"] + # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam + self.pr4 = [ + "\u0641\u0628\u0627\u0644", + "\u0648\u0628\u0627\u0644", + "\u0641\u0643\u0627\u0644", + ] + + # Kaf Yaa, Kaf Miim + self.su2 = ["\u0643\u064A", "\u0643\u0645"] + # Ha Alif, Ha Miim + self.su22 = ["\u0647\u0627", "\u0647\u0645"] + # Kaf Miim Alif, Kaf Noon Shadda + self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"] + # Ha Miim Alif, Ha Noon Shadda + self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"] + + # Alif Noon, Ya Noon, Waaw Noon + self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"] + # Taa Alif Noon, Taa Ya Noon + self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"] + + # Alif Noon, Waaw Noon + self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"] + # Siin Taa, Siin Yaa + self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"] + # Siin Alif, Siin Noon + self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"] + # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza + self.verb_pr33 = [ + "\u0644\u0646", + "\u0644\u062A", + "\u0644\u064A", + "\u0644\u0623", + ] + # Taa Miim Alif, Taa Noon Shadda + self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"] + # Noon Alif, Taa Miim, Taa Alif, Waaw Alif + self.verb_suf2 = [ + "\u0646\u0627", + "\u062A\u0645", + "\u062A\u0627", + "\u0648\u0627", + ] + # Taa, Alif, Noon + self.verb_suf1 = ["\u062A", "\u0627", "\u0646"] + + def stem1(self, token): + """ + call this function to get the first stem + """ + try: + if token is None: + raise ValueError( + "The word could not be stemmed, because \ + it is empty !" + ) + self.is_verb = False + # remove Arabic diacritics and replace some letters with others + token = self.norm(token) + # strip the common noun prefixes + pre = self.pref(token) + if pre is not None: + token = pre + # transform the feminine form to masculine form + fm = self.fem2masc(token) + if fm is not None: + return fm + # strip the adjective affixes + adj = self.adjective(token) + if adj is not None: + return adj + # strip the suffixes that are common to nouns and verbs + token = self.suff(token) + # transform a plural noun to a singular noun + ps = self.plur2sing(token) + if ps is None: + if pre is None: # if the noun prefixes are not stripped + # strip the verb prefixes and suffixes + verb = self.verb(token) + if verb is not None: + self.is_verb = True + return verb + else: + return ps + return token + except ValueError as e: + print(e) + + def stem(self, token): + # stem the input word + try: + if token is None: + raise ValueError( + "The word could not be stemmed, because \ + it is empty !" + ) + # run the first round of stemming + token = self.stem1(token) + # check if there is some additional noun affixes + if len(token) > 4: + # ^Taa, $Yaa + char + if token.startswith("\u062A") and token[-2] == "\u064A": + token = token[1:-2] + token[-1] + return token + # ^Miim, $Waaw + char + if token.startswith("\u0645") and token[-2] == "\u0648": + token = token[1:-2] + token[-1] + return token + if len(token) > 3: + # !^Alif, $Yaa + if not token.startswith("\u0627") and token.endswith("\u064A"): + token = token[:-1] + return token + # $Laam + if token.startswith("\u0644"): + return token[1:] + return token + except ValueError as e: + print(e) + + def norm(self, token): + """ + normalize the word by removing diacritics, replace hamzated Alif + with Alif bare, replace AlifMaqsura with Yaa and remove Waaw at the + beginning. + """ + # strip Arabic diacritics + token = self.re_diacritics.sub("", token) + # replace Hamzated Alif with Alif bare + token = self.re_hamzated_alif.sub("\u0627", token) + # replace alifMaqsura with Yaa + token = self.re_alifMaqsura.sub("\u064A", token) + # strip the Waaw from the word beginning if the remaining is + # tri-literal at least + if token.startswith("\u0648") and len(token) > 3: + token = token[1:] + return token + + def pref(self, token): + """ + remove prefixes from the words' beginning. + """ + if len(token) > 5: + for p3 in self.pr3: + if token.startswith(p3): + return token[3:] + if len(token) > 6: + for p4 in self.pr4: + if token.startswith(p4): + return token[4:] + if len(token) > 5: + for p3 in self.pr32: + if token.startswith(p3): + return token[3:] + if len(token) > 4: + for p2 in self.pr2: + if token.startswith(p2): + return token[2:] + + def adjective(self, token): + """ + remove the infixes from adjectives + """ + # ^Alif, Alif, $Yaa + if len(token) > 5: + if ( + token.startswith("\u0627") + and token[-3] == "\u0627" + and token.endswith("\u064A") + ): + return token[:-3] + token[-2] + + def suff(self, token): + """ + remove the suffixes from the word's ending. + """ + if token.endswith("\u0643") and len(token) > 3: + return token[:-1] + if len(token) > 4: + for s2 in self.su2: + if token.endswith(s2): + return token[:-2] + if len(token) > 5: + for s3 in self.su3: + if token.endswith(s3): + return token[:-3] + if token.endswith("\u0647") and len(token) > 3: + token = token[:-1] + return token + if len(token) > 4: + for s2 in self.su22: + if token.endswith(s2): + return token[:-2] + if len(token) > 5: + for s3 in self.su32: + if token.endswith(s3): + return token[:-3] + # $Noon and Alif + if token.endswith("\u0646\u0627") and len(token) > 4: + return token[:-2] + return token + + def fem2masc(self, token): + """ + transform the word from the feminine form to the masculine form. + """ + if len(token) > 6: + # ^Taa, Yaa, $Yaa and Taa Marbuta + if ( + token.startswith("\u062A") + and token[-4] == "\u064A" + and token.endswith("\u064A\u0629") + ): + return token[1:-4] + token[-3] + # ^Alif, Yaa, $Yaa and Taa Marbuta + if ( + token.startswith("\u0627") + and token[-4] == "\u0627" + and token.endswith("\u064A\u0629") + ): + return token[:-4] + token[-3] + # $Alif, Yaa and Taa Marbuta + if token.endswith("\u0627\u064A\u0629") and len(token) > 5: + return token[:-2] + if len(token) > 4: + # Alif, $Taa Marbuta + if token[1] == "\u0627" and token.endswith("\u0629"): + return token[0] + token[2:-1] + # $Yaa and Taa Marbuta + if token.endswith("\u064A\u0629"): + return token[:-2] + # $Taa Marbuta + if token.endswith("\u0629") and len(token) > 3: + return token[:-1] + + def plur2sing(self, token): + """ + transform the word from the plural form to the singular form. + """ + # ^Haa, $Noon, Waaw + if len(token) > 5: + if token.startswith("\u0645") and token.endswith("\u0648\u0646"): + return token[1:-2] + if len(token) > 4: + for ps2 in self.pl_si2: + if token.endswith(ps2): + return token[:-2] + if len(token) > 5: + for ps3 in self.pl_si3: + if token.endswith(ps3): + return token[:-3] + if len(token) > 4: + # $Alif, Taa + if token.endswith("\u0627\u062A"): + return token[:-2] + # ^Alif Alif + if token.startswith("\u0627") and token[2] == "\u0627": + return token[:2] + token[3:] + # ^Alif Alif + if token.startswith("\u0627") and token[-2] == "\u0627": + return token[1:-2] + token[-1] + + def verb(self, token): + """ + stem the verb prefixes and suffixes or both + """ + vb = self.verb_t1(token) + if vb is not None: + return vb + vb = self.verb_t2(token) + if vb is not None: + return vb + vb = self.verb_t3(token) + if vb is not None: + return vb + vb = self.verb_t4(token) + if vb is not None: + return vb + vb = self.verb_t5(token) + if vb is not None: + return vb + vb = self.verb_t6(token) + return vb + + def verb_t1(self, token): + """ + stem the present tense co-occurred prefixes and suffixes + """ + if len(token) > 5 and token.startswith("\u062A"): # Taa + for s2 in self.pl_si2: + if token.endswith(s2): + return token[1:-2] + if len(token) > 5 and token.startswith("\u064A"): # Yaa + for s2 in self.verb_su2: + if token.endswith(s2): + return token[1:-2] + if len(token) > 4 and token.startswith("\u0627"): # Alif + # Waaw Alif + if len(token) > 5 and token.endswith("\u0648\u0627"): + return token[1:-2] + # Yaa + if token.endswith("\u064A"): + return token[1:-1] + # Alif + if token.endswith("\u0627"): + return token[1:-1] + # Noon + if token.endswith("\u0646"): + return token[1:-1] + # ^Yaa, Noon$ + if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"): + return token[1:-1] + # ^Taa, Noon$ + if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"): + return token[1:-1] + + def verb_t2(self, token): + """ + stem the future tense co-occurred prefixes and suffixes + """ + if len(token) > 6: + for s2 in self.pl_si2: + # ^Siin Taa + if token.startswith(self.verb_pr2[0]) and token.endswith(s2): + return token[2:-2] + # ^Siin Yaa, Alif Noon$ + if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]): + return token[2:-2] + # ^Siin Yaa, Waaw Noon$ + if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]): + return token[2:-2] + # ^Siin Taa, Noon$ + if ( + len(token) > 5 + and token.startswith(self.verb_pr2[0]) + and token.endswith("\u0646") + ): + return token[2:-1] + # ^Siin Yaa, Noon$ + if ( + len(token) > 5 + and token.startswith(self.verb_pr2[1]) + and token.endswith("\u0646") + ): + return token[2:-1] + + def verb_t3(self, token): + """ + stem the present tense suffixes + """ + if len(token) > 5: + for su3 in self.verb_suf3: + if token.endswith(su3): + return token[:-3] + if len(token) > 4: + for su2 in self.verb_suf2: + if token.endswith(su2): + return token[:-2] + if len(token) > 3: + for su1 in self.verb_suf1: + if token.endswith(su1): + return token[:-1] + + def verb_t4(self, token): + """ + stem the present tense prefixes + """ + if len(token) > 3: + for pr1 in self.verb_suf1: + if token.startswith(pr1): + return token[1:] + if token.startswith("\u064A"): + return token[1:] + + def verb_t5(self, token): + """ + stem the future tense prefixes + """ + if len(token) > 4: + for pr2 in self.verb_pr22: + if token.startswith(pr2): + return token[2:] + for pr2 in self.verb_pr2: + if token.startswith(pr2): + return token[2:] + + def verb_t6(self, token): + """ + stem the imperative tense prefixes + """ + if len(token) > 4: + for pr3 in self.verb_pr33: + if token.startswith(pr3): + return token[2:] + + return token diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/cistem.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/cistem.py new file mode 100644 index 0000000000000000000000000000000000000000..7f3711be1328707ed22ea137c3c406e95b73a3c5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/cistem.py @@ -0,0 +1,209 @@ +# Natural Language Toolkit: CISTEM Stemmer for German +# Copyright (C) 2001-2022 NLTK Project +# Author: Leonie Weissweiler +# Tom Aarsen <> (modifications) +# Algorithm: Leonie Weissweiler +# Alexander Fraser +# URL: +# For license information, see LICENSE.TXT + +import re +from typing import Tuple + +from nltk.stem.api import StemmerI + + +class Cistem(StemmerI): + """ + CISTEM Stemmer for German + + This is the official Python implementation of the CISTEM stemmer. + It is based on the paper + Leonie Weissweiler, Alexander Fraser (2017). Developing a Stemmer for German + Based on a Comparative Analysis of Publicly Available Stemmers. + In Proceedings of the German Society for Computational Linguistics and Language + Technology (GSCL) + which can be read here: + https://www.cis.lmu.de/~weissweiler/cistem/ + + In the paper, we conducted an analysis of publicly available stemmers, + developed two gold standards for German stemming and evaluated the stemmers + based on the two gold standards. We then proposed the stemmer implemented here + and show that it achieves slightly better f-measure than the other stemmers and + is thrice as fast as the Snowball stemmer for German while being about as fast + as most other stemmers. + + case_insensitive is a a boolean specifying if case-insensitive stemming + should be used. Case insensitivity improves performance only if words in the + text may be incorrectly upper case. For all-lowercase and correctly cased + text, best performance is achieved by setting case_insensitive for false. + + :param case_insensitive: if True, the stemming is case insensitive. False by default. + :type case_insensitive: bool + """ + + strip_ge = re.compile(r"^ge(.{4,})") + repl_xx = re.compile(r"(.)\1") + strip_emr = re.compile(r"e[mr]$") + strip_nd = re.compile(r"nd$") + strip_t = re.compile(r"t$") + strip_esn = re.compile(r"[esn]$") + repl_xx_back = re.compile(r"(.)\*") + + def __init__(self, case_insensitive: bool = False): + self._case_insensitive = case_insensitive + + @staticmethod + def replace_to(word: str) -> str: + word = word.replace("sch", "$") + word = word.replace("ei", "%") + word = word.replace("ie", "&") + word = Cistem.repl_xx.sub(r"\1*", word) + + return word + + @staticmethod + def replace_back(word: str) -> str: + word = Cistem.repl_xx_back.sub(r"\1\1", word) + word = word.replace("%", "ei") + word = word.replace("&", "ie") + word = word.replace("$", "sch") + + return word + + def stem(self, word: str) -> str: + """Stems the input word. + + :param word: The word that is to be stemmed. + :type word: str + :return: The stemmed word. + :rtype: str + + >>> from nltk.stem.cistem import Cistem + >>> stemmer = Cistem() + >>> s1 = "Speicherbehältern" + >>> stemmer.stem(s1) + 'speicherbehalt' + >>> s2 = "Grenzpostens" + >>> stemmer.stem(s2) + 'grenzpost' + >>> s3 = "Ausgefeiltere" + >>> stemmer.stem(s3) + 'ausgefeilt' + >>> stemmer = Cistem(True) + >>> stemmer.stem(s1) + 'speicherbehal' + >>> stemmer.stem(s2) + 'grenzpo' + >>> stemmer.stem(s3) + 'ausgefeil' + """ + if len(word) == 0: + return word + + upper = word[0].isupper() + word = word.lower() + + word = word.replace("ü", "u") + word = word.replace("ö", "o") + word = word.replace("ä", "a") + word = word.replace("ß", "ss") + + word = Cistem.strip_ge.sub(r"\1", word) + + return self._segment_inner(word, upper)[0] + + def segment(self, word: str) -> Tuple[str, str]: + """ + This method works very similarly to stem (:func:'cistem.stem'). The difference is that in + addition to returning the stem, it also returns the rest that was removed at + the end. To be able to return the stem unchanged so the stem and the rest + can be concatenated to form the original word, all subsitutions that altered + the stem in any other way than by removing letters at the end were left out. + + :param word: The word that is to be stemmed. + :type word: str + :return: A tuple of the stemmed word and the removed suffix. + :rtype: Tuple[str, str] + + >>> from nltk.stem.cistem import Cistem + >>> stemmer = Cistem() + >>> s1 = "Speicherbehältern" + >>> stemmer.segment(s1) + ('speicherbehält', 'ern') + >>> s2 = "Grenzpostens" + >>> stemmer.segment(s2) + ('grenzpost', 'ens') + >>> s3 = "Ausgefeiltere" + >>> stemmer.segment(s3) + ('ausgefeilt', 'ere') + >>> stemmer = Cistem(True) + >>> stemmer.segment(s1) + ('speicherbehäl', 'tern') + >>> stemmer.segment(s2) + ('grenzpo', 'stens') + >>> stemmer.segment(s3) + ('ausgefeil', 'tere') + """ + if len(word) == 0: + return ("", "") + + upper = word[0].isupper() + word = word.lower() + + return self._segment_inner(word, upper) + + def _segment_inner(self, word: str, upper: bool): + """Inner method for iteratively applying the code stemming regexes. + This method receives a pre-processed variant of the word to be stemmed, + or the word to be segmented, and returns a tuple of the word and the + removed suffix. + + :param word: A pre-processed variant of the word that is to be stemmed. + :type word: str + :param upper: Whether the original word started with a capital letter. + :type upper: bool + :return: A tuple of the stemmed word and the removed suffix. + :rtype: Tuple[str, str] + """ + + rest_length = 0 + word_copy = word[:] + + # Pre-processing before applying the substitution patterns + word = Cistem.replace_to(word) + rest = "" + + # Apply the substitution patterns + while len(word) > 3: + if len(word) > 5: + word, n = Cistem.strip_emr.subn("", word) + if n != 0: + rest_length += 2 + continue + + word, n = Cistem.strip_nd.subn("", word) + if n != 0: + rest_length += 2 + continue + + if not upper or self._case_insensitive: + word, n = Cistem.strip_t.subn("", word) + if n != 0: + rest_length += 1 + continue + + word, n = Cistem.strip_esn.subn("", word) + if n != 0: + rest_length += 1 + continue + else: + break + + # Post-processing after applying the substitution patterns + word = Cistem.replace_back(word) + + if rest_length: + rest = word_copy[-rest_length:] + + return (word, rest) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/isri.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/isri.py new file mode 100644 index 0000000000000000000000000000000000000000..40f7a31396fe4d4e994454456456a50f3f2b96b9 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/isri.py @@ -0,0 +1,395 @@ +# +# Natural Language Toolkit: The ISRI Arabic Stemmer +# +# Copyright (C) 2001-2022 NLTK Project +# Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005) +# Author: Hosam Algasaier +# URL: +# For license information, see LICENSE.TXT + +""" +ISRI Arabic Stemmer + +The algorithm for this stemmer is described in: + +Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root dictionary. +Information Science Research Institute. University of Nevada, Las Vegas, USA. + +The Information Science Research Institute’s (ISRI) Arabic stemmer shares many features +with the Khoja stemmer. However, the main difference is that ISRI stemmer does not use root +dictionary. Also, if a root is not found, ISRI stemmer returned normalized form, rather than +returning the original unmodified word. + +Additional adjustments were made to improve the algorithm: + +1- Adding 60 stop words. +2- Adding the pattern (تفاعيل) to ISRI pattern set. +3- The step 2 in the original algorithm was normalizing all hamza. This step is discarded because it +increases the word ambiguities and changes the original root. + +""" +import re + +from nltk.stem.api import StemmerI + + +class ISRIStemmer(StemmerI): + """ + ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. + Information Science Research Institute. University of Nevada, Las Vegas, USA. + + A few minor modifications have been made to ISRI basic algorithm. + See the source code of this module for more information. + + isri.stem(token) returns Arabic root for the given token. + + The ISRI Stemmer requires that all tokens have Unicode string types. + If you use Python IDLE on Arabic Windows you have to decode text first + using Arabic '1256' coding. + """ + + def __init__(self): + # length three prefixes + self.p3 = [ + "\u0643\u0627\u0644", + "\u0628\u0627\u0644", + "\u0648\u0644\u0644", + "\u0648\u0627\u0644", + ] + + # length two prefixes + self.p2 = ["\u0627\u0644", "\u0644\u0644"] + + # length one prefixes + self.p1 = [ + "\u0644", + "\u0628", + "\u0641", + "\u0633", + "\u0648", + "\u064a", + "\u062a", + "\u0646", + "\u0627", + ] + + # length three suffixes + self.s3 = [ + "\u062a\u0645\u0644", + "\u0647\u0645\u0644", + "\u062a\u0627\u0646", + "\u062a\u064a\u0646", + "\u0643\u0645\u0644", + ] + + # length two suffixes + self.s2 = [ + "\u0648\u0646", + "\u0627\u062a", + "\u0627\u0646", + "\u064a\u0646", + "\u062a\u0646", + "\u0643\u0645", + "\u0647\u0646", + "\u0646\u0627", + "\u064a\u0627", + "\u0647\u0627", + "\u062a\u0645", + "\u0643\u0646", + "\u0646\u064a", + "\u0648\u0627", + "\u0645\u0627", + "\u0647\u0645", + ] + + # length one suffixes + self.s1 = ["\u0629", "\u0647", "\u064a", "\u0643", "\u062a", "\u0627", "\u0646"] + + # groups of length four patterns + self.pr4 = { + 0: ["\u0645"], + 1: ["\u0627"], + 2: ["\u0627", "\u0648", "\u064A"], + 3: ["\u0629"], + } + + # Groups of length five patterns and length three roots + self.pr53 = { + 0: ["\u0627", "\u062a"], + 1: ["\u0627", "\u064a", "\u0648"], + 2: ["\u0627", "\u062a", "\u0645"], + 3: ["\u0645", "\u064a", "\u062a"], + 4: ["\u0645", "\u062a"], + 5: ["\u0627", "\u0648"], + 6: ["\u0627", "\u0645"], + } + + self.re_short_vowels = re.compile(r"[\u064B-\u0652]") + self.re_hamza = re.compile(r"[\u0621\u0624\u0626]") + self.re_initial_hamza = re.compile(r"^[\u0622\u0623\u0625]") + + self.stop_words = [ + "\u064a\u0643\u0648\u0646", + "\u0648\u0644\u064a\u0633", + "\u0648\u0643\u0627\u0646", + "\u0643\u0630\u0644\u0643", + "\u0627\u0644\u062a\u064a", + "\u0648\u0628\u064a\u0646", + "\u0639\u0644\u064a\u0647\u0627", + "\u0645\u0633\u0627\u0621", + "\u0627\u0644\u0630\u064a", + "\u0648\u0643\u0627\u0646\u062a", + "\u0648\u0644\u0643\u0646", + "\u0648\u0627\u0644\u062a\u064a", + "\u062a\u0643\u0648\u0646", + "\u0627\u0644\u064a\u0648\u0645", + "\u0627\u0644\u0644\u0630\u064a\u0646", + "\u0639\u0644\u064a\u0647", + "\u0643\u0627\u0646\u062a", + "\u0644\u0630\u0644\u0643", + "\u0623\u0645\u0627\u0645", + "\u0647\u0646\u0627\u0643", + "\u0645\u0646\u0647\u0627", + "\u0645\u0627\u0632\u0627\u0644", + "\u0644\u0627\u0632\u0627\u0644", + "\u0644\u0627\u064a\u0632\u0627\u0644", + "\u0645\u0627\u064a\u0632\u0627\u0644", + "\u0627\u0635\u0628\u062d", + "\u0623\u0635\u0628\u062d", + "\u0623\u0645\u0633\u0649", + "\u0627\u0645\u0633\u0649", + "\u0623\u0636\u062d\u0649", + "\u0627\u0636\u062d\u0649", + "\u0645\u0627\u0628\u0631\u062d", + "\u0645\u0627\u0641\u062a\u0626", + "\u0645\u0627\u0627\u0646\u0641\u0643", + "\u0644\u0627\u0633\u064a\u0645\u0627", + "\u0648\u0644\u0627\u064a\u0632\u0627\u0644", + "\u0627\u0644\u062d\u0627\u0644\u064a", + "\u0627\u0644\u064a\u0647\u0627", + "\u0627\u0644\u0630\u064a\u0646", + "\u0641\u0627\u0646\u0647", + "\u0648\u0627\u0644\u0630\u064a", + "\u0648\u0647\u0630\u0627", + "\u0644\u0647\u0630\u0627", + "\u0641\u0643\u0627\u0646", + "\u0633\u062a\u0643\u0648\u0646", + "\u0627\u0644\u064a\u0647", + "\u064a\u0645\u0643\u0646", + "\u0628\u0647\u0630\u0627", + "\u0627\u0644\u0630\u0649", + ] + + def stem(self, token): + """ + Stemming a word token using the ISRI stemmer. + """ + token = self.norm( + token, 1 + ) # remove diacritics which representing Arabic short vowels + if token in self.stop_words: + return token # exclude stop words from being processed + token = self.pre32( + token + ) # remove length three and length two prefixes in this order + token = self.suf32( + token + ) # remove length three and length two suffixes in this order + token = self.waw( + token + ) # remove connective ‘و’ if it precedes a word beginning with ‘و’ + token = self.norm(token, 2) # normalize initial hamza to bare alif + # if 4 <= word length <= 7, then stem; otherwise, no stemming + if len(token) == 4: # length 4 word + token = self.pro_w4(token) + elif len(token) == 5: # length 5 word + token = self.pro_w53(token) + token = self.end_w5(token) + elif len(token) == 6: # length 6 word + token = self.pro_w6(token) + token = self.end_w6(token) + elif len(token) == 7: # length 7 word + token = self.suf1(token) + if len(token) == 7: + token = self.pre1(token) + if len(token) == 6: + token = self.pro_w6(token) + token = self.end_w6(token) + return token + + def norm(self, word, num=3): + """ + normalization: + num=1 normalize diacritics + num=2 normalize initial hamza + num=3 both 1&2 + """ + if num == 1: + word = self.re_short_vowels.sub("", word) + elif num == 2: + word = self.re_initial_hamza.sub("\u0627", word) + elif num == 3: + word = self.re_short_vowels.sub("", word) + word = self.re_initial_hamza.sub("\u0627", word) + return word + + def pre32(self, word): + """remove length three and length two prefixes in this order""" + if len(word) >= 6: + for pre3 in self.p3: + if word.startswith(pre3): + return word[3:] + if len(word) >= 5: + for pre2 in self.p2: + if word.startswith(pre2): + return word[2:] + return word + + def suf32(self, word): + """remove length three and length two suffixes in this order""" + if len(word) >= 6: + for suf3 in self.s3: + if word.endswith(suf3): + return word[:-3] + if len(word) >= 5: + for suf2 in self.s2: + if word.endswith(suf2): + return word[:-2] + return word + + def waw(self, word): + """remove connective ‘و’ if it precedes a word beginning with ‘و’""" + if len(word) >= 4 and word[:2] == "\u0648\u0648": + word = word[1:] + return word + + def pro_w4(self, word): + """process length four patterns and extract length three roots""" + if word[0] in self.pr4[0]: # مفعل + word = word[1:] + elif word[1] in self.pr4[1]: # فاعل + word = word[:1] + word[2:] + elif word[2] in self.pr4[2]: # فعال - فعول - فعيل + word = word[:2] + word[3] + elif word[3] in self.pr4[3]: # فعلة + word = word[:-1] + else: + word = self.suf1(word) # do - normalize short sufix + if len(word) == 4: + word = self.pre1(word) # do - normalize short prefix + return word + + def pro_w53(self, word): + """process length five patterns and extract length three roots""" + if word[2] in self.pr53[0] and word[0] == "\u0627": # افتعل - افاعل + word = word[1] + word[3:] + elif word[3] in self.pr53[1] and word[0] == "\u0645": # مفعول - مفعال - مفعيل + word = word[1:3] + word[4] + elif word[0] in self.pr53[2] and word[4] == "\u0629": # مفعلة - تفعلة - افعلة + word = word[1:4] + elif word[0] in self.pr53[3] and word[2] == "\u062a": # مفتعل - يفتعل - تفتعل + word = word[1] + word[3:] + elif word[0] in self.pr53[4] and word[2] == "\u0627": # مفاعل - تفاعل + word = word[1] + word[3:] + elif word[2] in self.pr53[5] and word[4] == "\u0629": # فعولة - فعالة + word = word[:2] + word[3] + elif word[0] in self.pr53[6] and word[1] == "\u0646": # انفعل - منفعل + word = word[2:] + elif word[3] == "\u0627" and word[0] == "\u0627": # افعال + word = word[1:3] + word[4] + elif word[4] == "\u0646" and word[3] == "\u0627": # فعلان + word = word[:3] + elif word[3] == "\u064a" and word[0] == "\u062a": # تفعيل + word = word[1:3] + word[4] + elif word[3] == "\u0648" and word[1] == "\u0627": # فاعول + word = word[0] + word[2] + word[4] + elif word[2] == "\u0627" and word[1] == "\u0648": # فواعل + word = word[0] + word[3:] + elif word[3] == "\u0626" and word[2] == "\u0627": # فعائل + word = word[:2] + word[4] + elif word[4] == "\u0629" and word[1] == "\u0627": # فاعلة + word = word[0] + word[2:4] + elif word[4] == "\u064a" and word[2] == "\u0627": # فعالي + word = word[:2] + word[3] + else: + word = self.suf1(word) # do - normalize short sufix + if len(word) == 5: + word = self.pre1(word) # do - normalize short prefix + return word + + def pro_w54(self, word): + """process length five patterns and extract length four roots""" + if word[0] in self.pr53[2]: # تفعلل - افعلل - مفعلل + word = word[1:] + elif word[4] == "\u0629": # فعللة + word = word[:4] + elif word[2] == "\u0627": # فعالل + word = word[:2] + word[3:] + return word + + def end_w5(self, word): + """ending step (word of length five)""" + if len(word) == 4: + word = self.pro_w4(word) + elif len(word) == 5: + word = self.pro_w54(word) + return word + + def pro_w6(self, word): + """process length six patterns and extract length three roots""" + if word.startswith("\u0627\u0633\u062a") or word.startswith( + "\u0645\u0633\u062a" + ): # مستفعل - استفعل + word = word[3:] + elif ( + word[0] == "\u0645" and word[3] == "\u0627" and word[5] == "\u0629" + ): # مفعالة + word = word[1:3] + word[4] + elif ( + word[0] == "\u0627" and word[2] == "\u062a" and word[4] == "\u0627" + ): # افتعال + word = word[1] + word[3] + word[5] + elif ( + word[0] == "\u0627" and word[3] == "\u0648" and word[2] == word[4] + ): # افعوعل + word = word[1] + word[4:] + elif ( + word[0] == "\u062a" and word[2] == "\u0627" and word[4] == "\u064a" + ): # تفاعيل new pattern + word = word[1] + word[3] + word[5] + else: + word = self.suf1(word) # do - normalize short sufix + if len(word) == 6: + word = self.pre1(word) # do - normalize short prefix + return word + + def pro_w64(self, word): + """process length six patterns and extract length four roots""" + if word[0] == "\u0627" and word[4] == "\u0627": # افعلال + word = word[1:4] + word[5] + elif word.startswith("\u0645\u062a"): # متفعلل + word = word[2:] + return word + + def end_w6(self, word): + """ending step (word of length six)""" + if len(word) == 5: + word = self.pro_w53(word) + word = self.end_w5(word) + elif len(word) == 6: + word = self.pro_w64(word) + return word + + def suf1(self, word): + """normalize short sufix""" + for sf1 in self.s1: + if word.endswith(sf1): + return word[:-1] + return word + + def pre1(self, word): + """normalize short prefix""" + for sp1 in self.p1: + if word.startswith(sp1): + return word[1:] + return word diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/porter.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/porter.py new file mode 100644 index 0000000000000000000000000000000000000000..c84402d8083677ea9e727f5f5b0998529ad96ba6 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/porter.py @@ -0,0 +1,715 @@ +""" +Porter Stemmer + +This is the Porter stemming algorithm. It follows the algorithm +presented in + +Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137. + +with some optional deviations that can be turned on or off with the +`mode` argument to the constructor. + +Martin Porter, the algorithm's inventor, maintains a web page about the +algorithm at + + https://www.tartarus.org/~martin/PorterStemmer/ + +which includes another Python implementation and other implementations +in many languages. +""" + +__docformat__ = "plaintext" + +import re + +from nltk.stem.api import StemmerI + + +class PorterStemmer(StemmerI): + """ + A word stemmer based on the Porter stemming algorithm. + + Porter, M. "An algorithm for suffix stripping." + Program 14.3 (1980): 130-137. + + See https://www.tartarus.org/~martin/PorterStemmer/ for the homepage + of the algorithm. + + Martin Porter has endorsed several modifications to the Porter + algorithm since writing his original paper, and those extensions are + included in the implementations on his website. Additionally, others + have proposed further improvements to the algorithm, including NLTK + contributors. There are thus three modes that can be selected by + passing the appropriate constant to the class constructor's `mode` + attribute: + + - PorterStemmer.ORIGINAL_ALGORITHM + + An implementation that is faithful to the original paper. + + Note that Martin Porter has deprecated this version of the + algorithm. Martin distributes implementations of the Porter + Stemmer in many languages, hosted at: + + https://www.tartarus.org/~martin/PorterStemmer/ + + and all of these implementations include his extensions. He + strongly recommends against using the original, published + version of the algorithm; only use this mode if you clearly + understand why you are choosing to do so. + + - PorterStemmer.MARTIN_EXTENSIONS + + An implementation that only uses the modifications to the + algorithm that are included in the implementations on Martin + Porter's website. He has declared Porter frozen, so the + behaviour of those implementations should never change. + + - PorterStemmer.NLTK_EXTENSIONS (default) + + An implementation that includes further improvements devised by + NLTK contributors or taken from other modified implementations + found on the web. + + For the best stemming, you should use the default NLTK_EXTENSIONS + version. However, if you need to get the same results as either the + original algorithm or one of Martin Porter's hosted versions for + compatibility with an existing implementation or dataset, you can use + one of the other modes instead. + """ + + # Modes the Stemmer can be instantiated in + NLTK_EXTENSIONS = "NLTK_EXTENSIONS" + MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS" + ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM" + + def __init__(self, mode=NLTK_EXTENSIONS): + if mode not in ( + self.NLTK_EXTENSIONS, + self.MARTIN_EXTENSIONS, + self.ORIGINAL_ALGORITHM, + ): + raise ValueError( + "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, " + "PorterStemmer.MARTIN_EXTENSIONS, or " + "PorterStemmer.ORIGINAL_ALGORITHM" + ) + + self.mode = mode + + if self.mode == self.NLTK_EXTENSIONS: + # This is a table of irregular forms. It is quite short, + # but still reflects the errors actually drawn to Martin + # Porter's attention over a 20 year period! + irregular_forms = { + "sky": ["sky", "skies"], + "die": ["dying"], + "lie": ["lying"], + "tie": ["tying"], + "news": ["news"], + "inning": ["innings", "inning"], + "outing": ["outings", "outing"], + "canning": ["cannings", "canning"], + "howe": ["howe"], + "proceed": ["proceed"], + "exceed": ["exceed"], + "succeed": ["succeed"], + } + + self.pool = {} + for key in irregular_forms: + for val in irregular_forms[key]: + self.pool[val] = key + + self.vowels = frozenset(["a", "e", "i", "o", "u"]) + + def _is_consonant(self, word, i): + """Returns True if word[i] is a consonant, False otherwise + + A consonant is defined in the paper as follows: + + A consonant in a word is a letter other than A, E, I, O or + U, and other than Y preceded by a consonant. (The fact that + the term `consonant' is defined to some extent in terms of + itself does not make it ambiguous.) So in TOY the consonants + are T and Y, and in SYZYGY they are S, Z and G. If a letter + is not a consonant it is a vowel. + """ + if word[i] in self.vowels: + return False + if word[i] == "y": + if i == 0: + return True + else: + return not self._is_consonant(word, i - 1) + return True + + def _measure(self, stem): + r"""Returns the 'measure' of stem, per definition in the paper + + From the paper: + + A consonant will be denoted by c, a vowel by v. A list + ccc... of length greater than 0 will be denoted by C, and a + list vvv... of length greater than 0 will be denoted by V. + Any word, or part of a word, therefore has one of the four + forms: + + CVCV ... C + CVCV ... V + VCVC ... C + VCVC ... V + + These may all be represented by the single form + + [C]VCVC ... [V] + + where the square brackets denote arbitrary presence of their + contents. Using (VC){m} to denote VC repeated m times, this + may again be written as + + [C](VC){m}[V]. + + m will be called the \measure\ of any word or word part when + represented in this form. The case m = 0 covers the null + word. Here are some examples: + + m=0 TR, EE, TREE, Y, BY. + m=1 TROUBLE, OATS, TREES, IVY. + m=2 TROUBLES, PRIVATE, OATEN, ORRERY. + """ + cv_sequence = "" + + # Construct a string of 'c's and 'v's representing whether each + # character in `stem` is a consonant or a vowel. + # e.g. 'falafel' becomes 'cvcvcvc', + # 'architecture' becomes 'vcccvcvccvcv' + for i in range(len(stem)): + if self._is_consonant(stem, i): + cv_sequence += "c" + else: + cv_sequence += "v" + + # Count the number of 'vc' occurrences, which is equivalent to + # the number of 'VC' occurrences in Porter's reduced form in the + # docstring above, which is in turn equivalent to `m` + return cv_sequence.count("vc") + + def _has_positive_measure(self, stem): + return self._measure(stem) > 0 + + def _contains_vowel(self, stem): + """Returns True if stem contains a vowel, else False""" + for i in range(len(stem)): + if not self._is_consonant(stem, i): + return True + return False + + def _ends_double_consonant(self, word): + """Implements condition *d from the paper + + Returns True if word ends with a double consonant + """ + return ( + len(word) >= 2 + and word[-1] == word[-2] + and self._is_consonant(word, len(word) - 1) + ) + + def _ends_cvc(self, word): + """Implements condition *o from the paper + + From the paper: + + *o - the stem ends cvc, where the second c is not W, X or Y + (e.g. -WIL, -HOP). + """ + return ( + len(word) >= 3 + and self._is_consonant(word, len(word) - 3) + and not self._is_consonant(word, len(word) - 2) + and self._is_consonant(word, len(word) - 1) + and word[-1] not in ("w", "x", "y") + ) or ( + self.mode == self.NLTK_EXTENSIONS + and len(word) == 2 + and not self._is_consonant(word, 0) + and self._is_consonant(word, 1) + ) + + def _replace_suffix(self, word, suffix, replacement): + """Replaces `suffix` of `word` with `replacement""" + assert word.endswith(suffix), "Given word doesn't end with given suffix" + if suffix == "": + return word + replacement + else: + return word[: -len(suffix)] + replacement + + def _apply_rule_list(self, word, rules): + """Applies the first applicable suffix-removal rule to the word + + Takes a word and a list of suffix-removal rules represented as + 3-tuples, with the first element being the suffix to remove, + the second element being the string to replace it with, and the + final element being the condition for the rule to be applicable, + or None if the rule is unconditional. + """ + for rule in rules: + suffix, replacement, condition = rule + if suffix == "*d" and self._ends_double_consonant(word): + stem = word[:-2] + if condition is None or condition(stem): + return stem + replacement + else: + # Don't try any further rules + return word + if word.endswith(suffix): + stem = self._replace_suffix(word, suffix, "") + if condition is None or condition(stem): + return stem + replacement + else: + # Don't try any further rules + return word + + return word + + def _step1a(self, word): + """Implements Step 1a from "An algorithm for suffix stripping" + + From the paper: + + SSES -> SS caresses -> caress + IES -> I ponies -> poni + ties -> ti + SS -> SS caress -> caress + S -> cats -> cat + """ + # this NLTK-only rule extends the original algorithm, so + # that 'flies'->'fli' but 'dies'->'die' etc + if self.mode == self.NLTK_EXTENSIONS: + if word.endswith("ies") and len(word) == 4: + return self._replace_suffix(word, "ies", "ie") + + return self._apply_rule_list( + word, + [ + ("sses", "ss", None), # SSES -> SS + ("ies", "i", None), # IES -> I + ("ss", "ss", None), # SS -> SS + ("s", "", None), # S -> + ], + ) + + def _step1b(self, word): + """Implements Step 1b from "An algorithm for suffix stripping" + + From the paper: + + (m>0) EED -> EE feed -> feed + agreed -> agree + (*v*) ED -> plastered -> plaster + bled -> bled + (*v*) ING -> motoring -> motor + sing -> sing + + If the second or third of the rules in Step 1b is successful, + the following is done: + + AT -> ATE conflat(ed) -> conflate + BL -> BLE troubl(ed) -> trouble + IZ -> IZE siz(ed) -> size + (*d and not (*L or *S or *Z)) + -> single letter + hopp(ing) -> hop + tann(ed) -> tan + fall(ing) -> fall + hiss(ing) -> hiss + fizz(ed) -> fizz + (m=1 and *o) -> E fail(ing) -> fail + fil(ing) -> file + + The rule to map to a single letter causes the removal of one of + the double letter pair. The -E is put back on -AT, -BL and -IZ, + so that the suffixes -ATE, -BLE and -IZE can be recognised + later. This E may be removed in step 4. + """ + # this NLTK-only block extends the original algorithm, so that + # 'spied'->'spi' but 'died'->'die' etc + if self.mode == self.NLTK_EXTENSIONS: + if word.endswith("ied"): + if len(word) == 4: + return self._replace_suffix(word, "ied", "ie") + else: + return self._replace_suffix(word, "ied", "i") + + # (m>0) EED -> EE + if word.endswith("eed"): + stem = self._replace_suffix(word, "eed", "") + if self._measure(stem) > 0: + return stem + "ee" + else: + return word + + rule_2_or_3_succeeded = False + + for suffix in ["ed", "ing"]: + if word.endswith(suffix): + intermediate_stem = self._replace_suffix(word, suffix, "") + if self._contains_vowel(intermediate_stem): + rule_2_or_3_succeeded = True + break + + if not rule_2_or_3_succeeded: + return word + + return self._apply_rule_list( + intermediate_stem, + [ + ("at", "ate", None), # AT -> ATE + ("bl", "ble", None), # BL -> BLE + ("iz", "ize", None), # IZ -> IZE + # (*d and not (*L or *S or *Z)) + # -> single letter + ( + "*d", + intermediate_stem[-1], + lambda stem: intermediate_stem[-1] not in ("l", "s", "z"), + ), + # (m=1 and *o) -> E + ( + "", + "e", + lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)), + ), + ], + ) + + def _step1c(self, word): + """Implements Step 1c from "An algorithm for suffix stripping" + + From the paper: + + Step 1c + + (*v*) Y -> I happy -> happi + sky -> sky + """ + + def nltk_condition(stem): + """ + This has been modified from the original Porter algorithm so + that y->i is only done when y is preceded by a consonant, + but not if the stem is only a single consonant, i.e. + + (*c and not c) Y -> I + + So 'happy' -> 'happi', but + 'enjoy' -> 'enjoy' etc + + This is a much better rule. Formerly 'enjoy'->'enjoi' and + 'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but + with this modification that no longer really matters. + + Also, the removal of the contains_vowel(z) condition means + that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and + conflate with 'spied', 'tried', 'flies' ... + """ + return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1) + + def original_condition(stem): + return self._contains_vowel(stem) + + return self._apply_rule_list( + word, + [ + ( + "y", + "i", + nltk_condition + if self.mode == self.NLTK_EXTENSIONS + else original_condition, + ) + ], + ) + + def _step2(self, word): + """Implements Step 2 from "An algorithm for suffix stripping" + + From the paper: + + Step 2 + + (m>0) ATIONAL -> ATE relational -> relate + (m>0) TIONAL -> TION conditional -> condition + rational -> rational + (m>0) ENCI -> ENCE valenci -> valence + (m>0) ANCI -> ANCE hesitanci -> hesitance + (m>0) IZER -> IZE digitizer -> digitize + (m>0) ABLI -> ABLE conformabli -> conformable + (m>0) ALLI -> AL radicalli -> radical + (m>0) ENTLI -> ENT differentli -> different + (m>0) ELI -> E vileli - > vile + (m>0) OUSLI -> OUS analogousli -> analogous + (m>0) IZATION -> IZE vietnamization -> vietnamize + (m>0) ATION -> ATE predication -> predicate + (m>0) ATOR -> ATE operator -> operate + (m>0) ALISM -> AL feudalism -> feudal + (m>0) IVENESS -> IVE decisiveness -> decisive + (m>0) FULNESS -> FUL hopefulness -> hopeful + (m>0) OUSNESS -> OUS callousness -> callous + (m>0) ALITI -> AL formaliti -> formal + (m>0) IVITI -> IVE sensitiviti -> sensitive + (m>0) BILITI -> BLE sensibiliti -> sensible + """ + + if self.mode == self.NLTK_EXTENSIONS: + # Instead of applying the ALLI -> AL rule after '(a)bli' per + # the published algorithm, instead we apply it first, and, + # if it succeeds, run the result through step2 again. + if word.endswith("alli") and self._has_positive_measure( + self._replace_suffix(word, "alli", "") + ): + return self._step2(self._replace_suffix(word, "alli", "al")) + + bli_rule = ("bli", "ble", self._has_positive_measure) + abli_rule = ("abli", "able", self._has_positive_measure) + + rules = [ + ("ational", "ate", self._has_positive_measure), + ("tional", "tion", self._has_positive_measure), + ("enci", "ence", self._has_positive_measure), + ("anci", "ance", self._has_positive_measure), + ("izer", "ize", self._has_positive_measure), + abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule, + ("alli", "al", self._has_positive_measure), + ("entli", "ent", self._has_positive_measure), + ("eli", "e", self._has_positive_measure), + ("ousli", "ous", self._has_positive_measure), + ("ization", "ize", self._has_positive_measure), + ("ation", "ate", self._has_positive_measure), + ("ator", "ate", self._has_positive_measure), + ("alism", "al", self._has_positive_measure), + ("iveness", "ive", self._has_positive_measure), + ("fulness", "ful", self._has_positive_measure), + ("ousness", "ous", self._has_positive_measure), + ("aliti", "al", self._has_positive_measure), + ("iviti", "ive", self._has_positive_measure), + ("biliti", "ble", self._has_positive_measure), + ] + + if self.mode == self.NLTK_EXTENSIONS: + rules.append(("fulli", "ful", self._has_positive_measure)) + + # The 'l' of the 'logi' -> 'log' rule is put with the stem, + # so that short stems like 'geo' 'theo' etc work like + # 'archaeo' 'philo' etc. + rules.append( + ("logi", "log", lambda stem: self._has_positive_measure(word[:-3])) + ) + + if self.mode == self.MARTIN_EXTENSIONS: + rules.append(("logi", "log", self._has_positive_measure)) + + return self._apply_rule_list(word, rules) + + def _step3(self, word): + """Implements Step 3 from "An algorithm for suffix stripping" + + From the paper: + + Step 3 + + (m>0) ICATE -> IC triplicate -> triplic + (m>0) ATIVE -> formative -> form + (m>0) ALIZE -> AL formalize -> formal + (m>0) ICITI -> IC electriciti -> electric + (m>0) ICAL -> IC electrical -> electric + (m>0) FUL -> hopeful -> hope + (m>0) NESS -> goodness -> good + """ + return self._apply_rule_list( + word, + [ + ("icate", "ic", self._has_positive_measure), + ("ative", "", self._has_positive_measure), + ("alize", "al", self._has_positive_measure), + ("iciti", "ic", self._has_positive_measure), + ("ical", "ic", self._has_positive_measure), + ("ful", "", self._has_positive_measure), + ("ness", "", self._has_positive_measure), + ], + ) + + def _step4(self, word): + """Implements Step 4 from "An algorithm for suffix stripping" + + Step 4 + + (m>1) AL -> revival -> reviv + (m>1) ANCE -> allowance -> allow + (m>1) ENCE -> inference -> infer + (m>1) ER -> airliner -> airlin + (m>1) IC -> gyroscopic -> gyroscop + (m>1) ABLE -> adjustable -> adjust + (m>1) IBLE -> defensible -> defens + (m>1) ANT -> irritant -> irrit + (m>1) EMENT -> replacement -> replac + (m>1) MENT -> adjustment -> adjust + (m>1) ENT -> dependent -> depend + (m>1 and (*S or *T)) ION -> adoption -> adopt + (m>1) OU -> homologou -> homolog + (m>1) ISM -> communism -> commun + (m>1) ATE -> activate -> activ + (m>1) ITI -> angulariti -> angular + (m>1) OUS -> homologous -> homolog + (m>1) IVE -> effective -> effect + (m>1) IZE -> bowdlerize -> bowdler + + The suffixes are now removed. All that remains is a little + tidying up. + """ + measure_gt_1 = lambda stem: self._measure(stem) > 1 + + return self._apply_rule_list( + word, + [ + ("al", "", measure_gt_1), + ("ance", "", measure_gt_1), + ("ence", "", measure_gt_1), + ("er", "", measure_gt_1), + ("ic", "", measure_gt_1), + ("able", "", measure_gt_1), + ("ible", "", measure_gt_1), + ("ant", "", measure_gt_1), + ("ement", "", measure_gt_1), + ("ment", "", measure_gt_1), + ("ent", "", measure_gt_1), + # (m>1 and (*S or *T)) ION -> + ( + "ion", + "", + lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"), + ), + ("ou", "", measure_gt_1), + ("ism", "", measure_gt_1), + ("ate", "", measure_gt_1), + ("iti", "", measure_gt_1), + ("ous", "", measure_gt_1), + ("ive", "", measure_gt_1), + ("ize", "", measure_gt_1), + ], + ) + + def _step5a(self, word): + """Implements Step 5a from "An algorithm for suffix stripping" + + From the paper: + + Step 5a + + (m>1) E -> probate -> probat + rate -> rate + (m=1 and not *o) E -> cease -> ceas + """ + # Note that Martin's test vocabulary and reference + # implementations are inconsistent in how they handle the case + # where two rules both refer to a suffix that matches the word + # to be stemmed, but only the condition of the second one is + # true. + # Earlier in step2b we had the rules: + # (m>0) EED -> EE + # (*v*) ED -> + # but the examples in the paper included "feed"->"feed", even + # though (*v*) is true for "fe" and therefore the second rule + # alone would map "feed"->"fe". + # However, in THIS case, we need to handle the consecutive rules + # differently and try both conditions (obviously; the second + # rule here would be redundant otherwise). Martin's paper makes + # no explicit mention of the inconsistency; you have to infer it + # from the examples. + # For this reason, we can't use _apply_rule_list here. + if word.endswith("e"): + stem = self._replace_suffix(word, "e", "") + if self._measure(stem) > 1: + return stem + if self._measure(stem) == 1 and not self._ends_cvc(stem): + return stem + return word + + def _step5b(self, word): + """Implements Step 5a from "An algorithm for suffix stripping" + + From the paper: + + Step 5b + + (m > 1 and *d and *L) -> single letter + controll -> control + roll -> roll + """ + return self._apply_rule_list( + word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)] + ) + + def stem(self, word, to_lowercase=True): + """ + :param to_lowercase: if `to_lowercase=True` the word always lowercase + """ + stem = word.lower() if to_lowercase else word + + if self.mode == self.NLTK_EXTENSIONS and word in self.pool: + return self.pool[stem] + + if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2: + # With this line, strings of length 1 or 2 don't go through + # the stemming process, although no mention is made of this + # in the published algorithm. + return stem + + stem = self._step1a(stem) + stem = self._step1b(stem) + stem = self._step1c(stem) + stem = self._step2(stem) + stem = self._step3(stem) + stem = self._step4(stem) + stem = self._step5a(stem) + stem = self._step5b(stem) + + return stem + + def __repr__(self): + return "" + + +def demo(): + """ + A demonstration of the porter stemmer on a sample from + the Penn Treebank corpus. + """ + + from nltk import stem + from nltk.corpus import treebank + + stemmer = stem.PorterStemmer() + + orig = [] + stemmed = [] + for item in treebank.fileids()[:3]: + for (word, tag) in treebank.tagged_words(item): + orig.append(word) + stemmed.append(stemmer.stem(word)) + + # Convert the results to a string, and word-wrap them. + results = " ".join(stemmed) + results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip() + + # Convert the original to a string, and word wrap it. + original = " ".join(orig) + original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip() + + # Print the results. + print("-Original-".center(70).replace(" ", "*").replace("-", " ")) + print(original) + print("-Results-".center(70).replace(" ", "*").replace("-", " ")) + print(results) + print("*" * 70) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/regexp.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/regexp.py new file mode 100644 index 0000000000000000000000000000000000000000..be4d6b062f0df2f7c6e58af4a000c2500e229177 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/regexp.py @@ -0,0 +1,56 @@ +# Natural Language Toolkit: Stemmers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT +import re + +from nltk.stem.api import StemmerI + + +class RegexpStemmer(StemmerI): + """ + A stemmer that uses regular expressions to identify morphological + affixes. Any substrings that match the regular expressions will + be removed. + + >>> from nltk.stem import RegexpStemmer + >>> st = RegexpStemmer('ing$|s$|e$|able$', min=4) + >>> st.stem('cars') + 'car' + >>> st.stem('mass') + 'mas' + >>> st.stem('was') + 'was' + >>> st.stem('bee') + 'bee' + >>> st.stem('compute') + 'comput' + >>> st.stem('advisable') + 'advis' + + :type regexp: str or regexp + :param regexp: The regular expression that should be used to + identify morphological affixes. + :type min: int + :param min: The minimum length of string to stem + """ + + def __init__(self, regexp, min=0): + + if not hasattr(regexp, "pattern"): + regexp = re.compile(regexp) + self._regexp = regexp + self._min = min + + def stem(self, word): + if len(word) < self._min: + return word + else: + return self._regexp.sub("", word) + + def __repr__(self): + return f"" diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/wordnet.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/wordnet.py new file mode 100644 index 0000000000000000000000000000000000000000..6256230e33e33eed729129e237665474bea6dc8d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/wordnet.py @@ -0,0 +1,49 @@ +# Natural Language Toolkit: WordNet stemmer interface +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +from nltk.corpus import wordnet as wn + + +class WordNetLemmatizer: + """ + WordNet Lemmatizer + + Lemmatize using WordNet's built-in morphy function. + Returns the input word unchanged if it cannot be found in WordNet. + + >>> from nltk.stem import WordNetLemmatizer + >>> wnl = WordNetLemmatizer() + >>> print(wnl.lemmatize('dogs')) + dog + >>> print(wnl.lemmatize('churches')) + church + >>> print(wnl.lemmatize('aardwolves')) + aardwolf + >>> print(wnl.lemmatize('abaci')) + abacus + >>> print(wnl.lemmatize('hardrock')) + hardrock + """ + + def lemmatize(self, word: str, pos: str = "n") -> str: + """Lemmatize `word` using WordNet's built-in morphy function. + Returns the input word unchanged if it cannot be found in WordNet. + + :param word: The input word to lemmatize. + :type word: str + :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns, + `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"` + for satellite adjectives. + :param pos: str + :return: The lemma of `word`, for the given `pos`. + """ + lemmas = wn._morphy(word, pos) + return min(lemmas, key=len) if lemmas else word + + def __repr__(self): + return "" diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/brill_trainer.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/brill_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..236fd9858e755b501f3a8f384b68a383b6902f99 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/brill_trainer.py @@ -0,0 +1,629 @@ +# Natural Language Toolkit: Transformation-based learning +# +# Copyright (C) 2001-2013 NLTK Project +# Author: Marcus Uneson +# based on previous (nltk2) version by +# Christopher Maloof, Edward Loper, Steven Bird +# URL: +# For license information, see LICENSE.TXT + +import bisect +import textwrap +from collections import defaultdict + +from nltk.tag import BrillTagger, untag + +###################################################################### +# Brill Tagger Trainer +###################################################################### + + +class BrillTaggerTrainer: + """ + A trainer for tbl taggers. + """ + + def __init__( + self, initial_tagger, templates, trace=0, deterministic=None, ruleformat="str" + ): + """ + Construct a Brill tagger from a baseline tagger and a + set of templates + + :param initial_tagger: the baseline tagger + :type initial_tagger: Tagger + :param templates: templates to be used in training + :type templates: list of Templates + :param trace: verbosity level + :type trace: int + :param deterministic: if True, adjudicate ties deterministically + :type deterministic: bool + :param ruleformat: format of reported Rules + :type ruleformat: str + :return: An untrained BrillTagger + :rtype: BrillTagger + """ + + if deterministic is None: + deterministic = trace > 0 + self._initial_tagger = initial_tagger + self._templates = templates + self._trace = trace + self._deterministic = deterministic + self._ruleformat = ruleformat + + self._tag_positions = None + """Mapping from tags to lists of positions that use that tag.""" + + self._rules_by_position = None + """Mapping from positions to the set of rules that are known + to occur at that position. Position is (sentnum, wordnum). + Initially, this will only contain positions where each rule + applies in a helpful way; but when we examine a rule, we'll + extend this list to also include positions where each rule + applies in a harmful or neutral way.""" + + self._positions_by_rule = None + """Mapping from rule to position to effect, specifying the + effect that each rule has on the overall score, at each + position. Position is (sentnum, wordnum); and effect is + -1, 0, or 1. As with _rules_by_position, this mapping starts + out only containing rules with positive effects; but when + we examine a rule, we'll extend this mapping to include + the positions where the rule is harmful or neutral.""" + + self._rules_by_score = None + """Mapping from scores to the set of rules whose effect on the + overall score is upper bounded by that score. Invariant: + rulesByScore[s] will contain r iff the sum of + _positions_by_rule[r] is s.""" + + self._rule_scores = None + """Mapping from rules to upper bounds on their effects on the + overall score. This is the inverse mapping to _rules_by_score. + Invariant: ruleScores[r] = sum(_positions_by_rule[r])""" + + self._first_unknown_position = None + """Mapping from rules to the first position where we're unsure + if the rule applies. This records the next position we + need to check to see if the rule messed anything up.""" + + # Training + + def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): + r""" + Trains the Brill tagger on the corpus *train_sents*, + producing at most *max_rules* transformations, each of which + reduces the net number of errors in the corpus by at least + *min_score*, and each of which has accuracy not lower than + *min_acc*. + + >>> # Relevant imports + >>> from nltk.tbl.template import Template + >>> from nltk.tag.brill import Pos, Word + >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer + + >>> # Load some data + >>> from nltk.corpus import treebank + >>> training_data = treebank.tagged_sents()[:100] + >>> baseline_data = treebank.tagged_sents()[100:200] + >>> gold_data = treebank.tagged_sents()[200:300] + >>> testing_data = [untag(s) for s in gold_data] + + >>> backoff = RegexpTagger([ + ... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers + ... (r'(The|the|A|a|An|an)$', 'AT'), # articles + ... (r'.*able$', 'JJ'), # adjectives + ... (r'.*ness$', 'NN'), # nouns formed from adjectives + ... (r'.*ly$', 'RB'), # adverbs + ... (r'.*s$', 'NNS'), # plural nouns + ... (r'.*ing$', 'VBG'), # gerunds + ... (r'.*ed$', 'VBD'), # past tense verbs + ... (r'.*', 'NN') # nouns (default) + ... ]) + + >>> baseline = backoff #see NOTE1 + >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS + 0.243... + + >>> # Set up templates + >>> Template._cleartemplates() #clear any templates created in earlier tests + >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] + + >>> # Construct a BrillTaggerTrainer + >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) + + >>> tagger1 = tt.train(training_data, max_rules=10) + TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) + Finding initial useful rules... + Found 847 useful rules. + + B | + S F r O | Score = Fixed - Broken + c i o t | R Fixed = num tags changed incorrect -> correct + o x k h | u Broken = num tags changed correct -> incorrect + r e e e | l Other = num tags changed incorrect -> incorrect + e d n r | e + ------------------+------------------------------------------------------- + 132 132 0 0 | AT->DT if Pos:NN@[-1] + 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] + 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] + 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] + 47 63 16 162 | NN->IN if Pos:NNS@[-1] + 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] + 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] + 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] + 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] + 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] + + >>> tagger1.rules()[1:3] + (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) + + >>> train_stats = tagger1.train_stats() + >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] + [1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] + + >>> tagger1.print_template_statistics(printunused=False) + TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) + TRAIN ( 2417 tokens) initial 1776 0.2652 final: 1270 0.4746 + #ID | Score (train) | #Rules | Template + -------------------------------------------- + 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) + 000 | 201 0.397 | 3 0.300 | Template(Pos([-1])) + + + + >>> round(tagger1.accuracy(gold_data),5) + 0.43834 + + >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) + + >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), + ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), + ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] + True + + >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] + [1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] + + >>> # A high-accuracy tagger + >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) + TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) + Finding initial useful rules... + Found 847 useful rules. + + B | + S F r O | Score = Fixed - Broken + c i o t | R Fixed = num tags changed incorrect -> correct + o x k h | u Broken = num tags changed correct -> incorrect + r e e e | l Other = num tags changed incorrect -> incorrect + e d n r | e + ------------------+------------------------------------------------------- + 132 132 0 0 | AT->DT if Pos:NN@[-1] + 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] + 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] + 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] + 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] + 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] + 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] + 19 19 0 6 | NN->VB if Pos:TO@[-1] + 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] + 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] + + >>> round(tagger2.accuracy(gold_data), 8) + 0.43996744 + + >>> tagger2.rules()[2:4] + (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) + + # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, + # with a RegexpTagger only as backoff. For instance, + # >>> baseline = UnigramTagger(baseline_data, backoff=backoff) + # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results + # between python versions. The simplistic backoff above is a workaround to make doctests + # get consistent input. + + :param train_sents: training data + :type train_sents: list(list(tuple)) + :param max_rules: output at most max_rules rules + :type max_rules: int + :param min_score: stop training when no rules better than min_score can be found + :type min_score: int + :param min_acc: discard any rule with lower accuracy than min_acc + :type min_acc: float or None + :return: the learned tagger + :rtype: BrillTagger + """ + # FIXME: several tests are a bit too dependent on tracing format + # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates + + # Basic idea: Keep track of the rules that apply at each position. + # And keep track of the positions to which each rule applies. + + # Create a new copy of the training corpus, and run the + # initial tagger on it. We will progressively update this + # test corpus to look more like the training corpus. + test_sents = [ + list(self._initial_tagger.tag(untag(sent))) for sent in train_sents + ] + + # Collect some statistics on the training process + trainstats = {} + trainstats["min_acc"] = min_acc + trainstats["min_score"] = min_score + trainstats["tokencount"] = sum(len(t) for t in test_sents) + trainstats["sequencecount"] = len(test_sents) + trainstats["templatecount"] = len(self._templates) + trainstats["rulescores"] = [] + trainstats["initialerrors"] = sum( + tag[1] != truth[1] + for paired in zip(test_sents, train_sents) + for (tag, truth) in zip(*paired) + ) + trainstats["initialacc"] = ( + 1 - trainstats["initialerrors"] / trainstats["tokencount"] + ) + if self._trace > 0: + print( + "TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; " + "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format( + **trainstats + ) + ) + + # Initialize our mappings. This will find any errors made + # by the initial tagger, and use those to generate repair + # rules, which are added to the rule mappings. + if self._trace: + print("Finding initial useful rules...") + self._init_mappings(test_sents, train_sents) + if self._trace: + print(f" Found {len(self._rule_scores)} useful rules.") + + # Let the user know what we're up to. + if self._trace > 2: + self._trace_header() + elif self._trace == 1: + print("Selecting rules...") + + # Repeatedly select the best rule, and add it to `rules`. + rules = [] + try: + while len(rules) < max_rules: + # Find the best rule, and add it to our rule list. + rule = self._best_rule(train_sents, test_sents, min_score, min_acc) + if rule: + rules.append(rule) + score = self._rule_scores[rule] + trainstats["rulescores"].append(score) + else: + break # No more good rules left! + + # Report the rule that we found. + if self._trace > 1: + self._trace_rule(rule) + + # Apply the new rule at the relevant sites + self._apply_rule(rule, test_sents) + + # Update _tag_positions[rule.original_tag] and + # _tag_positions[rule.replacement_tag] for the affected + # positions (i.e., self._positions_by_rule[rule]). + self._update_tag_positions(rule) + + # Update rules that were affected by the change. + self._update_rules(rule, train_sents, test_sents) + + # The user can cancel training manually: + except KeyboardInterrupt: + print(f"Training stopped manually -- {len(rules)} rules found") + + # Discard our tag position mapping & rule mappings. + self._clean() + trainstats["finalerrors"] = trainstats["initialerrors"] - sum( + trainstats["rulescores"] + ) + trainstats["finalacc"] = ( + 1 - trainstats["finalerrors"] / trainstats["tokencount"] + ) + # Create and return a tagger from the rules we found. + return BrillTagger(self._initial_tagger, rules, trainstats) + + def _init_mappings(self, test_sents, train_sents): + """ + Initialize the tag position mapping & the rule related + mappings. For each error in test_sents, find new rules that + would correct them, and add them to the rule mappings. + """ + self._tag_positions = defaultdict(list) + self._rules_by_position = defaultdict(set) + self._positions_by_rule = defaultdict(dict) + self._rules_by_score = defaultdict(set) + self._rule_scores = defaultdict(int) + self._first_unknown_position = defaultdict(int) + # Scan through the corpus, initializing the tag_positions + # mapping and all the rule-related mappings. + for sentnum, sent in enumerate(test_sents): + for wordnum, (word, tag) in enumerate(sent): + + # Initialize tag_positions + self._tag_positions[tag].append((sentnum, wordnum)) + + # If it's an error token, update the rule-related mappings. + correct_tag = train_sents[sentnum][wordnum][1] + if tag != correct_tag: + for rule in self._find_rules(sent, wordnum, correct_tag): + self._update_rule_applies(rule, sentnum, wordnum, train_sents) + + def _clean(self): + self._tag_positions = None + self._rules_by_position = None + self._positions_by_rule = None + self._rules_by_score = None + self._rule_scores = None + self._first_unknown_position = None + + def _find_rules(self, sent, wordnum, new_tag): + """ + Use the templates to find rules that apply at index *wordnum* + in the sentence *sent* and generate the tag *new_tag*. + """ + for template in self._templates: + yield from template.applicable_rules(sent, wordnum, new_tag) + + def _update_rule_applies(self, rule, sentnum, wordnum, train_sents): + """ + Update the rule data tables to reflect the fact that + *rule* applies at the position *(sentnum, wordnum)*. + """ + pos = sentnum, wordnum + + # If the rule is already known to apply here, ignore. + # (This only happens if the position's tag hasn't changed.) + if pos in self._positions_by_rule[rule]: + return + + # Update self._positions_by_rule. + correct_tag = train_sents[sentnum][wordnum][1] + if rule.replacement_tag == correct_tag: + self._positions_by_rule[rule][pos] = 1 + elif rule.original_tag == correct_tag: + self._positions_by_rule[rule][pos] = -1 + else: # was wrong, remains wrong + self._positions_by_rule[rule][pos] = 0 + + # Update _rules_by_position + self._rules_by_position[pos].add(rule) + + # Update _rule_scores. + old_score = self._rule_scores[rule] + self._rule_scores[rule] += self._positions_by_rule[rule][pos] + + # Update _rules_by_score. + self._rules_by_score[old_score].discard(rule) + self._rules_by_score[self._rule_scores[rule]].add(rule) + + def _update_rule_not_applies(self, rule, sentnum, wordnum): + """ + Update the rule data tables to reflect the fact that *rule* + does not apply at the position *(sentnum, wordnum)*. + """ + pos = sentnum, wordnum + + # Update _rule_scores. + old_score = self._rule_scores[rule] + self._rule_scores[rule] -= self._positions_by_rule[rule][pos] + + # Update _rules_by_score. + self._rules_by_score[old_score].discard(rule) + self._rules_by_score[self._rule_scores[rule]].add(rule) + + # Update _positions_by_rule + del self._positions_by_rule[rule][pos] + self._rules_by_position[pos].remove(rule) + + # Optional addition: if the rule now applies nowhere, delete + # all its dictionary entries. + + def _best_rule(self, train_sents, test_sents, min_score, min_acc): + """ + Find the next best rule. This is done by repeatedly taking a + rule with the highest score and stepping through the corpus to + see where it applies. When it makes an error (decreasing its + score) it's bumped down, and we try a new rule with the + highest score. When we find a rule which has the highest + score *and* which has been tested against the entire corpus, we + can conclude that it's the next best rule. + """ + for max_score in sorted(self._rules_by_score.keys(), reverse=True): + if len(self._rules_by_score) == 0: + return None + if max_score < min_score or max_score <= 0: + return None + best_rules = list(self._rules_by_score[max_score]) + if self._deterministic: + best_rules.sort(key=repr) + for rule in best_rules: + positions = self._tag_positions[rule.original_tag] + + unk = self._first_unknown_position.get(rule, (0, -1)) + start = bisect.bisect_left(positions, unk) + + for i in range(start, len(positions)): + sentnum, wordnum = positions[i] + if rule.applies(test_sents[sentnum], wordnum): + self._update_rule_applies(rule, sentnum, wordnum, train_sents) + if self._rule_scores[rule] < max_score: + self._first_unknown_position[rule] = (sentnum, wordnum + 1) + break # The update demoted the rule. + + if self._rule_scores[rule] == max_score: + self._first_unknown_position[rule] = (len(train_sents) + 1, 0) + # optimization: if no min_acc threshold given, don't bother computing accuracy + if min_acc is None: + return rule + else: + changes = self._positions_by_rule[rule].values() + num_fixed = len([c for c in changes if c == 1]) + num_broken = len([c for c in changes if c == -1]) + # acc here is fixed/(fixed+broken); could also be + # fixed/(fixed+broken+other) == num_fixed/len(changes) + acc = num_fixed / (num_fixed + num_broken) + if acc >= min_acc: + return rule + # else: rule too inaccurate, discard and try next + + # We demoted (or skipped due to < min_acc, if that was given) + # all the rules with score==max_score. + + assert min_acc is not None or not self._rules_by_score[max_score] + if not self._rules_by_score[max_score]: + del self._rules_by_score[max_score] + + def _apply_rule(self, rule, test_sents): + """ + Update *test_sents* by applying *rule* everywhere where its + conditions are met. + """ + update_positions = set(self._positions_by_rule[rule]) + new_tag = rule.replacement_tag + + if self._trace > 3: + self._trace_apply(len(update_positions)) + + # Update test_sents. + for (sentnum, wordnum) in update_positions: + text = test_sents[sentnum][wordnum][0] + test_sents[sentnum][wordnum] = (text, new_tag) + + def _update_tag_positions(self, rule): + """ + Update _tag_positions to reflect the changes to tags that are + made by *rule*. + """ + # Update the tag index. + for pos in self._positions_by_rule[rule]: + # Delete the old tag. + old_tag_positions = self._tag_positions[rule.original_tag] + old_index = bisect.bisect_left(old_tag_positions, pos) + del old_tag_positions[old_index] + # Insert the new tag. + new_tag_positions = self._tag_positions[rule.replacement_tag] + bisect.insort_left(new_tag_positions, pos) + + def _update_rules(self, rule, train_sents, test_sents): + """ + Check if we should add or remove any rules from consideration, + given the changes made by *rule*. + """ + # Collect a list of all positions that might be affected. + neighbors = set() + for sentnum, wordnum in self._positions_by_rule[rule]: + for template in self._templates: + n = template.get_neighborhood(test_sents[sentnum], wordnum) + neighbors.update([(sentnum, i) for i in n]) + + # Update the rules at each position. + num_obsolete = num_new = num_unseen = 0 + for sentnum, wordnum in neighbors: + test_sent = test_sents[sentnum] + correct_tag = train_sents[sentnum][wordnum][1] + + # Check if the change causes any rule at this position to + # stop matching; if so, then update our rule mappings + # accordingly. + old_rules = set(self._rules_by_position[sentnum, wordnum]) + for old_rule in old_rules: + if not old_rule.applies(test_sent, wordnum): + num_obsolete += 1 + self._update_rule_not_applies(old_rule, sentnum, wordnum) + + # Check if the change causes our templates to propose any + # new rules for this position. + for template in self._templates: + for new_rule in template.applicable_rules( + test_sent, wordnum, correct_tag + ): + if new_rule not in old_rules: + num_new += 1 + if new_rule not in self._rule_scores: + num_unseen += 1 + old_rules.add(new_rule) + self._update_rule_applies( + new_rule, sentnum, wordnum, train_sents + ) + + # We may have caused other rules to match here, that are + # not proposed by our templates -- in particular, rules + # that are harmful or neutral. We therefore need to + # update any rule whose first_unknown_position is past + # this rule. + for new_rule, pos in self._first_unknown_position.items(): + if pos > (sentnum, wordnum): + if new_rule not in old_rules: + num_new += 1 + if new_rule.applies(test_sent, wordnum): + self._update_rule_applies( + new_rule, sentnum, wordnum, train_sents + ) + + if self._trace > 3: + self._trace_update_rules(num_obsolete, num_new, num_unseen) + + # Tracing + + def _trace_header(self): + print( + """ + B | + S F r O | Score = Fixed - Broken + c i o t | R Fixed = num tags changed incorrect -> correct + o x k h | u Broken = num tags changed correct -> incorrect + r e e e | l Other = num tags changed incorrect -> incorrect + e d n r | e +------------------+------------------------------------------------------- + """.rstrip() + ) + + def _trace_rule(self, rule): + assert self._rule_scores[rule] == sum(self._positions_by_rule[rule].values()) + + changes = self._positions_by_rule[rule].values() + num_fixed = len([c for c in changes if c == 1]) + num_broken = len([c for c in changes if c == -1]) + num_other = len([c for c in changes if c == 0]) + score = self._rule_scores[rule] + + rulestr = rule.format(self._ruleformat) + if self._trace > 2: + print( + "{:4d}{:4d}{:4d}{:4d} |".format( + score, num_fixed, num_broken, num_other + ), + end=" ", + ) + print( + textwrap.fill( + rulestr, + initial_indent=" " * 20, + width=79, + subsequent_indent=" " * 18 + "| ", + ).strip() + ) + else: + print(rulestr) + + def _trace_apply(self, num_updates): + prefix = " " * 18 + "|" + print(prefix) + print(prefix, f"Applying rule to {num_updates} positions.") + + def _trace_update_rules(self, num_obsolete, num_new, num_unseen): + prefix = " " * 18 + "|" + print(prefix, "Updated rule tables:") + print(prefix, (f" - {num_obsolete} rule applications removed")) + print( + prefix, + (f" - {num_new} rule applications added ({num_unseen} novel)"), + ) + print(prefix) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/crf.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/crf.py new file mode 100644 index 0000000000000000000000000000000000000000..6b0131da1812cbe57485c9d6600b6a887ad459e4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/crf.py @@ -0,0 +1,207 @@ +# Natural Language Toolkit: Interface to the CRFSuite Tagger +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Long Duong +# URL: +# For license information, see LICENSE.TXT + +""" +A module for POS tagging using CRFSuite +""" + +import re +import unicodedata + +from nltk.tag.api import TaggerI + +try: + import pycrfsuite +except ImportError: + pass + + +class CRFTagger(TaggerI): + """ + A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite + + >>> from nltk.tag import CRFTagger + >>> ct = CRFTagger() + + >>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')], + ... [('dog','Noun'),('eat','Verb'),('meat','Noun')]] + + >>> ct.train(train_data,'model.crf.tagger') + >>> ct.tag_sents([['dog','is','good'], ['Cat','eat','meat']]) + [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]] + + >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]] + >>> ct.accuracy(gold_sentences) + 1.0 + + Setting learned model file + >>> ct = CRFTagger() + >>> ct.set_model_file('model.crf.tagger') + >>> ct.accuracy(gold_sentences) + 1.0 + """ + + def __init__(self, feature_func=None, verbose=False, training_opt={}): + """ + Initialize the CRFSuite tagger + + :param feature_func: The function that extracts features for each token of a sentence. This function should take + 2 parameters: tokens and index which extract features at index position from tokens list. See the build in + _get_features function for more detail. + :param verbose: output the debugging messages during training. + :type verbose: boolean + :param training_opt: python-crfsuite training options + :type training_opt: dictionary + + Set of possible training options (using LBFGS training algorithm). + :'feature.minfreq': The minimum frequency of features. + :'feature.possible_states': Force to generate possible state features. + :'feature.possible_transitions': Force to generate possible transition features. + :'c1': Coefficient for L1 regularization. + :'c2': Coefficient for L2 regularization. + :'max_iterations': The maximum number of iterations for L-BFGS optimization. + :'num_memories': The number of limited memories for approximating the inverse hessian matrix. + :'epsilon': Epsilon for testing the convergence of the objective. + :'period': The duration of iterations to test the stopping criterion. + :'delta': The threshold for the stopping criterion; an L-BFGS iteration stops when the + improvement of the log likelihood over the last ${period} iterations is no greater than this threshold. + :'linesearch': The line search algorithm used in L-BFGS updates: + + - 'MoreThuente': More and Thuente's method, + - 'Backtracking': Backtracking method with regular Wolfe condition, + - 'StrongBacktracking': Backtracking method with strong Wolfe condition + :'max_linesearch': The maximum number of trials for the line search algorithm. + """ + + self._model_file = "" + self._tagger = pycrfsuite.Tagger() + + if feature_func is None: + self._feature_func = self._get_features + else: + self._feature_func = feature_func + + self._verbose = verbose + self._training_options = training_opt + self._pattern = re.compile(r"\d") + + def set_model_file(self, model_file): + self._model_file = model_file + self._tagger.open(self._model_file) + + def _get_features(self, tokens, idx): + """ + Extract basic features about this word including + - Current word + - is it capitalized? + - Does it have punctuation? + - Does it have a number? + - Suffixes up to length 3 + + Note that : we might include feature over previous word, next word etc. + + :return: a list which contains the features + :rtype: list(str) + """ + token = tokens[idx] + + feature_list = [] + + if not token: + return feature_list + + # Capitalization + if token[0].isupper(): + feature_list.append("CAPITALIZATION") + + # Number + if re.search(self._pattern, token) is not None: + feature_list.append("HAS_NUM") + + # Punctuation + punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"} + if all(unicodedata.category(x) in punc_cat for x in token): + feature_list.append("PUNCTUATION") + + # Suffix up to length 3 + if len(token) > 1: + feature_list.append("SUF_" + token[-1:]) + if len(token) > 2: + feature_list.append("SUF_" + token[-2:]) + if len(token) > 3: + feature_list.append("SUF_" + token[-3:]) + + feature_list.append("WORD_" + token) + + return feature_list + + def tag_sents(self, sents): + """ + Tag a list of sentences. NB before using this function, user should specify the mode_file either by + + - Train a new model using ``train`` function + - Use the pre-trained model which is set via ``set_model_file`` function + + :params sentences: list of sentences needed to tag. + :type sentences: list(list(str)) + :return: list of tagged sentences. + :rtype: list(list(tuple(str,str))) + """ + if self._model_file == "": + raise Exception( + " No model file is found !! Please use train or set_model_file function" + ) + + # We need the list of sentences instead of the list generator for matching the input and output + result = [] + for tokens in sents: + features = [self._feature_func(tokens, i) for i in range(len(tokens))] + labels = self._tagger.tag(features) + + if len(labels) != len(tokens): + raise Exception(" Predicted Length Not Matched, Expect Errors !") + + tagged_sent = list(zip(tokens, labels)) + result.append(tagged_sent) + + return result + + def train(self, train_data, model_file): + """ + Train the CRF tagger using CRFSuite + :params train_data : is the list of annotated sentences. + :type train_data : list (list(tuple(str,str))) + :params model_file : the model will be saved to this file. + + """ + trainer = pycrfsuite.Trainer(verbose=self._verbose) + trainer.set_params(self._training_options) + + for sent in train_data: + tokens, labels = zip(*sent) + features = [self._feature_func(tokens, i) for i in range(len(tokens))] + trainer.append(features, labels) + + # Now train the model, the output should be model_file + trainer.train(model_file) + # Save the model file + self.set_model_file(model_file) + + def tag(self, tokens): + """ + Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by + + - Train a new model using ``train`` function + - Use the pre-trained model which is set via ``set_model_file`` function + + :params tokens: list of tokens needed to tag. + :type tokens: list(str) + :return: list of tagged tokens. + :rtype: list(tuple(str,str)) + """ + + return self.tag_sents([tokens])[0] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/hunpos.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/hunpos.py new file mode 100644 index 0000000000000000000000000000000000000000..316eab2225cd7e0171526d3e8d353a54f525ac16 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/hunpos.py @@ -0,0 +1,142 @@ +# Natural Language Toolkit: Interface to the HunPos POS-tagger +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Peter Ljunglöf +# Dávid Márk Nemeskey (modifications) +# Attila Zséder (modifications) +# URL: +# For license information, see LICENSE.TXT + +""" +A module for interfacing with the HunPos open-source POS-tagger. +""" + +import os +from subprocess import PIPE, Popen + +from nltk.internals import find_binary, find_file +from nltk.tag.api import TaggerI + +_hunpos_url = "https://code.google.com/p/hunpos/" + +_hunpos_charset = "ISO-8859-1" +"""The default encoding used by hunpos: ISO-8859-1.""" + + +class HunposTagger(TaggerI): + """ + A class for pos tagging with HunPos. The input is the paths to: + - a model trained on training data + - (optionally) the path to the hunpos-tag binary + - (optionally) the encoding of the training data (default: ISO-8859-1) + + Check whether the required "hunpos-tag" binary is available: + + >>> from nltk.test.setup_fixt import check_binary + >>> check_binary('hunpos-tag') + + Example: + >>> from nltk.tag import HunposTagger + >>> ht = HunposTagger('en_wsj.model') + >>> ht.tag('What is the airspeed of an unladen swallow ?'.split()) + [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] + >>> ht.close() + + This class communicates with the hunpos-tag binary via pipes. When the + tagger object is no longer needed, the close() method should be called to + free system resources. The class supports the context manager interface; if + used in a with statement, the close() method is invoked automatically: + + >>> with HunposTagger('en_wsj.model') as ht: + ... ht.tag('What is the airspeed of an unladen swallow ?'.split()) + ... + [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] + """ + + def __init__( + self, path_to_model, path_to_bin=None, encoding=_hunpos_charset, verbose=False + ): + """ + Starts the hunpos-tag executable and establishes a connection with it. + + :param path_to_model: The model file. + :param path_to_bin: The hunpos-tag binary. + :param encoding: The encoding used by the model. Unicode tokens + passed to the tag() and tag_sents() methods are converted to + this charset when they are sent to hunpos-tag. + The default is ISO-8859-1 (Latin-1). + + This parameter is ignored for str tokens, which are sent as-is. + The caller must ensure that tokens are encoded in the right charset. + """ + self._closed = True + hunpos_paths = [ + ".", + "/usr/bin", + "/usr/local/bin", + "/opt/local/bin", + "/Applications/bin", + "~/bin", + "~/Applications/bin", + ] + hunpos_paths = list(map(os.path.expanduser, hunpos_paths)) + + self._hunpos_bin = find_binary( + "hunpos-tag", + path_to_bin, + env_vars=("HUNPOS_TAGGER",), + searchpath=hunpos_paths, + url=_hunpos_url, + verbose=verbose, + ) + + self._hunpos_model = find_file( + path_to_model, env_vars=("HUNPOS_TAGGER",), verbose=verbose + ) + self._encoding = encoding + self._hunpos = Popen( + [self._hunpos_bin, self._hunpos_model], + shell=False, + stdin=PIPE, + stdout=PIPE, + stderr=PIPE, + ) + self._closed = False + + def __del__(self): + self.close() + + def close(self): + """Closes the pipe to the hunpos executable.""" + if not self._closed: + self._hunpos.communicate() + self._closed = True + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def tag(self, tokens): + """Tags a single sentence: a list of words. + The tokens should not contain any newline characters. + """ + for token in tokens: + assert "\n" not in token, "Tokens should not contain newlines" + if isinstance(token, str): + token = token.encode(self._encoding) + self._hunpos.stdin.write(token + b"\n") + # We write a final empty line to tell hunpos that the sentence is finished: + self._hunpos.stdin.write(b"\n") + self._hunpos.stdin.flush() + + tagged_tokens = [] + for token in tokens: + tagged = self._hunpos.stdout.readline().strip().split(b"\t") + tag = tagged[1] if len(tagged) > 1 else None + tagged_tokens.append((token, tag)) + # We have to read (and dismiss) the final empty line: + self._hunpos.stdout.readline() + + return tagged_tokens diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/mapping.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..8cfa592fa02875bd329913aabd42011c218c8950 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/mapping.py @@ -0,0 +1,136 @@ +# Natural Language Toolkit: Tagset Mapping +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Nathan Schneider +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Interface for converting POS tags from various treebanks +to the universal tagset of Petrov, Das, & McDonald. + +The tagset consists of the following 12 coarse tags: + +VERB - verbs (all tenses and modes) +NOUN - nouns (common and proper) +PRON - pronouns +ADJ - adjectives +ADV - adverbs +ADP - adpositions (prepositions and postpositions) +CONJ - conjunctions +DET - determiners +NUM - cardinal numbers +PRT - particles or other function words +X - other: foreign words, typos, abbreviations +. - punctuation + +@see: https://arxiv.org/abs/1104.2086 and https://code.google.com/p/universal-pos-tags/ + +""" + +from collections import defaultdict +from os.path import join + +from nltk.data import load + +_UNIVERSAL_DATA = "taggers/universal_tagset" +_UNIVERSAL_TAGS = ( + "VERB", + "NOUN", + "PRON", + "ADJ", + "ADV", + "ADP", + "CONJ", + "DET", + "NUM", + "PRT", + "X", + ".", +) + +# _MAPPINGS = defaultdict(lambda: defaultdict(dict)) +# the mapping between tagset T1 and T2 returns UNK if applied to an unrecognized tag +_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: "UNK"))) + + +def _load_universal_map(fileid): + contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text") + + # When mapping to the Universal Tagset, + # map unknown inputs to 'X' not 'UNK' + _MAPPINGS[fileid]["universal"].default_factory = lambda: "X" + + for line in contents.splitlines(): + line = line.strip() + if line == "": + continue + fine, coarse = line.split("\t") + + assert coarse in _UNIVERSAL_TAGS, f"Unexpected coarse tag: {coarse}" + assert ( + fine not in _MAPPINGS[fileid]["universal"] + ), f"Multiple entries for original tag: {fine}" + + _MAPPINGS[fileid]["universal"][fine] = coarse + + +def tagset_mapping(source, target): + """ + Retrieve the mapping dictionary between tagsets. + + >>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\ + 'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\ + 'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'} + True + """ + + if source not in _MAPPINGS or target not in _MAPPINGS[source]: + if target == "universal": + _load_universal_map(source) + # Added the new Russian National Corpus mappings because the + # Russian model for nltk.pos_tag() uses it. + _MAPPINGS["ru-rnc-new"]["universal"] = { + "A": "ADJ", + "A-PRO": "PRON", + "ADV": "ADV", + "ADV-PRO": "PRON", + "ANUM": "ADJ", + "CONJ": "CONJ", + "INTJ": "X", + "NONLEX": ".", + "NUM": "NUM", + "PARENTH": "PRT", + "PART": "PRT", + "PR": "ADP", + "PRAEDIC": "PRT", + "PRAEDIC-PRO": "PRON", + "S": "NOUN", + "S-PRO": "PRON", + "V": "VERB", + } + + return _MAPPINGS[source][target] + + +def map_tag(source, target, source_tag): + """ + Maps the tag from the source tagset to the target tagset. + + >>> map_tag('en-ptb', 'universal', 'VBZ') + 'VERB' + >>> map_tag('en-ptb', 'universal', 'VBP') + 'VERB' + >>> map_tag('en-ptb', 'universal', '``') + '.' + """ + + # we need a systematic approach to naming + if target == "universal": + if source == "wsj": + source = "en-ptb" + if source == "brown": + source = "en-brown" + + return tagset_mapping(source, target)[source_tag] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/senna.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/senna.py new file mode 100644 index 0000000000000000000000000000000000000000..97396e556622637c8b9e6512d325e36d11908c4d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/senna.py @@ -0,0 +1,134 @@ +# Natural Language Toolkit: Senna POS Tagger +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Rami Al-Rfou' +# URL: +# For license information, see LICENSE.TXT + +""" +Senna POS tagger, NER Tagger, Chunk Tagger + +The input is: + +- path to the directory that contains SENNA executables. If the path is incorrect, + SennaTagger will automatically search for executable file specified in SENNA environment variable +- (optionally) the encoding of the input data (default:utf-8) + +Note: Unit tests for this module can be found in test/unit/test_senna.py + +>>> from nltk.tag import SennaTagger +>>> tagger = SennaTagger('/usr/share/senna-v3.0') # doctest: +SKIP +>>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP +[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), +('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] + +>>> from nltk.tag import SennaChunkTagger +>>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') # doctest: +SKIP +>>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP +[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), +('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), +('?', 'O')] + +>>> from nltk.tag import SennaNERTagger +>>> nertagger = SennaNERTagger('/usr/share/senna-v3.0') # doctest: +SKIP +>>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP +[('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'), +('London', 'B-LOC'), ('.', 'O')] +>>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP +[('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), +('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')] +""" + +from nltk.classify import Senna + + +class SennaTagger(Senna): + def __init__(self, path, encoding="utf-8"): + super().__init__(path, ["pos"], encoding) + + def tag_sents(self, sentences): + """ + Applies the tag method over a list of sentences. This method will return + for each sentence a list of tuples of (word, tag). + """ + tagged_sents = super().tag_sents(sentences) + for i in range(len(tagged_sents)): + for j in range(len(tagged_sents[i])): + annotations = tagged_sents[i][j] + tagged_sents[i][j] = (annotations["word"], annotations["pos"]) + return tagged_sents + + +class SennaChunkTagger(Senna): + def __init__(self, path, encoding="utf-8"): + super().__init__(path, ["chk"], encoding) + + def tag_sents(self, sentences): + """ + Applies the tag method over a list of sentences. This method will return + for each sentence a list of tuples of (word, tag). + """ + tagged_sents = super().tag_sents(sentences) + for i in range(len(tagged_sents)): + for j in range(len(tagged_sents[i])): + annotations = tagged_sents[i][j] + tagged_sents[i][j] = (annotations["word"], annotations["chk"]) + return tagged_sents + + def bio_to_chunks(self, tagged_sent, chunk_type): + """ + Extracts the chunks in a BIO chunk-tagged sentence. + + >>> from nltk.tag import SennaChunkTagger + >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') # doctest: +SKIP + >>> sent = 'What is the airspeed of an unladen swallow ?'.split() + >>> tagged_sent = chktagger.tag(sent) # doctest: +SKIP + >>> tagged_sent # doctest: +SKIP + [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), + ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), + ('?', 'O')] + >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) # doctest: +SKIP + [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')] + + :param tagged_sent: A list of tuples of word and BIO chunk tag. + :type tagged_sent: list(tuple) + :param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP' + :type tagged_sent: str + + :return: An iterable of tuples of chunks that users want to extract + and their corresponding indices. + :rtype: iter(tuple(str)) + """ + current_chunk = [] + current_chunk_position = [] + for idx, word_pos in enumerate(tagged_sent): + word, pos = word_pos + if "-" + chunk_type in pos: # Append the word to the current_chunk. + current_chunk.append(word) + current_chunk_position.append(idx) + else: + if current_chunk: # Flush the full chunk when out of an NP. + _chunk_str = " ".join(current_chunk) + _chunk_pos_str = "-".join(map(str, current_chunk_position)) + yield _chunk_str, _chunk_pos_str + current_chunk = [] + current_chunk_position = [] + if current_chunk: # Flush the last chunk. + yield " ".join(current_chunk), "-".join(map(str, current_chunk_position)) + + +class SennaNERTagger(Senna): + def __init__(self, path, encoding="utf-8"): + super().__init__(path, ["ner"], encoding) + + def tag_sents(self, sentences): + """ + Applies the tag method over a list of sentences. This method will return + for each sentence a list of tuples of (word, tag). + """ + tagged_sents = super().tag_sents(sentences) + for i in range(len(tagged_sents)): + for j in range(len(tagged_sents[i])): + annotations = tagged_sents[i][j] + tagged_sents[i][j] = (annotations["word"], annotations["ner"]) + return tagged_sents diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/tnt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/tnt.py new file mode 100644 index 0000000000000000000000000000000000000000..b1e41bbc834974b3ad3bb60b3042190be68037af --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/tnt.py @@ -0,0 +1,579 @@ +# Natural Language Toolkit: TnT Tagger +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Sam Huston +# +# URL: +# For license information, see LICENSE.TXT + +""" +Implementation of 'TnT - A Statisical Part of Speech Tagger' +by Thorsten Brants + +https://aclanthology.org/A00-1031.pdf +""" + +from math import log +from operator import itemgetter + +from nltk.probability import ConditionalFreqDist, FreqDist +from nltk.tag.api import TaggerI + + +class TnT(TaggerI): + """ + TnT - Statistical POS tagger + + IMPORTANT NOTES: + + * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS + + - It is possible to provide an untrained POS tagger to + create tags for unknown words, see __init__ function + + * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT + + - Due to the nature of this tagger, it works best when + trained over sentence delimited input. + - However it still produces good results if the training + data and testing data are separated on all punctuation eg: [,.?!] + - Input for training is expected to be a list of sentences + where each sentence is a list of (word, tag) tuples + - Input for tag function is a single sentence + Input for tagdata function is a list of sentences + Output is of a similar form + + * Function provided to process text that is unsegmented + + - Please see basic_sent_chop() + + + TnT uses a second order Markov model to produce tags for + a sequence of input, specifically: + + argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) + + IE: the maximum projection of a set of probabilities + + The set of possible tags for a given word is derived + from the training data. It is the set of all tags + that exact word has been assigned. + + To speed up and get more precision, we can use log addition + to instead multiplication, specifically: + + argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + + log(P(t_T+1|t_T)) + + The probability of a tag for a given word is the linear + interpolation of 3 markov models; a zero-order, first-order, + and a second order model. + + P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + + l3*P(t_i| t_i-1, t_i-2) + + A beam search is used to limit the memory usage of the algorithm. + The degree of the beam can be changed using N in the initialization. + N represents the maximum number of possible solutions to maintain + while tagging. + + It is possible to differentiate the tags which are assigned to + capitalized words. However this does not result in a significant + gain in the accuracy of the results. + """ + + def __init__(self, unk=None, Trained=False, N=1000, C=False): + """ + Construct a TnT statistical tagger. Tagger must be trained + before being used to tag input. + + :param unk: instance of a POS tagger, conforms to TaggerI + :type unk: TaggerI + :param Trained: Indication that the POS tagger is trained or not + :type Trained: bool + :param N: Beam search degree (see above) + :type N: int + :param C: Capitalization flag + :type C: bool + + Initializer, creates frequency distributions to be used + for tagging + + _lx values represent the portion of the tri/bi/uni taggers + to be used to calculate the probability + + N value is the number of possible solutions to maintain + while tagging. A good value for this is 1000 + + C is a boolean value which specifies to use or + not use the Capitalization of the word as additional + information for tagging. + NOTE: using capitalization may not increase the accuracy + of the tagger + """ + + self._uni = FreqDist() + self._bi = ConditionalFreqDist() + self._tri = ConditionalFreqDist() + self._wd = ConditionalFreqDist() + self._eos = ConditionalFreqDist() + self._l1 = 0.0 + self._l2 = 0.0 + self._l3 = 0.0 + self._N = N + self._C = C + self._T = Trained + + self._unk = unk + + # statistical tools (ignore or delete me) + self.unknown = 0 + self.known = 0 + + def train(self, data): + """ + Uses a set of tagged data to train the tagger. + If an unknown word tagger is specified, + it is trained on the same data. + + :param data: List of lists of (word, tag) tuples + :type data: tuple(str) + """ + + # Ensure that local C flag is initialized before use + C = False + + if self._unk is not None and self._T == False: + self._unk.train(data) + + for sent in data: + history = [("BOS", False), ("BOS", False)] + for w, t in sent: + + # if capitalization is requested, + # and the word begins with a capital + # set local flag C to True + if self._C and w[0].isupper(): + C = True + + self._wd[w][t] += 1 + self._uni[(t, C)] += 1 + self._bi[history[1]][(t, C)] += 1 + self._tri[tuple(history)][(t, C)] += 1 + + history.append((t, C)) + history.pop(0) + + # set local flag C to false for the next word + C = False + + self._eos[t]["EOS"] += 1 + + # compute lambda values from the trained frequency distributions + self._compute_lambda() + + def _compute_lambda(self): + """ + creates lambda values based upon training data + + NOTE: no need to explicitly reference C, + it is contained within the tag variable :: tag == (tag,C) + + for each tag trigram (t1, t2, t3) + depending on the maximum value of + - f(t1,t2,t3)-1 / f(t1,t2)-1 + - f(t2,t3)-1 / f(t2)-1 + - f(t3)-1 / N-1 + + increment l3,l2, or l1 by f(t1,t2,t3) + + ISSUES -- Resolutions: + if 2 values are equal, increment both lambda values + by (f(t1,t2,t3) / 2) + """ + + # temporary lambda variables + tl1 = 0.0 + tl2 = 0.0 + tl3 = 0.0 + + # for each t1,t2 in system + for history in self._tri.conditions(): + (h1, h2) = history + + # for each t3 given t1,t2 in system + # (NOTE: tag actually represents (tag,C)) + # However no effect within this function + for tag in self._tri[history].keys(): + + # if there has only been 1 occurrence of this tag in the data + # then ignore this trigram. + if self._uni[tag] == 1: + continue + + # safe_div provides a safe floating point division + # it returns -1 if the denominator is 0 + c3 = self._safe_div( + (self._tri[history][tag] - 1), (self._tri[history].N() - 1) + ) + c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) + c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) + + # if c1 is the maximum value: + if (c1 > c3) and (c1 > c2): + tl1 += self._tri[history][tag] + + # if c2 is the maximum value + elif (c2 > c3) and (c2 > c1): + tl2 += self._tri[history][tag] + + # if c3 is the maximum value + elif (c3 > c2) and (c3 > c1): + tl3 += self._tri[history][tag] + + # if c3, and c2 are equal and larger than c1 + elif (c3 == c2) and (c3 > c1): + tl2 += self._tri[history][tag] / 2.0 + tl3 += self._tri[history][tag] / 2.0 + + # if c1, and c2 are equal and larger than c3 + # this might be a dumb thing to do....(not sure yet) + elif (c2 == c1) and (c1 > c3): + tl1 += self._tri[history][tag] / 2.0 + tl2 += self._tri[history][tag] / 2.0 + + # otherwise there might be a problem + # eg: all values = 0 + else: + pass + + # Lambda normalisation: + # ensures that l1+l2+l3 = 1 + self._l1 = tl1 / (tl1 + tl2 + tl3) + self._l2 = tl2 / (tl1 + tl2 + tl3) + self._l3 = tl3 / (tl1 + tl2 + tl3) + + def _safe_div(self, v1, v2): + """ + Safe floating point division function, does not allow division by 0 + returns -1 if the denominator is 0 + """ + if v2 == 0: + return -1 + else: + return v1 / v2 + + def tagdata(self, data): + """ + Tags each sentence in a list of sentences + + :param data:list of list of words + :type data: [[string,],] + :return: list of list of (word, tag) tuples + + Invokes tag(sent) function for each sentence + compiles the results into a list of tagged sentences + each tagged sentence is a list of (word, tag) tuples + """ + res = [] + for sent in data: + res1 = self.tag(sent) + res.append(res1) + return res + + def tag(self, data): + """ + Tags a single sentence + + :param data: list of words + :type data: [string,] + + :return: [(word, tag),] + + Calls recursive function '_tagword' + to produce a list of tags + + Associates the sequence of returned tags + with the correct words in the input sequence + + returns a list of (word, tag) tuples + """ + + current_state = [(["BOS", "BOS"], 0.0)] + + sent = list(data) + + tags = self._tagword(sent, current_state) + + res = [] + for i in range(len(sent)): + # unpack and discard the C flags + (t, C) = tags[i + 2] + res.append((sent[i], t)) + + return res + + def _tagword(self, sent, current_states): + """ + :param sent : List of words remaining in the sentence + :type sent : [word,] + :param current_states : List of possible tag combinations for + the sentence so far, and the log probability + associated with each tag combination + :type current_states : [([tag, ], logprob), ] + + Tags the first word in the sentence and + recursively tags the reminder of sentence + + Uses formula specified above to calculate the probability + of a particular tag + """ + + # if this word marks the end of the sentence, + # return the most probable tag + if sent == []: + (h, logp) = current_states[0] + return h + + # otherwise there are more words to be tagged + word = sent[0] + sent = sent[1:] + new_states = [] + + # if the Capitalisation is requested, + # initialise the flag for this word + C = False + if self._C and word[0].isupper(): + C = True + + # if word is known + # compute the set of possible tags + # and their associated log probabilities + if word in self._wd: + self.known += 1 + + for (history, curr_sent_logprob) in current_states: + logprobs = [] + + for t in self._wd[word].keys(): + tC = (t, C) + p_uni = self._uni.freq(tC) + p_bi = self._bi[history[-1]].freq(tC) + p_tri = self._tri[tuple(history[-2:])].freq(tC) + p_wd = self._wd[word][t] / self._uni[tC] + p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri + p2 = log(p, 2) + log(p_wd, 2) + + # compute the result of appending each tag to this history + new_states.append((history + [tC], curr_sent_logprob + p2)) + + # otherwise a new word, set of possible tags is unknown + else: + self.unknown += 1 + + # since a set of possible tags, + # and the probability of each specific tag + # can not be returned from most classifiers: + # specify that any unknown words are tagged with certainty + p = 1 + + # if no unknown word tagger has been specified + # then use the tag 'Unk' + if self._unk is None: + tag = ("Unk", C) + + # otherwise apply the unknown word tagger + else: + [(_w, t)] = list(self._unk.tag([word])) + tag = (t, C) + + for (history, logprob) in current_states: + history.append(tag) + + new_states = current_states + + # now have computed a set of possible new_states + + # sort states by log prob + # set is now ordered greatest to least log probability + new_states.sort(reverse=True, key=itemgetter(1)) + + # del everything after N (threshold) + # this is the beam search cut + if len(new_states) > self._N: + new_states = new_states[: self._N] + + # compute the tags for the rest of the sentence + # return the best list of tags for the sentence + return self._tagword(sent, new_states) + + +######################################## +# helper function -- basic sentence tokenizer +######################################## + + +def basic_sent_chop(data, raw=True): + """ + Basic method for tokenizing input into sentences + for this tagger: + + :param data: list of tokens (words or (word, tag) tuples) + :type data: str or tuple(str, str) + :param raw: boolean flag marking the input data + as a list of words or a list of tagged words + :type raw: bool + :return: list of sentences + sentences are a list of tokens + tokens are the same as the input + + Function takes a list of tokens and separates the tokens into lists + where each list represents a sentence fragment + This function can separate both tagged and raw sequences into + basic sentences. + + Sentence markers are the set of [,.!?] + + This is a simple method which enhances the performance of the TnT + tagger. Better sentence tokenization will further enhance the results. + """ + + new_data = [] + curr_sent = [] + sent_mark = [",", ".", "?", "!"] + + if raw: + for word in data: + if word in sent_mark: + curr_sent.append(word) + new_data.append(curr_sent) + curr_sent = [] + else: + curr_sent.append(word) + + else: + for (word, tag) in data: + if word in sent_mark: + curr_sent.append((word, tag)) + new_data.append(curr_sent) + curr_sent = [] + else: + curr_sent.append((word, tag)) + return new_data + + +def demo(): + from nltk.corpus import brown + + sents = list(brown.tagged_sents()) + test = list(brown.sents()) + + tagger = TnT() + tagger.train(sents[200:1000]) + + tagged_data = tagger.tagdata(test[100:120]) + + for j in range(len(tagged_data)): + s = tagged_data[j] + t = sents[j + 100] + for i in range(len(s)): + print(s[i], "--", t[i]) + print() + + +def demo2(): + from nltk.corpus import treebank + + d = list(treebank.tagged_sents()) + + t = TnT(N=1000, C=False) + s = TnT(N=1000, C=True) + t.train(d[(11) * 100 :]) + s.train(d[(11) * 100 :]) + + for i in range(10): + tacc = t.accuracy(d[i * 100 : ((i + 1) * 100)]) + tp_un = t.unknown / (t.known + t.unknown) + tp_kn = t.known / (t.known + t.unknown) + t.unknown = 0 + t.known = 0 + + print("Capitalization off:") + print("Accuracy:", tacc) + print("Percentage known:", tp_kn) + print("Percentage unknown:", tp_un) + print("Accuracy over known words:", (tacc / tp_kn)) + + sacc = s.accuracy(d[i * 100 : ((i + 1) * 100)]) + sp_un = s.unknown / (s.known + s.unknown) + sp_kn = s.known / (s.known + s.unknown) + s.unknown = 0 + s.known = 0 + + print("Capitalization on:") + print("Accuracy:", sacc) + print("Percentage known:", sp_kn) + print("Percentage unknown:", sp_un) + print("Accuracy over known words:", (sacc / sp_kn)) + + +def demo3(): + from nltk.corpus import brown, treebank + + d = list(treebank.tagged_sents()) + e = list(brown.tagged_sents()) + + d = d[:1000] + e = e[:1000] + + d10 = int(len(d) * 0.1) + e10 = int(len(e) * 0.1) + + tknacc = 0 + sknacc = 0 + tallacc = 0 + sallacc = 0 + tknown = 0 + sknown = 0 + + for i in range(10): + + t = TnT(N=1000, C=False) + s = TnT(N=1000, C=False) + + dtest = d[(i * d10) : ((i + 1) * d10)] + etest = e[(i * e10) : ((i + 1) * e10)] + + dtrain = d[: (i * d10)] + d[((i + 1) * d10) :] + etrain = e[: (i * e10)] + e[((i + 1) * e10) :] + + t.train(dtrain) + s.train(etrain) + + tacc = t.accuracy(dtest) + tp_un = t.unknown / (t.known + t.unknown) + tp_kn = t.known / (t.known + t.unknown) + tknown += tp_kn + t.unknown = 0 + t.known = 0 + + sacc = s.accuracy(etest) + sp_un = s.unknown / (s.known + s.unknown) + sp_kn = s.known / (s.known + s.unknown) + sknown += sp_kn + s.unknown = 0 + s.known = 0 + + tknacc += tacc / tp_kn + sknacc += sacc / tp_kn + tallacc += tacc + sallacc += sacc + + # print(i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc) + + print("brown: acc over words known:", 10 * tknacc) + print(" : overall accuracy:", 10 * tallacc) + print(" : words known:", 10 * tknown) + print("treebank: acc over words known:", 10 * sknacc) + print(" : overall accuracy:", 10 * sallacc) + print(" : words known:", 10 * sknown) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/api.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/demo.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..82db6d475bfdbb3a05b54657cdef82e5c891622f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/demo.py @@ -0,0 +1,418 @@ +# Natural Language Toolkit: Transformation-based learning +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Marcus Uneson +# based on previous (nltk2) version by +# Christopher Maloof, Edward Loper, Steven Bird +# URL: +# For license information, see LICENSE.TXT + +import os +import pickle +import random +import time + +from nltk.corpus import treebank +from nltk.tag import BrillTaggerTrainer, RegexpTagger, UnigramTagger +from nltk.tag.brill import Pos, Word +from nltk.tbl import Template, error_list + + +def demo(): + """ + Run a demo with defaults. See source comments for details, + or docstrings of any of the more specific demo_* functions. + """ + postag() + + +def demo_repr_rule_format(): + """ + Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose")) + """ + postag(ruleformat="repr") + + +def demo_str_rule_format(): + """ + Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose")) + """ + postag(ruleformat="str") + + +def demo_verbose_rule_format(): + """ + Exemplify Rule.format("verbose") + """ + postag(ruleformat="verbose") + + +def demo_multiposition_feature(): + """ + The feature/s of a template takes a list of positions + relative to the current word where the feature should be + looked for, conceptually joined by logical OR. For instance, + Pos([-1, 1]), given a value V, will hold whenever V is found + one step to the left and/or one step to the right. + + For contiguous ranges, a 2-arg form giving inclusive end + points can also be used: Pos(-3, -1) is the same as the arg + below. + """ + postag(templates=[Template(Pos([-3, -2, -1]))]) + + +def demo_multifeature_template(): + """ + Templates can have more than a single feature. + """ + postag(templates=[Template(Word([0]), Pos([-2, -1]))]) + + +def demo_template_statistics(): + """ + Show aggregate statistics per template. Little used templates are + candidates for deletion, much used templates may possibly be refined. + + Deleting unused templates is mostly about saving time and/or space: + training is basically O(T) in the number of templates T + (also in terms of memory usage, which often will be the limiting factor). + """ + postag(incremental_stats=True, template_stats=True) + + +def demo_generated_templates(): + """ + Template.expand and Feature.expand are class methods facilitating + generating large amounts of templates. See their documentation for + details. + + Note: training with 500 templates can easily fill all available + even on relatively small corpora + """ + wordtpls = Word.expand([-1, 0, 1], [1, 2], excludezero=False) + tagtpls = Pos.expand([-2, -1, 0, 1], [1, 2], excludezero=True) + templates = list(Template.expand([wordtpls, tagtpls], combinations=(1, 3))) + print( + "Generated {} templates for transformation-based learning".format( + len(templates) + ) + ) + postag(templates=templates, incremental_stats=True, template_stats=True) + + +def demo_learning_curve(): + """ + Plot a learning curve -- the contribution on tagging accuracy of + the individual rules. + Note: requires matplotlib + """ + postag( + incremental_stats=True, + separate_baseline_data=True, + learning_curve_output="learningcurve.png", + ) + + +def demo_error_analysis(): + """ + Writes a file with context for each erroneous word after tagging testing data + """ + postag(error_output="errors.txt") + + +def demo_serialize_tagger(): + """ + Serializes the learned tagger to a file in pickle format; reloads it + and validates the process. + """ + postag(serialize_output="tagger.pcl") + + +def demo_high_accuracy_rules(): + """ + Discard rules with low accuracy. This may hurt performance a bit, + but will often produce rules which are more interesting read to a human. + """ + postag(num_sents=3000, min_acc=0.96, min_score=10) + + +def postag( + templates=None, + tagged_data=None, + num_sents=1000, + max_rules=300, + min_score=3, + min_acc=None, + train=0.8, + trace=3, + randomize=False, + ruleformat="str", + incremental_stats=False, + template_stats=False, + error_output=None, + serialize_output=None, + learning_curve_output=None, + learning_curve_take=300, + baseline_backoff_tagger=None, + separate_baseline_data=False, + cache_baseline_tagger=None, +): + """ + Brill Tagger Demonstration + :param templates: how many sentences of training and testing data to use + :type templates: list of Template + + :param tagged_data: maximum number of rule instances to create + :type tagged_data: C{int} + + :param num_sents: how many sentences of training and testing data to use + :type num_sents: C{int} + + :param max_rules: maximum number of rule instances to create + :type max_rules: C{int} + + :param min_score: the minimum score for a rule in order for it to be considered + :type min_score: C{int} + + :param min_acc: the minimum score for a rule in order for it to be considered + :type min_acc: C{float} + + :param train: the fraction of the the corpus to be used for training (1=all) + :type train: C{float} + + :param trace: the level of diagnostic tracing output to produce (0-4) + :type trace: C{int} + + :param randomize: whether the training data should be a random subset of the corpus + :type randomize: C{bool} + + :param ruleformat: rule output format, one of "str", "repr", "verbose" + :type ruleformat: C{str} + + :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) + :type incremental_stats: C{bool} + + :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing + :type template_stats: C{bool} + + :param error_output: the file where errors will be saved + :type error_output: C{string} + + :param serialize_output: the file where the learned tbl tagger will be saved + :type serialize_output: C{string} + + :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) + :type learning_curve_output: C{string} + + :param learning_curve_take: how many rules plotted + :type learning_curve_take: C{int} + + :param baseline_backoff_tagger: the file where rules will be saved + :type baseline_backoff_tagger: tagger + + :param separate_baseline_data: use a fraction of the training data exclusively for training baseline + :type separate_baseline_data: C{bool} + + :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get + deterministic output from the baseline unigram tagger between python versions) + :type cache_baseline_tagger: C{string} + + + Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This + is fast and fine for a demo, but is likely to generalize worse on unseen data. + Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). + """ + + # defaults + baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER + if templates is None: + from nltk.tag.brill import brill24, describe_template_sets + + # some pre-built template sets taken from typical systems or publications are + # available. Print a list with describe_template_sets() + # for instance: + templates = brill24() + (training_data, baseline_data, gold_data, testing_data) = _demo_prepare_data( + tagged_data, train, num_sents, randomize, separate_baseline_data + ) + + # creating (or reloading from cache) a baseline tagger (unigram tagger) + # this is just a mechanism for getting deterministic output from the baseline between + # python versions + if cache_baseline_tagger: + if not os.path.exists(cache_baseline_tagger): + baseline_tagger = UnigramTagger( + baseline_data, backoff=baseline_backoff_tagger + ) + with open(cache_baseline_tagger, "w") as print_rules: + pickle.dump(baseline_tagger, print_rules) + print( + "Trained baseline tagger, pickled it to {}".format( + cache_baseline_tagger + ) + ) + with open(cache_baseline_tagger) as print_rules: + baseline_tagger = pickle.load(print_rules) + print(f"Reloaded pickled tagger from {cache_baseline_tagger}") + else: + baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) + print("Trained baseline tagger") + if gold_data: + print( + " Accuracy on test set: {:0.4f}".format( + baseline_tagger.accuracy(gold_data) + ) + ) + + # creating a Brill tagger + tbrill = time.time() + trainer = BrillTaggerTrainer( + baseline_tagger, templates, trace, ruleformat=ruleformat + ) + print("Training tbl tagger...") + brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) + print(f"Trained tbl tagger in {time.time() - tbrill:0.2f} seconds") + if gold_data: + print(" Accuracy on test set: %.4f" % brill_tagger.accuracy(gold_data)) + + # printing the learned rules, if learned silently + if trace == 1: + print("\nLearned rules: ") + for (ruleno, rule) in enumerate(brill_tagger.rules(), 1): + print(f"{ruleno:4d} {rule.format(ruleformat):s}") + + # printing template statistics (optionally including comparison with the training data) + # note: if not separate_baseline_data, then baseline accuracy will be artificially high + if incremental_stats: + print( + "Incrementally tagging the test data, collecting individual rule statistics" + ) + (taggedtest, teststats) = brill_tagger.batch_tag_incremental( + testing_data, gold_data + ) + print(" Rule statistics collected") + if not separate_baseline_data: + print( + "WARNING: train_stats asked for separate_baseline_data=True; the baseline " + "will be artificially high" + ) + trainstats = brill_tagger.train_stats() + if template_stats: + brill_tagger.print_template_statistics(teststats) + if learning_curve_output: + _demo_plot( + learning_curve_output, teststats, trainstats, take=learning_curve_take + ) + print(f"Wrote plot of learning curve to {learning_curve_output}") + else: + print("Tagging the test data") + taggedtest = brill_tagger.tag_sents(testing_data) + if template_stats: + brill_tagger.print_template_statistics() + + # writing error analysis to file + if error_output is not None: + with open(error_output, "w") as f: + f.write("Errors for Brill Tagger %r\n\n" % serialize_output) + f.write("\n".join(error_list(gold_data, taggedtest)).encode("utf-8") + "\n") + print(f"Wrote tagger errors including context to {error_output}") + + # serializing the tagger to a pickle file and reloading (just to see it works) + if serialize_output is not None: + taggedtest = brill_tagger.tag_sents(testing_data) + with open(serialize_output, "w") as print_rules: + pickle.dump(brill_tagger, print_rules) + print(f"Wrote pickled tagger to {serialize_output}") + with open(serialize_output) as print_rules: + brill_tagger_reloaded = pickle.load(print_rules) + print(f"Reloaded pickled tagger from {serialize_output}") + taggedtest_reloaded = brill_tagger.tag_sents(testing_data) + if taggedtest == taggedtest_reloaded: + print("Reloaded tagger tried on test set, results identical") + else: + print("PROBLEM: Reloaded tagger gave different results on test set") + + +def _demo_prepare_data( + tagged_data, train, num_sents, randomize, separate_baseline_data +): + # train is the proportion of data used in training; the rest is reserved + # for testing. + if tagged_data is None: + print("Loading tagged data from treebank... ") + tagged_data = treebank.tagged_sents() + if num_sents is None or len(tagged_data) <= num_sents: + num_sents = len(tagged_data) + if randomize: + random.seed(len(tagged_data)) + random.shuffle(tagged_data) + cutoff = int(num_sents * train) + training_data = tagged_data[:cutoff] + gold_data = tagged_data[cutoff:num_sents] + testing_data = [[t[0] for t in sent] for sent in gold_data] + if not separate_baseline_data: + baseline_data = training_data + else: + bl_cutoff = len(training_data) // 3 + (baseline_data, training_data) = ( + training_data[:bl_cutoff], + training_data[bl_cutoff:], + ) + (trainseqs, traintokens) = corpus_size(training_data) + (testseqs, testtokens) = corpus_size(testing_data) + (bltrainseqs, bltraintokens) = corpus_size(baseline_data) + print(f"Read testing data ({testseqs:d} sents/{testtokens:d} wds)") + print(f"Read training data ({trainseqs:d} sents/{traintokens:d} wds)") + print( + "Read baseline data ({:d} sents/{:d} wds) {:s}".format( + bltrainseqs, + bltraintokens, + "" if separate_baseline_data else "[reused the training set]", + ) + ) + return (training_data, baseline_data, gold_data, testing_data) + + +def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None): + testcurve = [teststats["initialerrors"]] + for rulescore in teststats["rulescores"]: + testcurve.append(testcurve[-1] - rulescore) + testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]] + + traincurve = [trainstats["initialerrors"]] + for rulescore in trainstats["rulescores"]: + traincurve.append(traincurve[-1] - rulescore) + traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]] + + import matplotlib.pyplot as plt + + r = list(range(len(testcurve))) + plt.plot(r, testcurve, r, traincurve) + plt.axis([None, None, None, 1.0]) + plt.savefig(learning_curve_output) + + +NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")]) + +REGEXP_TAGGER = RegexpTagger( + [ + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers + (r"(The|the|A|a|An|an)$", "AT"), # articles + (r".*able$", "JJ"), # adjectives + (r".*ness$", "NN"), # nouns formed from adjectives + (r".*ly$", "RB"), # adverbs + (r".*s$", "NNS"), # plural nouns + (r".*ing$", "VBG"), # gerunds + (r".*ed$", "VBD"), # past tense verbs + (r".*", "NN"), # nouns (default) + ] +) + + +def corpus_size(seqs): + return (len(seqs), sum(len(x) for x in seqs)) + + +if __name__ == "__main__": + demo_learning_curve() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/rule.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/rule.py new file mode 100644 index 0000000000000000000000000000000000000000..7a353a8e666587b885b712ca41d3a09c982774f0 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/rule.py @@ -0,0 +1,322 @@ +# Natural Language Toolkit: Transformation-based learning +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Marcus Uneson +# based on previous (nltk2) version by +# Christopher Maloof, Edward Loper, Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from abc import ABCMeta, abstractmethod + +from nltk import jsontags + + +###################################################################### +# Tag Rules +###################################################################### +class TagRule(metaclass=ABCMeta): + """ + An interface for tag transformations on a tagged corpus, as + performed by tbl taggers. Each transformation finds all tokens + in the corpus that are tagged with a specific original tag and + satisfy a specific condition, and replaces their tags with a + replacement tag. For any given transformation, the original + tag, replacement tag, and condition are fixed. Conditions may + depend on the token under consideration, as well as any other + tokens in the corpus. + + Tag rules must be comparable and hashable. + """ + + def __init__(self, original_tag, replacement_tag): + + self.original_tag = original_tag + """The tag which this TagRule may cause to be replaced.""" + + self.replacement_tag = replacement_tag + """The tag with which this TagRule may replace another tag.""" + + def apply(self, tokens, positions=None): + """ + Apply this rule at every position in positions where it + applies to the given sentence. I.e., for each position p + in *positions*, if *tokens[p]* is tagged with this rule's + original tag, and satisfies this rule's condition, then set + its tag to be this rule's replacement tag. + + :param tokens: The tagged sentence + :type tokens: list(tuple(str, str)) + :type positions: list(int) + :param positions: The positions where the transformation is to + be tried. If not specified, try it at all positions. + :return: The indices of tokens whose tags were changed by this + rule. + :rtype: int + """ + if positions is None: + positions = list(range(len(tokens))) + + # Determine the indices at which this rule applies. + change = [i for i in positions if self.applies(tokens, i)] + + # Make the changes. Note: this must be done in a separate + # step from finding applicable locations, since we don't want + # the rule to interact with itself. + for i in change: + tokens[i] = (tokens[i][0], self.replacement_tag) + + return change + + @abstractmethod + def applies(self, tokens, index): + """ + :return: True if the rule would change the tag of + ``tokens[index]``, False otherwise + :rtype: bool + :param tokens: A tagged sentence + :type tokens: list(str) + :param index: The index to check + :type index: int + """ + + # Rules must be comparable and hashable for the algorithm to work + def __eq__(self, other): + raise TypeError("Rules must implement __eq__()") + + def __ne__(self, other): + raise TypeError("Rules must implement __ne__()") + + def __hash__(self): + raise TypeError("Rules must implement __hash__()") + + +@jsontags.register_tag +class Rule(TagRule): + """ + A Rule checks the current corpus position for a certain set of conditions; + if they are all fulfilled, the Rule is triggered, meaning that it + will change tag A to tag B. For other tags than A, nothing happens. + + The conditions are parameters to the Rule instance. Each condition is a feature-value pair, + with a set of positions to check for the value of the corresponding feature. + Conceptually, the positions are joined by logical OR, and the feature set by logical AND. + + More formally, the Rule is then applicable to the M{n}th token iff: + + - The M{n}th token is tagged with the Rule's original tag; and + - For each (Feature(positions), M{value}) tuple: + + - The value of Feature of at least one token in {n+p for p in positions} + is M{value}. + """ + + json_tag = "nltk.tbl.Rule" + + def __init__(self, templateid, original_tag, replacement_tag, conditions): + """ + Construct a new Rule that changes a token's tag from + C{original_tag} to C{replacement_tag} if all of the properties + specified in C{conditions} hold. + + :param templateid: the template id (a zero-padded string, '001' etc, + so it will sort nicely) + :type templateid: string + + :param conditions: A list of Feature(positions), + each of which specifies that the property (computed by + Feature.extract_property()) of at least one + token in M{n} + p in positions is C{value}. + :type conditions: C{iterable} of C{Feature} + + """ + TagRule.__init__(self, original_tag, replacement_tag) + self._conditions = conditions + self.templateid = templateid + + def encode_json_obj(self): + return { + "templateid": self.templateid, + "original": self.original_tag, + "replacement": self.replacement_tag, + "conditions": self._conditions, + } + + @classmethod + def decode_json_obj(cls, obj): + return cls( + obj["templateid"], + obj["original"], + obj["replacement"], + tuple(tuple(feat) for feat in obj["conditions"]), + ) + + def applies(self, tokens, index): + # Inherit docs from TagRule + + # Does the given token have this Rule's "original tag"? + if tokens[index][1] != self.original_tag: + return False + + # Check to make sure that every condition holds. + for (feature, val) in self._conditions: + + # Look for *any* token that satisfies the condition. + for pos in feature.positions: + if not (0 <= index + pos < len(tokens)): + continue + if feature.extract_property(tokens, index + pos) == val: + break + else: + # No token satisfied the condition; return false. + return False + + # Every condition checked out, so the Rule is applicable. + return True + + def __eq__(self, other): + return self is other or ( + other is not None + and other.__class__ == self.__class__ + and self.original_tag == other.original_tag + and self.replacement_tag == other.replacement_tag + and self._conditions == other._conditions + ) + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + + # Cache our hash value (justified by profiling.) + try: + return self.__hash + except AttributeError: + self.__hash = hash(repr(self)) + return self.__hash + + def __repr__(self): + # Cache the repr (justified by profiling -- this is used as + # a sort key when deterministic=True.) + try: + return self.__repr + except AttributeError: + self.__repr = "{}('{}', {}, {}, [{}])".format( + self.__class__.__name__, + self.templateid, + repr(self.original_tag), + repr(self.replacement_tag), + # list(self._conditions) would be simpler but will not generate + # the same Rule.__repr__ in python 2 and 3 and thus break some tests + ", ".join(f"({f},{repr(v)})" for (f, v) in self._conditions), + ) + + return self.__repr + + def __str__(self): + def _condition_to_logic(feature, value): + """ + Return a compact, predicate-logic styled string representation + of the given condition. + """ + return "{}:{}@[{}]".format( + feature.PROPERTY_NAME, + value, + ",".join(str(w) for w in feature.positions), + ) + + conditions = " & ".join( + [_condition_to_logic(f, v) for (f, v) in self._conditions] + ) + s = f"{self.original_tag}->{self.replacement_tag} if {conditions}" + + return s + + def format(self, fmt): + """ + Return a string representation of this rule. + + >>> from nltk.tbl.rule import Rule + >>> from nltk.tag.brill import Pos + + >>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')]) + + r.format("str") == str(r) + True + >>> r.format("str") + 'VB->NN if Pos:DT@[-2,-1]' + + r.format("repr") == repr(r) + True + >>> r.format("repr") + "Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])" + + >>> r.format("verbose") + 'VB -> NN if the Pos of words i-2...i-1 is "DT"' + + >>> r.format("not_found") + Traceback (most recent call last): + File "", line 1, in + File "nltk/tbl/rule.py", line 256, in format + raise ValueError("unknown rule format spec: {0}".format(fmt)) + ValueError: unknown rule format spec: not_found + >>> + + :param fmt: format specification + :type fmt: str + :return: string representation + :rtype: str + """ + if fmt == "str": + return self.__str__() + elif fmt == "repr": + return self.__repr__() + elif fmt == "verbose": + return self._verbose_format() + else: + raise ValueError(f"unknown rule format spec: {fmt}") + + def _verbose_format(self): + """ + Return a wordy, human-readable string representation + of the given rule. + + Not sure how useful this is. + """ + + def condition_to_str(feature, value): + return 'the {} of {} is "{}"'.format( + feature.PROPERTY_NAME, + range_to_str(feature.positions), + value, + ) + + def range_to_str(positions): + if len(positions) == 1: + p = positions[0] + if p == 0: + return "this word" + if p == -1: + return "the preceding word" + elif p == 1: + return "the following word" + elif p < 0: + return "word i-%d" % -p + elif p > 0: + return "word i+%d" % p + else: + # for complete compatibility with the wordy format of nltk2 + mx = max(positions) + mn = min(positions) + if mx - mn == len(positions) - 1: + return "words i%+d...i%+d" % (mn, mx) + else: + return "words {{{}}}".format( + ",".join("i%+d" % d for d in positions) + ) + + replacement = f"{self.original_tag} -> {self.replacement_tag}" + conditions = (" if " if self._conditions else "") + ", and ".join( + condition_to_str(f, v) for (f, v) in self._conditions + ) + return replacement + conditions diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..01acf4def592cd73275a872add1b23c2ee001ec4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/__init__.py @@ -0,0 +1,18 @@ +# Natural Language Toolkit: Unit Tests +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Unit tests for the NLTK modules. These tests are intended to ensure +that source code changes don't accidentally introduce bugs. +For instructions, please see: + +../../web/dev/local_testing.rst + +https://github.com/nltk/nltk/blob/develop/web/dev/local_testing.rst + + +""" diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/all.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/all.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0d431e1c2fa356f31076768107b5da1e877bdd --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/all.py @@ -0,0 +1,25 @@ +"""Test suite that runs all NLTK tests. + +This module, `nltk.test.all`, is named as the NLTK ``test_suite`` in the +project's ``setup-eggs.py`` file. Here, we create a test suite that +runs all of our doctests, and return it for processing by the setuptools +test harness. + +""" +import doctest +import os.path +import unittest +from glob import glob + + +def additional_tests(): + # print("here-000000000000000") + # print("-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest'))) + dir = os.path.dirname(__file__) + paths = glob(os.path.join(dir, "*.doctest")) + files = [os.path.basename(path) for path in paths] + return unittest.TestSuite([doctest.DocFileSuite(file) for file in files]) + + +# if os.path.split(path)[-1] != 'index.rst' +# skips time-dependent doctest in index.rst diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/ccg.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/ccg.doctest new file mode 100644 index 0000000000000000000000000000000000000000..47dd25a90b6d0a13b08ad16289872bab8bf3c186 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/ccg.doctest @@ -0,0 +1,376 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +============================== +Combinatory Categorial Grammar +============================== + +Relative Clauses +---------------- + + >>> from nltk.ccg import chart, lexicon + +Construct a lexicon: + + >>> lex = lexicon.fromstring(''' + ... :- S, NP, N, VP + ... + ... Det :: NP/N + ... Pro :: NP + ... Modal :: S\\NP/VP + ... + ... TV :: VP/NP + ... DTV :: TV/NP + ... + ... the => Det + ... + ... that => Det + ... that => NP + ... + ... I => Pro + ... you => Pro + ... we => Pro + ... + ... chef => N + ... cake => N + ... children => N + ... dough => N + ... + ... will => Modal + ... should => Modal + ... might => Modal + ... must => Modal + ... + ... and => var\\.,var/.,var + ... + ... to => VP[to]/VP + ... + ... without => (VP\\VP)/VP[ing] + ... + ... be => TV + ... cook => TV + ... eat => TV + ... + ... cooking => VP[ing]/NP + ... + ... give => DTV + ... + ... is => (S\\NP)/NP + ... prefer => (S\\NP)/NP + ... + ... which => (N\\N)/(S/NP) + ... + ... persuade => (VP/VP[to])/NP + ... ''') + + >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) + >>> for parse in parser.parse("you prefer that cake".split()): + ... chart.printCCGDerivation(parse) + ... break + ... + you prefer that cake + NP ((S\NP)/NP) (NP/N) N + --------------> + NP + ---------------------------> + (S\NP) + --------------------------------< + S + + >>> for parse in parser.parse("that is the cake which you prefer".split()): + ... chart.printCCGDerivation(parse) + ... break + ... + that is the cake which you prefer + NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/NP) + ----->T + (S/(S\NP)) + ------------------>B + (S/NP) + ----------------------------------> + (N\N) + ----------------------------------------< + N + ------------------------------------------------> + NP + -------------------------------------------------------------> + (S\NP) + -------------------------------------------------------------------< + S + + +Some other sentences to try: +"that is the cake which we will persuade the chef to cook" +"that is the cake which we will persuade the chef to give the children" + + >>> sent = "that is the dough which you will eat without cooking".split() + >>> nosub_parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet + + ... chart.CompositionRuleSet + chart.TypeRaiseRuleSet) + +Without Substitution (no output) + + >>> for parse in nosub_parser.parse(sent): + ... chart.printCCGDerivation(parse) + +With Substitution: + + >>> for parse in parser.parse(sent): + ... chart.printCCGDerivation(parse) + ... break + ... + that is the dough which you will eat without cooking + NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/VP) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) + ----->T + (S/(S\NP)) + ------------------------------------->B + ((VP\VP)/NP) + ----------------------------------------------B + ((S\NP)/NP) + ---------------------------------------------------------------->B + (S/NP) + --------------------------------------------------------------------------------> + (N\N) + ---------------------------------------------------------------------------------------< + N + -----------------------------------------------------------------------------------------------> + NP + ------------------------------------------------------------------------------------------------------------> + (S\NP) + ------------------------------------------------------------------------------------------------------------------< + S + + +Conjunction +----------- + + >>> from nltk.ccg.chart import CCGChartParser, ApplicationRuleSet, CompositionRuleSet + >>> from nltk.ccg.chart import SubstitutionRuleSet, TypeRaiseRuleSet, printCCGDerivation + >>> from nltk.ccg import lexicon + +Lexicons for the tests: + + >>> test1_lex = ''' + ... :- S,N,NP,VP + ... I => NP + ... you => NP + ... will => S\\NP/VP + ... cook => VP/NP + ... which => (N\\N)/(S/NP) + ... and => var\\.,var/.,var + ... might => S\\NP/VP + ... eat => VP/NP + ... the => NP/N + ... mushrooms => N + ... parsnips => N''' + >>> test2_lex = ''' + ... :- N, S, NP, VP + ... articles => N + ... the => NP/N + ... and => var\\.,var/.,var + ... which => (N\\N)/(S/NP) + ... I => NP + ... anyone => NP + ... will => (S/VP)\\NP + ... file => VP/NP + ... without => (VP\\VP)/VP[ing] + ... forget => VP/NP + ... reading => VP[ing]/NP + ... ''' + +Tests handling of conjunctions. +Note that while the two derivations are different, they are semantically equivalent. + + >>> lex = lexicon.fromstring(test1_lex) + >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) + >>> for parse in parser.parse("I will cook and might eat the mushrooms and parsnips".split()): + ... printCCGDerivation(parse) + I will cook and might eat the mushrooms and parsnips + NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N + ---------------------->B + ((S\NP)/NP) + ---------------------->B + ((S\NP)/NP) + -------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) + -----------------------------------------------------------------------< + ((S\NP)/NP) + -------------------------------------> + (N\.,N) + ------------------------------------------------< + N + --------------------------------------------------------> + NP + -------------------------------------------------------------------------------------------------------------------------------> + (S\NP) + -----------------------------------------------------------------------------------------------------------------------------------< + S + I will cook and might eat the mushrooms and parsnips + NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N + ---------------------->B + ((S\NP)/NP) + ---------------------->B + ((S\NP)/NP) + -------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) + -----------------------------------------------------------------------< + ((S\NP)/NP) + ------------------------------------------------------------------------------->B + ((S\NP)/N) + -------------------------------------> + (N\.,N) + ------------------------------------------------< + N + -------------------------------------------------------------------------------------------------------------------------------> + (S\NP) + -----------------------------------------------------------------------------------------------------------------------------------< + S + + +Tests handling subject extraction. +Interesting to point that the two parses are clearly semantically different. + + >>> lex = lexicon.fromstring(test2_lex) + >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) + >>> for parse in parser.parse("articles which I will file and forget without reading".split()): + ... printCCGDerivation(parse) + articles which I will file and forget without reading + N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) + -----------------< + (S/VP) + ------------------------------------->B + ((VP\VP)/NP) + ---------------------------------------------- + ((VP/NP)\.,(VP/NP)) + ----------------------------------------------------------------------------------< + (VP/NP) + --------------------------------------------------------------------------------------------------->B + (S/NP) + -------------------------------------------------------------------------------------------------------------------> + (N\N) + -----------------------------------------------------------------------------------------------------------------------------< + N + articles which I will file and forget without reading + N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) + -----------------< + (S/VP) + ------------------------------------> + ((VP/NP)\.,(VP/NP)) + ---------------------------------------------< + (VP/NP) + ------------------------------------->B + ((VP\VP)/NP) + ----------------------------------------------------------------------------------B + (S/NP) + -------------------------------------------------------------------------------------------------------------------> + (N\N) + -----------------------------------------------------------------------------------------------------------------------------< + N + + +Unicode support +--------------- + +Unicode words are supported. + + >>> from nltk.ccg import chart, lexicon + +Lexicons for the tests: + + >>> lex = lexicon.fromstring(''' + ... :- S, N, NP, PP + ... + ... AdjI :: N\\N + ... AdjD :: N/N + ... AdvD :: S/S + ... AdvI :: S\\S + ... Det :: NP/N + ... PrepNPCompl :: PP/NP + ... PrepNAdjN :: S\\S/N + ... PrepNAdjNP :: S\\S/NP + ... VPNP :: S\\NP/NP + ... VPPP :: S\\NP/PP + ... VPser :: S\\NP/AdjI + ... + ... auto => N + ... bebidas => N + ... cine => N + ... ley => N + ... libro => N + ... ministro => N + ... panadería => N + ... presidente => N + ... super => N + ... + ... el => Det + ... la => Det + ... las => Det + ... un => Det + ... + ... Ana => NP + ... Pablo => NP + ... + ... y => var\\.,var/.,var + ... + ... pero => (S/NP)\\(S/NP)/(S/NP) + ... + ... anunció => VPNP + ... compró => VPNP + ... cree => S\\NP/S[dep] + ... desmintió => VPNP + ... lee => VPNP + ... fueron => VPPP + ... + ... es => VPser + ... + ... interesante => AdjD + ... interesante => AdjI + ... nueva => AdjD + ... nueva => AdjI + ... + ... a => PrepNPCompl + ... en => PrepNAdjN + ... en => PrepNAdjNP + ... + ... ayer => AdvI + ... + ... que => (NP\\NP)/(S/NP) + ... que => S[dep]/S + ... ''') + + >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) + >>> for parse in parser.parse(u"el ministro anunció pero el presidente desmintió la nueva ley".split()): + ... printCCGDerivation(parse) # doctest: +SKIP + ... # it fails on python2.7 because of the unicode problem explained in https://github.com/nltk/nltk/pull/1354 + ... break + el ministro anunció pero el presidente desmintió la nueva ley + (NP/N) N ((S\NP)/NP) (((S/NP)\(S/NP))/(S/NP)) (NP/N) N ((S\NP)/NP) (NP/N) (N/N) N + ------------------> + NP + ------------------>T + (S/(S\NP)) + --------------------> + NP + -------------------->T + (S/(S\NP)) + --------------------------------->B + (S/NP) + -----------------------------------------------------------> + ((S/NP)\(S/NP)) + ------------> + N + --------------------> + NP + -------------------- + S diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/ccg_semantics.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/ccg_semantics.doctest new file mode 100644 index 0000000000000000000000000000000000000000..368ca548434350d8200fdc9b8c4bfbfa23ad535f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/ccg_semantics.doctest @@ -0,0 +1,552 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +============================================== +Combinatory Categorial Grammar with semantics +============================================== + +----- +Chart +----- + + + >>> from nltk.ccg import chart, lexicon + >>> from nltk.ccg.chart import printCCGDerivation + +No semantics +------------------- + + >>> lex = lexicon.fromstring(''' + ... :- S, NP, N + ... She => NP + ... has => (S\\NP)/NP + ... books => NP + ... ''', + ... False) + + >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) + >>> parses = list(parser.parse("She has books".split())) + >>> print(str(len(parses)) + " parses") + 3 parses + + >>> printCCGDerivation(parses[0]) + She has books + NP ((S\NP)/NP) NP + --------------------> + (S\NP) + -------------------------< + S + + >>> printCCGDerivation(parses[1]) + She has books + NP ((S\NP)/NP) NP + ----->T + (S/(S\NP)) + --------------------> + (S\NP) + -------------------------> + S + + + >>> printCCGDerivation(parses[2]) + She has books + NP ((S\NP)/NP) NP + ----->T + (S/(S\NP)) + ------------------>B + (S/NP) + -------------------------> + S + +Simple semantics +------------------- + + >>> lex = lexicon.fromstring(''' + ... :- S, NP, N + ... She => NP {she} + ... has => (S\\NP)/NP {\\x y.have(y, x)} + ... a => NP/N {\\P.exists z.P(z)} + ... book => N {book} + ... ''', + ... True) + + >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) + >>> parses = list(parser.parse("She has a book".split())) + >>> print(str(len(parses)) + " parses") + 7 parses + + >>> printCCGDerivation(parses[0]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} + -------------------------------------> + NP {exists z.book(z)} + -------------------------------------------------------------------> + (S\NP) {\y.have(y,exists z.book(z))} + -----------------------------------------------------------------------------< + S {have(she,exists z.book(z))} + + >>> printCCGDerivation(parses[1]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} + --------------------------------------------------------->B + ((S\NP)/N) {\P y.have(y,exists z.P(z))} + -------------------------------------------------------------------> + (S\NP) {\y.have(y,exists z.book(z))} + -----------------------------------------------------------------------------< + S {have(she,exists z.book(z))} + + >>> printCCGDerivation(parses[2]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} + ---------->T + (S/(S\NP)) {\F.F(she)} + -------------------------------------> + NP {exists z.book(z)} + -------------------------------------------------------------------> + (S\NP) {\y.have(y,exists z.book(z))} + -----------------------------------------------------------------------------> + S {have(she,exists z.book(z))} + + >>> printCCGDerivation(parses[3]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} + ---------->T + (S/(S\NP)) {\F.F(she)} + --------------------------------------------------------->B + ((S\NP)/N) {\P y.have(y,exists z.P(z))} + -------------------------------------------------------------------> + (S\NP) {\y.have(y,exists z.book(z))} + -----------------------------------------------------------------------------> + S {have(she,exists z.book(z))} + + >>> printCCGDerivation(parses[4]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} + ---------->T + (S/(S\NP)) {\F.F(she)} + ---------------------------------------->B + (S/NP) {\x.have(she,x)} + -------------------------------------> + NP {exists z.book(z)} + -----------------------------------------------------------------------------> + S {have(she,exists z.book(z))} + + >>> printCCGDerivation(parses[5]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} + ---------->T + (S/(S\NP)) {\F.F(she)} + --------------------------------------------------------->B + ((S\NP)/N) {\P y.have(y,exists z.P(z))} + ------------------------------------------------------------------->B + (S/N) {\P.have(she,exists z.P(z))} + -----------------------------------------------------------------------------> + S {have(she,exists z.book(z))} + + >>> printCCGDerivation(parses[6]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} + ---------->T + (S/(S\NP)) {\F.F(she)} + ---------------------------------------->B + (S/NP) {\x.have(she,x)} + ------------------------------------------------------------------->B + (S/N) {\P.have(she,exists z.P(z))} + -----------------------------------------------------------------------------> + S {have(she,exists z.book(z))} + +Complex semantics +------------------- + + >>> lex = lexicon.fromstring(''' + ... :- S, NP, N + ... She => NP {she} + ... has => (S\\NP)/NP {\\x y.have(y, x)} + ... a => ((S\\NP)\\((S\\NP)/NP))/N {\\P R x.(exists z.P(z) & R(z,x))} + ... book => N {book} + ... ''', + ... True) + + >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) + >>> parses = list(parser.parse("She has a book".split())) + >>> print(str(len(parses)) + " parses") + 2 parses + + >>> printCCGDerivation(parses[0]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book} + ----------------------------------------------------------------------> + ((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))} + ----------------------------------------------------------------------------------------------------< + (S\NP) {\x.(exists z.book(z) & have(x,z))} + --------------------------------------------------------------------------------------------------------------< + S {(exists z.book(z) & have(she,z))} + + >>> printCCGDerivation(parses[1]) + She has a book + NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book} + ---------->T + (S/(S\NP)) {\F.F(she)} + ----------------------------------------------------------------------> + ((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))} + ----------------------------------------------------------------------------------------------------< + (S\NP) {\x.(exists z.book(z) & have(x,z))} + --------------------------------------------------------------------------------------------------------------> + S {(exists z.book(z) & have(she,z))} + +Using conjunctions +--------------------- + + # TODO: The semantics of "and" should have been more flexible + >>> lex = lexicon.fromstring(''' + ... :- S, NP, N + ... I => NP {I} + ... cook => (S\\NP)/NP {\\x y.cook(x,y)} + ... and => var\\.,var/.,var {\\P Q x y.(P(x,y) & Q(x,y))} + ... eat => (S\\NP)/NP {\\x y.eat(x,y)} + ... the => NP/N {\\x.the(x)} + ... bacon => N {bacon} + ... ''', + ... True) + + >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) + >>> parses = list(parser.parse("I cook and eat the bacon".split())) + >>> print(str(len(parses)) + " parses") + 7 parses + + >>> printCCGDerivation(parses[0]) + I cook and eat the bacon + NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} + -------------------------------------------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} + -------------------------------------------------------------------------------------------------------------------< + ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} + -------------------------------> + NP {the(bacon)} + --------------------------------------------------------------------------------------------------------------------------------------------------> + (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} + ----------------------------------------------------------------------------------------------------------------------------------------------------------< + S {(eat(the(bacon),I) & cook(the(bacon),I))} + + >>> printCCGDerivation(parses[1]) + I cook and eat the bacon + NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} + -------------------------------------------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} + -------------------------------------------------------------------------------------------------------------------< + ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} + --------------------------------------------------------------------------------------------------------------------------------------->B + ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} + --------------------------------------------------------------------------------------------------------------------------------------------------> + (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} + ----------------------------------------------------------------------------------------------------------------------------------------------------------< + S {(eat(the(bacon),I) & cook(the(bacon),I))} + + >>> printCCGDerivation(parses[2]) + I cook and eat the bacon + NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} + -------->T + (S/(S\NP)) {\F.F(I)} + -------------------------------------------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} + -------------------------------------------------------------------------------------------------------------------< + ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} + -------------------------------> + NP {the(bacon)} + --------------------------------------------------------------------------------------------------------------------------------------------------> + (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} + ----------------------------------------------------------------------------------------------------------------------------------------------------------> + S {(eat(the(bacon),I) & cook(the(bacon),I))} + + >>> printCCGDerivation(parses[3]) + I cook and eat the bacon + NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} + -------->T + (S/(S\NP)) {\F.F(I)} + -------------------------------------------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} + -------------------------------------------------------------------------------------------------------------------< + ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} + --------------------------------------------------------------------------------------------------------------------------------------->B + ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} + --------------------------------------------------------------------------------------------------------------------------------------------------> + (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} + ----------------------------------------------------------------------------------------------------------------------------------------------------------> + S {(eat(the(bacon),I) & cook(the(bacon),I))} + + >>> printCCGDerivation(parses[4]) + I cook and eat the bacon + NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} + -------->T + (S/(S\NP)) {\F.F(I)} + -------------------------------------------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} + -------------------------------------------------------------------------------------------------------------------< + ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} + --------------------------------------------------------------------------------------------------------------------------->B + (S/NP) {\x.(eat(x,I) & cook(x,I))} + -------------------------------> + NP {the(bacon)} + ----------------------------------------------------------------------------------------------------------------------------------------------------------> + S {(eat(the(bacon),I) & cook(the(bacon),I))} + + >>> printCCGDerivation(parses[5]) + I cook and eat the bacon + NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} + -------->T + (S/(S\NP)) {\F.F(I)} + -------------------------------------------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} + -------------------------------------------------------------------------------------------------------------------< + ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} + --------------------------------------------------------------------------------------------------------------------------------------->B + ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} + ----------------------------------------------------------------------------------------------------------------------------------------------->B + (S/N) {\x.(eat(the(x),I) & cook(the(x),I))} + ----------------------------------------------------------------------------------------------------------------------------------------------------------> + S {(eat(the(bacon),I) & cook(the(bacon),I))} + + >>> printCCGDerivation(parses[6]) + I cook and eat the bacon + NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} + -------->T + (S/(S\NP)) {\F.F(I)} + -------------------------------------------------------------------------------------> + (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} + -------------------------------------------------------------------------------------------------------------------< + ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} + --------------------------------------------------------------------------------------------------------------------------->B + (S/NP) {\x.(eat(x,I) & cook(x,I))} + ----------------------------------------------------------------------------------------------------------------------------------------------->B + (S/N) {\x.(eat(the(x),I) & cook(the(x),I))} + ----------------------------------------------------------------------------------------------------------------------------------------------------------> + S {(eat(the(bacon),I) & cook(the(bacon),I))} + +Tests from published papers +------------------------------ + +An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf + + >>> lex = lexicon.fromstring(''' + ... :- S, NP + ... I => NP {I} + ... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)} + ... them => NP {them} + ... money => NP {money} + ... ''', + ... True) + + >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) + >>> parses = list(parser.parse("I give them money".split())) + >>> print(str(len(parses)) + " parses") + 3 parses + + >>> printCCGDerivation(parses[0]) + I give them money + NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} + --------------------------------------------------> + ((S\NP)/NP) {\y z.give(y,them,z)} + --------------------------------------------------------------> + (S\NP) {\z.give(money,them,z)} + ----------------------------------------------------------------------< + S {give(money,them,I)} + + >>> printCCGDerivation(parses[1]) + I give them money + NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} + -------->T + (S/(S\NP)) {\F.F(I)} + --------------------------------------------------> + ((S\NP)/NP) {\y z.give(y,them,z)} + --------------------------------------------------------------> + (S\NP) {\z.give(money,them,z)} + ----------------------------------------------------------------------> + S {give(money,them,I)} + + + >>> printCCGDerivation(parses[2]) + I give them money + NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} + -------->T + (S/(S\NP)) {\F.F(I)} + --------------------------------------------------> + ((S\NP)/NP) {\y z.give(y,them,z)} + ---------------------------------------------------------->B + (S/NP) {\y.give(y,them,I)} + ----------------------------------------------------------------------> + S {give(money,them,I)} + + +An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf + + >>> lex = lexicon.fromstring(''' + ... :- N, NP, S + ... money => N {money} + ... that => (N\\N)/(S/NP) {\\P Q x.(P(x) & Q(x))} + ... I => NP {I} + ... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)} + ... them => NP {them} + ... ''', + ... True) + + >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) + >>> parses = list(parser.parse("money that I give them".split())) + >>> print(str(len(parses)) + " parses") + 3 parses + + >>> printCCGDerivation(parses[0]) + money that I give them + N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} + -------->T + (S/(S\NP)) {\F.F(I)} + --------------------------------------------------> + ((S\NP)/NP) {\y z.give(y,them,z)} + ---------------------------------------------------------->B + (S/NP) {\y.give(y,them,I)} + -------------------------------------------------------------------------------------------------> + (N\N) {\Q x.(give(x,them,I) & Q(x))} + ------------------------------------------------------------------------------------------------------------< + N {\x.(give(x,them,I) & money(x))} + + >>> printCCGDerivation(parses[1]) + money that I give them + N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} + ----------->T + (N/(N\N)) {\F.F(money)} + -------->T + (S/(S\NP)) {\F.F(I)} + --------------------------------------------------> + ((S\NP)/NP) {\y z.give(y,them,z)} + ---------------------------------------------------------->B + (S/NP) {\y.give(y,them,I)} + -------------------------------------------------------------------------------------------------> + (N\N) {\Q x.(give(x,them,I) & Q(x))} + ------------------------------------------------------------------------------------------------------------> + N {\x.(give(x,them,I) & money(x))} + + >>> printCCGDerivation(parses[2]) + money that I give them + N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} + ----------->T + (N/(N\N)) {\F.F(money)} + -------------------------------------------------->B + (N/(S/NP)) {\P x.(P(x) & money(x))} + -------->T + (S/(S\NP)) {\F.F(I)} + --------------------------------------------------> + ((S\NP)/NP) {\y z.give(y,them,z)} + ---------------------------------------------------------->B + (S/NP) {\y.give(y,them,I)} + ------------------------------------------------------------------------------------------------------------> + N {\x.(give(x,them,I) & money(x))} + + +------- +Lexicon +------- + + >>> from nltk.ccg import lexicon + +Parse lexicon with semantics + + >>> print(str(lexicon.fromstring( + ... ''' + ... :- S,NP + ... + ... IntransVsg :: S\\NP[sg] + ... + ... sleeps => IntransVsg {\\x.sleep(x)} + ... eats => S\\NP[sg]/NP {\\x y.eat(x,y)} + ... + ... and => var\\var/var {\\x y.x & y} + ... ''', + ... True + ... ))) + and => ((_var0\_var0)/_var0) {(\x y.x & y)} + eats => ((S\NP['sg'])/NP) {\x y.eat(x,y)} + sleeps => (S\NP['sg']) {\x.sleep(x)} + +Parse lexicon without semantics + + >>> print(str(lexicon.fromstring( + ... ''' + ... :- S,NP + ... + ... IntransVsg :: S\\NP[sg] + ... + ... sleeps => IntransVsg + ... eats => S\\NP[sg]/NP {sem=\\x y.eat(x,y)} + ... + ... and => var\\var/var + ... ''', + ... False + ... ))) + and => ((_var0\_var0)/_var0) + eats => ((S\NP['sg'])/NP) + sleeps => (S\NP['sg']) + +Semantics are missing + + >>> print(str(lexicon.fromstring( + ... ''' + ... :- S,NP + ... + ... eats => S\\NP[sg]/NP + ... ''', + ... True + ... ))) + Traceback (most recent call last): + ... + AssertionError: eats => S\NP[sg]/NP must contain semantics because include_semantics is set to True + + +------------------------------------ +CCG combinator semantics computation +------------------------------------ + + >>> from nltk.sem.logic import * + >>> from nltk.ccg.logic import * + + >>> read_expr = Expression.fromstring + +Compute semantics from function application + + >>> print(str(compute_function_semantics(read_expr(r'\x.P(x)'), read_expr(r'book')))) + P(book) + + >>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'read')))) + read(book) + + >>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'\x.read(x)')))) + read(book) + +Compute semantics from composition + + >>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'\x.Q(x)')))) + \x.P(Q(x)) + + >>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'read')))) + Traceback (most recent call last): + ... + AssertionError: `read` must be a lambda expression + +Compute semantics from substitution + + >>> print(str(compute_substitution_semantics(read_expr(r'\x y.P(x,y)'), read_expr(r'\x.Q(x)')))) + \x.P(x,Q(x)) + + >>> print(str(compute_substitution_semantics(read_expr(r'\x.P(x)'), read_expr(r'read')))) + Traceback (most recent call last): + ... + AssertionError: `\x.P(x)` must be a lambda expression with 2 arguments + +Compute type-raise semantics + + >>> print(str(compute_type_raised_semantics(read_expr(r'\x.P(x)')))) + \F x.F(P(x)) + + >>> print(str(compute_type_raised_semantics(read_expr(r'\x.F(x)')))) + \F1 x.F1(F(x)) + + >>> print(str(compute_type_raised_semantics(read_expr(r'\x y z.P(x,y,z)')))) + \F x y z.F(P(x,y,z)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/conftest.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..d5e89a36725cb1da9ec3865c215c357ef98cabbe --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/conftest.py @@ -0,0 +1,33 @@ +import pytest + +from nltk.corpus.reader import CorpusReader + + +@pytest.fixture(autouse=True) +def mock_plot(mocker): + """Disable matplotlib plotting in test code""" + + try: + import matplotlib.pyplot as plt + + mocker.patch.object(plt, "gca") + mocker.patch.object(plt, "show") + except ImportError: + pass + + +@pytest.fixture(scope="module", autouse=True) +def teardown_loaded_corpora(): + """ + After each test session ends (either doctest or unit test), + unload any loaded corpora + """ + + yield # first, wait for the test to end + + import nltk.corpus + + for name in dir(nltk.corpus): + obj = getattr(nltk.corpus, name, None) + if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"): + obj._unload() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/crubadan.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/crubadan.doctest new file mode 100644 index 0000000000000000000000000000000000000000..2ad9b79aba830e0525ddbac85d51117ef75f491e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/crubadan.doctest @@ -0,0 +1,65 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +Crubadan Corpus Reader +====================== + +Crubadan is an NLTK corpus reader for ngram files provided +by the Crubadan project. It supports several languages. + + >>> from nltk.corpus import crubadan + >>> crubadan.langs() + ['abk', 'abn',..., 'zpa', 'zul'] + +---------------------------------------- +Language code mapping and helper methods +---------------------------------------- + +The web crawler that generates the 3-gram frequencies works at the +level of "writing systems" rather than languages. Writing systems +are assigned internal 2-3 letter codes that require mapping to the +standard ISO 639-3 codes. For more information, please refer to +the README in nltk_data/crubadan folder after installing it. + +To translate ISO 639-3 codes to "Crubadan Code": + + >>> crubadan.iso_to_crubadan('eng') + 'en' + >>> crubadan.iso_to_crubadan('fra') + 'fr' + >>> crubadan.iso_to_crubadan('aaa') + +In reverse, print ISO 639-3 code if we have the Crubadan Code: + + >>> crubadan.crubadan_to_iso('en') + 'eng' + >>> crubadan.crubadan_to_iso('fr') + 'fra' + >>> crubadan.crubadan_to_iso('aa') + +--------------------------- +Accessing ngram frequencies +--------------------------- + +On initialization the reader will create a dictionary of every +language supported by the Crubadan project, mapping the ISO 639-3 +language code to its corresponding ngram frequency. + +You can access individual language FreqDist and the ngrams within them as follows: + + >>> english_fd = crubadan.lang_freq('eng') + >>> english_fd['the'] + 728135 + +Above accesses the FreqDist of English and returns the frequency of the ngram 'the'. +A ngram that isn't found within the language will return 0: + + >>> english_fd['sometest'] + 0 + +A language that isn't supported will raise an exception: + + >>> crubadan.lang_freq('elvish') + Traceback (most recent call last): + ... + RuntimeError: Unsupported language. diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/drt.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/drt.doctest new file mode 100644 index 0000000000000000000000000000000000000000..b577dba34b92269bf935623b7432949a6e2e81be --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/drt.doctest @@ -0,0 +1,515 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +================================ + Discourse Representation Theory +================================ + + >>> from nltk.sem import logic + >>> from nltk.inference import TableauProver + +Overview +======== + +A DRS can be created with the ``DRS()`` constructor. This takes two arguments: a list of +discourse referents and list of conditions. . + + >>> from nltk.sem.drt import * + >>> dexpr = DrtExpression.fromstring + >>> man_x = dexpr('man(x)') + >>> walk_x = dexpr('walk(x)') + >>> x = dexpr('x') + >>> print(DRS([x], [man_x, walk_x])) + ([x],[man(x), walk(x)]) + +The ``parse()`` method can also be applied directly to DRS +expressions, which allows them to be specified more +easily. + + >>> drs1 = dexpr('([x],[man(x),walk(x)])') + >>> print(drs1) + ([x],[man(x), walk(x)]) + +DRSs can be *merged* using the ``+`` operator. + + >>> drs2 = dexpr('([y],[woman(y),stop(y)])') + >>> drs3 = drs1 + drs2 + >>> print(drs3) + (([x],[man(x), walk(x)]) + ([y],[woman(y), stop(y)])) + >>> print(drs3.simplify()) + ([x,y],[man(x), walk(x), woman(y), stop(y)]) + +We can embed DRSs as components of an ``implies`` condition. + + >>> s = '([], [(%s -> %s)])' % (drs1, drs2) + >>> print(dexpr(s)) + ([],[(([x],[man(x), walk(x)]) -> ([y],[woman(y), stop(y)]))]) + +The ``fol()`` method converts DRSs into FOL formulae. + + >>> print(dexpr(r'([x],[man(x), walks(x)])').fol()) + exists x.(man(x) & walks(x)) + >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol()) + all x.(man(x) -> walks(x)) + +In order to visualize a DRS, the ``pretty_format()`` method can be used. + + >>> print(drs3.pretty_format()) + _________ __________ + | x | | y | + (|---------| + |----------|) + | man(x) | | woman(y) | + | walk(x) | | stop(y) | + |_________| |__________| + + +Parse to semantics +------------------ + +.. + >>> logic._counter._value = 0 + +DRSs can be used for building compositional semantics in a feature +based grammar. To specify that we want to use DRSs, the appropriate +logic parser needs be passed as a parameter to ``load_earley()`` + + >>> from nltk.parse import load_parser + >>> from nltk.sem.drt import DrtParser + >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, logic_parser=DrtParser()) + >>> for tree in parser.parse('a dog barks'.split()): + ... print(tree.label()['SEM'].simplify()) + ... + ([x],[dog(x), bark(x)]) + +Alternatively, a ``FeatStructReader`` can be passed with the ``logic_parser`` set on it + + >>> from nltk.featstruct import FeatStructReader + >>> from nltk.grammar import FeatStructNonterminal + >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, fstruct_reader=FeatStructReader(fdict_class=FeatStructNonterminal, logic_parser=DrtParser())) + >>> for tree in parser.parse('every girl chases a dog'.split()): + ... print(tree.label()['SEM'].simplify().normalize()) + ... + ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chase(z1,z2)]))]) + + + +Unit Tests +========== + +Parser +------ + + >>> print(dexpr(r'([x,y],[sees(x,y)])')) + ([x,y],[sees(x,y)]) + >>> print(dexpr(r'([x],[man(x), walks(x)])')) + ([x],[man(x), walks(x)]) + >>> print(dexpr(r'\x.([],[man(x), walks(x)])')) + \x.([],[man(x), walks(x)]) + >>> print(dexpr(r'\x.\y.([],[sees(x,y)])')) + \x y.([],[sees(x,y)]) + + >>> print(dexpr(r'([x,y],[(x = y)])')) + ([x,y],[(x = y)]) + >>> print(dexpr(r'([x,y],[(x != y)])')) + ([x,y],[-(x = y)]) + + >>> print(dexpr(r'\x.([],[walks(x)])(john)')) + (\x.([],[walks(x)]))(john) + >>> print(dexpr(r'\R.\x.([],[big(x,R)])(\y.([],[mouse(y)]))')) + (\R x.([],[big(x,R)]))(\y.([],[mouse(y)])) + + >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))')) + (([x],[walks(x)]) + ([y],[runs(y)])) + >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))')) + (([x,y],[walks(x), jumps(y)]) + ([z],[twos(z)]) + ([w],[runs(w)])) + >>> print(dexpr(r'((([],[walks(x)]) + ([],[twos(x)])) + ([],[runs(x)]))')) + (([],[walks(x)]) + ([],[twos(x)]) + ([],[runs(x)])) + >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)])) + (([],[threes(x)]) + ([],[fours(x)])))')) + (([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])) + + >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))')) + (([],[walks(x)]) -> ([],[runs(x)])) + + >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])')) + ([x],[PRO(x), sees(John,x)]) + >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])')) + ([x],[man(x), -([],[walks(x)])]) + >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])')) + ([],[(([x],[man(x)]) -> ([],[walks(x)]))]) + + >>> print(dexpr(r'DRS([x],[walk(x)])')) + ([x],[walk(x)]) + >>> print(dexpr(r'DRS([x][walk(x)])')) + ([x],[walk(x)]) + >>> print(dexpr(r'([x][walk(x)])')) + ([x],[walk(x)]) + +``simplify()`` +-------------- + + >>> print(dexpr(r'\x.([],[man(x), walks(x)])(john)').simplify()) + ([],[man(john), walks(john)]) + >>> print(dexpr(r'\x.\y.([z],[dog(z),sees(x,y)])(john)(mary)').simplify()) + ([z],[dog(z), sees(john,mary)]) + >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').simplify()) + \x.([],[big(x,\y.([],[mouse(y)]))]) + + >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').simplify()) + ([x,y],[walks(x), runs(y)]) + >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))').simplify()) + ([w,x,y,z],[walks(x), jumps(y), twos(z), runs(w)]) + >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])))').simplify()) + ([],[walks(x), runs(x), threes(x), fours(x)]) + >>> dexpr(r'([x],[man(x)])+([x],[walks(x)])').simplify() == \ + ... dexpr(r'([x,z1],[man(x), walks(z1)])') + True + >>> dexpr(r'([y],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)]))])+([x],[run(x)])').simplify() == \ + ... dexpr(r'([y,z1],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)])), run(z1)])') + True + + >>> dexpr(r'\Q.(([x],[john(x),walks(x)]) + Q)(([x],[PRO(x),leaves(x)]))').simplify() == \ + ... dexpr(r'([x,z1],[john(x), walks(x), PRO(z1), leaves(z1)])') + True + + >>> logic._counter._value = 0 + >>> print(dexpr('([],[(([x],[dog(x)]) -> ([e,y],[boy(y), chase(e), subj(e,x), obj(e,y)]))])+([e,x],[PRO(x), run(e), subj(e,x)])').simplify().normalize().normalize()) + ([e02,z5],[(([z3],[dog(z3)]) -> ([e01,z4],[boy(z4), chase(e01), subj(e01,z3), obj(e01,z4)])), PRO(z5), run(e02), subj(e02,z5)]) + +``fol()`` +----------- + + >>> print(dexpr(r'([x,y],[sees(x,y)])').fol()) + exists x y.sees(x,y) + >>> print(dexpr(r'([x],[man(x), walks(x)])').fol()) + exists x.(man(x) & walks(x)) + >>> print(dexpr(r'\x.([],[man(x), walks(x)])').fol()) + \x.(man(x) & walks(x)) + >>> print(dexpr(r'\x y.([],[sees(x,y)])').fol()) + \x y.sees(x,y) + + >>> print(dexpr(r'\x.([],[walks(x)])(john)').fol()) + \x.walks(x)(john) + >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').fol()) + (\R x.big(x,R))(\y.mouse(y)) + + >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').fol()) + (exists x.walks(x) & exists y.runs(y)) + + >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))').fol()) + (walks(x) -> runs(x)) + + >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])').fol()) + exists x.(PRO(x) & sees(John,x)) + >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])').fol()) + exists x.(man(x) & -walks(x)) + >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol()) + all x.(man(x) -> walks(x)) + + >>> print(dexpr(r'([x],[man(x) | walks(x)])').fol()) + exists x.(man(x) | walks(x)) + >>> print(dexpr(r'P(x) + ([x],[walks(x)])').fol()) + (P(x) & exists x.walks(x)) + +``resolve_anaphora()`` +---------------------- + + >>> from nltk.sem.drt import AnaphoraResolutionException + + >>> print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])'))) + ([x,y,z],[dog(x), cat(y), walks(z), (z = [x,y])]) + >>> print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])'))) + ([],[(([x],[dog(x)]) -> ([y],[walks(y), (y = x)]))]) + >>> print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')).simplify()) + ([x,y],[(x = y)]) + >>> try: print(resolve_anaphora(dexpr(r'([x],[walks(x), PRO(x)])'))) + ... except AnaphoraResolutionException as e: print(e) + Variable 'x' does not resolve to anything. + >>> print(resolve_anaphora(dexpr('([e01,z6,z7],[boy(z6), PRO(z7), run(e01), subj(e01,z7)])'))) + ([e01,z6,z7],[boy(z6), (z7 = z6), run(e01), subj(e01,z7)]) + +``equiv()``: +---------------- + + >>> a = dexpr(r'([x],[man(x), walks(x)])') + >>> b = dexpr(r'([x],[walks(x), man(x)])') + >>> print(a.equiv(b, TableauProver())) + True + + +``replace()``: +-------------- + + >>> a = dexpr(r'a') + >>> w = dexpr(r'w') + >>> x = dexpr(r'x') + >>> y = dexpr(r'y') + >>> z = dexpr(r'z') + + +replace bound +------------- + + >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, False)) + ([x],[give(x,y,z)]) + >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, True)) + ([a],[give(a,y,z)]) + +replace unbound +--------------- + + >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, False)) + ([x],[give(x,a,z)]) + >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, True)) + ([x],[give(x,a,z)]) + +replace unbound with bound +-------------------------- + + >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, False) == \ + ... dexpr('([z1],[give(z1,x,z)])') + True + >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, True) == \ + ... dexpr('([z1],[give(z1,x,z)])') + True + +replace unbound with unbound +---------------------------- + + >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, False)) + ([x],[give(x,z,z)]) + >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, True)) + ([x],[give(x,z,z)]) + + +replace unbound +--------------- + + >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False)) + (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) + >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True)) + (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) + +replace bound +------------- + + >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, False)) + (([x],[P(x,y,z)]) + ([y],[Q(x,y,z)])) + >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, True)) + (([a],[P(a,y,z)]) + ([y],[Q(a,y,z)])) + +replace unbound with unbound +---------------------------- + + >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False)) + (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) + >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True)) + (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) + +replace unbound with bound on same side +--------------------------------------- + + >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, False) == \ + ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))') + True + >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, True) == \ + ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))') + True + +replace unbound with bound on other side +---------------------------------------- + + >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, False) == \ + ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))') + True + >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, True) == \ + ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))') + True + +replace unbound with double bound +--------------------------------- + + >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, False) == \ + ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))') + True + >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, True) == \ + ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))') + True + + +regression tests +---------------- + + >>> d = dexpr('([x],[A(c), ([y],[B(x,y,z,a)])->([z],[C(x,y,z,a)])])') + >>> print(d) + ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) + >>> print(d.pretty_format()) + ____________________________________ + | x | + |------------------------------------| + | A(c) | + | ____________ ____________ | + | | y | | z | | + | (|------------| -> |------------|) | + | | B(x,y,z,a) | | C(x,y,z,a) | | + | |____________| |____________| | + |____________________________________| + >>> print(str(d)) + ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) + >>> print(d.fol()) + exists x.(A(c) & all y.(B(x,y,z,a) -> exists z.C(x,y,z,a))) + >>> print(d.replace(Variable('a'), DrtVariableExpression(Variable('r')))) + ([x],[A(c), (([y],[B(x,y,z,r)]) -> ([z],[C(x,y,z,r)]))]) + >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')))) + ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) + >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')))) + ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) + >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')))) + ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([z],[C(x,y,z,a)]))]) + >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')), True)) + ([r],[A(c), (([y],[B(r,y,z,a)]) -> ([z],[C(r,y,z,a)]))]) + >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')), True)) + ([x],[A(c), (([r],[B(x,r,z,a)]) -> ([z],[C(x,r,z,a)]))]) + >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')), True)) + ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([r],[C(x,y,r,a)]))]) + >>> print(d == dexpr('([l],[A(c), ([m],[B(l,m,z,a)])->([n],[C(l,m,n,a)])])')) + True + >>> d = dexpr('([],[([x,y],[B(x,y,h), ([a,b],[dee(x,a,g)])])->([z,w],[cee(x,y,f), ([c,d],[E(x,c,d,e)])])])') + >>> sorted(d.free()) + [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')] + >>> sorted(d.variables()) + [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')] + >>> sorted(d.get_refs(True)) + [Variable('a'), Variable('b'), Variable('c'), Variable('d'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] + >>> sorted(d.conds[0].get_refs(False)) + [Variable('x'), Variable('y')] + >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])->([],[C(x,y)]), ([x,y],[D(x,y)])->([],[E(x,y)]), ([],[F(x,y)])->([x,y],[G(x,y)])])').eliminate_equality()) + ([x],[A(x,x), (([],[B(x,x)]) -> ([],[C(x,x)])), (([x,y],[D(x,y)]) -> ([],[E(x,y)])), (([],[F(x,x)]) -> ([x,y],[G(x,y)]))]) + >>> print(dexpr('([x,y],[A(x,y), (x=y)]) -> ([],[B(x,y)])').eliminate_equality()) + (([x],[A(x,x)]) -> ([],[B(x,x)])) + >>> print(dexpr('([x,y],[A(x,y)]) -> ([],[B(x,y), (x=y)])').eliminate_equality()) + (([x,y],[A(x,y)]) -> ([],[B(x,x)])) + >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])])').eliminate_equality()) + ([x],[A(x,x), ([],[B(x,x)])]) + >>> print(dexpr('([x,y],[A(x,y), ([],[B(x,y), (x=y)])])').eliminate_equality()) + ([x,y],[A(x,y), ([],[B(x,x)])]) + >>> print(dexpr('([z8 z9 z10],[A(z8), z8=z10, z9=z10, B(z9), C(z10), D(z10)])').eliminate_equality()) + ([z9],[A(z9), B(z9), C(z9), D(z9)]) + + >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)]), ([x,y],[C(x,y)])])').eliminate_equality()) + ([x],[A(x,x), ([],[B(x,x)]), ([x,y],[C(x,y)])]) + >>> print(dexpr('([x,y],[A(x,y)]) + ([],[B(x,y), (x=y)]) + ([],[C(x,y)])').eliminate_equality()) + ([x],[A(x,x), B(x,x), C(x,x)]) + >>> print(dexpr('([x,y],[B(x,y)])+([x,y],[C(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) + (([x,y],[B(x,y)]) + ([x,y],[C(x,y)])) + >>> print(dexpr('(([x,y],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) + (([x,y],[B(x,y)]) + ([],[C(x,y)]) + ([],[D(x,y)])) + >>> print(dexpr('(([],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) + (([],[B(x,x)]) + ([],[C(x,x)]) + ([],[D(x,x)])) + >>> print(dexpr('(([],[B(x,y), ([x,y],[A(x,y)])])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))).normalize()) + (([],[B(z3,z1), ([z2,z3],[A(z3,z2)])]) + ([],[C(z3,z1)]) + ([],[D(z3,z1)])) + + +Parse errors +============ + + >>> def parse_error(drtstring): + ... try: dexpr(drtstring) + ... except logic.LogicalExpressionException as e: print(e) + + >>> parse_error(r'') + End of input found. Expression expected. + + ^ + >>> parse_error(r'(') + End of input found. Expression expected. + ( + ^ + >>> parse_error(r'()') + Unexpected token: ')'. Expression expected. + () + ^ + >>> parse_error(r'([') + End of input found. Expected token ']'. + ([ + ^ + >>> parse_error(r'([,') + ',' is an illegal variable name. Constants may not be quantified. + ([, + ^ + >>> parse_error(r'([x,') + End of input found. Variable expected. + ([x, + ^ + >>> parse_error(r'([]') + End of input found. Expected token '['. + ([] + ^ + >>> parse_error(r'([][') + End of input found. Expected token ']'. + ([][ + ^ + >>> parse_error(r'([][,') + Unexpected token: ','. Expression expected. + ([][, + ^ + >>> parse_error(r'([][]') + End of input found. Expected token ')'. + ([][] + ^ + >>> parse_error(r'([x][man(x)]) |') + End of input found. Expression expected. + ([x][man(x)]) | + ^ + +Pretty Printing +=============== + + >>> dexpr(r"([],[])").pretty_print() + __ + | | + |--| + |__| + + >>> dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print() + _____________________________ + | | + |-----------------------------| + | ________ _________ | + | | x | | | | + | (|--------| -> |---------|) | + | | big(x) | | bark(x) | | + | | dog(x) | |_________| | + | |________| | + | _________ | + | | x | | + | __ |---------| | + | | | walk(x) | | + | |_________| | + |_____________________________| + + >>> dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print() + _________ _________ + | x y | | z | + (|---------| + |---------|) + | (x = y) | | dog(z) | + |_________| | walk(z) | + |_________| + + >>> dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print() + _______________________________ + | | + |-------------------------------| + | ___ ___ _________ | + | | x | | y | | z | | + | (|---| | |---| | |---------|) | + | |___| |___| | dog(z) | | + | | walk(z) | | + | |_________| | + |_______________________________| + + >>> dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print() + ___ ________ + \ | x | \ | | + /\ P Q.(|---| + P(x) + Q(x))( /\ x.|--------|) + |___| | dog(x) | + |________| diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/featstruct.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/featstruct.doctest new file mode 100644 index 0000000000000000000000000000000000000000..cfa70c9c4905c03630aa5caad2fdeecfa3adcfa0 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/featstruct.doctest @@ -0,0 +1,1229 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +================================== + Feature Structures & Unification +================================== + >>> from nltk.featstruct import FeatStruct + >>> from nltk.sem.logic import Variable, VariableExpression, Expression + +.. note:: For now, featstruct uses the older lambdalogic semantics + module. Eventually, it should be updated to use the new first + order predicate logic module. + +Overview +~~~~~~~~ +A feature structure is a mapping from feature identifiers to feature +values, where feature values can be simple values (like strings or +ints), nested feature structures, or variables: + + >>> fs1 = FeatStruct(number='singular', person=3) + >>> print(fs1) + [ number = 'singular' ] + [ person = 3 ] + +Feature structure may be nested: + + >>> fs2 = FeatStruct(type='NP', agr=fs1) + >>> print(fs2) + [ agr = [ number = 'singular' ] ] + [ [ person = 3 ] ] + [ ] + [ type = 'NP' ] + +Variables are used to indicate that two features should be assigned +the same value. For example, the following feature structure requires +that the feature fs3['agr']['number'] be bound to the same value as the +feature fs3['subj']['number']. + + >>> fs3 = FeatStruct(agr=FeatStruct(number=Variable('?n')), + ... subj=FeatStruct(number=Variable('?n'))) + >>> print(fs3) + [ agr = [ number = ?n ] ] + [ ] + [ subj = [ number = ?n ] ] + +Feature structures are typically used to represent partial information +about objects. A feature name that is not mapped to a value stands +for a feature whose value is unknown (*not* a feature without a +value). Two feature structures that represent (potentially +overlapping) information about the same object can be combined by +*unification*. + + >>> print(fs2.unify(fs3)) + [ agr = [ number = 'singular' ] ] + [ [ person = 3 ] ] + [ ] + [ subj = [ number = 'singular' ] ] + [ ] + [ type = 'NP' ] + +When two inconsistent feature structures are unified, the unification +fails and returns ``None``. + + >>> fs4 = FeatStruct(agr=FeatStruct(person=1)) + >>> print(fs4.unify(fs2)) + None + >>> print(fs2.unify(fs4)) + None + +.. + >>> del fs1, fs2, fs3, fs4 # clean-up + +Feature Structure Types +----------------------- +There are actually two types of feature structure: + +- *feature dictionaries*, implemented by `FeatDict`, act like + Python dictionaries. Feature identifiers may be strings or + instances of the `Feature` class. +- *feature lists*, implemented by `FeatList`, act like Python + lists. Feature identifiers are integers. + +When you construct a feature structure using the `FeatStruct` +constructor, it will automatically decide which type is appropriate: + + >>> type(FeatStruct(number='singular')) + + >>> type(FeatStruct([1,2,3])) + + +Usually, we will just use feature dictionaries; but sometimes feature +lists can be useful too. Two feature lists will unify with each other +only if they have equal lengths, and all of their feature values +match. If you wish to write a feature list that contains 'unknown' +values, you must use variables: + + >>> fs1 = FeatStruct([1,2,Variable('?y')]) + >>> fs2 = FeatStruct([1,Variable('?x'),3]) + >>> fs1.unify(fs2) + [1, 2, 3] + +.. + >>> del fs1, fs2 # clean-up + +Parsing Feature Structure Strings +--------------------------------- +Feature structures can be constructed directly from strings. Often, +this is more convenient than constructing them directly. NLTK can +parse most feature strings to produce the corresponding feature +structures. (But you must restrict your base feature values to +strings, ints, logic expressions (`nltk.sem.logic.Expression`), and a +few other types discussed below). + +Feature dictionaries are written like Python dictionaries, except that +keys are not put in quotes; and square brackets (``[]``) are used +instead of braces (``{}``): + + >>> FeatStruct('[tense="past", agr=[number="sing", person=3]]') + [agr=[number='sing', person=3], tense='past'] + +If a feature value is a single alphanumeric word, then it does not +need to be quoted -- it will be automatically treated as a string: + + >>> FeatStruct('[tense=past, agr=[number=sing, person=3]]') + [agr=[number='sing', person=3], tense='past'] + +Feature lists are written like python lists: + + >>> FeatStruct('[1, 2, 3]') + [1, 2, 3] + +The expression ``[]`` is treated as an empty feature dictionary, not +an empty feature list: + + >>> type(FeatStruct('[]')) + + +Feature Paths +------------- +Features can be specified using *feature paths*, or tuples of feature +identifiers that specify path through the nested feature structures to +a value. + + >>> fs1 = FeatStruct('[x=1, y=[1,2,[z=3]]]') + >>> fs1['y'] + [1, 2, [z=3]] + >>> fs1['y', 2] + [z=3] + >>> fs1['y', 2, 'z'] + 3 + +.. + >>> del fs1 # clean-up + +Reentrance +---------- +Feature structures may contain reentrant feature values. A *reentrant +feature value* is a single feature structure that can be accessed via +multiple feature paths. + + >>> fs1 = FeatStruct(x='val') + >>> fs2 = FeatStruct(a=fs1, b=fs1) + >>> print(fs2) + [ a = (1) [ x = 'val' ] ] + [ ] + [ b -> (1) ] + >>> fs2 + [a=(1)[x='val'], b->(1)] + +As you can see, reentrane is displayed by marking a feature structure +with a unique identifier, in this case ``(1)``, the first time it is +encountered; and then using the special form ``var -> id`` whenever it +is encountered again. You can use the same notation to directly +create reentrant feature structures from strings. + + >>> FeatStruct('[a=(1)[], b->(1), c=[d->(1)]]') + [a=(1)[], b->(1), c=[d->(1)]] + +Reentrant feature structures may contain cycles: + + >>> fs3 = FeatStruct('(1)[a->(1)]') + >>> fs3['a', 'a', 'a', 'a'] + (1)[a->(1)] + >>> fs3['a', 'a', 'a', 'a'] is fs3 + True + +Unification preserves the reentrance relations imposed by both of the +unified feature structures. In the feature structure resulting from +unification, any modifications to a reentrant feature value will be +visible using any of its feature paths. + + >>> fs3.unify(FeatStruct('[a=[b=12], c=33]')) + (1)[a->(1), b=12, c=33] + +.. + >>> del fs1, fs2, fs3 # clean-up + +Feature Structure Equality +-------------------------- +Two feature structures are considered equal if they assign the same +values to all features, *and* they contain the same reentrances. + + >>> fs1 = FeatStruct('[a=(1)[x=1], b->(1)]') + >>> fs2 = FeatStruct('[a=(1)[x=1], b->(1)]') + >>> fs3 = FeatStruct('[a=[x=1], b=[x=1]]') + >>> fs1 == fs1, fs1 is fs1 + (True, True) + >>> fs1 == fs2, fs1 is fs2 + (True, False) + >>> fs1 == fs3, fs1 is fs3 + (False, False) + +Note that this differs from how Python dictionaries and lists define +equality -- in particular, Python dictionaries and lists ignore +reentrance relations. To test two feature structures for equality +while ignoring reentrance relations, use the `equal_values()` method: + + >>> fs1.equal_values(fs1) + True + >>> fs1.equal_values(fs2) + True + >>> fs1.equal_values(fs3) + True + +.. + >>> del fs1, fs2, fs3 # clean-up + +Feature Value Sets & Feature Value Tuples +----------------------------------------- +`nltk.featstruct` defines two new data types that are intended to be +used as feature values: `FeatureValueTuple` and `FeatureValueSet`. +Both of these types are considered base values -- i.e., unification +does *not* apply to them. However, variable binding *does* apply to +any values that they contain. + +Feature value tuples are written with parentheses: + + >>> fs1 = FeatStruct('[x=(?x, ?y)]') + >>> fs1 + [x=(?x, ?y)] + >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2}) + [x=(1, 2)] + +Feature sets are written with braces: + + >>> fs1 = FeatStruct('[x={?x, ?y}]') + >>> fs1 + [x={?x, ?y}] + >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2}) + [x={1, 2}] + +In addition to the basic feature value tuple & set classes, nltk +defines feature value unions (for sets) and feature value +concatenations (for tuples). These are written using '+', and can be +used to combine sets & tuples: + + >>> fs1 = FeatStruct('[x=((1, 2)+?z), z=?z]') + >>> fs1 + [x=((1, 2)+?z), z=?z] + >>> fs1.unify(FeatStruct('[z=(3, 4, 5)]')) + [x=(1, 2, 3, 4, 5), z=(3, 4, 5)] + +Thus, feature value tuples and sets can be used to build up tuples +and sets of values over the course of unification. For example, when +parsing sentences using a semantic feature grammar, feature sets or +feature tuples can be used to build a list of semantic predicates as +the sentence is parsed. + +As was mentioned above, unification does not apply to feature value +tuples and sets. One reason for this that it's impossible to define a +single correct answer for unification when concatenation is used. +Consider the following example: + + >>> fs1 = FeatStruct('[x=(1, 2, 3, 4)]') + >>> fs2 = FeatStruct('[x=(?a+?b), a=?a, b=?b]') + +If unification applied to feature tuples, then the unification +algorithm would have to arbitrarily choose how to divide the tuple +(1,2,3,4) into two parts. Instead, the unification algorithm refuses +to make this decision, and simply unifies based on value. Because +(1,2,3,4) is not equal to (?a+?b), fs1 and fs2 will not unify: + + >>> print(fs1.unify(fs2)) + None + +If you need a list-like structure that unification does apply to, use +`FeatList`. + +.. + >>> del fs1, fs2 # clean-up + +Light-weight Feature Structures +------------------------------- +Many of the functions defined by `nltk.featstruct` can be applied +directly to simple Python dictionaries and lists, rather than to +full-fledged `FeatDict` and `FeatList` objects. In other words, +Python ``dicts`` and ``lists`` can be used as "light-weight" feature +structures. + + >>> # Note: pprint prints dicts sorted + >>> from pprint import pprint + >>> from nltk.featstruct import unify + >>> pprint(unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b')))) + {'a': 'a', 'x': 1, 'y': {'b': 'b'}} + +However, you should keep in mind the following caveats: + +- Python dictionaries & lists ignore reentrance when checking for + equality between values. But two FeatStructs with different + reentrances are considered nonequal, even if all their base + values are equal. + +- FeatStructs can be easily frozen, allowing them to be used as + keys in hash tables. Python dictionaries and lists can not. + +- FeatStructs display reentrance in their string representations; + Python dictionaries and lists do not. + +- FeatStructs may *not* be mixed with Python dictionaries and lists + (e.g., when performing unification). + +- FeatStructs provide a number of useful methods, such as `walk()` + and `cyclic()`, which are not available for Python dicts & lists. + +In general, if your feature structures will contain any reentrances, +or if you plan to use them as dictionary keys, it is strongly +recommended that you use full-fledged `FeatStruct` objects. + +Custom Feature Values +--------------------- +The abstract base class `CustomFeatureValue` can be used to define new +base value types that have custom unification methods. For example, +the following feature value type encodes a range, and defines +unification as taking the intersection on the ranges: + + >>> from functools import total_ordering + >>> from nltk.featstruct import CustomFeatureValue, UnificationFailure + >>> @total_ordering + ... class Range(CustomFeatureValue): + ... def __init__(self, low, high): + ... assert low <= high + ... self.low = low + ... self.high = high + ... def unify(self, other): + ... if not isinstance(other, Range): + ... return UnificationFailure + ... low = max(self.low, other.low) + ... high = min(self.high, other.high) + ... if low <= high: return Range(low, high) + ... else: return UnificationFailure + ... def __repr__(self): + ... return '(%s>> fs1 = FeatStruct(x=Range(5,8), y=FeatStruct(z=Range(7,22))) + >>> print(fs1.unify(FeatStruct(x=Range(6, 22)))) + [ x = (6>> print(fs1.unify(FeatStruct(x=Range(9, 12)))) + None + >>> print(fs1.unify(FeatStruct(x=12))) + None + >>> print(fs1.unify(FeatStruct('[x=?x, y=[z=?x]]'))) + [ x = (7>> fs1 = FeatStruct(a=1, b=2, c=3) + >>> fs2 = FeatStruct(x=fs1, y='x') + +Feature structures support all dictionary methods (excluding the class +method `dict.fromkeys()`). Non-mutating methods: + + >>> sorted(fs2.keys()) # keys() + ['x', 'y'] + >>> sorted(fs2.values()) # values() + [[a=1, b=2, c=3], 'x'] + >>> sorted(fs2.items()) # items() + [('x', [a=1, b=2, c=3]), ('y', 'x')] + >>> sorted(fs2) # __iter__() + ['x', 'y'] + >>> 'a' in fs2, 'x' in fs2 # __contains__() + (False, True) + >>> fs2.has_key('a'), fs2.has_key('x') # has_key() + (False, True) + >>> fs2['x'], fs2['y'] # __getitem__() + ([a=1, b=2, c=3], 'x') + >>> fs2['a'] # __getitem__() + Traceback (most recent call last): + . . . + KeyError: 'a' + >>> fs2.get('x'), fs2.get('y'), fs2.get('a') # get() + ([a=1, b=2, c=3], 'x', None) + >>> fs2.get('x', 'hello'), fs2.get('a', 'hello') # get() + ([a=1, b=2, c=3], 'hello') + >>> len(fs1), len(fs2) # __len__ + (3, 2) + >>> fs2.copy() # copy() + [x=[a=1, b=2, c=3], y='x'] + >>> fs2.copy() is fs2 # copy() + False + +Note: by default, `FeatStruct.copy()` does a deep copy. Use +`FeatStruct.copy(deep=False)` for a shallow copy. + +.. + >>> del fs1, fs2 # clean-up. + +Dictionary access methods (mutating) +------------------------------------ + >>> fs1 = FeatStruct(a=1, b=2, c=3) + >>> fs2 = FeatStruct(x=fs1, y='x') + +Setting features (`__setitem__()`) + + >>> fs1['c'] = 5 + >>> fs1 + [a=1, b=2, c=5] + >>> fs1['x'] = 12 + >>> fs1 + [a=1, b=2, c=5, x=12] + >>> fs2['x', 'a'] = 2 + >>> fs2 + [x=[a=2, b=2, c=5, x=12], y='x'] + >>> fs1 + [a=2, b=2, c=5, x=12] + +Deleting features (`__delitem__()`) + + >>> del fs1['x'] + >>> fs1 + [a=2, b=2, c=5] + >>> del fs2['x', 'a'] + >>> fs1 + [b=2, c=5] + +`setdefault()`: + + >>> fs1.setdefault('b', 99) + 2 + >>> fs1 + [b=2, c=5] + >>> fs1.setdefault('x', 99) + 99 + >>> fs1 + [b=2, c=5, x=99] + +`update()`: + + >>> fs2.update({'a':'A', 'b':'B'}, c='C') + >>> fs2 + [a='A', b='B', c='C', x=[b=2, c=5, x=99], y='x'] + +`pop()`: + + >>> fs2.pop('a') + 'A' + >>> fs2 + [b='B', c='C', x=[b=2, c=5, x=99], y='x'] + >>> fs2.pop('a') + Traceback (most recent call last): + . . . + KeyError: 'a' + >>> fs2.pop('a', 'foo') + 'foo' + >>> fs2 + [b='B', c='C', x=[b=2, c=5, x=99], y='x'] + +`clear()`: + + >>> fs1.clear() + >>> fs1 + [] + >>> fs2 + [b='B', c='C', x=[], y='x'] + +`popitem()`: + + >>> sorted([fs2.popitem() for i in range(len(fs2))]) + [('b', 'B'), ('c', 'C'), ('x', []), ('y', 'x')] + >>> fs2 + [] + +Once a feature structure has been frozen, it may not be mutated. + + >>> fs1 = FeatStruct('[x=1, y=2, z=[a=3]]') + >>> fs1.freeze() + >>> fs1.frozen() + True + >>> fs1['z'].frozen() + True + + >>> fs1['x'] = 5 + Traceback (most recent call last): + . . . + ValueError: Frozen FeatStructs may not be modified. + >>> del fs1['x'] + Traceback (most recent call last): + . . . + ValueError: Frozen FeatStructs may not be modified. + >>> fs1.clear() + Traceback (most recent call last): + . . . + ValueError: Frozen FeatStructs may not be modified. + >>> fs1.pop('x') + Traceback (most recent call last): + . . . + ValueError: Frozen FeatStructs may not be modified. + >>> fs1.popitem() + Traceback (most recent call last): + . . . + ValueError: Frozen FeatStructs may not be modified. + >>> fs1.setdefault('x') + Traceback (most recent call last): + . . . + ValueError: Frozen FeatStructs may not be modified. + >>> fs1.update(z=22) + Traceback (most recent call last): + . . . + ValueError: Frozen FeatStructs may not be modified. + +.. + >>> del fs1, fs2 # clean-up. + +Feature Paths +------------- +Make sure that __getitem__ with feature paths works as intended: + + >>> fs1 = FeatStruct(a=1, b=2, + ... c=FeatStruct( + ... d=FeatStruct(e=12), + ... f=FeatStruct(g=55, h='hello'))) + >>> fs1[()] + [a=1, b=2, c=[d=[e=12], f=[g=55, h='hello']]] + >>> fs1['a'], fs1[('a',)] + (1, 1) + >>> fs1['c','d','e'] + 12 + >>> fs1['c','f','g'] + 55 + +Feature paths that select unknown features raise KeyError: + + >>> fs1['c', 'f', 'e'] + Traceback (most recent call last): + . . . + KeyError: ('c', 'f', 'e') + >>> fs1['q', 'p'] + Traceback (most recent call last): + . . . + KeyError: ('q', 'p') + +Feature paths that try to go 'through' a feature that's not a feature +structure raise KeyError: + + >>> fs1['a', 'b'] + Traceback (most recent call last): + . . . + KeyError: ('a', 'b') + +Feature paths can go through reentrant structures: + + >>> fs2 = FeatStruct('(1)[a=[b=[c->(1), d=5], e=11]]') + >>> fs2['a', 'b', 'c', 'a', 'e'] + 11 + >>> fs2['a', 'b', 'c', 'a', 'b', 'd'] + 5 + >>> fs2[tuple('abcabcabcabcabcabcabcabcabcabca')] + (1)[b=[c=[a->(1)], d=5], e=11] + +Indexing requires strings, `Feature`\s, or tuples; other types raise a +TypeError: + + >>> fs2[12] + Traceback (most recent call last): + . . . + TypeError: Expected feature name or path. Got 12. + >>> fs2[list('abc')] + Traceback (most recent call last): + . . . + TypeError: Expected feature name or path. Got ['a', 'b', 'c']. + +Feature paths can also be used with `get()`, `has_key()`, and +`__contains__()`. + + >>> fpath1 = tuple('abcabc') + >>> fpath2 = tuple('abcabz') + >>> fs2.get(fpath1), fs2.get(fpath2) + ((1)[a=[b=[c->(1), d=5], e=11]], None) + >>> fpath1 in fs2, fpath2 in fs2 + (True, False) + >>> fs2.has_key(fpath1), fs2.has_key(fpath2) + (True, False) + +.. + >>> del fs1, fs2 # clean-up + +Reading Feature Structures +-------------------------- + +Empty feature struct: + + >>> FeatStruct('[]') + [] + +Test features with integer values: + + >>> FeatStruct('[a=12, b=-33, c=0]') + [a=12, b=-33, c=0] + +Test features with string values. Either single or double quotes may +be used. Strings are evaluated just like python strings -- in +particular, you can use escape sequences and 'u' and 'r' prefixes, and +triple-quoted strings. + + >>> FeatStruct('[a="", b="hello", c="\'", d=\'\', e=\'"\']') + [a='', b='hello', c="'", d='', e='"'] + >>> FeatStruct(r'[a="\\", b="\"", c="\x6f\\y", d="12"]') + [a='\\', b='"', c='o\\y', d='12'] + >>> FeatStruct(r'[b=r"a\b\c"]') + [b='a\\b\\c'] + >>> FeatStruct('[x="""a"""]') + [x='a'] + +Test parsing of reentrant feature structures. + + >>> FeatStruct('[a=(1)[], b->(1)]') + [a=(1)[], b->(1)] + >>> FeatStruct('[a=(1)[x=1, y=2], b->(1)]') + [a=(1)[x=1, y=2], b->(1)] + +Test parsing of cyclic feature structures. + + >>> FeatStruct('[a=(1)[b->(1)]]') + [a=(1)[b->(1)]] + >>> FeatStruct('(1)[a=[b=[c->(1)]]]') + (1)[a=[b=[c->(1)]]] + +Strings of the form "+name" and "-name" may be used to specify boolean +values. + + >>> FeatStruct('[-bar, +baz, +foo]') + [-bar, +baz, +foo] + +None, True, and False are recognized as values: + + >>> FeatStruct('[bar=True, baz=False, foo=None]') + [+bar, -baz, foo=None] + +Special features: + + >>> FeatStruct('NP/VP') + NP[]/VP[] + >>> FeatStruct('?x/?x') + ?x[]/?x[] + >>> print(FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')) + [ *type* = 'VP' ] + [ ] + [ [ *type* = 'NP' ] ] + [ *slash* = [ agr = ?x ] ] + [ [ pl = True ] ] + [ ] + [ agr = ?x ] + [ fin = True ] + [ tense = 'past' ] + +Here the slash feature gets coerced: + + >>> FeatStruct('[*slash*=a, x=b, *type*="NP"]') + NP[x='b']/a[] + + >>> FeatStruct('NP[sem=]/NP') + NP[sem=]/NP[] + >>> FeatStruct('S[sem=]') + S[sem=] + >>> print(FeatStruct('NP[sem=]/NP')) + [ *type* = 'NP' ] + [ ] + [ *slash* = [ *type* = 'NP' ] ] + [ ] + [ sem = ] + +Playing with ranges: + + >>> from nltk.featstruct import RangeFeature, FeatStructReader + >>> width = RangeFeature('width') + >>> reader = FeatStructReader([width]) + >>> fs1 = reader.fromstring('[*width*=-5:12]') + >>> fs2 = reader.fromstring('[*width*=2:123]') + >>> fs3 = reader.fromstring('[*width*=-7:-2]') + >>> fs1.unify(fs2) + [*width*=(2, 12)] + >>> fs1.unify(fs3) + [*width*=(-5, -2)] + >>> print(fs2.unify(fs3)) # no overlap in width. + None + +The slash feature has a default value of 'False': + + >>> print(FeatStruct('NP[]/VP').unify(FeatStruct('NP[]'), trace=1)) + + Unification trace: + / NP[]/VP[] + |\ NP[] + | + | Unify feature: *type* + | / 'NP' + | |\ 'NP' + | | + | +-->'NP' + | + | Unify feature: *slash* + | / VP[] + | |\ False + | | + X X <-- FAIL + None + +The demo structures from category.py. They all parse, but they don't +do quite the right thing, -- ?x vs x. + + >>> FeatStruct(pos='n', agr=FeatStruct(number='pl', gender='f')) + [agr=[gender='f', number='pl'], pos='n'] + >>> FeatStruct(r'NP[sem=]/NP') + NP[sem=]/NP[] + >>> FeatStruct(r'S[sem=]') + S[sem=] + >>> FeatStruct('?x/?x') + ?x[]/?x[] + >>> FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]') + VP[agr=?x, +fin, tense='past']/NP[agr=?x, +pl] + >>> FeatStruct('S[sem = ]') + S[sem=] + + >>> FeatStruct('S') + S[] + +The parser also includes support for reading sets and tuples. + + >>> FeatStruct('[x={1,2,2,2}, y={/}]') + [x={1, 2}, y={/}] + >>> FeatStruct('[x=(1,2,2,2), y=()]') + [x=(1, 2, 2, 2), y=()] + >>> print(FeatStruct('[x=(1,[z=(1,2,?x)],?z,{/})]')) + [ x = (1, [ z = (1, 2, ?x) ], ?z, {/}) ] + +Note that we can't put a featstruct inside a tuple, because doing so +would hash it, and it's not frozen yet: + + >>> print(FeatStruct('[x={[]}]')) + Traceback (most recent call last): + . . . + TypeError: FeatStructs must be frozen before they can be hashed. + +There's a special syntax for taking the union of sets: "{...+...}". +The elements should only be variables or sets. + + >>> FeatStruct('[x={?a+?b+{1,2,3}}]') + [x={?a+?b+{1, 2, 3}}] + +There's a special syntax for taking the concatenation of tuples: +"(...+...)". The elements should only be variables or tuples. + + >>> FeatStruct('[x=(?a+?b+(1,2,3))]') + [x=(?a+?b+(1, 2, 3))] + +Parsing gives helpful messages if your string contains an error. + + >>> FeatStruct('[a=, b=5]]') + Traceback (most recent call last): + . . . + ValueError: Error parsing feature structure + [a=, b=5]] + ^ Expected value + >>> FeatStruct('[a=12 22, b=33]') + Traceback (most recent call last): + . . . + ValueError: Error parsing feature structure + [a=12 22, b=33] + ^ Expected comma + >>> FeatStruct('[a=5] [b=6]') + Traceback (most recent call last): + . . . + ValueError: Error parsing feature structure + [a=5] [b=6] + ^ Expected end of string + >>> FeatStruct(' *++*') + Traceback (most recent call last): + . . . + ValueError: Error parsing feature structure + *++* + ^ Expected open bracket or identifier + >>> FeatStruct('[x->(1)]') + Traceback (most recent call last): + . . . + ValueError: Error parsing feature structure + [x->(1)] + ^ Expected bound identifier + >>> FeatStruct('[x->y]') + Traceback (most recent call last): + . . . + ValueError: Error parsing feature structure + [x->y] + ^ Expected identifier + >>> FeatStruct('') + Traceback (most recent call last): + . . . + ValueError: Error parsing feature structure + + ^ Expected open bracket or identifier + + +Unification +----------- +Very simple unifications give the expected results: + + >>> FeatStruct().unify(FeatStruct()) + [] + >>> FeatStruct(number='singular').unify(FeatStruct()) + [number='singular'] + >>> FeatStruct().unify(FeatStruct(number='singular')) + [number='singular'] + >>> FeatStruct(number='singular').unify(FeatStruct(person=3)) + [number='singular', person=3] + +Merging nested structures: + + >>> fs1 = FeatStruct('[A=[B=b]]') + >>> fs2 = FeatStruct('[A=[C=c]]') + >>> fs1.unify(fs2) + [A=[B='b', C='c']] + >>> fs2.unify(fs1) + [A=[B='b', C='c']] + +A basic case of reentrant unification + + >>> fs4 = FeatStruct('[A=(1)[B=b], E=[F->(1)]]') + >>> fs5 = FeatStruct("[A=[C='c'], E=[F=[D='d']]]") + >>> fs4.unify(fs5) + [A=(1)[B='b', C='c', D='d'], E=[F->(1)]] + >>> fs5.unify(fs4) + [A=(1)[B='b', C='c', D='d'], E=[F->(1)]] + +More than 2 paths to a value + + >>> fs1 = FeatStruct("[a=[],b=[],c=[],d=[]]") + >>> fs2 = FeatStruct('[a=(1)[], b->(1), c->(1), d->(1)]') + >>> fs1.unify(fs2) + [a=(1)[], b->(1), c->(1), d->(1)] + +fs1[a] gets unified with itself + + >>> fs1 = FeatStruct('[x=(1)[], y->(1)]') + >>> fs2 = FeatStruct('[x=(1)[], y->(1)]') + >>> fs1.unify(fs2) + [x=(1)[], y->(1)] + +Bound variables should get forwarded appropriately + + >>> fs1 = FeatStruct('[A=(1)[X=x], B->(1), C=?cvar, D=?dvar]') + >>> fs2 = FeatStruct('[A=(1)[Y=y], B=(2)[Z=z], C->(1), D->(2)]') + >>> fs1.unify(fs2) + [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)] + >>> fs2.unify(fs1) + [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)] + +Cyclic structure created by unification. + + >>> fs1 = FeatStruct('[F=(1)[], G->(1)]') + >>> fs2 = FeatStruct('[F=[H=(2)[]], G->(2)]') + >>> fs3 = fs1.unify(fs2) + >>> fs3 + [F=(1)[H->(1)], G->(1)] + >>> fs3['F'] is fs3['G'] + True + >>> fs3['F'] is fs3['G']['H'] + True + >>> fs3['F'] is fs3['G']['H']['H'] + True + >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H'] + True + +Cyclic structure created w/ variables. + + >>> fs1 = FeatStruct('[F=[H=?x]]') + >>> fs2 = FeatStruct('[F=?x]') + >>> fs3 = fs1.unify(fs2, rename_vars=False) + >>> fs3 + [F=(1)[H->(1)]] + >>> fs3['F'] is fs3['F']['H'] + True + >>> fs3['F'] is fs3['F']['H']['H'] + True + >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H'] + True + +Unifying w/ a cyclic feature structure. + + >>> fs4 = FeatStruct('[F=[H=[H=[H=(1)[]]]], K->(1)]') + >>> fs3.unify(fs4) + [F=(1)[H->(1)], K->(1)] + >>> fs4.unify(fs3) + [F=(1)[H->(1)], K->(1)] + +Variable bindings should preserve reentrance. + + >>> bindings = {} + >>> fs1 = FeatStruct("[a=?x]") + >>> fs2 = fs1.unify(FeatStruct("[a=[]]"), bindings) + >>> fs2['a'] is bindings[Variable('?x')] + True + >>> fs2.unify(FeatStruct("[b=?x]"), bindings) + [a=(1)[], b->(1)] + +Aliased variable tests + + >>> fs1 = FeatStruct("[a=?x, b=?x]") + >>> fs2 = FeatStruct("[b=?y, c=?y]") + >>> bindings = {} + >>> fs3 = fs1.unify(fs2, bindings) + >>> fs3 + [a=?x, b=?x, c=?x] + >>> bindings + {Variable('?y'): Variable('?x')} + >>> fs3.unify(FeatStruct("[a=1]")) + [a=1, b=1, c=1] + +If we keep track of the bindings, then we can use the same variable +over multiple calls to unify. + + >>> bindings = {} + >>> fs1 = FeatStruct('[a=?x]') + >>> fs2 = fs1.unify(FeatStruct('[a=[]]'), bindings) + >>> fs2.unify(FeatStruct('[b=?x]'), bindings) + [a=(1)[], b->(1)] + >>> bindings + {Variable('?x'): []} + +.. + >>> del fs1, fs2, fs3, fs4, fs5 # clean-up + +Unification Bindings +-------------------- + + >>> bindings = {} + >>> fs1 = FeatStruct('[a=?x]') + >>> fs2 = FeatStruct('[a=12]') + >>> fs3 = FeatStruct('[b=?x]') + >>> fs1.unify(fs2, bindings) + [a=12] + >>> bindings + {Variable('?x'): 12} + >>> fs3.substitute_bindings(bindings) + [b=12] + >>> fs3 # substitute_bindings didn't mutate fs3. + [b=?x] + >>> fs2.unify(fs3, bindings) + [a=12, b=12] + + >>> bindings = {} + >>> fs1 = FeatStruct('[a=?x, b=1]') + >>> fs2 = FeatStruct('[a=5, b=?x]') + >>> fs1.unify(fs2, bindings) + [a=5, b=1] + >>> sorted(bindings.items()) + [(Variable('?x'), 5), (Variable('?x2'), 1)] + +.. + >>> del fs1, fs2, fs3 # clean-up + +Expressions +----------- + + >>> e = Expression.fromstring('\\P y.P(z,y)') + >>> fs1 = FeatStruct(x=e, y=Variable('z')) + >>> fs2 = FeatStruct(y=VariableExpression(Variable('John'))) + >>> fs1.unify(fs2) + [x=<\P y.P(John,y)>, y=] + +Remove Variables +---------------- + + >>> FeatStruct('[a=?x, b=12, c=[d=?y]]').remove_variables() + [b=12, c=[]] + >>> FeatStruct('(1)[a=[b=?x,c->(1)]]').remove_variables() + (1)[a=[c->(1)]] + +Equality & Hashing +------------------ +The `equal_values` method checks whether two feature structures assign +the same value to every feature. If the optional argument +``check_reentrances`` is supplied, then it also returns false if there +is any difference in the reentrances. + + >>> a = FeatStruct('(1)[x->(1)]') + >>> b = FeatStruct('(1)[x->(1)]') + >>> c = FeatStruct('(1)[x=[x->(1)]]') + >>> d = FeatStruct('[x=(1)[x->(1)]]') + >>> e = FeatStruct('(1)[x=[x->(1), y=1], y=1]') + >>> def compare(x,y): + ... assert x.equal_values(y, True) == y.equal_values(x, True) + ... assert x.equal_values(y, False) == y.equal_values(x, False) + ... if x.equal_values(y, True): + ... assert x.equal_values(y, False) + ... print('equal values, same reentrance') + ... elif x.equal_values(y, False): + ... print('equal values, different reentrance') + ... else: + ... print('different values') + + >>> compare(a, a) + equal values, same reentrance + >>> compare(a, b) + equal values, same reentrance + >>> compare(a, c) + equal values, different reentrance + >>> compare(a, d) + equal values, different reentrance + >>> compare(c, d) + equal values, different reentrance + >>> compare(a, e) + different values + >>> compare(c, e) + different values + >>> compare(d, e) + different values + >>> compare(e, e) + equal values, same reentrance + +Feature structures may not be hashed until they are frozen: + + >>> hash(a) + Traceback (most recent call last): + . . . + TypeError: FeatStructs must be frozen before they can be hashed. + >>> a.freeze() + >>> v = hash(a) + +Feature structures define hash consistently. The following example +looks at the hash value for each (fs1,fs2) pair; if their hash values +are not equal, then they must not be equal. If their hash values are +equal, then display a message, and indicate whether their values are +indeed equal. Note that c and d currently have the same hash value, +even though they are not equal. That is not a bug, strictly speaking, +but it wouldn't be a bad thing if it changed. + + >>> for fstruct in (a, b, c, d, e): + ... fstruct.freeze() + >>> for fs1_name in 'abcde': + ... for fs2_name in 'abcde': + ... fs1 = locals()[fs1_name] + ... fs2 = locals()[fs2_name] + ... if hash(fs1) != hash(fs2): + ... assert fs1 != fs2 + ... else: + ... print('%s and %s have the same hash value,' % + ... (fs1_name, fs2_name)) + ... if fs1 == fs2: print('and are equal') + ... else: print('and are not equal') + a and a have the same hash value, and are equal + a and b have the same hash value, and are equal + b and a have the same hash value, and are equal + b and b have the same hash value, and are equal + c and c have the same hash value, and are equal + c and d have the same hash value, and are not equal + d and c have the same hash value, and are not equal + d and d have the same hash value, and are equal + e and e have the same hash value, and are equal + +.. + >>> del a, b, c, d, e, v # clean-up + +Tracing +------- + + >>> fs1 = FeatStruct('[a=[b=(1)[], c=?x], d->(1), e=[f=?x]]') + >>> fs2 = FeatStruct('[a=(1)[c="C"], e=[g->(1)]]') + >>> fs1.unify(fs2, trace=True) + + Unification trace: + / [a=[b=(1)[], c=?x], d->(1), e=[f=?x]] + |\ [a=(1)[c='C'], e=[g->(1)]] + | + | Unify feature: a + | / [b=[], c=?x] + | |\ [c='C'] + | | + | | Unify feature: a.c + | | / ?x + | | |\ 'C' + | | | + | | +-->Variable('?x') + | | + | +-->[b=[], c=?x] + | Bindings: {?x: 'C'} + | + | Unify feature: e + | / [f=?x] + | |\ [g=[c='C']] + | | + | +-->[f=?x, g=[b=[], c=?x]] + | Bindings: {?x: 'C'} + | + +-->[a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]] + Bindings: {?x: 'C'} + [a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]] + >>> + >>> fs1 = FeatStruct('[a=?x, b=?z, c=?z]') + >>> fs2 = FeatStruct('[a=?y, b=?y, c=?q]') + >>> #fs1.unify(fs2, trace=True) + >>> + +.. + >>> del fs1, fs2 # clean-up + +Unification on Dicts & Lists +---------------------------- +It's possible to do unification on dictionaries: + + >>> from nltk.featstruct import unify + >>> pprint(unify(dict(x=1, y=dict(z=2)), dict(x=1, q=5)), width=1) + {'q': 5, 'x': 1, 'y': {'z': 2}} + +It's possible to do unification on lists as well: + + >>> unify([1, 2, 3], [1, Variable('x'), 3]) + [1, 2, 3] + +Mixing dicts and lists is fine: + + >>> pprint(unify([dict(x=1, y=dict(z=2)),3], [dict(x=1, q=5),3]), + ... width=1) + [{'q': 5, 'x': 1, 'y': {'z': 2}}, 3] + +Mixing dicts and FeatStructs is discouraged: + + >>> unify(dict(x=1), FeatStruct(x=1)) + Traceback (most recent call last): + . . . + ValueError: Mixing FeatStruct objects with Python dicts and lists is not supported. + +But you can do it if you really want, by explicitly stating that both +dictionaries and FeatStructs should be treated as feature structures: + + >>> unify(dict(x=1), FeatStruct(x=1), fs_class=(dict, FeatStruct)) + {'x': 1} + +Finding Conflicts +----------------- + + >>> from nltk.featstruct import conflicts + >>> fs1 = FeatStruct('[a=[b=(1)[c=2], d->(1), e=[f->(1)]]]') + >>> fs2 = FeatStruct('[a=[b=[c=[x=5]], d=[c=2], e=[f=[c=3]]]]') + >>> for path in conflicts(fs1, fs2): + ... print('%-8s: %r vs %r' % ('.'.join(path), fs1[path], fs2[path])) + a.b.c : 2 vs [x=5] + a.e.f.c : 2 vs 3 + +.. + >>> del fs1, fs2 # clean-up + +Retracting Bindings +------------------- + + >>> from nltk.featstruct import retract_bindings + >>> bindings = {} + >>> fs1 = FeatStruct('[a=?x, b=[c=?y]]') + >>> fs2 = FeatStruct('[a=(1)[c=[d=1]], b->(1)]') + >>> fs3 = fs1.unify(fs2, bindings) + >>> print(fs3) + [ a = (1) [ c = [ d = 1 ] ] ] + [ ] + [ b -> (1) ] + >>> pprint(bindings) + {Variable('?x'): [c=[d=1]], Variable('?y'): [d=1]} + >>> retract_bindings(fs3, bindings) + [a=?x, b=?x] + >>> pprint(bindings) + {Variable('?x'): [c=?y], Variable('?y'): [d=1]} + +Squashed Bugs +~~~~~~~~~~~~~ +In svn rev 5167, unifying two feature structures that used the same +variable would cause those variables to become aliased in the output. + + >>> fs1 = FeatStruct('[a=?x]') + >>> fs2 = FeatStruct('[b=?x]') + >>> fs1.unify(fs2) + [a=?x, b=?x2] + +There was a bug in svn revision 5172 that caused `rename_variables` to +rename variables to names that are already used. + + >>> FeatStruct('[a=?x, b=?x2]').rename_variables( + ... vars=[Variable('?x')]) + [a=?x3, b=?x2] + >>> fs1 = FeatStruct('[a=?x]') + >>> fs2 = FeatStruct('[a=?x, b=?x2]') + >>> fs1.unify(fs2) + [a=?x, b=?x2] + +There was a bug in svn rev 5167 that caused us to get the following +example wrong. Basically the problem was that we only followed +'forward' pointers for other, not self, when unifying two feature +structures. (nb: this test assumes that features are unified in +alphabetical order -- if they are not, it might pass even if the bug +is present.) + + >>> fs1 = FeatStruct('[a=[x=1], b=?x, c=?x]') + >>> fs2 = FeatStruct('[a=(1)[], b->(1), c=[x=2]]') + >>> print(fs1.unify(fs2)) + None + +.. + >>> del fs1, fs2 # clean-up diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/framenet.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/framenet.doctest new file mode 100644 index 0000000000000000000000000000000000000000..e4ca41dd019bf53c72393233154c576e5003dda3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/framenet.doctest @@ -0,0 +1,288 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +======== +FrameNet +======== + +The FrameNet corpus is a lexical database of English that is both human- +and machine-readable, based on annotating examples of how words are used +in actual texts. FrameNet is based on a theory of meaning called Frame +Semantics, deriving from the work of Charles J. Fillmore and colleagues. +The basic idea is straightforward: that the meanings of most words can +best be understood on the basis of a semantic frame: a description of a +type of event, relation, or entity and the participants in it. For +example, the concept of cooking typically involves a person doing the +cooking (Cook), the food that is to be cooked (Food), something to hold +the food while cooking (Container) and a source of heat +(Heating_instrument). In the FrameNet project, this is represented as a +frame called Apply_heat, and the Cook, Food, Heating_instrument and +Container are called frame elements (FEs). Words that evoke this frame, +such as fry, bake, boil, and broil, are called lexical units (LUs) of +the Apply_heat frame. The job of FrameNet is to define the frames +and to annotate sentences to show how the FEs fit syntactically around +the word that evokes the frame. + +------ +Frames +------ + +A Frame is a script-like conceptual structure that describes a +particular type of situation, object, or event along with the +participants and props that are needed for that Frame. For +example, the "Apply_heat" frame describes a common situation +involving a Cook, some Food, and a Heating_Instrument, and is +evoked by words such as bake, blanch, boil, broil, brown, +simmer, steam, etc. + +We call the roles of a Frame "frame elements" (FEs) and the +frame-evoking words are called "lexical units" (LUs). + +FrameNet includes relations between Frames. Several types of +relations are defined, of which the most important are: + +- Inheritance: An IS-A relation. The child frame is a subtype + of the parent frame, and each FE in the parent is bound to + a corresponding FE in the child. An example is the + "Revenge" frame which inherits from the + "Rewards_and_punishments" frame. + +- Using: The child frame presupposes the parent frame as + background, e.g the "Speed" frame "uses" (or presupposes) + the "Motion" frame; however, not all parent FEs need to be + bound to child FEs. + +- Subframe: The child frame is a subevent of a complex event + represented by the parent, e.g. the "Criminal_process" frame + has subframes of "Arrest", "Arraignment", "Trial", and + "Sentencing". + +- Perspective_on: The child frame provides a particular + perspective on an un-perspectivized parent frame. A pair of + examples consists of the "Hiring" and "Get_a_job" frames, + which perspectivize the "Employment_start" frame from the + Employer's and the Employee's point of view, respectively. + +To get a list of all of the Frames in FrameNet, you can use the +`frames()` function. If you supply a regular expression pattern to the +`frames()` function, you will get a list of all Frames whose names match +that pattern: + + >>> from pprint import pprint + >>> from operator import itemgetter + >>> from nltk.corpus import framenet as fn + >>> from nltk.corpus.reader.framenet import PrettyList + >>> x = fn.frames(r'(?i)crim') + >>> x.sort(key=itemgetter('ID')) + >>> x + [, , ...] + >>> PrettyList(sorted(x, key=itemgetter('ID'))) + [, , ...] + +To get the details of a particular Frame, you can use the `frame()` +function passing in the frame number: + + >>> from pprint import pprint + >>> from nltk.corpus import framenet as fn + >>> f = fn.frame(202) + >>> f.ID + 202 + >>> f.name + 'Arrest' + >>> f.definition + "Authorities charge a Suspect, who is under suspicion of having committed a crime..." + >>> len(f.lexUnit) + 11 + >>> pprint(sorted([x for x in f.FE])) + ['Authorities', + 'Charges', + 'Co-participant', + 'Manner', + 'Means', + 'Offense', + 'Place', + 'Purpose', + 'Source_of_legal_authority', + 'Suspect', + 'Time', + 'Type'] + >>> pprint(f.frameRelations) + [ Child=Arrest>, Component=Arrest>, ...] + +The `frame()` function shown above returns a dict object containing +detailed information about the Frame. See the documentation on the +`frame()` function for the specifics. + +You can also search for Frames by their Lexical Units (LUs). The +`frames_by_lemma()` function returns a list of all frames that contain +LUs in which the 'name' attribute of the LU matches the given regular +expression. Note that LU names are composed of "lemma.POS", where the +"lemma" part can be made up of either a single lexeme (e.g. 'run') or +multiple lexemes (e.g. 'a little') (see below). + + >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) + [, ] + +------------- +Lexical Units +------------- + +A lexical unit (LU) is a pairing of a word with a meaning. For +example, the "Apply_heat" Frame describes a common situation +involving a Cook, some Food, and a Heating Instrument, and is +_evoked_ by words such as bake, blanch, boil, broil, brown, +simmer, steam, etc. These frame-evoking words are the LUs in the +Apply_heat frame. Each sense of a polysemous word is a different +LU. + +We have used the word "word" in talking about LUs. The reality +is actually rather complex. When we say that the word "bake" is +polysemous, we mean that the lemma "bake.v" (which has the +word-forms "bake", "bakes", "baked", and "baking") is linked to +three different frames: + +- Apply_heat: "Michelle baked the potatoes for 45 minutes." + +- Cooking_creation: "Michelle baked her mother a cake for her birthday." + +- Absorb_heat: "The potatoes have to bake for more than 30 minutes." + +These constitute three different LUs, with different +definitions. + +Multiword expressions such as "given name" and hyphenated words +like "shut-eye" can also be LUs. Idiomatic phrases such as +"middle of nowhere" and "give the slip (to)" are also defined as +LUs in the appropriate frames ("Isolated_places" and "Evading", +respectively), and their internal structure is not analyzed. + +Framenet provides multiple annotated examples of each sense of a +word (i.e. each LU). Moreover, the set of examples +(approximately 20 per LU) illustrates all of the combinatorial +possibilities of the lexical unit. + +Each LU is linked to a Frame, and hence to the other words which +evoke that Frame. This makes the FrameNet database similar to a +thesaurus, grouping together semantically similar words. + +In the simplest case, frame-evoking words are verbs such as +"fried" in: + + "Matilde fried the catfish in a heavy iron skillet." + +Sometimes event nouns may evoke a Frame. For example, +"reduction" evokes "Cause_change_of_scalar_position" in: + + "...the reduction of debt levels to $665 million from $2.6 billion." + +Adjectives may also evoke a Frame. For example, "asleep" may +evoke the "Sleep" frame as in: + + "They were asleep for hours." + +Many common nouns, such as artifacts like "hat" or "tower", +typically serve as dependents rather than clearly evoking their +own frames. + +Details for a specific lexical unit can be obtained using this class's +`lus()` function, which takes an optional regular expression +pattern that will be matched against the name of the lexical unit: + + >>> from pprint import pprint + >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID'))) + [, , ...] + +You can obtain detailed information on a particular LU by calling the +`lu()` function and passing in an LU's 'ID' number: + + >>> from pprint import pprint + >>> from nltk.corpus import framenet as fn + >>> fn.lu(256).name + 'foresee.v' + >>> fn.lu(256).definition + 'COD: be aware of beforehand; predict.' + >>> fn.lu(256).frame.name + 'Expectation' + >>> fn.lu(256).lexemes[0].name + 'foresee' + +Note that LU names take the form of a dotted string (e.g. "run.v" or "a +little.adv") in which a lemma precedes the "." and a part of speech +(POS) follows the dot. The lemma may be composed of a single lexeme +(e.g. "run") or of multiple lexemes (e.g. "a little"). The list of +POSs used in the LUs is: + +v - verb +n - noun +a - adjective +adv - adverb +prep - preposition +num - numbers +intj - interjection +art - article +c - conjunction +scon - subordinating conjunction + +For more detailed information about the info that is contained in the +dict that is returned by the `lu()` function, see the documentation on +the `lu()` function. + +------------------- +Annotated Documents +------------------- + +The FrameNet corpus contains a small set of annotated documents. A list +of these documents can be obtained by calling the `docs()` function: + + >>> from pprint import pprint + >>> from nltk.corpus import framenet as fn + >>> d = fn.docs('BellRinging')[0] + >>> d.corpname + 'PropBank' + >>> d.sentence[49] + full-text sentence (...) in BellRinging: + + + [POS] 17 tags + + [POS_tagset] PENN + + [text] + [annotationSet] + + `` I live in hopes that the ringers themselves will be drawn into + ***** ******* ***** + Desir Cause_t Cause + [1] [3] [2] + + that fuller life . + ****** + Comple + [4] + (Desir=Desiring, Cause_t=Cause_to_make_noise, Cause=Cause_motion, Comple=Completeness) + + + >>> d.sentence[49].annotationSet[1] + annotation set (...): + + [status] MANUAL + + [LU] (6605) hope.n in Desiring + + [frame] (366) Desiring + + [GF] 2 relations + + [PT] 2 phrases + + [text] + [Target] + [FE] + [Noun] + + `` I live in hopes that the ringers themselves will be drawn into + - ^^^^ ^^ ***** ---------------------------------------------- + E supp su Event + + that fuller life . + ----------------- + + (E=Experiencer, su=supp) + + diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/generate.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/generate.doctest new file mode 100644 index 0000000000000000000000000000000000000000..423b88aa70b974e5c94d6ad7a342a8d62a814662 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/generate.doctest @@ -0,0 +1,78 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=============================================== +Generating sentences from context-free grammars +=============================================== + +An example grammar: + + >>> from nltk.parse.generate import generate, demo_grammar + >>> from nltk import CFG + >>> grammar = CFG.fromstring(demo_grammar) + >>> print(grammar) + Grammar with 13 productions (start state = S) + S -> NP VP + NP -> Det N + PP -> P NP + VP -> 'slept' + VP -> 'saw' NP + VP -> 'walked' PP + Det -> 'the' + Det -> 'a' + N -> 'man' + N -> 'park' + N -> 'dog' + P -> 'in' + P -> 'with' + +The first 10 generated sentences: + + >>> for sentence in generate(grammar, n=10): + ... print(' '.join(sentence)) + the man slept + the man saw the man + the man saw the park + the man saw the dog + the man saw a man + the man saw a park + the man saw a dog + the man walked in the man + the man walked in the park + the man walked in the dog + +All sentences of max depth 4: + + >>> for sentence in generate(grammar, depth=4): + ... print(' '.join(sentence)) + the man slept + the park slept + the dog slept + a man slept + a park slept + a dog slept + +The number of sentences of different max depths: + + >>> len(list(generate(grammar, depth=3))) + 0 + >>> len(list(generate(grammar, depth=4))) + 6 + >>> len(list(generate(grammar, depth=5))) + 42 + >>> len(list(generate(grammar, depth=6))) + 114 + >>> len(list(generate(grammar))) + 114 + +Infinite grammars will throw a RecursionError when not bounded by some ``depth``: + + >>> grammar = CFG.fromstring(""" + ... S -> A B + ... A -> B + ... B -> "b" | A + ... """) + >>> list(generate(grammar)) + Traceback (most recent call last): + ... + RuntimeError: The grammar has rule(s) that yield infinite recursion! diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics_malt_fixt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics_malt_fixt.py new file mode 100644 index 0000000000000000000000000000000000000000..ad278231a9c9798936f9c8236dc8c16ed4437a28 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics_malt_fixt.py @@ -0,0 +1,9 @@ +def setup_module(): + import pytest + + from nltk.parse.malt import MaltParser + + try: + depparser = MaltParser() + except (AssertionError, LookupError) as e: + pytest.skip("MaltParser is not available") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/grammar.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/grammar.doctest new file mode 100644 index 0000000000000000000000000000000000000000..aa30dcab1fd7257329ce8db0f088914edb47f95c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/grammar.doctest @@ -0,0 +1,69 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=============== +Grammar Parsing +=============== + +Grammars can be parsed from strings: + + >>> from nltk import CFG + >>> grammar = CFG.fromstring(""" + ... S -> NP VP + ... PP -> P NP + ... NP -> Det N | NP PP + ... VP -> V NP | VP PP + ... Det -> 'a' | 'the' + ... N -> 'dog' | 'cat' + ... V -> 'chased' | 'sat' + ... P -> 'on' | 'in' + ... """) + >>> grammar + + >>> grammar.start() + S + >>> grammar.productions() + [S -> NP VP, PP -> P NP, NP -> Det N, NP -> NP PP, VP -> V NP, VP -> VP PP, + Det -> 'a', Det -> 'the', N -> 'dog', N -> 'cat', V -> 'chased', V -> 'sat', + P -> 'on', P -> 'in'] + +Probabilistic CFGs: + + >>> from nltk import PCFG + >>> toy_pcfg1 = PCFG.fromstring(""" + ... S -> NP VP [1.0] + ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] + ... Det -> 'the' [0.8] | 'my' [0.2] + ... N -> 'man' [0.5] | 'telescope' [0.5] + ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2] + ... V -> 'ate' [0.35] | 'saw' [0.65] + ... PP -> P NP [1.0] + ... P -> 'with' [0.61] | 'under' [0.39] + ... """) + +Chomsky Normal Form grammar (Test for bug 474) + + >>> g = CFG.fromstring("VP^ -> VBP NP^") + >>> g.productions()[0].lhs() + VP^ + +Grammars can contain both empty strings and empty productions: + + >>> from nltk.grammar import CFG + >>> from nltk.parse.generate import generate + >>> grammar = CFG.fromstring(""" + ... S -> A B + ... A -> 'a' + ... # An empty string: + ... B -> 'b' | '' + ... """) + >>> list(generate(grammar)) + [['a', 'b'], ['a', '']] + >>> grammar = CFG.fromstring(""" + ... S -> A B + ... A -> 'a' + ... # An empty production: + ... B -> 'b' | + ... """) + >>> list(generate(grammar)) + [['a', 'b'], ['a']] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/grammartestsuites.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/grammartestsuites.doctest new file mode 100644 index 0000000000000000000000000000000000000000..ff5943c3ccad8aa73930b8c9da81080a5b2be348 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/grammartestsuites.doctest @@ -0,0 +1,109 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========================== + Test Suites for Grammars +========================== + +Sentences in the test suite are divided into two classes: + +- grammatical (*accept*) and +- ungrammatical (*reject*). + +If a sentence should parse according to the grammar, the value of +``trees`` will be a non-empty list. If a sentence should be rejected +according to the grammar, then the value of ``trees`` will be ``None``. + + >>> from nltk.parse import TestGrammar + >>> germantest1 = {} + >>> germantest1['doc'] = "Tests for person agreement" + >>> germantest1['accept'] = [ + ... 'ich komme', + ... 'ich sehe mich', + ... 'du kommst', + ... 'du siehst mich', + ... 'sie kommt', + ... 'sie sieht mich', + ... 'ihr kommt', + ... 'wir kommen', + ... 'sie kommen', + ... 'du magst mich', + ... 'er mag mich', + ... 'du folgst mir', + ... 'sie hilft mir', + ... ] + >>> germantest1['reject'] = [ + ... 'ich kommt', + ... 'ich kommst', + ... 'ich siehst mich', + ... 'du komme', + ... 'du sehe mich', + ... 'du kommt', + ... 'er komme', + ... 'er siehst mich', + ... 'wir komme', + ... 'wir kommst', + ... 'die Katzen kommst', + ... 'sie komme', + ... 'sie kommst', + ... 'du mag mich', + ... 'er magst mich', + ... 'du folgt mir', + ... 'sie hilfst mir', + ... ] + >>> germantest2 = {} + >>> germantest2['doc'] = "Tests for number agreement" + >>> germantest2['accept'] = [ + ... 'der Hund kommt', + ... 'die Hunde kommen', + ... 'ich komme', + ... 'wir kommen', + ... 'ich sehe die Katzen', + ... 'ich folge den Katzen', + ... 'ich sehe die Katzen', + ... 'ich folge den Katzen', + ... 'wir sehen die Katzen', + ... 'wir folgen den Katzen' + ... ] + >>> germantest2['reject'] = [ + ... 'ich kommen', + ... 'wir komme', + ... 'der Hunde kommt', + ... 'der Hunde kommen', + ... 'die Katzen kommt', + ... 'ich sehe der Hunde', + ... 'ich folge den Hund', + ... 'ich sehen der Hunde', + ... 'ich folgen den Hund', + ... 'wir sehe die Katzen', + ... 'wir folge den Katzen' + ... ] + >>> germantest3 = {} + >>> germantest3['doc'] = "Tests for case government and subcategorization" + >>> germantest3['accept'] = [ + ... 'der Hund sieht mich', + ... 'der Hund kommt', + ... 'ich sehe den Hund', + ... 'ich helfe dem Hund', + ... ] + >>> germantest3['reject'] = [ + ... 'ich sehe', + ... 'ich helfe', + ... 'ich komme den Hund', + ... 'ich sehe den Hund die Katzen', + ... 'du hilfst mich', + ... 'du siehst mir', + ... 'du siehst ich', + ... 'der Hunde kommt mich', + ... 'die Hunde sehe die Hunde', + ... 'der Hund sehe die Hunde', + ... 'ich hilft den Hund', + ... 'ich hilft der Hund', + ... 'ich sehe dem Hund', + ... ] + >>> germantestsuites = [germantest1, germantest2, germantest3] + >>> tester = TestGrammar('grammars/book_grammars/german.fcfg', germantestsuites) + >>> tester.run() + Tests for person agreement: All tests passed! + Tests for number agreement: All tests passed! + Tests for case government and subcategorization: All tests passed! diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/inference.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/inference.doctest new file mode 100644 index 0000000000000000000000000000000000000000..662138b8100015a9f3133924524eb2754126bce8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/inference.doctest @@ -0,0 +1,536 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +==================================== +Logical Inference and Model Building +==================================== + + >>> from nltk.test.setup_fixt import check_binary + >>> check_binary('mace4') + + >>> from nltk import * + >>> from nltk.sem.drt import DrtParser + >>> from nltk.sem import logic + >>> logic._counter._value = 0 + +------------ +Introduction +------------ + +Within the area of automated reasoning, first order theorem proving +and model building (or model generation) have both received much +attention, and have given rise to highly sophisticated techniques. We +focus therefore on providing an NLTK interface to third party tools +for these tasks. In particular, the module ``nltk.inference`` can be +used to access both theorem provers and model builders. + +--------------------------------- +NLTK Interface to Theorem Provers +--------------------------------- + +The main class used to interface with a theorem prover is the ``Prover`` +class, found in ``nltk.api``. The ``prove()`` method takes three optional +arguments: a goal, a list of assumptions, and a ``verbose`` boolean to +indicate whether the proof should be printed to the console. The proof goal +and any assumptions need to be instances of the ``Expression`` class +specified by ``nltk.sem.logic``. There are currently three theorem provers +included with NLTK: ``Prover9``, ``TableauProver``, and +``ResolutionProver``. The first is an off-the-shelf prover, while the other +two are written in Python and included in the ``nltk.inference`` package. + + >>> from nltk.sem import Expression + >>> read_expr = Expression.fromstring + >>> p1 = read_expr('man(socrates)') + >>> p2 = read_expr('all x.(man(x) -> mortal(x))') + >>> c = read_expr('mortal(socrates)') + >>> Prover9().prove(c, [p1,p2]) + True + >>> TableauProver().prove(c, [p1,p2]) + True + >>> ResolutionProver().prove(c, [p1,p2], verbose=True) + [1] {-mortal(socrates)} A + [2] {man(socrates)} A + [3] {-man(z2), mortal(z2)} A + [4] {-man(socrates)} (1, 3) + [5] {mortal(socrates)} (2, 3) + [6] {} (1, 5) + + True + +--------------------- +The ``ProverCommand`` +--------------------- + +A ``ProverCommand`` is a stateful holder for a theorem +prover. The command stores a theorem prover instance (of type ``Prover``), +a goal, a list of assumptions, the result of the proof, and a string version +of the entire proof. Corresponding to the three included ``Prover`` +implementations, there are three ``ProverCommand`` implementations: +``Prover9Command``, ``TableauProverCommand``, and +``ResolutionProverCommand``. + +The ``ProverCommand``'s constructor takes its goal and assumptions. The +``prove()`` command executes the ``Prover`` and ``proof()`` +returns a String form of the proof +If the ``prove()`` method has not been called, +then the prover command will be unable to display a proof. + + >>> prover = ResolutionProverCommand(c, [p1,p2]) + >>> print(prover.proof()) + Traceback (most recent call last): + File "...", line 1212, in __run + compileflags, 1) in test.globs + File "", line 1, in + File "...", line ..., in proof + raise LookupError("You have to call prove() first to get a proof!") + LookupError: You have to call prove() first to get a proof! + >>> prover.prove() + True + >>> print(prover.proof()) + [1] {-mortal(socrates)} A + [2] {man(socrates)} A + [3] {-man(z4), mortal(z4)} A + [4] {-man(socrates)} (1, 3) + [5] {mortal(socrates)} (2, 3) + [6] {} (1, 5) + + +The prover command stores the result of proving so that if ``prove()`` is +called again, then the command can return the result without executing the +prover again. This allows the user to access the result of the proof without +wasting time re-computing what it already knows. + + >>> prover.prove() + True + >>> prover.prove() + True + +The assumptions and goal may be accessed using the ``assumptions()`` and +``goal()`` methods, respectively. + + >>> prover.assumptions() + [, mortal(x))>] + >>> prover.goal() + + +The assumptions list may be modified using the ``add_assumptions()`` and +``retract_assumptions()`` methods. Both methods take a list of ``Expression`` +objects. Since adding or removing assumptions may change the result of the +proof, the stored result is cleared when either of these methods are called. +That means that ``proof()`` will be unavailable until ``prove()`` is called and +a call to ``prove()`` will execute the theorem prover. + + >>> prover.retract_assumptions([read_expr('man(socrates)')]) + >>> print(prover.proof()) + Traceback (most recent call last): + File "...", line 1212, in __run + compileflags, 1) in test.globs + File "", line 1, in + File "...", line ..., in proof + raise LookupError("You have to call prove() first to get a proof!") + LookupError: You have to call prove() first to get a proof! + >>> prover.prove() + False + >>> print(prover.proof()) + [1] {-mortal(socrates)} A + [2] {-man(z6), mortal(z6)} A + [3] {-man(socrates)} (1, 2) + + >>> prover.add_assumptions([read_expr('man(socrates)')]) + >>> prover.prove() + True + +------- +Prover9 +------- + +Prover9 Installation +~~~~~~~~~~~~~~~~~~~~ + +You can download Prover9 from https://www.cs.unm.edu/~mccune/prover9/. + +Extract the source code into a suitable directory and follow the +instructions in the Prover9 ``README.make`` file to compile the executables. +Install these into an appropriate location; the +``prover9_search`` variable is currently configured to look in the +following locations: + + >>> p = Prover9() + >>> p.binary_locations() + ['/usr/local/bin/prover9', + '/usr/local/bin/prover9/bin', + '/usr/local/bin', + '/usr/bin', + '/usr/local/prover9', + '/usr/local/share/prover9'] + +Alternatively, the environment variable ``PROVER9HOME`` may be configured with +the binary's location. + +The path to the correct directory can be set manually in the following +manner: + + >>> config_prover9(path='/usr/local/bin') # doctest: +SKIP + [Found prover9: /usr/local/bin/prover9] + +If the executables cannot be found, ``Prover9`` will issue a warning message: + + >>> p.prove() # doctest: +SKIP + Traceback (most recent call last): + ... + LookupError: + =========================================================================== + NLTK was unable to find the prover9 executable! Use config_prover9() or + set the PROVER9HOME environment variable. + + >> config_prover9('/path/to/prover9') + + For more information, on prover9, see: + + =========================================================================== + + +Using Prover9 +~~~~~~~~~~~~~ + +The general case in theorem proving is to determine whether ``S |- g`` +holds, where ``S`` is a possibly empty set of assumptions, and ``g`` +is a proof goal. + +As mentioned earlier, NLTK input to ``Prover9`` must be +``Expression``\ s of ``nltk.sem.logic``. A ``Prover9`` instance is +initialized with a proof goal and, possibly, some assumptions. The +``prove()`` method attempts to find a proof of the goal, given the +list of assumptions (in this case, none). + + >>> goal = read_expr('(man(x) <-> --man(x))') + >>> prover = Prover9Command(goal) + >>> prover.prove() + True + +Given a ``ProverCommand`` instance ``prover``, the method +``prover.proof()`` will return a String of the extensive proof information +provided by Prover9, shown in abbreviated form here:: + + ============================== Prover9 =============================== + Prover9 (32) version ... + Process ... was started by ... on ... + ... + The command was ".../prover9 -f ...". + ============================== end of head =========================== + + ============================== INPUT ================================= + + % Reading from file /var/... + + + formulas(goals). + (all x (man(x) -> man(x))). + end_of_list. + + ... + ============================== end of search ========================= + + THEOREM PROVED + + Exiting with 1 proof. + + Process 6317 exit (max_proofs) Mon Jan 21 15:23:28 2008 + + +As mentioned earlier, we may want to list some assumptions for +the proof, as shown here. + + >>> g = read_expr('mortal(socrates)') + >>> a1 = read_expr('all x.(man(x) -> mortal(x))') + >>> prover = Prover9Command(g, assumptions=[a1]) + >>> prover.print_assumptions() + all x.(man(x) -> mortal(x)) + +However, the assumptions are not sufficient to derive the goal: + + >>> print(prover.prove()) + False + +So let's add another assumption: + + >>> a2 = read_expr('man(socrates)') + >>> prover.add_assumptions([a2]) + >>> prover.print_assumptions() + all x.(man(x) -> mortal(x)) + man(socrates) + >>> print(prover.prove()) + True + +We can also show the assumptions in ``Prover9`` format. + + >>> prover.print_assumptions(output_format='Prover9') + all x (man(x) -> mortal(x)) + man(socrates) + + >>> prover.print_assumptions(output_format='Spass') + Traceback (most recent call last): + . . . + NameError: Unrecognized value for 'output_format': Spass + +Assumptions can be retracted from the list of assumptions. + + >>> prover.retract_assumptions([a1]) + >>> prover.print_assumptions() + man(socrates) + >>> prover.retract_assumptions([a1]) + +Statements can be loaded from a file and parsed. We can then add these +statements as new assumptions. + + >>> g = read_expr('all x.(boxer(x) -> -boxerdog(x))') + >>> prover = Prover9Command(g) + >>> prover.prove() + False + >>> import nltk.data + >>> new = nltk.data.load('grammars/sample_grammars/background0.fol') + >>> for a in new: + ... print(a) + all x.(boxerdog(x) -> dog(x)) + all x.(boxer(x) -> person(x)) + all x.-(dog(x) & person(x)) + exists x.boxer(x) + exists x.boxerdog(x) + >>> prover.add_assumptions(new) + >>> print(prover.prove()) + True + >>> print(prover.proof()) + ============================== prooftrans ============================ + Prover9 (...) version ... + Process ... was started by ... on ... + ... + The command was ".../prover9". + ============================== end of head =========================== + + ============================== end of input ========================== + + ============================== PROOF ================================= + + % -------- Comments from original proof -------- + % Proof 1 at ... seconds. + % Length of proof is 13. + % Level of proof is 4. + % Maximum clause weight is 0. + % Given clauses 0. + + 1 (all x (boxerdog(x) -> dog(x))). [assumption]. + 2 (all x (boxer(x) -> person(x))). [assumption]. + 3 (all x -(dog(x) & person(x))). [assumption]. + 6 (all x (boxer(x) -> -boxerdog(x))). [goal]. + 8 -boxerdog(x) | dog(x). [clausify(1)]. + 9 boxerdog(c3). [deny(6)]. + 11 -boxer(x) | person(x). [clausify(2)]. + 12 boxer(c3). [deny(6)]. + 14 -dog(x) | -person(x). [clausify(3)]. + 15 dog(c3). [resolve(9,a,8,a)]. + 18 person(c3). [resolve(12,a,11,a)]. + 19 -person(c3). [resolve(15,a,14,a)]. + 20 $F. [resolve(19,a,18,a)]. + + ============================== end of proof ========================== + +---------------------- +The equiv() method +---------------------- + +One application of the theorem prover functionality is to check if +two Expressions have the same meaning. +The ``equiv()`` method calls a theorem prover to determine whether two +Expressions are logically equivalent. + + >>> a = read_expr(r'exists x.(man(x) & walks(x))') + >>> b = read_expr(r'exists x.(walks(x) & man(x))') + >>> print(a.equiv(b)) + True + +The same method can be used on Discourse Representation Structures (DRSs). +In this case, each DRS is converted to a first order logic form, and then +passed to the theorem prover. + + >>> dp = DrtParser() + >>> a = dp.parse(r'([x],[man(x), walks(x)])') + >>> b = dp.parse(r'([x],[walks(x), man(x)])') + >>> print(a.equiv(b)) + True + + +-------------------------------- +NLTK Interface to Model Builders +-------------------------------- + +The top-level to model builders is parallel to that for +theorem-provers. The ``ModelBuilder`` interface is located +in ``nltk.inference.api``. It is currently only implemented by +``Mace``, which interfaces with the Mace4 model builder. + +Typically we use a model builder to show that some set of formulas has +a model, and is therefore consistent. One way of doing this is by +treating our candidate set of sentences as assumptions, and leaving +the goal unspecified. +Thus, the following interaction shows how both ``{a, c1}`` and ``{a, c2}`` +are consistent sets, since Mace succeeds in a building a +model for each of them, while ``{c1, c2}`` is inconsistent. + + >>> a3 = read_expr('exists x.(man(x) and walks(x))') + >>> c1 = read_expr('mortal(socrates)') + >>> c2 = read_expr('-mortal(socrates)') + >>> mace = Mace() + >>> print(mace.build_model(None, [a3, c1])) + True + >>> print(mace.build_model(None, [a3, c2])) + True + +We can also use the model builder as an adjunct to theorem prover. +Let's suppose we are trying to prove ``S |- g``, i.e. that ``g`` +is logically entailed by assumptions ``S = {s1, s2, ..., sn}``. +We can this same input to Mace4, and the model builder will try to +find a counterexample, that is, to show that ``g`` does *not* follow +from ``S``. So, given this input, Mace4 will try to find a model for +the set ``S' = {s1, s2, ..., sn, (not g)}``. If ``g`` fails to follow +from ``S``, then Mace4 may well return with a counterexample faster +than Prover9 concludes that it cannot find the required proof. +Conversely, if ``g`` *is* provable from ``S``, Mace4 may take a long +time unsuccessfully trying to find a counter model, and will eventually give up. + +In the following example, we see that the model builder does succeed +in building a model of the assumptions together with the negation of +the goal. That is, it succeeds in finding a model +where there is a woman that every man loves; Adam is a man; Eve is a +woman; but Adam does not love Eve. + + >>> a4 = read_expr('exists y. (woman(y) & all x. (man(x) -> love(x,y)))') + >>> a5 = read_expr('man(adam)') + >>> a6 = read_expr('woman(eve)') + >>> g = read_expr('love(adam,eve)') + >>> print(mace.build_model(g, [a4, a5, a6])) + True + +The Model Builder will fail to find a model if the assumptions do entail +the goal. Mace will continue to look for models of ever-increasing sizes +until the end_size number is reached. By default, end_size is 500, +but it can be set manually for quicker response time. + + >>> a7 = read_expr('all x.(man(x) -> mortal(x))') + >>> a8 = read_expr('man(socrates)') + >>> g2 = read_expr('mortal(socrates)') + >>> print(Mace(end_size=50).build_model(g2, [a7, a8])) + False + +There is also a ``ModelBuilderCommand`` class that, like ``ProverCommand``, +stores a ``ModelBuilder``, a goal, assumptions, a result, and a model. The +only implementation in NLTK is ``MaceCommand``. + + +----- +Mace4 +----- + +Mace4 Installation +~~~~~~~~~~~~~~~~~~ + +Mace4 is packaged with Prover9, and can be downloaded from the same +source, namely https://www.cs.unm.edu/~mccune/prover9/. It is installed +in the same manner as Prover9. + +Using Mace4 +~~~~~~~~~~~ + +Check whether Mace4 can find a model. + + >>> a = read_expr('(see(mary,john) & -(mary = john))') + >>> mb = MaceCommand(assumptions=[a]) + >>> mb.build_model() + True + +Show the model in 'tabular' format. + + >>> print(mb.model(format='tabular')) + % number = 1 + % seconds = 0 + + % Interpretation of size 2 + + john : 0 + + mary : 1 + + see : + | 0 1 + ---+---- + 0 | 0 0 + 1 | 1 0 + + +Show the model in 'tabular' format. + + >>> print(mb.model(format='cooked')) + % number = 1 + % seconds = 0 + + % Interpretation of size 2 + + john = 0. + + mary = 1. + + - see(0,0). + - see(0,1). + see(1,0). + - see(1,1). + + +The property ``valuation`` accesses the stored ``Valuation``. + + >>> print(mb.valuation) + {'john': 'a', 'mary': 'b', 'see': {('b', 'a')}} + +We can return to our earlier example and inspect the model: + + >>> mb = MaceCommand(g, assumptions=[a4, a5, a6]) + >>> m = mb.build_model() + >>> print(mb.model(format='cooked')) + % number = 1 + % seconds = 0 + + % Interpretation of size 2 + + adam = 0. + + eve = 0. + + c1 = 1. + + man(0). + - man(1). + + woman(0). + woman(1). + + - love(0,0). + love(0,1). + - love(1,0). + - love(1,1). + + +Here, we can see that ``adam`` and ``eve`` have been assigned the same +individual, namely ``0`` as value; ``0`` is both a man and a woman; a second +individual ``1`` is also a woman; and ``0`` loves ``1``. Thus, this is +an interpretation in which there is a woman that every man loves but +Adam doesn't love Eve. + +Mace can also be used with propositional logic. + + >>> p = read_expr('P') + >>> q = read_expr('Q') + >>> mb = MaceCommand(q, [p, p>-q]) + >>> mb.build_model() + True + >>> mb.valuation['P'] + True + >>> mb.valuation['Q'] + False diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/meteor.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/meteor.doctest new file mode 100644 index 0000000000000000000000000000000000000000..7b26d773e9f57e6299d6686bdccec3a25ed0eec8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/meteor.doctest @@ -0,0 +1,54 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +.. -*- coding: utf-8 -*- + +============= +METEOR tests +============= + +No Alignment test +------------------ + + >>> from nltk.translate import meteor + >>> from nltk import word_tokenize + +If the candidate has no alignment to any of the references, the METEOR score is 0. + + >>> round(meteor( + ... [word_tokenize('The candidate has no alignment to any of the references')], + ... word_tokenize('John loves Mary') + ... ), 4) + 0.0 + +Tests based on wikipedia examples +--------------------------------- + +Testing on `wikipedia examples `_ + + >>> same_res = round(meteor( + ... [word_tokenize('The cat sat on the mat')], + ... word_tokenize('The cat sat on the mat') + ... ), 4) + >>> abs(same_res - 0.9977) < 1e-2 + True + + >>> meteor( + ... [word_tokenize('The cat sat on the mat')], + ... word_tokenize('on the mat sat the cat') + ... ) + 0.5 + + >>> round(meteor( + ... [word_tokenize('The cat sat on the mat')], + ... word_tokenize('The cat was sat on the mat') + ... ), 4) + 0.9654 + +Test corresponding to issue #2751, where METEOR score > 1 + + >>> round(meteor( + ... [word_tokenize('create or update a vm set')], + ... word_tokenize('creates or updates a virtual machine scale set') + ... ), 4) + 0.7806 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/paice.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/paice.doctest new file mode 100644 index 0000000000000000000000000000000000000000..3759a44bd17ae6234b970b87ee39d6424e6d6f2c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/paice.doctest @@ -0,0 +1,35 @@ + +===================================================== +PAICE's evaluation statistics for stemming algorithms +===================================================== + +Given a list of words with their real lemmas and stems according to stemming algorithm under evaluation, +counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) and Error-rate relative to truncation (ERRT). + + >>> from nltk.metrics import Paice + + +------------------------------------- +Understemming and Overstemming values +------------------------------------- + + >>> lemmas = {'kneel': ['kneel', 'knelt'], + ... 'range': ['range', 'ranged'], + ... 'ring': ['ring', 'rang', 'rung']} + >>> stems = {'kneel': ['kneel'], + ... 'knelt': ['knelt'], + ... 'rang': ['rang', 'range', 'ranged'], + ... 'ring': ['ring'], + ... 'rung': ['rung']} + >>> p = Paice(lemmas, stems) + >>> p.gumt, p.gdmt, p.gwmt, p.gdnt + (4.0, 5.0, 2.0, 16.0) + + >>> p.ui, p.oi, p.sw + (0.8..., 0.125..., 0.15625...) + + >>> p.errt + 1.0 + + >>> [('{0:.3f}'.format(a), '{0:.3f}'.format(b)) for a, b in p.coords] + [('0.000', '1.000'), ('0.000', '0.375'), ('0.600', '0.125'), ('0.800', '0.125')] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/parse.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/parse.doctest new file mode 100644 index 0000000000000000000000000000000000000000..48101eac2366b55d68ecc7a05da4ebdcee7fcba3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/parse.doctest @@ -0,0 +1,933 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========= + Parsing +========= + +Unit tests for the Context Free Grammar class +--------------------------------------------- + + >>> import pickle + >>> import subprocess + >>> import sys + >>> from nltk import Nonterminal, nonterminals, Production, CFG + + >>> nt1 = Nonterminal('NP') + >>> nt2 = Nonterminal('VP') + + >>> nt1.symbol() + 'NP' + + >>> nt1 == Nonterminal('NP') + True + + >>> nt1 == nt2 + False + + >>> S, NP, VP, PP = nonterminals('S, NP, VP, PP') + >>> N, V, P, DT = nonterminals('N, V, P, DT') + + >>> prod1 = Production(S, [NP, VP]) + >>> prod2 = Production(NP, [DT, NP]) + + >>> prod1.lhs() + S + + >>> prod1.rhs() + (NP, VP) + + >>> prod1 == Production(S, [NP, VP]) + True + + >>> prod1 == prod2 + False + + >>> grammar = CFG.fromstring(""" + ... S -> NP VP + ... PP -> P NP + ... NP -> 'the' N | N PP | 'the' N PP + ... VP -> V NP | V PP | V NP PP + ... N -> 'cat' + ... N -> 'dog' + ... N -> 'rug' + ... V -> 'chased' + ... V -> 'sat' + ... P -> 'in' + ... P -> 'on' + ... """) + + >>> cmd = """import pickle + ... from nltk import Production + ... p = Production('S', ['NP', 'VP']) + ... print(pickle.dumps(p)) + ... """ + + >>> # Start a subprocess to simulate pickling in another process + >>> proc = subprocess.run([sys.executable, '-c', cmd], stdout=subprocess.PIPE) + >>> p1 = pickle.loads(eval(proc.stdout)) + >>> p2 = Production('S', ['NP', 'VP']) + >>> print(hash(p1) == hash(p2)) + True + +Unit tests for the rd (Recursive Descent Parser) class +------------------------------------------------------ + +Create and run a recursive descent parser over both a syntactically ambiguous +and unambiguous sentence. + + >>> from nltk.parse import RecursiveDescentParser + >>> rd = RecursiveDescentParser(grammar) + + >>> sentence1 = 'the cat chased the dog'.split() + >>> sentence2 = 'the cat chased the dog on the rug'.split() + + >>> for t in rd.parse(sentence1): + ... print(t) + (S (NP the (N cat)) (VP (V chased) (NP the (N dog)))) + + >>> for t in rd.parse(sentence2): + ... print(t) + (S + (NP the (N cat)) + (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) + (S + (NP the (N cat)) + (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) + + +(dolist (expr doctest-font-lock-keywords) + (add-to-list 'font-lock-keywords expr)) + + font-lock-keywords +(add-to-list 'font-lock-keywords + (car doctest-font-lock-keywords)) + + +Unit tests for the sr (Shift Reduce Parser) class +------------------------------------------------- + +Create and run a shift reduce parser over both a syntactically ambiguous +and unambiguous sentence. Note that unlike the recursive descent parser, one +and only one parse is ever returned. + + >>> from nltk.parse import ShiftReduceParser + >>> sr = ShiftReduceParser(grammar) + + >>> sentence1 = 'the cat chased the dog'.split() + >>> sentence2 = 'the cat chased the dog on the rug'.split() + + >>> for t in sr.parse(sentence1): + ... print(t) + (S (NP the (N cat)) (VP (V chased) (NP the (N dog)))) + + +The shift reduce parser uses heuristics to decide what to do when there are +multiple possible shift or reduce operations available - for the supplied +grammar clearly the wrong operation is selected. + + >>> for t in sr.parse(sentence2): + ... print(t) + + +Unit tests for the Chart Parser class +------------------------------------- + +We use the demo() function for testing. +We must turn off showing of times. + + >>> import nltk + +First we test tracing with a short sentence + + >>> nltk.parse.chart.demo(2, print_times=False, trace=1, + ... sent='I saw a dog', numparses=1) + * Sentence: + I saw a dog + ['I', 'saw', 'a', 'dog'] + + * Strategy: Bottom-up + + |. I . saw . a . dog .| + |[---------] . . .| [0:1] 'I' + |. [---------] . .| [1:2] 'saw' + |. . [---------] .| [2:3] 'a' + |. . . [---------]| [3:4] 'dog' + |> . . . .| [0:0] NP -> * 'I' + |[---------] . . .| [0:1] NP -> 'I' * + |> . . . .| [0:0] S -> * NP VP + |> . . . .| [0:0] NP -> * NP PP + |[---------> . . .| [0:1] S -> NP * VP + |[---------> . . .| [0:1] NP -> NP * PP + |. > . . .| [1:1] Verb -> * 'saw' + |. [---------] . .| [1:2] Verb -> 'saw' * + |. > . . .| [1:1] VP -> * Verb NP + |. > . . .| [1:1] VP -> * Verb + |. [---------> . .| [1:2] VP -> Verb * NP + |. [---------] . .| [1:2] VP -> Verb * + |. > . . .| [1:1] VP -> * VP PP + |[-------------------] . .| [0:2] S -> NP VP * + |. [---------> . .| [1:2] VP -> VP * PP + |. . > . .| [2:2] Det -> * 'a' + |. . [---------] .| [2:3] Det -> 'a' * + |. . > . .| [2:2] NP -> * Det Noun + |. . [---------> .| [2:3] NP -> Det * Noun + |. . . > .| [3:3] Noun -> * 'dog' + |. . . [---------]| [3:4] Noun -> 'dog' * + |. . [-------------------]| [2:4] NP -> Det Noun * + |. . > . .| [2:2] S -> * NP VP + |. . > . .| [2:2] NP -> * NP PP + |. [-----------------------------]| [1:4] VP -> Verb NP * + |. . [------------------->| [2:4] S -> NP * VP + |. . [------------------->| [2:4] NP -> NP * PP + |[=======================================]| [0:4] S -> NP VP * + |. [----------------------------->| [1:4] VP -> VP * PP + Nr edges in chart: 33 + (S (NP I) (VP (Verb saw) (NP (Det a) (Noun dog)))) + + +Then we test the different parsing Strategies. +Note that the number of edges differ between the strategies. + +Top-down + + >>> nltk.parse.chart.demo(1, print_times=False, trace=0, + ... sent='I saw John with a dog', numparses=2) + * Sentence: + I saw John with a dog + ['I', 'saw', 'John', 'with', 'a', 'dog'] + + * Strategy: Top-down + + Nr edges in chart: 48 + (S + (NP I) + (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) + (S + (NP I) + (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) + + +Bottom-up + + >>> nltk.parse.chart.demo(2, print_times=False, trace=0, + ... sent='I saw John with a dog', numparses=2) + * Sentence: + I saw John with a dog + ['I', 'saw', 'John', 'with', 'a', 'dog'] + + * Strategy: Bottom-up + + Nr edges in chart: 53 + (S + (NP I) + (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) + (S + (NP I) + (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) + + +Bottom-up Left-Corner + + >>> nltk.parse.chart.demo(3, print_times=False, trace=0, + ... sent='I saw John with a dog', numparses=2) + * Sentence: + I saw John with a dog + ['I', 'saw', 'John', 'with', 'a', 'dog'] + + * Strategy: Bottom-up left-corner + + Nr edges in chart: 36 + (S + (NP I) + (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) + (S + (NP I) + (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) + + +Left-Corner with Bottom-Up Filter + + >>> nltk.parse.chart.demo(4, print_times=False, trace=0, + ... sent='I saw John with a dog', numparses=2) + * Sentence: + I saw John with a dog + ['I', 'saw', 'John', 'with', 'a', 'dog'] + + * Strategy: Filtered left-corner + + Nr edges in chart: 28 + (S + (NP I) + (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) + (S + (NP I) + (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) + + +The stepping chart parser + + >>> nltk.parse.chart.demo(5, print_times=False, trace=1, + ... sent='I saw John with a dog', numparses=2) + * Sentence: + I saw John with a dog + ['I', 'saw', 'John', 'with', 'a', 'dog'] + + * Strategy: Stepping (top-down vs bottom-up) + + *** SWITCH TO TOP DOWN + |[------] . . . . .| [0:1] 'I' + |. [------] . . . .| [1:2] 'saw' + |. . [------] . . .| [2:3] 'John' + |. . . [------] . .| [3:4] 'with' + |. . . . [------] .| [4:5] 'a' + |. . . . . [------]| [5:6] 'dog' + |> . . . . . .| [0:0] S -> * NP VP + |> . . . . . .| [0:0] NP -> * NP PP + |> . . . . . .| [0:0] NP -> * Det Noun + |> . . . . . .| [0:0] NP -> * 'I' + |[------] . . . . .| [0:1] NP -> 'I' * + |[------> . . . . .| [0:1] S -> NP * VP + |[------> . . . . .| [0:1] NP -> NP * PP + |. > . . . . .| [1:1] VP -> * VP PP + |. > . . . . .| [1:1] VP -> * Verb NP + |. > . . . . .| [1:1] VP -> * Verb + |. > . . . . .| [1:1] Verb -> * 'saw' + |. [------] . . . .| [1:2] Verb -> 'saw' * + |. [------> . . . .| [1:2] VP -> Verb * NP + |. [------] . . . .| [1:2] VP -> Verb * + |[-------------] . . . .| [0:2] S -> NP VP * + |. [------> . . . .| [1:2] VP -> VP * PP + *** SWITCH TO BOTTOM UP + |. . > . . . .| [2:2] NP -> * 'John' + |. . . > . . .| [3:3] PP -> * 'with' NP + |. . . > . . .| [3:3] Prep -> * 'with' + |. . . . > . .| [4:4] Det -> * 'a' + |. . . . . > .| [5:5] Noun -> * 'dog' + |. . [------] . . .| [2:3] NP -> 'John' * + |. . . [------> . .| [3:4] PP -> 'with' * NP + |. . . [------] . .| [3:4] Prep -> 'with' * + |. . . . [------] .| [4:5] Det -> 'a' * + |. . . . . [------]| [5:6] Noun -> 'dog' * + |. [-------------] . . .| [1:3] VP -> Verb NP * + |[--------------------] . . .| [0:3] S -> NP VP * + |. [-------------> . . .| [1:3] VP -> VP * PP + |. . > . . . .| [2:2] S -> * NP VP + |. . > . . . .| [2:2] NP -> * NP PP + |. . . . > . .| [4:4] NP -> * Det Noun + |. . [------> . . .| [2:3] S -> NP * VP + |. . [------> . . .| [2:3] NP -> NP * PP + |. . . . [------> .| [4:5] NP -> Det * Noun + |. . . . [-------------]| [4:6] NP -> Det Noun * + |. . . [--------------------]| [3:6] PP -> 'with' NP * + |. [----------------------------------]| [1:6] VP -> VP PP * + *** SWITCH TO TOP DOWN + |. . > . . . .| [2:2] NP -> * Det Noun + |. . . . > . .| [4:4] NP -> * NP PP + |. . . > . . .| [3:3] VP -> * VP PP + |. . . > . . .| [3:3] VP -> * Verb NP + |. . . > . . .| [3:3] VP -> * Verb + |[=========================================]| [0:6] S -> NP VP * + |. [---------------------------------->| [1:6] VP -> VP * PP + |. . [---------------------------]| [2:6] NP -> NP PP * + |. . . . [------------->| [4:6] NP -> NP * PP + |. [----------------------------------]| [1:6] VP -> Verb NP * + |. . [--------------------------->| [2:6] S -> NP * VP + |. . [--------------------------->| [2:6] NP -> NP * PP + |[=========================================]| [0:6] S -> NP VP * + |. [---------------------------------->| [1:6] VP -> VP * PP + |. . . . . . >| [6:6] VP -> * VP PP + |. . . . . . >| [6:6] VP -> * Verb NP + |. . . . . . >| [6:6] VP -> * Verb + *** SWITCH TO BOTTOM UP + |. . . . > . .| [4:4] S -> * NP VP + |. . . . [------------->| [4:6] S -> NP * VP + *** SWITCH TO TOP DOWN + *** SWITCH TO BOTTOM UP + *** SWITCH TO TOP DOWN + *** SWITCH TO BOTTOM UP + *** SWITCH TO TOP DOWN + *** SWITCH TO BOTTOM UP + Nr edges in chart: 61 + (S + (NP I) + (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) + (S + (NP I) + (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) + + + +Unit tests for the Incremental Chart Parser class +------------------------------------------------- + +The incremental chart parsers are defined in earleychart.py. +We use the demo() function for testing. We must turn off showing of times. + + >>> import nltk + +Earley Chart Parser + + >>> nltk.parse.earleychart.demo(print_times=False, trace=1, + ... sent='I saw John with a dog', numparses=2) + * Sentence: + I saw John with a dog + ['I', 'saw', 'John', 'with', 'a', 'dog'] + + |. I . saw . John . with . a . dog .| + |[------] . . . . .| [0:1] 'I' + |. [------] . . . .| [1:2] 'saw' + |. . [------] . . .| [2:3] 'John' + |. . . [------] . .| [3:4] 'with' + |. . . . [------] .| [4:5] 'a' + |. . . . . [------]| [5:6] 'dog' + |> . . . . . .| [0:0] S -> * NP VP + |> . . . . . .| [0:0] NP -> * NP PP + |> . . . . . .| [0:0] NP -> * Det Noun + |> . . . . . .| [0:0] NP -> * 'I' + |[------] . . . . .| [0:1] NP -> 'I' * + |[------> . . . . .| [0:1] S -> NP * VP + |[------> . . . . .| [0:1] NP -> NP * PP + |. > . . . . .| [1:1] VP -> * VP PP + |. > . . . . .| [1:1] VP -> * Verb NP + |. > . . . . .| [1:1] VP -> * Verb + |. > . . . . .| [1:1] Verb -> * 'saw' + |. [------] . . . .| [1:2] Verb -> 'saw' * + |. [------> . . . .| [1:2] VP -> Verb * NP + |. [------] . . . .| [1:2] VP -> Verb * + |[-------------] . . . .| [0:2] S -> NP VP * + |. [------> . . . .| [1:2] VP -> VP * PP + |. . > . . . .| [2:2] NP -> * NP PP + |. . > . . . .| [2:2] NP -> * Det Noun + |. . > . . . .| [2:2] NP -> * 'John' + |. . [------] . . .| [2:3] NP -> 'John' * + |. [-------------] . . .| [1:3] VP -> Verb NP * + |. . [------> . . .| [2:3] NP -> NP * PP + |. . . > . . .| [3:3] PP -> * 'with' NP + |[--------------------] . . .| [0:3] S -> NP VP * + |. [-------------> . . .| [1:3] VP -> VP * PP + |. . . [------> . .| [3:4] PP -> 'with' * NP + |. . . . > . .| [4:4] NP -> * NP PP + |. . . . > . .| [4:4] NP -> * Det Noun + |. . . . > . .| [4:4] Det -> * 'a' + |. . . . [------] .| [4:5] Det -> 'a' * + |. . . . [------> .| [4:5] NP -> Det * Noun + |. . . . . > .| [5:5] Noun -> * 'dog' + |. . . . . [------]| [5:6] Noun -> 'dog' * + |. . . . [-------------]| [4:6] NP -> Det Noun * + |. . . [--------------------]| [3:6] PP -> 'with' NP * + |. . . . [------------->| [4:6] NP -> NP * PP + |. . [---------------------------]| [2:6] NP -> NP PP * + |. [----------------------------------]| [1:6] VP -> VP PP * + |[=========================================]| [0:6] S -> NP VP * + |. [---------------------------------->| [1:6] VP -> VP * PP + |. [----------------------------------]| [1:6] VP -> Verb NP * + |. . [--------------------------->| [2:6] NP -> NP * PP + |[=========================================]| [0:6] S -> NP VP * + |. [---------------------------------->| [1:6] VP -> VP * PP + (S + (NP I) + (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) + (S + (NP I) + (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) + + +Unit tests for LARGE context-free grammars +------------------------------------------ + +Reading the ATIS grammar. + + >>> grammar = nltk.data.load('grammars/large_grammars/atis.cfg') + >>> grammar + + +Reading the test sentences. + + >>> sentences = nltk.data.load('grammars/large_grammars/atis_sentences.txt') + >>> sentences = nltk.parse.util.extract_test_sentences(sentences) + >>> len(sentences) + 98 + >>> testsentence = sentences[22] + >>> testsentence[0] + ['show', 'me', 'northwest', 'flights', 'to', 'detroit', '.'] + >>> testsentence[1] + 17 + >>> sentence = testsentence[0] + +Now we test all different parsing strategies. +Note that the number of edges differ between the strategies. + +Bottom-up parsing. + + >>> parser = nltk.parse.BottomUpChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 7661 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + +Bottom-up Left-corner parsing. + + >>> parser = nltk.parse.BottomUpLeftCornerChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 4986 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + +Left-corner parsing with bottom-up filter. + + >>> parser = nltk.parse.LeftCornerChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 1342 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + +Top-down parsing. + + >>> parser = nltk.parse.TopDownChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 28352 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + +Incremental Bottom-up parsing. + + >>> parser = nltk.parse.IncrementalBottomUpChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 7661 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + +Incremental Bottom-up Left-corner parsing. + + >>> parser = nltk.parse.IncrementalBottomUpLeftCornerChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 4986 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + +Incremental Left-corner parsing with bottom-up filter. + + >>> parser = nltk.parse.IncrementalLeftCornerChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 1342 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + +Incremental Top-down parsing. + + >>> parser = nltk.parse.IncrementalTopDownChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 28352 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + +Earley parsing. This is similar to the incremental top-down algorithm. + + >>> parser = nltk.parse.EarleyChartParser(grammar) + >>> chart = parser.chart_parse(sentence) + >>> print((chart.num_edges())) + 28352 + >>> print((len(list(chart.parses(grammar.start()))))) + 17 + + +Unit tests for the Probabilistic CFG class +------------------------------------------ + + >>> from nltk.corpus import treebank + >>> from itertools import islice + >>> from nltk.grammar import PCFG, induce_pcfg + >>> toy_pcfg1 = PCFG.fromstring(""" + ... S -> NP VP [1.0] + ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] + ... Det -> 'the' [0.8] | 'my' [0.2] + ... N -> 'man' [0.5] | 'telescope' [0.5] + ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2] + ... V -> 'ate' [0.35] | 'saw' [0.65] + ... PP -> P NP [1.0] + ... P -> 'with' [0.61] | 'under' [0.39] + ... """) + + >>> toy_pcfg2 = PCFG.fromstring(""" + ... S -> NP VP [1.0] + ... VP -> V NP [.59] + ... VP -> V [.40] + ... VP -> VP PP [.01] + ... NP -> Det N [.41] + ... NP -> Name [.28] + ... NP -> NP PP [.31] + ... PP -> P NP [1.0] + ... V -> 'saw' [.21] + ... V -> 'ate' [.51] + ... V -> 'ran' [.28] + ... N -> 'boy' [.11] + ... N -> 'cookie' [.12] + ... N -> 'table' [.13] + ... N -> 'telescope' [.14] + ... N -> 'hill' [.5] + ... Name -> 'Jack' [.52] + ... Name -> 'Bob' [.48] + ... P -> 'with' [.61] + ... P -> 'under' [.39] + ... Det -> 'the' [.41] + ... Det -> 'a' [.31] + ... Det -> 'my' [.28] + ... """) + +Create a set of PCFG productions. + + >>> grammar = PCFG.fromstring(""" + ... A -> B B [.3] | C B C [.7] + ... B -> B D [.5] | C [.5] + ... C -> 'a' [.1] | 'b' [0.9] + ... D -> 'b' [1.0] + ... """) + >>> prod = grammar.productions()[0] + >>> prod + A -> B B [0.3] + + >>> prod.lhs() + A + + >>> prod.rhs() + (B, B) + + >>> print((prod.prob())) + 0.3 + + >>> grammar.start() + A + + >>> grammar.productions() + [A -> B B [0.3], A -> C B C [0.7], B -> B D [0.5], B -> C [0.5], C -> 'a' [0.1], C -> 'b' [0.9], D -> 'b' [1.0]] + +Induce some productions using parsed Treebank data. + + >>> productions = [] + >>> for fileid in treebank.fileids()[:2]: + ... for t in treebank.parsed_sents(fileid): + ... productions += t.productions() + + >>> grammar = induce_pcfg(S, productions) + >>> grammar + + + >>> sorted(grammar.productions(lhs=Nonterminal('PP')))[:2] + [PP -> IN NP [1.0]] + >>> sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2] + [NNP -> 'Agnew' [0.0714286], NNP -> 'Consolidated' [0.0714286]] + >>> sorted(grammar.productions(lhs=Nonterminal('JJ')))[:2] + [JJ -> 'British' [0.142857], JJ -> 'former' [0.142857]] + >>> sorted(grammar.productions(lhs=Nonterminal('NP')))[:2] + [NP -> CD NNS [0.133333], NP -> DT JJ JJ NN [0.0666667]] + +Unit tests for the Probabilistic Chart Parse classes +---------------------------------------------------- + + >>> tokens = "Jack saw Bob with my cookie".split() + >>> grammar = toy_pcfg2 + >>> print(grammar) + Grammar with 23 productions (start state = S) + S -> NP VP [1.0] + VP -> V NP [0.59] + VP -> V [0.4] + VP -> VP PP [0.01] + NP -> Det N [0.41] + NP -> Name [0.28] + NP -> NP PP [0.31] + PP -> P NP [1.0] + V -> 'saw' [0.21] + V -> 'ate' [0.51] + V -> 'ran' [0.28] + N -> 'boy' [0.11] + N -> 'cookie' [0.12] + N -> 'table' [0.13] + N -> 'telescope' [0.14] + N -> 'hill' [0.5] + Name -> 'Jack' [0.52] + Name -> 'Bob' [0.48] + P -> 'with' [0.61] + P -> 'under' [0.39] + Det -> 'the' [0.41] + Det -> 'a' [0.31] + Det -> 'my' [0.28] + +Create several parsers using different queuing strategies and show the +resulting parses. + + >>> from nltk.parse import pchart + + >>> parser = pchart.InsideChartParser(grammar) + >>> for t in parser.parse(tokens): + ... print(t) + (S + (NP (Name Jack)) + (VP + (V saw) + (NP + (NP (Name Bob)) + (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) + (S + (NP (Name Jack)) + (VP + (VP (V saw) (NP (Name Bob))) + (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) + + >>> parser = pchart.RandomChartParser(grammar) + >>> for t in parser.parse(tokens): + ... print(t) + (S + (NP (Name Jack)) + (VP + (V saw) + (NP + (NP (Name Bob)) + (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) + (S + (NP (Name Jack)) + (VP + (VP (V saw) (NP (Name Bob))) + (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) + + >>> parser = pchart.UnsortedChartParser(grammar) + >>> for t in parser.parse(tokens): + ... print(t) + (S + (NP (Name Jack)) + (VP + (V saw) + (NP + (NP (Name Bob)) + (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) + (S + (NP (Name Jack)) + (VP + (VP (V saw) (NP (Name Bob))) + (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) + + >>> parser = pchart.LongestChartParser(grammar) + >>> for t in parser.parse(tokens): + ... print(t) + (S + (NP (Name Jack)) + (VP + (V saw) + (NP + (NP (Name Bob)) + (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) + (S + (NP (Name Jack)) + (VP + (VP (V saw) (NP (Name Bob))) + (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) + + >>> parser = pchart.InsideChartParser(grammar, beam_size = len(tokens)+1) + >>> for t in parser.parse(tokens): + ... print(t) + + +Unit tests for the Viterbi Parse classes +---------------------------------------- + + >>> from nltk.parse import ViterbiParser + >>> tokens = "Jack saw Bob with my cookie".split() + >>> grammar = toy_pcfg2 + +Parse the tokenized sentence. + + >>> parser = ViterbiParser(grammar) + >>> for t in parser.parse(tokens): + ... print(t) + (S + (NP (Name Jack)) + (VP + (V saw) + (NP + (NP (Name Bob)) + (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) + + +Unit tests for the FeatStructNonterminal class +---------------------------------------------- + + >>> from nltk.grammar import FeatStructNonterminal + >>> FeatStructNonterminal( + ... pos='n', agr=FeatStructNonterminal(number='pl', gender='f')) + [agr=[gender='f', number='pl'], pos='n'] + + >>> FeatStructNonterminal('VP[+fin]/NP[+pl]') + VP[+fin]/NP[+pl] + + +Tracing the Feature Chart Parser +-------------------------------- + +We use the featurechart.demo() function for tracing the Feature Chart Parser. + + >>> nltk.parse.featurechart.demo(print_times=False, + ... print_grammar=True, + ... parser=nltk.parse.featurechart.FeatureChartParser, + ... sent='I saw John with a dog') + + Grammar with 18 productions (start state = S[]) + S[] -> NP[] VP[] + PP[] -> Prep[] NP[] + NP[] -> NP[] PP[] + VP[] -> VP[] PP[] + VP[] -> Verb[] NP[] + VP[] -> Verb[] + NP[] -> Det[pl=?x] Noun[pl=?x] + NP[] -> 'John' + NP[] -> 'I' + Det[] -> 'the' + Det[] -> 'my' + Det[-pl] -> 'a' + Noun[-pl] -> 'dog' + Noun[-pl] -> 'cookie' + Verb[] -> 'ate' + Verb[] -> 'saw' + Prep[] -> 'with' + Prep[] -> 'under' + + * FeatureChartParser + Sentence: I saw John with a dog + |.I.s.J.w.a.d.| + |[-] . . . . .| [0:1] 'I' + |. [-] . . . .| [1:2] 'saw' + |. . [-] . . .| [2:3] 'John' + |. . . [-] . .| [3:4] 'with' + |. . . . [-] .| [4:5] 'a' + |. . . . . [-]| [5:6] 'dog' + |[-] . . . . .| [0:1] NP[] -> 'I' * + |[-> . . . . .| [0:1] S[] -> NP[] * VP[] {} + |[-> . . . . .| [0:1] NP[] -> NP[] * PP[] {} + |. [-] . . . .| [1:2] Verb[] -> 'saw' * + |. [-> . . . .| [1:2] VP[] -> Verb[] * NP[] {} + |. [-] . . . .| [1:2] VP[] -> Verb[] * + |. [-> . . . .| [1:2] VP[] -> VP[] * PP[] {} + |[---] . . . .| [0:2] S[] -> NP[] VP[] * + |. . [-] . . .| [2:3] NP[] -> 'John' * + |. . [-> . . .| [2:3] S[] -> NP[] * VP[] {} + |. . [-> . . .| [2:3] NP[] -> NP[] * PP[] {} + |. [---] . . .| [1:3] VP[] -> Verb[] NP[] * + |. [---> . . .| [1:3] VP[] -> VP[] * PP[] {} + |[-----] . . .| [0:3] S[] -> NP[] VP[] * + |. . . [-] . .| [3:4] Prep[] -> 'with' * + |. . . [-> . .| [3:4] PP[] -> Prep[] * NP[] {} + |. . . . [-] .| [4:5] Det[-pl] -> 'a' * + |. . . . [-> .| [4:5] NP[] -> Det[pl=?x] * Noun[pl=?x] {?x: False} + |. . . . . [-]| [5:6] Noun[-pl] -> 'dog' * + |. . . . [---]| [4:6] NP[] -> Det[-pl] Noun[-pl] * + |. . . . [--->| [4:6] S[] -> NP[] * VP[] {} + |. . . . [--->| [4:6] NP[] -> NP[] * PP[] {} + |. . . [-----]| [3:6] PP[] -> Prep[] NP[] * + |. . [-------]| [2:6] NP[] -> NP[] PP[] * + |. [---------]| [1:6] VP[] -> VP[] PP[] * + |. [--------->| [1:6] VP[] -> VP[] * PP[] {} + |[===========]| [0:6] S[] -> NP[] VP[] * + |. . [------->| [2:6] S[] -> NP[] * VP[] {} + |. . [------->| [2:6] NP[] -> NP[] * PP[] {} + |. [---------]| [1:6] VP[] -> Verb[] NP[] * + |. [--------->| [1:6] VP[] -> VP[] * PP[] {} + |[===========]| [0:6] S[] -> NP[] VP[] * + (S[] + (NP[] I) + (VP[] + (VP[] (Verb[] saw) (NP[] John)) + (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog))))) + (S[] + (NP[] I) + (VP[] + (Verb[] saw) + (NP[] + (NP[] John) + (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog)))))) + + +Unit tests for the Feature Chart Parser classes +----------------------------------------------- + +The list of parsers we want to test. + + >>> parsers = [nltk.parse.featurechart.FeatureChartParser, + ... nltk.parse.featurechart.FeatureTopDownChartParser, + ... nltk.parse.featurechart.FeatureBottomUpChartParser, + ... nltk.parse.featurechart.FeatureBottomUpLeftCornerChartParser, + ... nltk.parse.earleychart.FeatureIncrementalChartParser, + ... nltk.parse.earleychart.FeatureEarleyChartParser, + ... nltk.parse.earleychart.FeatureIncrementalTopDownChartParser, + ... nltk.parse.earleychart.FeatureIncrementalBottomUpChartParser, + ... nltk.parse.earleychart.FeatureIncrementalBottomUpLeftCornerChartParser, + ... ] + +A helper function that tests each parser on the given grammar and sentence. +We check that the number of trees are correct, and that all parsers +return the same trees. Otherwise an error is printed. + + >>> def unittest(grammar, sentence, nr_trees): + ... sentence = sentence.split() + ... trees = None + ... for P in parsers: + ... result = P(grammar).parse(sentence) + ... result = set(tree.freeze() for tree in result) + ... if len(result) != nr_trees: + ... print("Wrong nr of trees:", len(result)) + ... elif trees is None: + ... trees = result + ... elif result != trees: + ... print("Trees differ for parser:", P.__name__) + +The demo grammar from before, with an ambiguous sentence. + + >>> isawjohn = nltk.parse.featurechart.demo_grammar() + >>> unittest(isawjohn, "I saw John with a dog with my cookie", 5) + +This grammar tests that variables in different grammar rules are renamed +before unification. (The problematic variable is in this case ?X). + + >>> whatwasthat = nltk.grammar.FeatureGrammar.fromstring(''' + ... S[] -> NP[num=?N] VP[num=?N, slash=?X] + ... NP[num=?X] -> "what" + ... NP[num=?X] -> "that" + ... VP[num=?P, slash=none] -> V[num=?P] NP[] + ... V[num=sg] -> "was" + ... ''') + >>> unittest(whatwasthat, "what was that", 1) + +This grammar tests that the same rule can be used in different places +in another rule, and that the variables are properly renamed. + + >>> thislovesthat = nltk.grammar.FeatureGrammar.fromstring(''' + ... S[] -> NP[case=nom] V[] NP[case=acc] + ... NP[case=?X] -> Pron[case=?X] + ... Pron[] -> "this" + ... Pron[] -> "that" + ... V[] -> "loves" + ... ''') + >>> unittest(thislovesthat, "this loves that", 1) + + +Tests for loading feature grammar files +--------------------------------------- + +Alternative 1: first load the grammar, then create the parser. + + >>> fcfg = nltk.data.load('grammars/book_grammars/feat0.fcfg') + >>> fcp1 = nltk.parse.FeatureChartParser(fcfg) + >>> print((type(fcp1))) + + +Alternative 2: directly load the parser. + + >>> fcp2 = nltk.parse.load_parser('grammars/book_grammars/feat0.fcfg') + >>> print((type(fcp2))) + diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/resolution.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/resolution.doctest new file mode 100644 index 0000000000000000000000000000000000000000..09410be37142e6a4b321cd5869d1922fb6af9a19 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/resolution.doctest @@ -0,0 +1,222 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========================= +Resolution Theorem Prover +========================= + + >>> from nltk.inference.resolution import * + >>> from nltk.sem import logic + >>> from nltk.sem.logic import * + >>> logic._counter._value = 0 + >>> read_expr = logic.Expression.fromstring + + >>> P = read_expr('P') + >>> Q = read_expr('Q') + >>> R = read_expr('R') + >>> A = read_expr('A') + >>> B = read_expr('B') + >>> x = read_expr('x') + >>> y = read_expr('y') + >>> z = read_expr('z') + +------------------------------- +Test most_general_unification() +------------------------------- + >>> print(most_general_unification(x, x)) + {} + >>> print(most_general_unification(A, A)) + {} + >>> print(most_general_unification(A, x)) + {x: A} + >>> print(most_general_unification(x, A)) + {x: A} + >>> print(most_general_unification(x, y)) + {x: y} + >>> print(most_general_unification(P(x), P(A))) + {x: A} + >>> print(most_general_unification(P(x,B), P(A,y))) + {x: A, y: B} + >>> print(most_general_unification(P(x,B), P(B,x))) + {x: B} + >>> print(most_general_unification(P(x,y), P(A,x))) + {x: A, y: x} + >>> print(most_general_unification(P(Q(x)), P(y))) + {y: Q(x)} + +------------ +Test unify() +------------ + >>> print(Clause([]).unify(Clause([]))) + [] + >>> print(Clause([P(x)]).unify(Clause([-P(A)]))) + [{}] + >>> print(Clause([P(A), Q(x)]).unify(Clause([-P(x), R(x)]))) + [{R(A), Q(A)}] + >>> print(Clause([P(A), Q(x), R(x,y)]).unify(Clause([-P(x), Q(y)]))) + [{Q(y), Q(A), R(A,y)}] + >>> print(Clause([P(A), -Q(y)]).unify(Clause([-P(x), Q(B)]))) + [{}] + >>> print(Clause([P(x), Q(x)]).unify(Clause([-P(A), -Q(B)]))) + [{-Q(B), Q(A)}, {-P(A), P(B)}] + >>> print(Clause([P(x,x), Q(x), R(x)]).unify(Clause([-P(A,z), -Q(B)]))) + [{-Q(B), Q(A), R(A)}, {-P(A,z), R(B), P(B,B)}] + + >>> a = clausify(read_expr('P(A)')) + >>> b = clausify(read_expr('A=B')) + >>> print(a[0].unify(b[0])) + [{P(B)}] + +------------------------- +Test is_tautology() +------------------------- + >>> print(Clause([P(A), -P(A)]).is_tautology()) + True + >>> print(Clause([-P(A), P(A)]).is_tautology()) + True + >>> print(Clause([P(x), -P(A)]).is_tautology()) + False + >>> print(Clause([Q(B), -P(A), P(A)]).is_tautology()) + True + >>> print(Clause([-Q(A), P(R(A)), -P(R(A)), Q(x), -R(y)]).is_tautology()) + True + >>> print(Clause([P(x), -Q(A)]).is_tautology()) + False + +------------------------- +Test subsumes() +------------------------- + >>> print(Clause([P(A), Q(B)]).subsumes(Clause([P(A), Q(B)]))) + True + >>> print(Clause([-P(A)]).subsumes(Clause([P(A)]))) + False + >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), P(A)]))) + True + >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), R(A), P(A)]))) + True + >>> print(Clause([P(A), R(A), Q(B)]).subsumes(Clause([Q(B), P(A)]))) + False + >>> print(Clause([P(x)]).subsumes(Clause([P(A)]))) + True + >>> print(Clause([P(A)]).subsumes(Clause([P(x)]))) + True + +------------ +Test prove() +------------ + >>> print(ResolutionProverCommand(read_expr('man(x)')).prove()) + False + >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('(man(x) -> --man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('(man(x) <-> man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('-(man(x) <-> -man(x))')).prove()) + True + >>> print(ResolutionProverCommand(read_expr('all x.man(x)')).prove()) + False + >>> print(ResolutionProverCommand(read_expr('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')).prove()) + False + >>> print(ResolutionProverCommand(read_expr('some x.all y.sees(x,y)')).prove()) + False + + >>> p1 = read_expr('all x.(man(x) -> mortal(x))') + >>> p2 = read_expr('man(Socrates)') + >>> c = read_expr('mortal(Socrates)') + >>> ResolutionProverCommand(c, [p1,p2]).prove() + True + + >>> p1 = read_expr('all x.(man(x) -> walks(x))') + >>> p2 = read_expr('man(John)') + >>> c = read_expr('some y.walks(y)') + >>> ResolutionProverCommand(c, [p1,p2]).prove() + True + + >>> p = read_expr('some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))') + >>> c = read_expr('some e0.walk(e0,mary)') + >>> ResolutionProverCommand(c, [p]).prove() + True + +------------ +Test proof() +------------ + >>> p1 = read_expr('all x.(man(x) -> mortal(x))') + >>> p2 = read_expr('man(Socrates)') + >>> c = read_expr('mortal(Socrates)') + >>> logic._counter._value = 0 + >>> tp = ResolutionProverCommand(c, [p1,p2]) + >>> tp.prove() + True + >>> print(tp.proof()) + [1] {-mortal(Socrates)} A + [2] {-man(z2), mortal(z2)} A + [3] {man(Socrates)} A + [4] {-man(Socrates)} (1, 2) + [5] {mortal(Socrates)} (2, 3) + [6] {} (1, 5) + + +------------------ +Question Answering +------------------ +One answer + + >>> p1 = read_expr('father_of(art,john)') + >>> p2 = read_expr('father_of(bob,kim)') + >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))') + >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))') + >>> logic._counter._value = 0 + >>> tp = ResolutionProverCommand(None, [p1,p2,p3,c]) + >>> sorted(tp.find_answers()) + [] + >>> print(tp.proof()) # doctest: +SKIP + [1] {father_of(art,john)} A + [2] {father_of(bob,kim)} A + [3] {-father_of(z3,z4), parent_of(z3,z4)} A + [4] {-parent_of(z6,john), ANSWER(z6)} A + [5] {parent_of(art,john)} (1, 3) + [6] {parent_of(bob,kim)} (2, 3) + [7] {ANSWER(z6), -father_of(z6,john)} (3, 4) + [8] {ANSWER(art)} (1, 7) + [9] {ANSWER(art)} (4, 5) + + +Multiple answers + + >>> p1 = read_expr('father_of(art,john)') + >>> p2 = read_expr('mother_of(ann,john)') + >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))') + >>> p4 = read_expr('all x.all y.(mother_of(x,y) -> parent_of(x,y))') + >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))') + >>> logic._counter._value = 0 + >>> tp = ResolutionProverCommand(None, [p1,p2,p3,p4,c]) + >>> sorted(tp.find_answers()) + [, ] + >>> print(tp.proof()) # doctest: +SKIP + [ 1] {father_of(art,john)} A + [ 2] {mother_of(ann,john)} A + [ 3] {-father_of(z3,z4), parent_of(z3,z4)} A + [ 4] {-mother_of(z7,z8), parent_of(z7,z8)} A + [ 5] {-parent_of(z10,john), ANSWER(z10)} A + [ 6] {parent_of(art,john)} (1, 3) + [ 7] {parent_of(ann,john)} (2, 4) + [ 8] {ANSWER(z10), -father_of(z10,john)} (3, 5) + [ 9] {ANSWER(art)} (1, 8) + [10] {ANSWER(z10), -mother_of(z10,john)} (4, 5) + [11] {ANSWER(ann)} (2, 10) + [12] {ANSWER(art)} (5, 6) + [13] {ANSWER(ann)} (5, 7) + diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/simple.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/simple.doctest new file mode 100644 index 0000000000000000000000000000000000000000..ab6d1169e026fe4dc6ce724f1be670dbdd68112b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/simple.doctest @@ -0,0 +1,83 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +================= +EasyInstall Tests +================= + +This file contains some simple tests that will be run by EasyInstall in +order to test the installation when NLTK-Data is absent. + + +------------ +Tokenization +------------ + + >>> from nltk.tokenize import wordpunct_tokenize + >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" + ... "two of them.\n\nThanks.") + >>> wordpunct_tokenize(s) + ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', + 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + +------- +Metrics +------- + + >>> from nltk.metrics import precision, recall, f_measure + >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() + >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() + >>> reference_set = set(reference) + >>> test_set = set(test) + >>> precision(reference_set, test_set) + 1.0 + >>> print(recall(reference_set, test_set)) + 0.8 + >>> print(f_measure(reference_set, test_set)) + 0.88888888888... + +------------------ +Feature Structures +------------------ + + >>> from nltk import FeatStruct + >>> fs1 = FeatStruct(PER=3, NUM='pl', GND='fem') + >>> fs2 = FeatStruct(POS='N', AGR=fs1) + >>> print(fs2) + [ [ GND = 'fem' ] ] + [ AGR = [ NUM = 'pl' ] ] + [ [ PER = 3 ] ] + [ ] + [ POS = 'N' ] + >>> print(fs2['AGR']) + [ GND = 'fem' ] + [ NUM = 'pl' ] + [ PER = 3 ] + >>> print(fs2['AGR']['PER']) + 3 + +------- +Parsing +------- + + >>> from nltk.parse.recursivedescent import RecursiveDescentParser + >>> from nltk.grammar import CFG + >>> grammar = CFG.fromstring(""" + ... S -> NP VP + ... PP -> P NP + ... NP -> 'the' N | N PP | 'the' N PP + ... VP -> V NP | V PP | V NP PP + ... N -> 'cat' | 'dog' | 'rug' + ... V -> 'chased' + ... P -> 'on' + ... """) + >>> rd = RecursiveDescentParser(grammar) + >>> sent = 'the cat chased the dog on the rug'.split() + >>> for t in rd.parse(sent): + ... print(t) + (S + (NP the (N cat)) + (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) + (S + (NP the (N cat)) + (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_counter.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_counter.py new file mode 100644 index 0000000000000000000000000000000000000000..1db375d3cdb9288392d74b65e88d441dcd032673 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_counter.py @@ -0,0 +1,116 @@ +# Natural Language Toolkit: Language Model Unit Tests +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT + +import unittest + +import pytest + +from nltk import FreqDist +from nltk.lm import NgramCounter +from nltk.util import everygrams + + +class TestNgramCounter: + """Tests for NgramCounter that only involve lookup, no modification.""" + + @classmethod + def setup_class(self): + text = [list("abcd"), list("egdbe")] + self.trigram_counter = NgramCounter( + everygrams(sent, max_len=3) for sent in text + ) + self.bigram_counter = NgramCounter(everygrams(sent, max_len=2) for sent in text) + self.case = unittest.TestCase() + + def test_N(self): + assert self.bigram_counter.N() == 16 + assert self.trigram_counter.N() == 21 + + def test_counter_len_changes_with_lookup(self): + assert len(self.bigram_counter) == 2 + self.bigram_counter[50] + assert len(self.bigram_counter) == 3 + + def test_ngram_order_access_unigrams(self): + assert self.bigram_counter[1] == self.bigram_counter.unigrams + + def test_ngram_conditional_freqdist(self): + case = unittest.TestCase() + expected_trigram_contexts = [ + ("a", "b"), + ("b", "c"), + ("e", "g"), + ("g", "d"), + ("d", "b"), + ] + expected_bigram_contexts = [("a",), ("b",), ("d",), ("e",), ("c",), ("g",)] + + bigrams = self.trigram_counter[2] + trigrams = self.trigram_counter[3] + + self.case.assertCountEqual(expected_bigram_contexts, bigrams.conditions()) + self.case.assertCountEqual(expected_trigram_contexts, trigrams.conditions()) + + def test_bigram_counts_seen_ngrams(self): + assert self.bigram_counter[["a"]]["b"] == 1 + assert self.bigram_counter[["b"]]["c"] == 1 + + def test_bigram_counts_unseen_ngrams(self): + assert self.bigram_counter[["b"]]["z"] == 0 + + def test_unigram_counts_seen_words(self): + assert self.bigram_counter["b"] == 2 + + def test_unigram_counts_completely_unseen_words(self): + assert self.bigram_counter["z"] == 0 + + +class TestNgramCounterTraining: + @classmethod + def setup_class(self): + self.counter = NgramCounter() + self.case = unittest.TestCase() + + @pytest.mark.parametrize("case", ["", [], None]) + def test_empty_inputs(self, case): + test = NgramCounter(case) + assert 2 not in test + assert test[1] == FreqDist() + + def test_train_on_unigrams(self): + words = list("abcd") + counter = NgramCounter([[(w,) for w in words]]) + + assert not counter[3] + assert not counter[2] + self.case.assertCountEqual(words, counter[1].keys()) + + def test_train_on_illegal_sentences(self): + str_sent = ["Check", "this", "out", "!"] + list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]] + + with pytest.raises(TypeError): + NgramCounter([str_sent]) + + with pytest.raises(TypeError): + NgramCounter([list_sent]) + + def test_train_on_bigrams(self): + bigram_sent = [("a", "b"), ("c", "d")] + counter = NgramCounter([bigram_sent]) + assert not bool(counter[3]) + + def test_train_on_mix(self): + mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h",)] + counter = NgramCounter([mixed_sent]) + unigrams = ["h"] + bigram_contexts = [("a",), ("c",)] + trigram_contexts = [("e", "f")] + + self.case.assertCountEqual(unigrams, counter[1].keys()) + self.case.assertCountEqual(bigram_contexts, counter[2].keys()) + self.case.assertCountEqual(trigram_contexts, counter[3].keys()) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/text.py b/.eggs/nltk-3.8-py3.10.egg/nltk/text.py new file mode 100644 index 0000000000000000000000000000000000000000..da97b15286af4c1a82aa5b398429b025dc18cae4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/text.py @@ -0,0 +1,779 @@ +# Natural Language Toolkit: Texts +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +This module brings together a variety of NLTK functionality for +text analysis, and provides simple, interactive interfaces. +Functionality includes: concordancing, collocation discovery, +regular expression search over tokenized strings, and +distributional similarity. +""" + +import re +import sys +from collections import Counter, defaultdict, namedtuple +from functools import reduce +from math import log + +from nltk.collocations import BigramCollocationFinder +from nltk.lm import MLE +from nltk.lm.preprocessing import padded_everygram_pipeline +from nltk.metrics import BigramAssocMeasures, f_measure +from nltk.probability import ConditionalFreqDist as CFD +from nltk.probability import FreqDist +from nltk.tokenize import sent_tokenize +from nltk.util import LazyConcatenation, tokenwrap + +ConcordanceLine = namedtuple( + "ConcordanceLine", + ["left", "query", "right", "offset", "left_print", "right_print", "line"], +) + + +class ContextIndex: + """ + A bidirectional index between words and their 'contexts' in a text. + The context of a word is usually defined to be the words that occur + in a fixed window around the word; but other definitions may also + be used by providing a custom context function. + """ + + @staticmethod + def _default_context(tokens, i): + """One left token and one right token, normalized to lowercase""" + left = tokens[i - 1].lower() if i != 0 else "*START*" + right = tokens[i + 1].lower() if i != len(tokens) - 1 else "*END*" + return (left, right) + + def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x): + self._key = key + self._tokens = tokens + if context_func: + self._context_func = context_func + else: + self._context_func = self._default_context + if filter: + tokens = [t for t in tokens if filter(t)] + self._word_to_contexts = CFD( + (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens) + ) + self._context_to_words = CFD( + (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens) + ) + + def tokens(self): + """ + :rtype: list(str) + :return: The document that this context index was + created from. + """ + return self._tokens + + def word_similarity_dict(self, word): + """ + Return a dictionary mapping from words to 'similarity scores,' + indicating how often these two words occur in the same + context. + """ + word = self._key(word) + word_contexts = set(self._word_to_contexts[word]) + + scores = {} + for w, w_contexts in self._word_to_contexts.items(): + scores[w] = f_measure(word_contexts, set(w_contexts)) + + return scores + + def similar_words(self, word, n=20): + scores = defaultdict(int) + for c in self._word_to_contexts[self._key(word)]: + for w in self._context_to_words[c]: + if w != word: + scores[w] += ( + self._context_to_words[c][word] * self._context_to_words[c][w] + ) + return sorted(scores, key=scores.get, reverse=True)[:n] + + def common_contexts(self, words, fail_on_unknown=False): + """ + Find contexts where the specified words can all appear; and + return a frequency distribution mapping each context to the + number of times that context was used. + + :param words: The words used to seed the similarity search + :type words: str + :param fail_on_unknown: If true, then raise a value error if + any of the given words do not occur at all in the index. + """ + words = [self._key(w) for w in words] + contexts = [set(self._word_to_contexts[w]) for w in words] + empty = [words[i] for i in range(len(words)) if not contexts[i]] + common = reduce(set.intersection, contexts) + if empty and fail_on_unknown: + raise ValueError("The following word(s) were not found:", " ".join(words)) + elif not common: + # nothing in common -- just return an empty freqdist. + return FreqDist() + else: + fd = FreqDist( + c for w in words for c in self._word_to_contexts[w] if c in common + ) + return fd + + +class ConcordanceIndex: + """ + An index that can be used to look up the offset locations at which + a given word occurs in a document. + """ + + def __init__(self, tokens, key=lambda x: x): + """ + Construct a new concordance index. + + :param tokens: The document (list of tokens) that this + concordance index was created from. This list can be used + to access the context of a given word occurrence. + :param key: A function that maps each token to a normalized + version that will be used as a key in the index. E.g., if + you use ``key=lambda s:s.lower()``, then the index will be + case-insensitive. + """ + self._tokens = tokens + """The document (list of tokens) that this concordance index + was created from.""" + + self._key = key + """Function mapping each token to an index key (or None).""" + + self._offsets = defaultdict(list) + """Dictionary mapping words (or keys) to lists of offset indices.""" + # Initialize the index (self._offsets) + for index, word in enumerate(tokens): + word = self._key(word) + self._offsets[word].append(index) + + def tokens(self): + """ + :rtype: list(str) + :return: The document that this concordance index was + created from. + """ + return self._tokens + + def offsets(self, word): + """ + :rtype: list(int) + :return: A list of the offset positions at which the given + word occurs. If a key function was specified for the + index, then given word's key will be looked up. + """ + word = self._key(word) + return self._offsets[word] + + def __repr__(self): + return "" % ( + len(self._tokens), + len(self._offsets), + ) + + def find_concordance(self, word, width=80): + """ + Find all concordance lines given the query word. + + Provided with a list of words, these will be found as a phrase. + """ + if isinstance(word, list): + phrase = word + else: + phrase = [word] + + half_width = (width - len(" ".join(phrase)) - 2) // 2 + context = width // 4 # approx number of words of context + + # Find the instances of the word to create the ConcordanceLine + concordance_list = [] + offsets = self.offsets(phrase[0]) + for i, word in enumerate(phrase[1:]): + word_offsets = {offset - i - 1 for offset in self.offsets(word)} + offsets = sorted(word_offsets.intersection(offsets)) + if offsets: + for i in offsets: + query_word = " ".join(self._tokens[i : i + len(phrase)]) + # Find the context of query word. + left_context = self._tokens[max(0, i - context) : i] + right_context = self._tokens[i + len(phrase) : i + context] + # Create the pretty lines with the query_word in the middle. + left_print = " ".join(left_context)[-half_width:] + right_print = " ".join(right_context)[:half_width] + # The WYSIWYG line of the concordance. + line_print = " ".join([left_print, query_word, right_print]) + # Create the ConcordanceLine + concordance_line = ConcordanceLine( + left_context, + query_word, + right_context, + i, + left_print, + right_print, + line_print, + ) + concordance_list.append(concordance_line) + return concordance_list + + def print_concordance(self, word, width=80, lines=25): + """ + Print concordance lines given the query word. + :param word: The target word or phrase (a list of strings) + :type word: str or list + :param lines: The number of lines to display (default=25) + :type lines: int + :param width: The width of each line, in characters (default=80) + :type width: int + :param save: The option to save the concordance. + :type save: bool + """ + concordance_list = self.find_concordance(word, width=width) + + if not concordance_list: + print("no matches") + else: + lines = min(lines, len(concordance_list)) + print(f"Displaying {lines} of {len(concordance_list)} matches:") + for i, concordance_line in enumerate(concordance_list[:lines]): + print(concordance_line.line) + + +class TokenSearcher: + """ + A class that makes it easier to use regular expressions to search + over tokenized strings. The tokenized string is converted to a + string where tokens are marked with angle brackets -- e.g., + ``''``. The regular expression + passed to the ``findall()`` method is modified to treat angle + brackets as non-capturing parentheses, in addition to matching the + token boundaries; and to have ``'.'`` not match the angle brackets. + """ + + def __init__(self, tokens): + self._raw = "".join("<" + w + ">" for w in tokens) + + def findall(self, regexp): + """ + Find instances of the regular expression in the text. + The text is a list of tokens, and a regexp pattern to match + a single token must be surrounded by angle brackets. E.g. + + >>> from nltk.text import TokenSearcher + >>> from nltk.book import text1, text5, text9 + >>> text5.findall("<.*><.*>") + you rule bro; telling you bro; u twizted bro + >>> text1.findall("(<.*>)") + monied; nervous; dangerous; white; white; white; pious; queer; good; + mature; white; Cape; great; wise; wise; butterless; white; fiendish; + pale; furious; better; certain; complete; dismasted; younger; brave; + brave; brave; brave + >>> text9.findall("{3,}") + thread through those; the thought that; that the thing; the thing + that; that that thing; through these than through; them that the; + through the thick; them that they; thought that the + + :param regexp: A regular expression + :type regexp: str + """ + # preprocess the regular expression + regexp = re.sub(r"\s", "", regexp) + regexp = re.sub(r"<", "(?:<(?:", regexp) + regexp = re.sub(r">", ")>)", regexp) + regexp = re.sub(r"(?]", regexp) + + # perform the search + hits = re.findall(regexp, self._raw) + + # Sanity check + for h in hits: + if not h.startswith("<") and h.endswith(">"): + raise ValueError("Bad regexp for TokenSearcher.findall") + + # postprocess the output + hits = [h[1:-1].split("><") for h in hits] + return hits + + +class Text: + """ + A wrapper around a sequence of simple (string) tokens, which is + intended to support initial exploration of texts (via the + interactive console). Its methods perform a variety of analyses + on the text's contexts (e.g., counting, concordancing, collocation + discovery), and display the results. If you wish to write a + program which makes use of these analyses, then you should bypass + the ``Text`` class, and use the appropriate analysis function or + class directly instead. + + A ``Text`` is typically initialized from a given document or + corpus. E.g.: + + >>> import nltk.corpus + >>> from nltk.text import Text + >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) + + """ + + # This defeats lazy loading, but makes things faster. This + # *shouldn't* be necessary because the corpus view *should* be + # doing intelligent caching, but without this it's running slow. + # Look into whether the caching is working correctly. + _COPY_TOKENS = True + + def __init__(self, tokens, name=None): + """ + Create a Text object. + + :param tokens: The source text. + :type tokens: sequence of str + """ + if self._COPY_TOKENS: + tokens = list(tokens) + self.tokens = tokens + + if name: + self.name = name + elif "]" in tokens[:20]: + end = tokens[:20].index("]") + self.name = " ".join(str(tok) for tok in tokens[1:end]) + else: + self.name = " ".join(str(tok) for tok in tokens[:8]) + "..." + + # //////////////////////////////////////////////////////////// + # Support item & slice access + # //////////////////////////////////////////////////////////// + + def __getitem__(self, i): + return self.tokens[i] + + def __len__(self): + return len(self.tokens) + + # //////////////////////////////////////////////////////////// + # Interactive console methods + # //////////////////////////////////////////////////////////// + + def concordance(self, word, width=79, lines=25): + """ + Prints a concordance for ``word`` with the specified context window. + Word matching is not case-sensitive. + + :param word: The target word or phrase (a list of strings) + :type word: str or list + :param width: The width of each line, in characters (default=80) + :type width: int + :param lines: The number of lines to display (default=25) + :type lines: int + + :seealso: ``ConcordanceIndex`` + """ + if "_concordance_index" not in self.__dict__: + self._concordance_index = ConcordanceIndex( + self.tokens, key=lambda s: s.lower() + ) + + return self._concordance_index.print_concordance(word, width, lines) + + def concordance_list(self, word, width=79, lines=25): + """ + Generate a concordance for ``word`` with the specified context window. + Word matching is not case-sensitive. + + :param word: The target word or phrase (a list of strings) + :type word: str or list + :param width: The width of each line, in characters (default=80) + :type width: int + :param lines: The number of lines to display (default=25) + :type lines: int + + :seealso: ``ConcordanceIndex`` + """ + if "_concordance_index" not in self.__dict__: + self._concordance_index = ConcordanceIndex( + self.tokens, key=lambda s: s.lower() + ) + return self._concordance_index.find_concordance(word, width)[:lines] + + def collocation_list(self, num=20, window_size=2): + """ + Return collocations derived from the text, ignoring stopwords. + + >>> from nltk.book import text4 + >>> text4.collocation_list()[:2] + [('United', 'States'), ('fellow', 'citizens')] + + :param num: The maximum number of collocations to return. + :type num: int + :param window_size: The number of tokens spanned by a collocation (default=2) + :type window_size: int + :rtype: list(tuple(str, str)) + """ + if not ( + "_collocations" in self.__dict__ + and self._num == num + and self._window_size == window_size + ): + self._num = num + self._window_size = window_size + + # print("Building collocations list") + from nltk.corpus import stopwords + + ignored_words = stopwords.words("english") + finder = BigramCollocationFinder.from_words(self.tokens, window_size) + finder.apply_freq_filter(2) + finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) + bigram_measures = BigramAssocMeasures() + self._collocations = list( + finder.nbest(bigram_measures.likelihood_ratio, num) + ) + return self._collocations + + def collocations(self, num=20, window_size=2): + """ + Print collocations derived from the text, ignoring stopwords. + + >>> from nltk.book import text4 + >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE + United States; fellow citizens; years ago; four years; Federal + Government; General Government; American people; Vice President; God + bless; Chief Justice; one another; fellow Americans; Old World; + Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian + tribes; public debt; foreign nations + + + :param num: The maximum number of collocations to print. + :type num: int + :param window_size: The number of tokens spanned by a collocation (default=2) + :type window_size: int + """ + + collocation_strings = [ + w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size) + ] + print(tokenwrap(collocation_strings, separator="; ")) + + def count(self, word): + """ + Count the number of times this word appears in the text. + """ + return self.tokens.count(word) + + def index(self, word): + """ + Find the index of the first occurrence of the word in the text. + """ + return self.tokens.index(word) + + def readability(self, method): + # code from nltk_contrib.readability + raise NotImplementedError + + def similar(self, word, num=20): + """ + Distributional similarity: find other words which appear in the + same contexts as the specified word; list most similar words first. + + :param word: The word used to seed the similarity search + :type word: str + :param num: The number of words to generate (default=20) + :type num: int + :seealso: ContextIndex.similar_words() + """ + if "_word_context_index" not in self.__dict__: + # print('Building word-context index...') + self._word_context_index = ContextIndex( + self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower() + ) + + # words = self._word_context_index.similar_words(word, num) + + word = word.lower() + wci = self._word_context_index._word_to_contexts + if word in wci.conditions(): + contexts = set(wci[word]) + fd = Counter( + w + for w in wci.conditions() + for c in wci[w] + if c in contexts and not w == word + ) + words = [w for w, _ in fd.most_common(num)] + print(tokenwrap(words)) + else: + print("No matches") + + def common_contexts(self, words, num=20): + """ + Find contexts where the specified words appear; list + most frequent common contexts first. + + :param words: The words used to seed the similarity search + :type words: str + :param num: The number of words to generate (default=20) + :type num: int + :seealso: ContextIndex.common_contexts() + """ + if "_word_context_index" not in self.__dict__: + # print('Building word-context index...') + self._word_context_index = ContextIndex( + self.tokens, key=lambda s: s.lower() + ) + + try: + fd = self._word_context_index.common_contexts(words, True) + if not fd: + print("No common contexts were found") + else: + ranked_contexts = [w for w, _ in fd.most_common(num)] + print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts)) + + except ValueError as e: + print(e) + + def dispersion_plot(self, words): + """ + Produce a plot showing the distribution of the words through the text. + Requires pylab to be installed. + + :param words: The words to be plotted + :type words: list(str) + :seealso: nltk.draw.dispersion_plot() + """ + from nltk.draw import dispersion_plot + + dispersion_plot(self, words) + + def _train_default_ngram_lm(self, tokenized_sents, n=3): + train_data, padded_sents = padded_everygram_pipeline(n, tokenized_sents) + model = MLE(order=n) + model.fit(train_data, padded_sents) + return model + + def generate(self, length=100, text_seed=None, random_seed=42): + """ + Print random text, generated using a trigram language model. + See also `help(nltk.lm)`. + + :param length: The length of text to generate (default=100) + :type length: int + + :param text_seed: Generation can be conditioned on preceding context. + :type text_seed: list(str) + + :param random_seed: A random seed or an instance of `random.Random`. If provided, + makes the random sampling part of generation reproducible. (default=42) + :type random_seed: int + """ + # Create the model when using it the first time. + self._tokenized_sents = [ + sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens)) + ] + if not hasattr(self, "_trigram_model"): + print("Building ngram index...", file=sys.stderr) + self._trigram_model = self._train_default_ngram_lm( + self._tokenized_sents, n=3 + ) + + generated_tokens = [] + + assert length > 0, "The `length` must be more than 0." + while len(generated_tokens) < length: + for idx, token in enumerate( + self._trigram_model.generate( + length, text_seed=text_seed, random_seed=random_seed + ) + ): + if token == "": + continue + if token == "": + break + generated_tokens.append(token) + random_seed += 1 + + prefix = " ".join(text_seed) + " " if text_seed else "" + output_str = prefix + tokenwrap(generated_tokens[:length]) + print(output_str) + return output_str + + def plot(self, *args): + """ + See documentation for FreqDist.plot() + :seealso: nltk.prob.FreqDist.plot() + """ + return self.vocab().plot(*args) + + def vocab(self): + """ + :seealso: nltk.prob.FreqDist + """ + if "_vocab" not in self.__dict__: + # print("Building vocabulary index...") + self._vocab = FreqDist(self) + return self._vocab + + def findall(self, regexp): + """ + Find instances of the regular expression in the text. + The text is a list of tokens, and a regexp pattern to match + a single token must be surrounded by angle brackets. E.g. + + >>> from nltk.book import text1, text5, text9 + >>> text5.findall("<.*><.*>") + you rule bro; telling you bro; u twizted bro + >>> text1.findall("(<.*>)") + monied; nervous; dangerous; white; white; white; pious; queer; good; + mature; white; Cape; great; wise; wise; butterless; white; fiendish; + pale; furious; better; certain; complete; dismasted; younger; brave; + brave; brave; brave + >>> text9.findall("{3,}") + thread through those; the thought that; that the thing; the thing + that; that that thing; through these than through; them that the; + through the thick; them that they; thought that the + + :param regexp: A regular expression + :type regexp: str + """ + + if "_token_searcher" not in self.__dict__: + self._token_searcher = TokenSearcher(self) + + hits = self._token_searcher.findall(regexp) + hits = [" ".join(h) for h in hits] + print(tokenwrap(hits, "; ")) + + # //////////////////////////////////////////////////////////// + # Helper Methods + # //////////////////////////////////////////////////////////// + + _CONTEXT_RE = re.compile(r"\w+|[\.\!\?]") + + def _context(self, tokens, i): + """ + One left & one right token, both case-normalized. Skip over + non-sentence-final punctuation. Used by the ``ContextIndex`` + that is created for ``similar()`` and ``common_contexts()``. + """ + # Left context + j = i - 1 + while j >= 0 and not self._CONTEXT_RE.match(tokens[j]): + j -= 1 + left = tokens[j] if j != 0 else "*START*" + + # Right context + j = i + 1 + while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]): + j += 1 + right = tokens[j] if j != len(tokens) else "*END*" + + return (left, right) + + # //////////////////////////////////////////////////////////// + # String Display + # //////////////////////////////////////////////////////////// + + def __str__(self): + return "" % self.name + + def __repr__(self): + return "" % self.name + + +# Prototype only; this approach will be slow to load +class TextCollection(Text): + """A collection of texts, which can be loaded with list of texts, or + with a corpus consisting of one or more texts, and which supports + counting, concordancing, collocation discovery, etc. Initialize a + TextCollection as follows: + + >>> import nltk.corpus + >>> from nltk.text import TextCollection + >>> from nltk.book import text1, text2, text3 + >>> gutenberg = TextCollection(nltk.corpus.gutenberg) + >>> mytexts = TextCollection([text1, text2, text3]) + + Iterating over a TextCollection produces all the tokens of all the + texts in order. + """ + + def __init__(self, source): + if hasattr(source, "words"): # bridge to the text corpus reader + source = [source.words(f) for f in source.fileids()] + + self._texts = source + Text.__init__(self, LazyConcatenation(source)) + self._idf_cache = {} + + def tf(self, term, text): + """The frequency of the term in text.""" + return text.count(term) / len(text) + + def idf(self, term): + """The number of texts in the corpus divided by the + number of texts that the term appears in. + If a term does not appear in the corpus, 0.0 is returned.""" + # idf values are cached for performance. + idf = self._idf_cache.get(term) + if idf is None: + matches = len([True for text in self._texts if term in text]) + if len(self._texts) == 0: + raise ValueError("IDF undefined for empty document collection") + idf = log(len(self._texts) / matches) if matches else 0.0 + self._idf_cache[term] = idf + return idf + + def tf_idf(self, term, text): + return self.tf(term, text) * self.idf(term) + + +def demo(): + from nltk.corpus import brown + + text = Text(brown.words(categories="news")) + print(text) + print() + print("Concordance:") + text.concordance("news") + print() + print("Distributionally similar words:") + text.similar("news") + print() + print("Collocations:") + text.collocations() + print() + # print("Automatically generated text:") + # text.generate() + # print() + print("Dispersion plot:") + text.dispersion_plot(["news", "report", "said", "announced"]) + print() + print("Vocabulary plot:") + text.plot(50) + print() + print("Indexing:") + print("text[3]:", text[3]) + print("text[3:5]:", text[3:5]) + print("text.vocab()['news']:", text.vocab()["news"]) + + +if __name__ == "__main__": + demo() + +__all__ = [ + "ContextIndex", + "ConcordanceIndex", + "TokenSearcher", + "Text", + "TextCollection", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tgrep.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tgrep.py new file mode 100644 index 0000000000000000000000000000000000000000..e1d31121d412a0340d846b1fa2791aee3d94143b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tgrep.py @@ -0,0 +1,1039 @@ +#!/usr/bin/env python +# +# Natural Language Toolkit: TGrep search +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Will Roberts +# URL: +# For license information, see LICENSE.TXT + +""" +============================================ + TGrep search implementation for NLTK trees +============================================ + +This module supports TGrep2 syntax for matching parts of NLTK Trees. +Note that many tgrep operators require the tree passed to be a +``ParentedTree``. + +External links: + +- `Tgrep tutorial `_ +- `Tgrep2 manual `_ +- `Tgrep2 source `_ + +Usage +===== + +>>> from nltk.tree import ParentedTree +>>> from nltk.tgrep import tgrep_nodes, tgrep_positions +>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') +>>> list(tgrep_nodes('NN', [tree])) +[[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]] +>>> list(tgrep_positions('NN', [tree])) +[[(0, 2), (2, 1)]] +>>> list(tgrep_nodes('DT', [tree])) +[[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]] +>>> list(tgrep_nodes('DT $ JJ', [tree])) +[[ParentedTree('DT', ['the'])]] + +This implementation adds syntax to select nodes based on their NLTK +tree position. This syntax is ``N`` plus a Python tuple representing +the tree position. For instance, ``N()``, ``N(0,)``, ``N(0,0)`` are +valid node selectors. Example: + +>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') +>>> tree[0,0] +ParentedTree('DT', ['the']) +>>> tree[0,0].treeposition() +(0, 0) +>>> list(tgrep_nodes('N(0,0)', [tree])) +[[ParentedTree('DT', ['the'])]] + +Caveats: +======== + +- Link modifiers: "?" and "=" are not implemented. +- Tgrep compatibility: Using "@" for "!", "{" for "<", "}" for ">" are + not implemented. +- The "=" and "~" links are not implemented. + +Known Issues: +============= + +- There are some issues with link relations involving leaf nodes + (which are represented as bare strings in NLTK trees). For + instance, consider the tree:: + + (S (A x)) + + The search string ``* !>> S`` should select all nodes which are not + dominated in some way by an ``S`` node (i.e., all nodes which are + not descendants of an ``S``). Clearly, in this tree, the only node + which fulfills this criterion is the top node (since it is not + dominated by anything). However, the code here will find both the + top node and the leaf node ``x``. This is because we cannot recover + the parent of the leaf, since it is stored as a bare string. + + A possible workaround, when performing this kind of search, would be + to filter out all leaf nodes. + +Implementation notes +==================== + +This implementation is (somewhat awkwardly) based on lambda functions +which are predicates on a node. A predicate is a function which is +either True or False; using a predicate function, we can identify sets +of nodes with particular properties. A predicate function, could, for +instance, return True only if a particular node has a label matching a +particular regular expression, and has a daughter node which has no +sisters. Because tgrep2 search strings can do things statefully (such +as substituting in macros, and binding nodes with node labels), the +actual predicate function is declared with three arguments:: + + pred = lambda n, m, l: return True # some logic here + +``n`` + is a node in a tree; this argument must always be given + +``m`` + contains a dictionary, mapping macro names onto predicate functions + +``l`` + is a dictionary to map node labels onto nodes in the tree + +``m`` and ``l`` are declared to default to ``None``, and so need not be +specified in a call to a predicate. Predicates which call other +predicates must always pass the value of these arguments on. The +top-level predicate (constructed by ``_tgrep_exprs_action``) binds the +macro definitions to ``m`` and initialises ``l`` to an empty dictionary. +""" + +import functools +import re + +try: + import pyparsing +except ImportError: + print("Warning: nltk.tgrep will not work without the `pyparsing` package") + print("installed.") + +import nltk.tree + + +class TgrepException(Exception): + """Tgrep exception type.""" + + pass + + +def ancestors(node): + """ + Returns the list of all nodes dominating the given tree node. + This method will not work with leaf nodes, since there is no way + to recover the parent. + """ + results = [] + try: + current = node.parent() + except AttributeError: + # if node is a leaf, we cannot retrieve its parent + return results + while current: + results.append(current) + current = current.parent() + return results + + +def unique_ancestors(node): + """ + Returns the list of all nodes dominating the given node, where + there is only a single path of descent. + """ + results = [] + try: + current = node.parent() + except AttributeError: + # if node is a leaf, we cannot retrieve its parent + return results + while current and len(current) == 1: + results.append(current) + current = current.parent() + return results + + +def _descendants(node): + """ + Returns the list of all nodes which are descended from the given + tree node in some way. + """ + try: + treepos = node.treepositions() + except AttributeError: + return [] + return [node[x] for x in treepos[1:]] + + +def _leftmost_descendants(node): + """ + Returns the set of all nodes descended in some way through + left branches from this node. + """ + try: + treepos = node.treepositions() + except AttributeError: + return [] + return [node[x] for x in treepos[1:] if all(y == 0 for y in x)] + + +def _rightmost_descendants(node): + """ + Returns the set of all nodes descended in some way through + right branches from this node. + """ + try: + rightmost_leaf = max(node.treepositions()) + except AttributeError: + return [] + return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] + + +def _istree(obj): + """Predicate to check whether `obj` is a nltk.tree.Tree.""" + return isinstance(obj, nltk.tree.Tree) + + +def _unique_descendants(node): + """ + Returns the list of all nodes descended from the given node, where + there is only a single path of descent. + """ + results = [] + current = node + while current and _istree(current) and len(current) == 1: + current = current[0] + results.append(current) + return results + + +def _before(node): + """ + Returns the set of all nodes that are before the given node. + """ + try: + pos = node.treeposition() + tree = node.root() + except AttributeError: + return [] + return [tree[x] for x in tree.treepositions() if x[: len(pos)] < pos[: len(x)]] + + +def _immediately_before(node): + """ + Returns the set of all nodes that are immediately before the given + node. + + Tree node A immediately precedes node B if the last terminal + symbol (word) produced by A immediately precedes the first + terminal symbol produced by B. + """ + try: + pos = node.treeposition() + tree = node.root() + except AttributeError: + return [] + # go "upwards" from pos until there is a place we can go to the left + idx = len(pos) - 1 + while 0 <= idx and pos[idx] == 0: + idx -= 1 + if idx < 0: + return [] + pos = list(pos[: idx + 1]) + pos[-1] -= 1 + before = tree[pos] + return [before] + _rightmost_descendants(before) + + +def _after(node): + """ + Returns the set of all nodes that are after the given node. + """ + try: + pos = node.treeposition() + tree = node.root() + except AttributeError: + return [] + return [tree[x] for x in tree.treepositions() if x[: len(pos)] > pos[: len(x)]] + + +def _immediately_after(node): + """ + Returns the set of all nodes that are immediately after the given + node. + + Tree node A immediately follows node B if the first terminal + symbol (word) produced by A immediately follows the last + terminal symbol produced by B. + """ + try: + pos = node.treeposition() + tree = node.root() + current = node.parent() + except AttributeError: + return [] + # go "upwards" from pos until there is a place we can go to the + # right + idx = len(pos) - 1 + while 0 <= idx and pos[idx] == len(current) - 1: + idx -= 1 + current = current.parent() + if idx < 0: + return [] + pos = list(pos[: idx + 1]) + pos[-1] += 1 + after = tree[pos] + return [after] + _leftmost_descendants(after) + + +def _tgrep_node_literal_value(node): + """ + Gets the string value of a given parse tree node, for comparison + using the tgrep node literal predicates. + """ + return node.label() if _istree(node) else str(node) + + +def _tgrep_macro_use_action(_s, _l, tokens): + """ + Builds a lambda function which looks up the macro name used. + """ + assert len(tokens) == 1 + assert tokens[0][0] == "@" + macro_name = tokens[0][1:] + + def macro_use(n, m=None, l=None): + if m is None or macro_name not in m: + raise TgrepException(f"macro {macro_name} not defined") + return m[macro_name](n, m, l) + + return macro_use + + +def _tgrep_node_action(_s, _l, tokens): + """ + Builds a lambda function representing a predicate on a tree node + depending on the name of its node. + """ + if tokens[0] == "'": + # strip initial apostrophe (tgrep2 print command) + tokens = tokens[1:] + if len(tokens) > 1: + # disjunctive definition of a node name + assert list(set(tokens[1::2])) == ["|"] + # recursively call self to interpret each node name definition + tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]] + # capture tokens and return the disjunction + return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens) + else: + if hasattr(tokens[0], "__call__"): + # this is a previously interpreted parenthetical node + # definition (lambda function) + return tokens[0] + elif tokens[0] == "*" or tokens[0] == "__": + return lambda n, m=None, l=None: True + elif tokens[0].startswith('"'): + assert tokens[0].endswith('"') + node_lit = tokens[0][1:-1].replace('\\"', '"').replace("\\\\", "\\") + return ( + lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s + )(node_lit) + elif tokens[0].startswith("/"): + assert tokens[0].endswith("/") + node_lit = tokens[0][1:-1] + return ( + lambda r: lambda n, m=None, l=None: r.search( + _tgrep_node_literal_value(n) + ) + )(re.compile(node_lit)) + elif tokens[0].startswith("i@"): + node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()]) + return ( + lambda f: lambda n, m=None, l=None: f( + _tgrep_node_literal_value(n).lower() + ) + )(node_func) + else: + return ( + lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s + )(tokens[0]) + + +def _tgrep_parens_action(_s, _l, tokens): + """ + Builds a lambda function representing a predicate on a tree node + from a parenthetical notation. + """ + assert len(tokens) == 3 + assert tokens[0] == "(" + assert tokens[2] == ")" + return tokens[1] + + +def _tgrep_nltk_tree_pos_action(_s, _l, tokens): + """ + Builds a lambda function representing a predicate on a tree node + which returns true if the node is located at a specific tree + position. + """ + # recover the tuple from the parsed string + node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) + # capture the node's tree position + return ( + lambda i: lambda n, m=None, l=None: ( + hasattr(n, "treeposition") and n.treeposition() == i + ) + )(node_tree_position) + + +def _tgrep_relation_action(_s, _l, tokens): + """ + Builds a lambda function representing a predicate on a tree node + depending on its relation to other nodes in the tree. + """ + # process negation first if needed + negated = False + if tokens[0] == "!": + negated = True + tokens = tokens[1:] + if tokens[0] == "[": + # process square-bracketed relation expressions + assert len(tokens) == 3 + assert tokens[2] == "]" + retval = tokens[1] + else: + # process operator-node relation expressions + assert len(tokens) == 2 + operator, predicate = tokens + # A < B A is the parent of (immediately dominates) B. + if operator == "<": + retval = lambda n, m=None, l=None: ( + _istree(n) and any(predicate(x, m, l) for x in n) + ) + # A > B A is the child of B. + elif operator == ">": + retval = lambda n, m=None, l=None: ( + hasattr(n, "parent") + and bool(n.parent()) + and predicate(n.parent(), m, l) + ) + # A <, B Synonymous with A <1 B. + elif operator == "<," or operator == "<1": + retval = lambda n, m=None, l=None: ( + _istree(n) and bool(list(n)) and predicate(n[0], m, l) + ) + # A >, B Synonymous with A >1 B. + elif operator == ">," or operator == ">1": + retval = lambda n, m=None, l=None: ( + hasattr(n, "parent") + and bool(n.parent()) + and (n is n.parent()[0]) + and predicate(n.parent(), m, l) + ) + # A N B A is the Nth child of B (the first child is >1). + elif operator[0] == ">" and operator[1:].isdigit(): + idx = int(operator[1:]) + # capture the index parameter + retval = ( + lambda i: lambda n, m=None, l=None: ( + hasattr(n, "parent") + and bool(n.parent()) + and 0 <= i < len(n.parent()) + and (n is n.parent()[i]) + and predicate(n.parent(), m, l) + ) + )(idx - 1) + # A <' B B is the last child of A (also synonymous with A <-1 B). + # A <- B B is the last child of A (synonymous with A <-1 B). + elif operator == "<'" or operator == "<-" or operator == "<-1": + retval = lambda n, m=None, l=None: ( + _istree(n) and bool(list(n)) and predicate(n[-1], m, l) + ) + # A >' B A is the last child of B (also synonymous with A >-1 B). + # A >- B A is the last child of B (synonymous with A >-1 B). + elif operator == ">'" or operator == ">-" or operator == ">-1": + retval = lambda n, m=None, l=None: ( + hasattr(n, "parent") + and bool(n.parent()) + and (n is n.parent()[-1]) + and predicate(n.parent(), m, l) + ) + # A <-N B B is the N th-to-last child of A (the last child is <-1). + elif operator[:2] == "<-" and operator[2:].isdigit(): + idx = -int(operator[2:]) + # capture the index parameter + retval = ( + lambda i: lambda n, m=None, l=None: ( + _istree(n) + and bool(list(n)) + and 0 <= (i + len(n)) < len(n) + and predicate(n[i + len(n)], m, l) + ) + )(idx) + # A >-N B A is the N th-to-last child of B (the last child is >-1). + elif operator[:2] == ">-" and operator[2:].isdigit(): + idx = -int(operator[2:]) + # capture the index parameter + retval = ( + lambda i: lambda n, m=None, l=None: ( + hasattr(n, "parent") + and bool(n.parent()) + and 0 <= (i + len(n.parent())) < len(n.parent()) + and (n is n.parent()[i + len(n.parent())]) + and predicate(n.parent(), m, l) + ) + )(idx) + # A <: B B is the only child of A + elif operator == "<:": + retval = lambda n, m=None, l=None: ( + _istree(n) and len(n) == 1 and predicate(n[0], m, l) + ) + # A >: B A is the only child of B. + elif operator == ">:": + retval = lambda n, m=None, l=None: ( + hasattr(n, "parent") + and bool(n.parent()) + and len(n.parent()) == 1 + and predicate(n.parent(), m, l) + ) + # A << B A dominates B (A is an ancestor of B). + elif operator == "<<": + retval = lambda n, m=None, l=None: ( + _istree(n) and any(predicate(x, m, l) for x in _descendants(n)) + ) + # A >> B A is dominated by B (A is a descendant of B). + elif operator == ">>": + retval = lambda n, m=None, l=None: any( + predicate(x, m, l) for x in ancestors(n) + ) + # A <<, B B is a left-most descendant of A. + elif operator == "<<," or operator == "<<1": + retval = lambda n, m=None, l=None: ( + _istree(n) and any(predicate(x, m, l) for x in _leftmost_descendants(n)) + ) + # A >>, B A is a left-most descendant of B. + elif operator == ">>,": + retval = lambda n, m=None, l=None: any( + (predicate(x, m, l) and n in _leftmost_descendants(x)) + for x in ancestors(n) + ) + # A <<' B B is a right-most descendant of A. + elif operator == "<<'": + retval = lambda n, m=None, l=None: ( + _istree(n) + and any(predicate(x, m, l) for x in _rightmost_descendants(n)) + ) + # A >>' B A is a right-most descendant of B. + elif operator == ">>'": + retval = lambda n, m=None, l=None: any( + (predicate(x, m, l) and n in _rightmost_descendants(x)) + for x in ancestors(n) + ) + # A <<: B There is a single path of descent from A and B is on it. + elif operator == "<<:": + retval = lambda n, m=None, l=None: ( + _istree(n) and any(predicate(x, m, l) for x in _unique_descendants(n)) + ) + # A >>: B There is a single path of descent from B and A is on it. + elif operator == ">>:": + retval = lambda n, m=None, l=None: any( + predicate(x, m, l) for x in unique_ancestors(n) + ) + # A . B A immediately precedes B. + elif operator == ".": + retval = lambda n, m=None, l=None: any( + predicate(x, m, l) for x in _immediately_after(n) + ) + # A , B A immediately follows B. + elif operator == ",": + retval = lambda n, m=None, l=None: any( + predicate(x, m, l) for x in _immediately_before(n) + ) + # A .. B A precedes B. + elif operator == "..": + retval = lambda n, m=None, l=None: any( + predicate(x, m, l) for x in _after(n) + ) + # A ,, B A follows B. + elif operator == ",,": + retval = lambda n, m=None, l=None: any( + predicate(x, m, l) for x in _before(n) + ) + # A $ B A is a sister of B (and A != B). + elif operator == "$" or operator == "%": + retval = lambda n, m=None, l=None: ( + hasattr(n, "parent") + and bool(n.parent()) + and any(predicate(x, m, l) for x in n.parent() if x is not n) + ) + # A $. B A is a sister of and immediately precedes B. + elif operator == "$." or operator == "%.": + retval = lambda n, m=None, l=None: ( + hasattr(n, "right_sibling") + and bool(n.right_sibling()) + and predicate(n.right_sibling(), m, l) + ) + # A $, B A is a sister of and immediately follows B. + elif operator == "$," or operator == "%,": + retval = lambda n, m=None, l=None: ( + hasattr(n, "left_sibling") + and bool(n.left_sibling()) + and predicate(n.left_sibling(), m, l) + ) + # A $.. B A is a sister of and precedes B. + elif operator == "$.." or operator == "%..": + retval = lambda n, m=None, l=None: ( + hasattr(n, "parent") + and hasattr(n, "parent_index") + and bool(n.parent()) + and any(predicate(x, m, l) for x in n.parent()[n.parent_index() + 1 :]) + ) + # A $,, B A is a sister of and follows B. + elif operator == "$,," or operator == "%,,": + retval = lambda n, m=None, l=None: ( + hasattr(n, "parent") + and hasattr(n, "parent_index") + and bool(n.parent()) + and any(predicate(x, m, l) for x in n.parent()[: n.parent_index()]) + ) + else: + raise TgrepException(f'cannot interpret tgrep operator "{operator}"') + # now return the built function + if negated: + return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval) + else: + return retval + + +def _tgrep_conjunction_action(_s, _l, tokens, join_char="&"): + """ + Builds a lambda function representing a predicate on a tree node + from the conjunction of several other such lambda functions. + + This is prototypically called for expressions like + (`tgrep_rel_conjunction`):: + + < NP & < AP < VP + + where tokens is a list of predicates representing the relations + (`< NP`, `< AP`, and `< VP`), possibly with the character `&` + included (as in the example here). + + This is also called for expressions like (`tgrep_node_expr2`):: + + NP < NN + S=s < /NP/=n : s < /VP/=v : n .. v + + tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional) + list of segmented patterns (`tgrep_expr_labeled`, processed by + `_tgrep_segmented_pattern_action`). + """ + # filter out the ampersand + tokens = [x for x in tokens if x != join_char] + if len(tokens) == 1: + return tokens[0] + else: + return ( + lambda ts: lambda n, m=None, l=None: all( + predicate(n, m, l) for predicate in ts + ) + )(tokens) + + +def _tgrep_segmented_pattern_action(_s, _l, tokens): + """ + Builds a lambda function representing a segmented pattern. + + Called for expressions like (`tgrep_expr_labeled`):: + + =s .. =v < =n + + This is a segmented pattern, a tgrep2 expression which begins with + a node label. + + The problem is that for segemented_pattern_action (': =v < =s'), + the first element (in this case, =v) is specifically selected by + virtue of matching a particular node in the tree; to retrieve + the node, we need the label, not a lambda function. For node + labels inside a tgrep_node_expr, we need a lambda function which + returns true if the node visited is the same as =v. + + We solve this by creating two copies of a node_label_use in the + grammar; the label use inside a tgrep_expr_labeled has a separate + parse action to the pred use inside a node_expr. See + `_tgrep_node_label_use_action` and + `_tgrep_node_label_pred_use_action`. + """ + # tokens[0] is a string containing the node label + node_label = tokens[0] + # tokens[1:] is an (optional) list of predicates which must all + # hold of the bound node + reln_preds = tokens[1:] + + def pattern_segment_pred(n, m=None, l=None): + """This predicate function ignores its node argument.""" + # look up the bound node using its label + if l is None or node_label not in l: + raise TgrepException(f"node_label ={node_label} not bound in pattern") + node = l[node_label] + # match the relation predicates against the node + return all(pred(node, m, l) for pred in reln_preds) + + return pattern_segment_pred + + +def _tgrep_node_label_use_action(_s, _l, tokens): + """ + Returns the node label used to begin a tgrep_expr_labeled. See + `_tgrep_segmented_pattern_action`. + + Called for expressions like (`tgrep_node_label_use`):: + + =s + + when they appear as the first element of a `tgrep_expr_labeled` + expression (see `_tgrep_segmented_pattern_action`). + + It returns the node label. + """ + assert len(tokens) == 1 + assert tokens[0].startswith("=") + return tokens[0][1:] + + +def _tgrep_node_label_pred_use_action(_s, _l, tokens): + """ + Builds a lambda function representing a predicate on a tree node + which describes the use of a previously bound node label. + + Called for expressions like (`tgrep_node_label_use_pred`):: + + =s + + when they appear inside a tgrep_node_expr (for example, inside a + relation). The predicate returns true if and only if its node + argument is identical the the node looked up in the node label + dictionary using the node's label. + """ + assert len(tokens) == 1 + assert tokens[0].startswith("=") + node_label = tokens[0][1:] + + def node_label_use_pred(n, m=None, l=None): + # look up the bound node using its label + if l is None or node_label not in l: + raise TgrepException(f"node_label ={node_label} not bound in pattern") + node = l[node_label] + # truth means the given node is this node + return n is node + + return node_label_use_pred + + +def _tgrep_bind_node_label_action(_s, _l, tokens): + """ + Builds a lambda function representing a predicate on a tree node + which can optionally bind a matching node into the tgrep2 string's + label_dict. + + Called for expressions like (`tgrep_node_expr2`):: + + /NP/ + @NP=n + """ + # tokens[0] is a tgrep_node_expr + if len(tokens) == 1: + return tokens[0] + else: + # if present, tokens[1] is the character '=', and tokens[2] is + # a tgrep_node_label, a string value containing the node label + assert len(tokens) == 3 + assert tokens[1] == "=" + node_pred = tokens[0] + node_label = tokens[2] + + def node_label_bind_pred(n, m=None, l=None): + if node_pred(n, m, l): + # bind `n` into the dictionary `l` + if l is None: + raise TgrepException( + "cannot bind node_label {}: label_dict is None".format( + node_label + ) + ) + l[node_label] = n + return True + else: + return False + + return node_label_bind_pred + + +def _tgrep_rel_disjunction_action(_s, _l, tokens): + """ + Builds a lambda function representing a predicate on a tree node + from the disjunction of several other such lambda functions. + """ + # filter out the pipe + tokens = [x for x in tokens if x != "|"] + if len(tokens) == 1: + return tokens[0] + elif len(tokens) == 2: + return (lambda a, b: lambda n, m=None, l=None: a(n, m, l) or b(n, m, l))( + tokens[0], tokens[1] + ) + + +def _macro_defn_action(_s, _l, tokens): + """ + Builds a dictionary structure which defines the given macro. + """ + assert len(tokens) == 3 + assert tokens[0] == "@" + return {tokens[1]: tokens[2]} + + +def _tgrep_exprs_action(_s, _l, tokens): + """ + This is the top-lebel node in a tgrep2 search string; the + predicate function it returns binds together all the state of a + tgrep2 search string. + + Builds a lambda function representing a predicate on a tree node + from the disjunction of several tgrep expressions. Also handles + macro definitions and macro name binding, and node label + definitions and node label binding. + """ + if len(tokens) == 1: + return lambda n, m=None, l=None: tokens[0](n, None, {}) + # filter out all the semicolons + tokens = [x for x in tokens if x != ";"] + # collect all macro definitions + macro_dict = {} + macro_defs = [tok for tok in tokens if isinstance(tok, dict)] + for macro_def in macro_defs: + macro_dict.update(macro_def) + # collect all tgrep expressions + tgrep_exprs = [tok for tok in tokens if not isinstance(tok, dict)] + # create a new scope for the node label dictionary + def top_level_pred(n, m=macro_dict, l=None): + label_dict = {} + # bind macro definitions and OR together all tgrep_exprs + return any(predicate(n, m, label_dict) for predicate in tgrep_exprs) + + return top_level_pred + + +def _build_tgrep_parser(set_parse_actions=True): + """ + Builds a pyparsing-based parser object for tokenizing and + interpreting tgrep search strings. + """ + tgrep_op = pyparsing.Optional("!") + pyparsing.Regex("[$%,.<>][%,.<>0-9-':]*") + tgrep_qstring = pyparsing.QuotedString( + quoteChar='"', escChar="\\", unquoteResults=False + ) + tgrep_node_regex = pyparsing.QuotedString( + quoteChar="/", escChar="\\", unquoteResults=False + ) + tgrep_qstring_icase = pyparsing.Regex('i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"') + tgrep_node_regex_icase = pyparsing.Regex("i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/") + tgrep_node_literal = pyparsing.Regex("[^][ \r\t\n;:.,&|<>()$!@%'^=]+") + tgrep_expr = pyparsing.Forward() + tgrep_relations = pyparsing.Forward() + tgrep_parens = pyparsing.Literal("(") + tgrep_expr + ")" + tgrep_nltk_tree_pos = ( + pyparsing.Literal("N(") + + pyparsing.Optional( + pyparsing.Word(pyparsing.nums) + + "," + + pyparsing.Optional( + pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=",") + + pyparsing.Optional(",") + ) + ) + + ")" + ) + tgrep_node_label = pyparsing.Regex("[A-Za-z0-9]+") + tgrep_node_label_use = pyparsing.Combine("=" + tgrep_node_label) + # see _tgrep_segmented_pattern_action + tgrep_node_label_use_pred = tgrep_node_label_use.copy() + macro_name = pyparsing.Regex("[^];:.,&|<>()[$!@%'^=\r\t\n ]+") + macro_name.setWhitespaceChars("") + macro_use = pyparsing.Combine("@" + macro_name) + tgrep_node_expr = ( + tgrep_node_label_use_pred + | macro_use + | tgrep_nltk_tree_pos + | tgrep_qstring_icase + | tgrep_node_regex_icase + | tgrep_qstring + | tgrep_node_regex + | "*" + | tgrep_node_literal + ) + tgrep_node_expr2 = ( + tgrep_node_expr + + pyparsing.Literal("=").setWhitespaceChars("") + + tgrep_node_label.copy().setWhitespaceChars("") + ) | tgrep_node_expr + tgrep_node = tgrep_parens | ( + pyparsing.Optional("'") + + tgrep_node_expr2 + + pyparsing.ZeroOrMore("|" + tgrep_node_expr) + ) + tgrep_brackets = pyparsing.Optional("!") + "[" + tgrep_relations + "]" + tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node) + tgrep_rel_conjunction = pyparsing.Forward() + tgrep_rel_conjunction << ( + tgrep_relation + + pyparsing.ZeroOrMore(pyparsing.Optional("&") + tgrep_rel_conjunction) + ) + tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( + "|" + tgrep_relations + ) + tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) + tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations) + tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(":" + tgrep_expr_labeled) + macro_defn = ( + pyparsing.Literal("@") + pyparsing.White().suppress() + macro_name + tgrep_expr2 + ) + tgrep_exprs = ( + pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(";" + macro_defn) + ";") + + tgrep_expr2 + + pyparsing.ZeroOrMore(";" + (macro_defn | tgrep_expr2)) + + pyparsing.ZeroOrMore(";").suppress() + ) + if set_parse_actions: + tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action) + tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action) + macro_use.setParseAction(_tgrep_macro_use_action) + tgrep_node.setParseAction(_tgrep_node_action) + tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action) + tgrep_parens.setParseAction(_tgrep_parens_action) + tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) + tgrep_relation.setParseAction(_tgrep_relation_action) + tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action) + tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) + macro_defn.setParseAction(_macro_defn_action) + # the whole expression is also the conjunction of two + # predicates: the first node predicate, and the remaining + # relation predicates + tgrep_expr.setParseAction(_tgrep_conjunction_action) + tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action) + tgrep_expr2.setParseAction( + functools.partial(_tgrep_conjunction_action, join_char=":") + ) + tgrep_exprs.setParseAction(_tgrep_exprs_action) + return tgrep_exprs.ignore("#" + pyparsing.restOfLine) + + +def tgrep_tokenize(tgrep_string): + """ + Tokenizes a TGrep search string into separate tokens. + """ + parser = _build_tgrep_parser(False) + if isinstance(tgrep_string, bytes): + tgrep_string = tgrep_string.decode() + return list(parser.parseString(tgrep_string)) + + +def tgrep_compile(tgrep_string): + """ + Parses (and tokenizes, if necessary) a TGrep search string into a + lambda function. + """ + parser = _build_tgrep_parser(True) + if isinstance(tgrep_string, bytes): + tgrep_string = tgrep_string.decode() + return list(parser.parseString(tgrep_string, parseAll=True))[0] + + +def treepositions_no_leaves(tree): + """ + Returns all the tree positions in the given tree which are not + leaf nodes. + """ + treepositions = tree.treepositions() + # leaves are treeposition tuples that are not prefixes of any + # other treeposition + prefixes = set() + for pos in treepositions: + for length in range(len(pos)): + prefixes.add(pos[:length]) + return [pos for pos in treepositions if pos in prefixes] + + +def tgrep_positions(pattern, trees, search_leaves=True): + """ + Return the tree positions in the trees which match the given pattern. + + :param pattern: a tgrep search pattern + :type pattern: str or output of tgrep_compile() + :param trees: a sequence of NLTK trees (usually ParentedTrees) + :type trees: iter(ParentedTree) or iter(Tree) + :param search_leaves: whether to return matching leaf nodes + :type search_leaves: bool + :rtype: iter(tree positions) + """ + + if isinstance(pattern, (bytes, str)): + pattern = tgrep_compile(pattern) + + for tree in trees: + try: + if search_leaves: + positions = tree.treepositions() + else: + positions = treepositions_no_leaves(tree) + yield [position for position in positions if pattern(tree[position])] + except AttributeError: + yield [] + + +def tgrep_nodes(pattern, trees, search_leaves=True): + """ + Return the tree nodes in the trees which match the given pattern. + + :param pattern: a tgrep search pattern + :type pattern: str or output of tgrep_compile() + :param trees: a sequence of NLTK trees (usually ParentedTrees) + :type trees: iter(ParentedTree) or iter(Tree) + :param search_leaves: whether to return matching leaf nodes + :type search_leaves: bool + :rtype: iter(tree nodes) + """ + + if isinstance(pattern, (bytes, str)): + pattern = tgrep_compile(pattern) + + for tree in trees: + try: + if search_leaves: + positions = tree.treepositions() + else: + positions = treepositions_no_leaves(tree) + yield [tree[position] for position in positions if pattern(tree[position])] + except AttributeError: + yield [] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/sexpr.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/sexpr.py new file mode 100644 index 0000000000000000000000000000000000000000..f315d9a6aa7b854567cbf2b573dea2d3555f8a64 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/sexpr.py @@ -0,0 +1,140 @@ +# Natural Language Toolkit: Tokenizers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Yoav Goldberg +# Steven Bird (minor edits) +# URL: +# For license information, see LICENSE.TXT + +""" +S-Expression Tokenizer + +``SExprTokenizer`` is used to find parenthesized expressions in a +string. In particular, it divides a string into a sequence of +substrings that are either parenthesized expressions (including any +nested parenthesized expressions), or other whitespace-separated +tokens. + + >>> from nltk.tokenize import SExprTokenizer + >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') + ['(a b (c d))', 'e', 'f', '(g)'] + +By default, `SExprTokenizer` will raise a ``ValueError`` exception if +used to tokenize an expression with non-matching parentheses: + + >>> SExprTokenizer().tokenize('c) d) e (f (g') + Traceback (most recent call last): + ... + ValueError: Un-matched close paren at char 1 + +The ``strict`` argument can be set to False to allow for +non-matching parentheses. Any unmatched close parentheses will be +listed as their own s-expression; and the last partial sexpr with +unmatched open parentheses will be listed as its own sexpr: + + >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') + ['c', ')', 'd', ')', 'e', '(f (g'] + +The characters used for open and close parentheses may be customized +using the ``parens`` argument to the `SExprTokenizer` constructor: + + >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}') + ['{a b {c d}}', 'e', 'f', '{g}'] + +The s-expression tokenizer is also available as a function: + + >>> from nltk.tokenize import sexpr_tokenize + >>> sexpr_tokenize('(a b (c d)) e f (g)') + ['(a b (c d))', 'e', 'f', '(g)'] + +""" + +import re + +from nltk.tokenize.api import TokenizerI + + +class SExprTokenizer(TokenizerI): + """ + A tokenizer that divides strings into s-expressions. + An s-expresion can be either: + + - a parenthesized expression, including any nested parenthesized + expressions, or + - a sequence of non-whitespace non-parenthesis characters. + + For example, the string ``(a (b c)) d e (f)`` consists of four + s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``. + + By default, the characters ``(`` and ``)`` are treated as open and + close parentheses, but alternative strings may be specified. + + :param parens: A two-element sequence specifying the open and close parentheses + that should be used to find sexprs. This will typically be either a + two-character string, or a list of two strings. + :type parens: str or list + :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr. + """ + + def __init__(self, parens="()", strict=True): + if len(parens) != 2: + raise ValueError("parens must contain exactly two strings") + self._strict = strict + self._open_paren = parens[0] + self._close_paren = parens[1] + self._paren_regexp = re.compile( + f"{re.escape(parens[0])}|{re.escape(parens[1])}" + ) + + def tokenize(self, text): + """ + Return a list of s-expressions extracted from *text*. + For example: + + >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') + ['(a b (c d))', 'e', 'f', '(g)'] + + All parentheses are assumed to mark s-expressions. + (No special processing is done to exclude parentheses that occur + inside strings, or following backslash characters.) + + If the given expression contains non-matching parentheses, + then the behavior of the tokenizer depends on the ``strict`` + parameter to the constructor. If ``strict`` is ``True``, then + raise a ``ValueError``. If ``strict`` is ``False``, then any + unmatched close parentheses will be listed as their own + s-expression; and the last partial s-expression with unmatched open + parentheses will be listed as its own s-expression: + + >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') + ['c', ')', 'd', ')', 'e', '(f (g'] + + :param text: the string to be tokenized + :type text: str or iter(str) + :rtype: iter(str) + """ + result = [] + pos = 0 + depth = 0 + for m in self._paren_regexp.finditer(text): + paren = m.group() + if depth == 0: + result += text[pos : m.start()].split() + pos = m.start() + if paren == self._open_paren: + depth += 1 + if paren == self._close_paren: + if self._strict and depth == 0: + raise ValueError("Un-matched close paren at char %d" % m.start()) + depth = max(0, depth - 1) + if depth == 0: + result.append(text[pos : m.end()]) + pos = m.end() + if self._strict and depth > 0: + raise ValueError("Un-matched open paren at char %d" % pos) + if pos < len(text): + result.append(text[pos:]) + return result + + +sexpr_tokenize = SExprTokenizer().tokenize diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/simple.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/simple.py new file mode 100644 index 0000000000000000000000000000000000000000..d27a0b4bd5f444b815806f1120dc5fe4fab9b3e3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/simple.py @@ -0,0 +1,137 @@ +# Natural Language Toolkit: Simple Tokenizers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +r""" +Simple Tokenizers + +These tokenizers divide strings into substrings using the string +``split()`` method. +When tokenizing using a particular delimiter string, use +the string ``split()`` method directly, as this is more efficient. + +The simple tokenizers are *not* available as separate functions; +instead, you should just use the string ``split()`` method directly: + + >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." + >>> s.split() # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', + 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] + >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', + 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] + >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE + ['Good muffins cost $3.88', 'in New York. Please buy me', + 'two of them.', '', 'Thanks.'] + +The simple tokenizers are mainly useful because they follow the +standard ``TokenizerI`` interface, and so can be used with any code +that expects a tokenizer. For example, these tokenizers can be used +to specify the tokenization conventions when building a `CorpusReader`. + +""" + +from nltk.tokenize.api import StringTokenizer, TokenizerI +from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize + + +class SpaceTokenizer(StringTokenizer): + r"""Tokenize a string using the space character as a delimiter, + which is the same as ``s.split(' ')``. + + >>> from nltk.tokenize import SpaceTokenizer + >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." + >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', + 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] + """ + + _string = " " + + +class TabTokenizer(StringTokenizer): + r"""Tokenize a string use the tab character as a delimiter, + the same as ``s.split('\t')``. + + >>> from nltk.tokenize import TabTokenizer + >>> TabTokenizer().tokenize('a\tb c\n\t d') + ['a', 'b c\n', ' d'] + """ + + _string = "\t" + + +class CharTokenizer(StringTokenizer): + """Tokenize a string into individual characters. If this functionality + is ever required directly, use ``for char in string``. + """ + + def tokenize(self, s): + return list(s) + + def span_tokenize(self, s): + yield from enumerate(range(1, len(s) + 1)) + + +class LineTokenizer(TokenizerI): + r"""Tokenize a string into its lines, optionally discarding blank lines. + This is similar to ``s.split('\n')``. + + >>> from nltk.tokenize import LineTokenizer + >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." + >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good muffins cost $3.88', 'in New York. Please buy me', + 'two of them.', '', 'Thanks.'] + >>> # same as [l for l in s.split('\n') if l.strip()]: + >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good muffins cost $3.88', 'in New York. Please buy me', + 'two of them.', 'Thanks.'] + + :param blanklines: Indicates how blank lines should be handled. Valid values are: + + - ``discard``: strip blank lines out of the token list before returning it. + A line is considered blank if it contains only whitespace characters. + - ``keep``: leave all blank lines in the token list. + - ``discard-eof``: if the string ends with a newline, then do not generate + a corresponding token ``''`` after that newline. + """ + + def __init__(self, blanklines="discard"): + valid_blanklines = ("discard", "keep", "discard-eof") + if blanklines not in valid_blanklines: + raise ValueError( + "Blank lines must be one of: %s" % " ".join(valid_blanklines) + ) + + self._blanklines = blanklines + + def tokenize(self, s): + lines = s.splitlines() + # If requested, strip off blank lines. + if self._blanklines == "discard": + lines = [l for l in lines if l.rstrip()] + elif self._blanklines == "discard-eof": + if lines and not lines[-1].strip(): + lines.pop() + return lines + + # discard-eof not implemented + def span_tokenize(self, s): + if self._blanklines == "keep": + yield from string_span_tokenize(s, r"\n") + else: + yield from regexp_span_tokenize(s, r"\n(\s+\n)*") + + +###################################################################### +# { Tokenization Functions +###################################################################### +# XXX: it is stated in module docs that there is no function versions + + +def line_tokenize(text, blanklines="discard"): + return LineTokenizer(blanklines).tokenize(text) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/stanford_segmenter.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/stanford_segmenter.py new file mode 100644 index 0000000000000000000000000000000000000000..ea09c5eccaa9c18f2bcba374c0488d65f7afdcd3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/stanford_segmenter.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python +# Natural Language Toolkit: Interface to the Stanford Segmenter +# for Chinese and Arabic +# +# Copyright (C) 2001-2022 NLTK Project +# Author: 52nlp <52nlpcn@gmail.com> +# Casper Lehmann-Strøm +# Alex Constantin +# +# URL: +# For license information, see LICENSE.TXT + +import json +import os +import tempfile +import warnings +from subprocess import PIPE + +from nltk.internals import ( + _java_options, + config_java, + find_dir, + find_file, + find_jar, + java, +) +from nltk.tokenize.api import TokenizerI + +_stanford_url = "https://nlp.stanford.edu/software" + + +class StanfordSegmenter(TokenizerI): + """Interface to the Stanford Segmenter + + If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j + should be provieded, for example:: + + seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar') + + >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter + >>> seg = StanfordSegmenter() # doctest: +SKIP + >>> seg.default_config('zh') # doctest: +SKIP + >>> sent = u'这是斯坦福中文分词器测试' + >>> print(seg.segment(sent)) # doctest: +SKIP + \u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5 + + >>> seg.default_config('ar') # doctest: +SKIP + >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات' + >>> print(seg.segment(sent.split())) # doctest: +SKIP + \u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a + + """ + + _JAR = "stanford-segmenter.jar" + + def __init__( + self, + path_to_jar=None, + path_to_slf4j=None, + java_class=None, + path_to_model=None, + path_to_dict=None, + path_to_sihan_corpora_dict=None, + sihan_post_processing="false", + keep_whitespaces="false", + encoding="UTF-8", + options=None, + verbose=False, + java_options="-mx2g", + ): + # Raise deprecation warning. + warnings.simplefilter("always", DeprecationWarning) + warnings.warn( + str( + "\nThe StanfordTokenizer will " + "be deprecated in version 3.2.5.\n" + "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'" + ), + DeprecationWarning, + stacklevel=2, + ) + warnings.simplefilter("ignore", DeprecationWarning) + + stanford_segmenter = find_jar( + self._JAR, + path_to_jar, + env_vars=("STANFORD_SEGMENTER",), + searchpath=(), + url=_stanford_url, + verbose=verbose, + ) + if path_to_slf4j is not None: + slf4j = find_jar( + "slf4j-api.jar", + path_to_slf4j, + env_vars=("SLF4J", "STANFORD_SEGMENTER"), + searchpath=(), + url=_stanford_url, + verbose=verbose, + ) + else: + slf4j = None + + # This is passed to java as the -cp option, the old version of segmenter needs slf4j. + # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j + self._stanford_jar = os.pathsep.join( + _ for _ in [stanford_segmenter, slf4j] if _ is not None + ) + + self._java_class = java_class + self._model = path_to_model + self._sihan_corpora_dict = path_to_sihan_corpora_dict + self._sihan_post_processing = sihan_post_processing + self._keep_whitespaces = keep_whitespaces + self._dict = path_to_dict + + self._encoding = encoding + self.java_options = java_options + options = {} if options is None else options + self._options_cmd = ",".join( + f"{key}={json.dumps(val)}" for key, val in options.items() + ) + + def default_config(self, lang): + """ + Attempt to initialize Stanford Word Segmenter for the specified language + using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables + """ + + search_path = () + if os.environ.get("STANFORD_SEGMENTER"): + search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")} + + # init for Chinese-specific files + self._dict = None + self._sihan_corpora_dict = None + self._sihan_post_processing = "false" + + if lang == "ar": + self._java_class = ( + "edu.stanford.nlp.international.arabic.process.ArabicSegmenter" + ) + model = "arabic-segmenter-atb+bn+arztrain.ser.gz" + + elif lang == "zh": + self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier" + model = "pku.gz" + self._sihan_post_processing = "true" + + path_to_dict = "dict-chris6.ser.gz" + try: + self._dict = find_file( + path_to_dict, + searchpath=search_path, + url=_stanford_url, + verbose=False, + env_vars=("STANFORD_MODELS",), + ) + except LookupError as e: + raise LookupError( + "Could not find '%s' (tried using env. " + "variables STANFORD_MODELS and /data/)" + % path_to_dict + ) from e + + sihan_dir = "./data/" + try: + path_to_sihan_dir = find_dir( + sihan_dir, + url=_stanford_url, + verbose=False, + env_vars=("STANFORD_SEGMENTER",), + ) + self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir) + except LookupError as e: + raise LookupError( + "Could not find '%s' (tried using the " + "STANFORD_SEGMENTER environment variable)" % sihan_dir + ) from e + else: + raise LookupError(f"Unsupported language {lang}") + + try: + self._model = find_file( + model, + searchpath=search_path, + url=_stanford_url, + verbose=False, + env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"), + ) + except LookupError as e: + raise LookupError( + "Could not find '%s' (tried using env. " + "variables STANFORD_MODELS and /data/)" % model + ) from e + + def tokenize(self, s): + super().tokenize(s) + + def segment_file(self, input_file_path): + """ """ + cmd = [ + self._java_class, + "-loadClassifier", + self._model, + "-keepAllWhitespaces", + self._keep_whitespaces, + "-textFile", + input_file_path, + ] + if self._sihan_corpora_dict is not None: + cmd.extend( + [ + "-serDictionary", + self._dict, + "-sighanCorporaDict", + self._sihan_corpora_dict, + "-sighanPostProcessing", + self._sihan_post_processing, + ] + ) + + stdout = self._execute(cmd) + + return stdout + + def segment(self, tokens): + return self.segment_sents([tokens]) + + def segment_sents(self, sentences): + """ """ + encoding = self._encoding + # Create a temporary input file + _input_fh, self._input_file_path = tempfile.mkstemp(text=True) + + # Write the actural sentences to the temporary input file + _input_fh = os.fdopen(_input_fh, "wb") + _input = "\n".join(" ".join(x) for x in sentences) + if isinstance(_input, str) and encoding: + _input = _input.encode(encoding) + _input_fh.write(_input) + _input_fh.close() + + cmd = [ + self._java_class, + "-loadClassifier", + self._model, + "-keepAllWhitespaces", + self._keep_whitespaces, + "-textFile", + self._input_file_path, + ] + if self._sihan_corpora_dict is not None: + cmd.extend( + [ + "-serDictionary", + self._dict, + "-sighanCorporaDict", + self._sihan_corpora_dict, + "-sighanPostProcessing", + self._sihan_post_processing, + ] + ) + + stdout = self._execute(cmd) + + # Delete the temporary file + os.unlink(self._input_file_path) + + return stdout + + def _execute(self, cmd, verbose=False): + encoding = self._encoding + cmd.extend(["-inputEncoding", encoding]) + _options_cmd = self._options_cmd + if _options_cmd: + cmd.extend(["-options", self._options_cmd]) + + default_options = " ".join(_java_options) + + # Configure java. + config_java(options=self.java_options, verbose=verbose) + + stdout, _stderr = java( + cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE + ) + stdout = stdout.decode(encoding) + + # Return java configurations to their default values. + config_java(options=default_options, verbose=False) + + return stdout diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/toktok.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/toktok.py new file mode 100644 index 0000000000000000000000000000000000000000..4229a7327743ad9788449a82c8d2350b9c8db392 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/toktok.py @@ -0,0 +1,179 @@ +# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer. +# +# Copyright (C) 2001-2015 NLTK Project +# Author: Jon Dehdari +# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters +# +# URL: +# For license information, see LICENSE.TXT + +""" +The tok-tok tokenizer is a simple, general tokenizer, where the input has one +sentence per line; thus only final period is tokenized. + +Tok-tok has been tested on, and gives reasonably good results for English, +Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. +The input should be in UTF-8 encoding. + +Reference: +Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language +Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University. +""" + +import re + +from nltk.tokenize.api import TokenizerI + + +class ToktokTokenizer(TokenizerI): + """ + This is a Python port of the tok-tok.pl from + https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl + + >>> toktok = ToktokTokenizer() + >>> text = u'Is 9.5 or 525,600 my favorite number?' + >>> print(toktok.tokenize(text, return_str=True)) + Is 9.5 or 525,600 my favorite number ? + >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things' + >>> print(toktok.tokenize(text, return_str=True)) + The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things + >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' + >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' + >>> assert toktok.tokenize(text, return_str=True) == expected + >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf'] + True + """ + + # Replace non-breaking spaces with normal spaces. + NON_BREAKING = re.compile("\u00A0"), " " + + # Pad some funky punctuation. + FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 " + # Pad more funky punctuation. + FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 " + # Pad En dash and em dash + EN_EM_DASHES = re.compile("([–—])"), r" \1 " + + # Replace problematic character with numeric character reference. + AMPERCENT = re.compile("& "), "& " + TAB = re.compile("\t"), " " + PIPE = re.compile(r"\|"), " | " + + # Pad numbers with commas to keep them from further tokenization. + COMMA_IN_NUM = re.compile(r"(? "something ..." + # "something." -> "something ." + FINAL_PERIOD_1 = re.compile(r"(? "... stuff ." + FINAL_PERIOD_2 = re.compile(r"""(? +# Michael Heilman (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed) +# Tom Aarsen <> (modifications) +# +# URL: +# For license information, see LICENSE.TXT + +r""" + +Penn Treebank Tokenizer + +The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. +This implementation is a port of the tokenizer sed script written by Robert McIntyre +and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed. +""" + +import re +import warnings +from typing import Iterator, List, Tuple + +from nltk.tokenize.api import TokenizerI +from nltk.tokenize.destructive import MacIntyreContractions +from nltk.tokenize.util import align_tokens + + +class TreebankWordTokenizer(TokenizerI): + r""" + The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. + + This tokenizer performs the following steps: + + - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll`` + - treat most punctuation characters as separate tokens + - split off commas and single quotes, when followed by whitespace + - separate periods that appear at the end of line + + >>> from nltk.tokenize import TreebankWordTokenizer + >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' + >>> TreebankWordTokenizer().tokenize(s) + ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] + >>> s = "They'll save and invest more." + >>> TreebankWordTokenizer().tokenize(s) + ['They', "'ll", 'save', 'and', 'invest', 'more', '.'] + >>> s = "hi, my name can't hello," + >>> TreebankWordTokenizer().tokenize(s) + ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ','] + """ + + # starting quotes + STARTING_QUOTES = [ + (re.compile(r"^\""), r"``"), + (re.compile(r"(``)"), r" \1 "), + (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "), + ] + + # punctuation + PUNCTUATION = [ + (re.compile(r"([:,])([^\d])"), r" \1 \2"), + (re.compile(r"([:,])$"), r" \1 "), + (re.compile(r"\.\.\."), r" ... "), + (re.compile(r"[;@#$%&]"), r" \g<0> "), + ( + re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), + r"\1 \2\3 ", + ), # Handles the final period. + (re.compile(r"[?!]"), r" \g<0> "), + (re.compile(r"([^'])' "), r"\1 ' "), + ] + + # Pads parentheses + PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ") + + # Optionally: Convert parentheses, brackets and converts them to PTB symbols. + CONVERT_PARENTHESES = [ + (re.compile(r"\("), "-LRB-"), + (re.compile(r"\)"), "-RRB-"), + (re.compile(r"\["), "-LSB-"), + (re.compile(r"\]"), "-RSB-"), + (re.compile(r"\{"), "-LCB-"), + (re.compile(r"\}"), "-RCB-"), + ] + + DOUBLE_DASHES = (re.compile(r"--"), r" -- ") + + # ending quotes + ENDING_QUOTES = [ + (re.compile(r"''"), " '' "), + (re.compile(r'"'), " '' "), + (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), + (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), + ] + + # List of contractions adapted from Robert MacIntyre's tokenizer. + _contractions = MacIntyreContractions() + CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2)) + CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) + + def tokenize( + self, text: str, convert_parentheses: bool = False, return_str: bool = False + ) -> List[str]: + r"""Return a tokenized copy of `text`. + + >>> from nltk.tokenize import TreebankWordTokenizer + >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.''' + >>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', + 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', + 'of', 'them.', 'Thanks', '.'] + >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36', + 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', + 'of', 'them.', 'Thanks', '.'] + + :param text: A string with a sentence or sentences. + :type text: str + :param convert_parentheses: if True, replace parentheses to PTB symbols, + e.g. `(` to `-LRB-`. Defaults to False. + :type convert_parentheses: bool, optional + :param return_str: If True, return tokens as space-separated string, + defaults to False. + :type return_str: bool, optional + :return: List of tokens from `text`. + :rtype: List[str] + """ + if return_str is not False: + warnings.warn( + "Parameter 'return_str' has been deprecated and should no " + "longer be used.", + category=DeprecationWarning, + stacklevel=2, + ) + + for regexp, substitution in self.STARTING_QUOTES: + text = regexp.sub(substitution, text) + + for regexp, substitution in self.PUNCTUATION: + text = regexp.sub(substitution, text) + + # Handles parentheses. + regexp, substitution = self.PARENS_BRACKETS + text = regexp.sub(substitution, text) + # Optionally convert parentheses + if convert_parentheses: + for regexp, substitution in self.CONVERT_PARENTHESES: + text = regexp.sub(substitution, text) + + # Handles double dash. + regexp, substitution = self.DOUBLE_DASHES + text = regexp.sub(substitution, text) + + # add extra space to make things easier + text = " " + text + " " + + for regexp, substitution in self.ENDING_QUOTES: + text = regexp.sub(substitution, text) + + for regexp in self.CONTRACTIONS2: + text = regexp.sub(r" \1 \2 ", text) + for regexp in self.CONTRACTIONS3: + text = regexp.sub(r" \1 \2 ", text) + + # We are not using CONTRACTIONS4 since + # they are also commented out in the SED scripts + # for regexp in self._contractions.CONTRACTIONS4: + # text = regexp.sub(r' \1 \2 \3 ', text) + + return text.split() + + def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]: + r""" + Returns the spans of the tokens in ``text``. + Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. + + >>> from nltk.tokenize import TreebankWordTokenizer + >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' + >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), + ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), + ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), + ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] + >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', + ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', + ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] + >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected + True + + :param text: A string with a sentence or sentences. + :type text: str + :yield: Tuple[int, int] + """ + raw_tokens = self.tokenize(text) + + # Convert converted quotes back to original double quotes + # Do this only if original text contains double quote(s) or double + # single-quotes (because '' might be transformed to `` if it is + # treated as starting quotes). + if ('"' in text) or ("''" in text): + # Find double quotes and converted quotes + matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] + + # Replace converted quotes back to double quotes + tokens = [ + matched.pop(0) if tok in ['"', "``", "''"] else tok + for tok in raw_tokens + ] + else: + tokens = raw_tokens + + yield from align_tokens(tokens, text) + + +class TreebankWordDetokenizer(TokenizerI): + r""" + The Treebank detokenizer uses the reverse regex operations corresponding to + the Treebank tokenizer's regexes. + + Note: + + - There're additional assumption mades when undoing the padding of ``[;@#$%&]`` + punctuation symbols that isn't presupposed in the TreebankTokenizer. + - There're additional regexes added in reversing the parentheses tokenization, + such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right + padding added to the closing parentheses precedding ``[:;,.]``. + - It's not possible to return the original whitespaces as they were because + there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at + the text.split() operation. + + >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer + >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' + >>> d = TreebankWordDetokenizer() + >>> t = TreebankWordTokenizer() + >>> toks = t.tokenize(s) + >>> d.detokenize(toks) + 'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.' + + The MXPOST parentheses substitution can be undone using the ``convert_parentheses`` + parameter: + + >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' + >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in', + ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy', + ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.'] + >>> expected_tokens == t.tokenize(s, convert_parentheses=True) + True + >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).' + >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True) + True + + During tokenization it's safe to add more spaces but during detokenization, + simply undoing the padding doesn't really help. + + - During tokenization, left and right pad is added to ``[!?]``, when + detokenizing, only left shift the ``[!?]`` is needed. + Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``. + + - During tokenization ``[:,]`` are left and right padded but when detokenizing, + only left shift is necessary and we keep right pad after comma/colon + if the string after is a non-digit. + Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``. + + >>> from nltk.tokenize.treebank import TreebankWordDetokenizer + >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!'] + >>> twd = TreebankWordDetokenizer() + >>> twd.detokenize(toks) + "hello, i can't feel my feet! Help!!" + + >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!', + ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!'] + >>> twd.detokenize(toks) + "hello, i can't feel; my feet! Help!! He said: Help, help?!" + """ + + _contractions = MacIntyreContractions() + CONTRACTIONS2 = [ + re.compile(pattern.replace("(?#X)", r"\s")) + for pattern in _contractions.CONTRACTIONS2 + ] + CONTRACTIONS3 = [ + re.compile(pattern.replace("(?#X)", r"\s")) + for pattern in _contractions.CONTRACTIONS3 + ] + + # ending quotes + ENDING_QUOTES = [ + (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "), + (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "), + (re.compile(r"(\S)\s(\'\')"), r"\1\2"), + ( + re.compile(r"(\'\')\s([.,:)\]>};%])"), + r"\1\2", + ), # Quotes followed by no-left-padded punctuations. + (re.compile(r"''"), '"'), + ] + + # Handles double dashes + DOUBLE_DASHES = (re.compile(r" -- "), r"--") + + # Optionally: Convert parentheses, brackets and converts them from PTB symbols. + CONVERT_PARENTHESES = [ + (re.compile("-LRB-"), "("), + (re.compile("-RRB-"), ")"), + (re.compile("-LSB-"), "["), + (re.compile("-RSB-"), "]"), + (re.compile("-LCB-"), "{"), + (re.compile("-RCB-"), "}"), + ] + + # Undo padding on parentheses. + PARENS_BRACKETS = [ + (re.compile(r"([\[\(\{\<])\s"), r"\g<1>"), + (re.compile(r"\s([\]\)\}\>])"), r"\g<1>"), + (re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"), + ] + + # punctuation + PUNCTUATION = [ + (re.compile(r"([^'])\s'\s"), r"\1' "), + (re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!] + # (re.compile(r'\s([?!])\s'), r'\g<1>'), + (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"), + # When tokenizing, [;@#$%&] are padded with whitespace regardless of + # whether there are spaces before or after them. + # But during detokenization, we need to distinguish between left/right + # pad, so we split this up. + (re.compile(r"([#$])\s"), r"\g<1>"), # Left pad. + (re.compile(r"\s([;%])"), r"\g<1>"), # Right pad. + # (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad. + (re.compile(r"\s\.\.\.\s"), r"..."), + # (re.compile(r"\s([:,])\s$"), r"\1"), # .strip() takes care of it. + ( + re.compile(r"\s([:,])"), + r"\1", + ), # Just remove left padding. Punctuation in numbers won't be padded. + ] + + # starting quotes + STARTING_QUOTES = [ + (re.compile(r"([ (\[{<])\s``"), r"\1``"), + (re.compile(r"(``)\s"), r"\1"), + (re.compile(r"``"), r'"'), + ] + + def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str: + """ + Treebank detokenizer, created by undoing the regexes from + the TreebankWordTokenizer.tokenize. + + :param tokens: A list of strings, i.e. tokenized text. + :type tokens: List[str] + :param convert_parentheses: if True, replace PTB symbols with parentheses, + e.g. `-LRB-` to `(`. Defaults to False. + :type convert_parentheses: bool, optional + :return: str + """ + text = " ".join(tokens) + + # Add extra space to make things easier + text = " " + text + " " + + # Reverse the contractions regexes. + # Note: CONTRACTIONS4 are not used in tokenization. + for regexp in self.CONTRACTIONS3: + text = regexp.sub(r"\1\2", text) + for regexp in self.CONTRACTIONS2: + text = regexp.sub(r"\1\2", text) + + # Reverse the regexes applied for ending quotes. + for regexp, substitution in self.ENDING_QUOTES: + text = regexp.sub(substitution, text) + + # Undo the space padding. + text = text.strip() + + # Reverse the padding on double dashes. + regexp, substitution = self.DOUBLE_DASHES + text = regexp.sub(substitution, text) + + if convert_parentheses: + for regexp, substitution in self.CONVERT_PARENTHESES: + text = regexp.sub(substitution, text) + + # Reverse the padding regexes applied for parenthesis/brackets. + for regexp, substitution in self.PARENS_BRACKETS: + text = regexp.sub(substitution, text) + + # Reverse the regexes applied for punctuations. + for regexp, substitution in self.PUNCTUATION: + text = regexp.sub(substitution, text) + + # Reverse the regexes applied for starting quotes. + for regexp, substitution in self.STARTING_QUOTES: + text = regexp.sub(substitution, text) + + return text.strip() + + def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str: + """Duck-typing the abstract *tokenize()*.""" + return self.tokenize(tokens, convert_parentheses) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/util.py new file mode 100644 index 0000000000000000000000000000000000000000..d5481b82c8dccd9bb20c7bc5e6aa617839fbd2ff --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/util.py @@ -0,0 +1,295 @@ +# Natural Language Toolkit: Tokenizer Utilities +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from re import finditer +from xml.sax.saxutils import escape, unescape + + +def string_span_tokenize(s, sep): + r""" + Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` + tuples, by splitting the string at each occurrence of *sep*. + + >>> from nltk.tokenize.util import string_span_tokenize + >>> s = '''Good muffins cost $3.88\nin New York. Please buy me + ... two of them.\n\nThanks.''' + >>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE + [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37), + (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)] + + :param s: the string to be tokenized + :type s: str + :param sep: the token separator + :type sep: str + :rtype: iter(tuple(int, int)) + """ + if len(sep) == 0: + raise ValueError("Token delimiter must not be empty") + left = 0 + while True: + try: + right = s.index(sep, left) + if right != 0: + yield left, right + except ValueError: + if left != len(s): + yield left, len(s) + break + + left = right + len(sep) + + +def regexp_span_tokenize(s, regexp): + r""" + Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` + tuples, by splitting the string at each successive match of *regexp*. + + >>> from nltk.tokenize.util import regexp_span_tokenize + >>> s = '''Good muffins cost $3.88\nin New York. Please buy me + ... two of them.\n\nThanks.''' + >>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE + [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), + (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] + + :param s: the string to be tokenized + :type s: str + :param regexp: regular expression that matches token separators (must not be empty) + :type regexp: str + :rtype: iter(tuple(int, int)) + """ + left = 0 + for m in finditer(regexp, s): + right, next = m.span() + if right != left: + yield left, right + left = next + yield left, len(s) + + +def spans_to_relative(spans): + r""" + Return a sequence of relative spans, given a sequence of spans. + + >>> from nltk.tokenize import WhitespaceTokenizer + >>> from nltk.tokenize.util import spans_to_relative + >>> s = '''Good muffins cost $3.88\nin New York. Please buy me + ... two of them.\n\nThanks.''' + >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE + [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6), + (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)] + + :param spans: a sequence of (start, end) offsets of the tokens + :type spans: iter(tuple(int, int)) + :rtype: iter(tuple(int, int)) + """ + prev = 0 + for left, right in spans: + yield left - prev, right - left + prev = right + + +class CJKChars: + """ + An object that enumerates the code points of the CJK characters as listed on + https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane + + This is a Python port of the CJK code point enumerations of Moses tokenizer: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309 + """ + + # Hangul Jamo (1100–11FF) + Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff")) + + # CJK Radicals Supplement (2E80–2EFF) + # Kangxi Radicals (2F00–2FDF) + # Ideographic Description Characters (2FF0–2FFF) + # CJK Symbols and Punctuation (3000–303F) + # Hiragana (3040–309F) + # Katakana (30A0–30FF) + # Bopomofo (3100–312F) + # Hangul Compatibility Jamo (3130–318F) + # Kanbun (3190–319F) + # Bopomofo Extended (31A0–31BF) + # CJK Strokes (31C0–31EF) + # Katakana Phonetic Extensions (31F0–31FF) + # Enclosed CJK Letters and Months (3200–32FF) + # CJK Compatibility (3300–33FF) + # CJK Unified Ideographs Extension A (3400–4DBF) + # Yijing Hexagram Symbols (4DC0–4DFF) + # CJK Unified Ideographs (4E00–9FFF) + # Yi Syllables (A000–A48F) + # Yi Radicals (A490–A4CF) + CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf")) + + # Phags-pa (A840–A87F) + Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f")) + + # Hangul Syllables (AC00–D7AF) + Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF")) + + # CJK Compatibility Ideographs (F900–FAFF) + CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF")) + + # CJK Compatibility Forms (FE30–FE4F) + CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F")) + + # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters + Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC")) + + # Supplementary Ideographic Plane 20000–2FFFF + Supplementary_Ideographic_Plane = ( + 131072, + 196607, + ) # (ord(u"\U00020000"), ord(u"\U0002FFFF")) + + ranges = [ + Hangul_Jamo, + CJK_Radicals, + Phags_Pa, + Hangul_Syllables, + CJK_Compatibility_Ideographs, + CJK_Compatibility_Forms, + Katakana_Hangul_Halfwidth, + Supplementary_Ideographic_Plane, + ] + + +def is_cjk(character): + """ + Python port of Moses' code to check for CJK character. + + >>> CJKChars().ranges + [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)] + >>> is_cjk(u'\u33fe') + True + >>> is_cjk(u'\uFE5F') + False + + :param character: The character that needs to be checked. + :type character: char + :return: bool + """ + return any( + [ + start <= ord(character) <= end + for start, end in [ + (4352, 4607), + (11904, 42191), + (43072, 43135), + (44032, 55215), + (63744, 64255), + (65072, 65103), + (65381, 65500), + (131072, 196607), + ] + ] + ) + + +def xml_escape(text): + """ + This function transforms the input text into an "escaped" version suitable + for well-formed XML formatting. + + Note that the default xml.sax.saxutils.escape() function don't escape + some characters that Moses does so we have to manually add them to the + entities dictionary. + + >>> input_str = ''')| & < > ' " ] [''' + >>> expected_output = ''')| & < > ' " ] [''' + >>> escape(input_str) == expected_output + True + >>> xml_escape(input_str) + ')| & < > ' " ] [' + + :param text: The text that needs to be escaped. + :type text: str + :rtype: str + """ + return escape( + text, + entities={ + r"'": r"'", + r'"': r""", + r"|": r"|", + r"[": r"[", + r"]": r"]", + }, + ) + + +def xml_unescape(text): + """ + This function transforms the "escaped" version suitable + for well-formed XML formatting into humanly-readable string. + + Note that the default xml.sax.saxutils.unescape() function don't unescape + some characters that Moses does so we have to manually add them to the + entities dictionary. + + >>> from xml.sax.saxutils import unescape + >>> s = ')| & < > ' " ] [' + >>> expected = ''')| & < > \' " ] [''' + >>> xml_unescape(s) == expected + True + + :param text: The text that needs to be unescaped. + :type text: str + :rtype: str + """ + return unescape( + text, + entities={ + r"'": r"'", + r""": r'"', + r"|": r"|", + r"[": r"[", + r"]": r"]", + }, + ) + + +def align_tokens(tokens, sentence): + """ + This module attempt to find the offsets of the tokens in *s*, as a sequence + of ``(start, end)`` tuples, given the tokens and also the source string. + + >>> from nltk.tokenize import TreebankWordTokenizer + >>> from nltk.tokenize.util import align_tokens + >>> s = str("The plane, bound for St Petersburg, crashed in Egypt's " + ... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh " + ... "on Saturday.") + >>> tokens = TreebankWordTokenizer().tokenize(s) + >>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), + ... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), + ... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), + ... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), + ... (123, 131), (131, 132)] + >>> output = list(align_tokens(tokens, s)) + >>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same. + True + >>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected. + True + >>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens. + True + + :param tokens: The list of strings that are the result of tokenization + :type tokens: list(str) + :param sentence: The original string + :type sentence: str + :rtype: list(tuple(int,int)) + """ + point = 0 + offsets = [] + for token in tokens: + try: + start = sentence.index(token, point) + except ValueError as e: + raise ValueError(f'substring "{token}" not found in "{sentence}"') from e + point = start + len(token) + offsets.append((start, point)) + return offsets diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/toolbox.py b/.eggs/nltk-3.8-py3.10.egg/nltk/toolbox.py new file mode 100644 index 0000000000000000000000000000000000000000..cc9bc0dc55d01f019ef85db154d85dfaa3a181fb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/toolbox.py @@ -0,0 +1,524 @@ +# Natural Language Toolkit: Toolbox Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Greg Aumann +# URL: +# For license information, see LICENSE.TXT + +""" +Module for reading, writing and manipulating +Toolbox databases and settings files. +""" + +import codecs +import re +from io import StringIO +from xml.etree.ElementTree import Element, ElementTree, SubElement, TreeBuilder + +from nltk.data import PathPointer, find + + +class StandardFormat: + """ + Class for reading and processing standard format marker files and strings. + """ + + def __init__(self, filename=None, encoding=None): + self._encoding = encoding + if filename is not None: + self.open(filename) + + def open(self, sfm_file): + """ + Open a standard format marker file for sequential reading. + + :param sfm_file: name of the standard format marker input file + :type sfm_file: str + """ + if isinstance(sfm_file, PathPointer): + self._file = sfm_file.open(self._encoding) + else: + self._file = codecs.open(sfm_file, "r", self._encoding) + + def open_string(self, s): + """ + Open a standard format marker string for sequential reading. + + :param s: string to parse as a standard format marker input file + :type s: str + """ + self._file = StringIO(s) + + def raw_fields(self): + """ + Return an iterator that returns the next field in a (marker, value) + tuple. Linebreaks and trailing white space are preserved except + for the final newline in each field. + + :rtype: iter(tuple(str, str)) + """ + join_string = "\n" + line_regexp = r"^%s(?:\\(\S+)\s*)?(.*)$" + # discard a BOM in the first line + first_line_pat = re.compile(line_regexp % "(?:\xef\xbb\xbf)?") + line_pat = re.compile(line_regexp % "") + # need to get first line outside the loop for correct handling + # of the first marker if it spans multiple lines + file_iter = iter(self._file) + # PEP 479, prevent RuntimeError when StopIteration is raised inside generator + try: + line = next(file_iter) + except StopIteration: + # no more data is available, terminate the generator + return + mobj = re.match(first_line_pat, line) + mkr, line_value = mobj.groups() + value_lines = [line_value] + self.line_num = 0 + for line in file_iter: + self.line_num += 1 + mobj = re.match(line_pat, line) + line_mkr, line_value = mobj.groups() + if line_mkr: + yield (mkr, join_string.join(value_lines)) + mkr = line_mkr + value_lines = [line_value] + else: + value_lines.append(line_value) + self.line_num += 1 + yield (mkr, join_string.join(value_lines)) + + def fields( + self, + strip=True, + unwrap=True, + encoding=None, + errors="strict", + unicode_fields=None, + ): + """ + Return an iterator that returns the next field in a ``(marker, value)`` + tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding`` + was specified in the ``fields()`` method. Otherwise they are non-unicode strings. + + :param strip: strip trailing whitespace from the last line of each field + :type strip: bool + :param unwrap: Convert newlines in a field to spaces. + :type unwrap: bool + :param encoding: Name of an encoding to use. If it is specified then + the ``fields()`` method returns unicode strings rather than non + unicode strings. + :type encoding: str or None + :param errors: Error handling scheme for codec. Same as the ``decode()`` + builtin string method. + :type errors: str + :param unicode_fields: Set of marker names whose values are UTF-8 encoded. + Ignored if encoding is None. If the whole file is UTF-8 encoded set + ``encoding='utf8'`` and leave ``unicode_fields`` with its default + value of None. + :type unicode_fields: sequence + :rtype: iter(tuple(str, str)) + """ + if encoding is None and unicode_fields is not None: + raise ValueError("unicode_fields is set but not encoding.") + unwrap_pat = re.compile(r"\n+") + for mkr, val in self.raw_fields(): + if unwrap: + val = unwrap_pat.sub(" ", val) + if strip: + val = val.rstrip() + yield (mkr, val) + + def close(self): + """Close a previously opened standard format marker file or string.""" + self._file.close() + try: + del self.line_num + except AttributeError: + pass + + +class ToolboxData(StandardFormat): + def parse(self, grammar=None, **kwargs): + if grammar: + return self._chunk_parse(grammar=grammar, **kwargs) + else: + return self._record_parse(**kwargs) + + def _record_parse(self, key=None, **kwargs): + r""" + Returns an element tree structure corresponding to a toolbox data file with + all markers at the same level. + + Thus the following Toolbox database:: + \_sh v3.0 400 Rotokas Dictionary + \_DateStampHasFourDigitYear + + \lx kaa + \ps V.A + \ge gag + \gp nek i pas + + \lx kaa + \ps V.B + \ge strangle + \gp pasim nek + + after parsing will end up with the same structure (ignoring the extra + whitespace) as the following XML fragment after being parsed by + ElementTree:: + +
    + <_sh>v3.0 400 Rotokas Dictionary + <_DateStampHasFourDigitYear/> +
    + + + kaa + V.A + gag + nek i pas + + + + kaa + V.B + strangle + pasim nek + +
    + + :param key: Name of key marker at the start of each record. If set to + None (the default value) the first marker that doesn't begin with + an underscore is assumed to be the key. + :type key: str + :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` + :type kwargs: dict + :rtype: ElementTree._ElementInterface + :return: contents of toolbox data divided into header and records + """ + builder = TreeBuilder() + builder.start("toolbox_data", {}) + builder.start("header", {}) + in_records = False + for mkr, value in self.fields(**kwargs): + if key is None and not in_records and mkr[0] != "_": + key = mkr + if mkr == key: + if in_records: + builder.end("record") + else: + builder.end("header") + in_records = True + builder.start("record", {}) + builder.start(mkr, {}) + builder.data(value) + builder.end(mkr) + if in_records: + builder.end("record") + else: + builder.end("header") + builder.end("toolbox_data") + return builder.close() + + def _tree2etree(self, parent): + from nltk.tree import Tree + + root = Element(parent.label()) + for child in parent: + if isinstance(child, Tree): + root.append(self._tree2etree(child)) + else: + text, tag = child + e = SubElement(root, tag) + e.text = text + return root + + def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs): + """ + Returns an element tree structure corresponding to a toolbox data file + parsed according to the chunk grammar. + + :type grammar: str + :param grammar: Contains the chunking rules used to parse the + database. See ``chunk.RegExp`` for documentation. + :type root_label: str + :param root_label: The node value that should be used for the + top node of the chunk structure. + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + ``1`` will generate normal tracing output; and ``2`` or + higher will generate verbose tracing output. + :type kwargs: dict + :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()`` + :rtype: ElementTree._ElementInterface + """ + from nltk import chunk + from nltk.tree import Tree + + cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace) + db = self.parse(**kwargs) + tb_etree = Element("toolbox_data") + header = db.find("header") + tb_etree.append(header) + for record in db.findall("record"): + parsed = cp.parse([(elem.text, elem.tag) for elem in record]) + tb_etree.append(self._tree2etree(parsed)) + return tb_etree + + +_is_value = re.compile(r"\S") + + +def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None): + """ + Return a string with a standard format representation of the toolbox + data in tree (tree can be a toolbox database or a single record). + + :param tree: flat representation of toolbox data (whole database or single record) + :type tree: ElementTree._ElementInterface + :param encoding: Name of an encoding to use. + :type encoding: str + :param errors: Error handling scheme for codec. Same as the ``encode()`` + builtin string method. + :type errors: str + :param unicode_fields: + :type unicode_fields: dict(str) or set(str) + :rtype: str + """ + if tree.tag == "record": + root = Element("toolbox_data") + root.append(tree) + tree = root + + if tree.tag != "toolbox_data": + raise ValueError("not a toolbox_data element structure") + if encoding is None and unicode_fields is not None: + raise ValueError( + "if encoding is not specified then neither should unicode_fields" + ) + l = [] + for rec in tree: + l.append("\n") + for field in rec: + mkr = field.tag + value = field.text + if encoding is not None: + if unicode_fields is not None and mkr in unicode_fields: + cur_encoding = "utf8" + else: + cur_encoding = encoding + if re.search(_is_value, value): + l.append((f"\\{mkr} {value}\n").encode(cur_encoding, errors)) + else: + l.append((f"\\{mkr}{value}\n").encode(cur_encoding, errors)) + else: + if re.search(_is_value, value): + l.append(f"\\{mkr} {value}\n") + else: + l.append(f"\\{mkr}{value}\n") + return "".join(l[1:]) + + +class ToolboxSettings(StandardFormat): + """This class is the base class for settings files.""" + + def __init__(self): + super().__init__() + + def parse(self, encoding=None, errors="strict", **kwargs): + """ + Return the contents of toolbox settings file with a nested structure. + + :param encoding: encoding used by settings file + :type encoding: str + :param errors: Error handling scheme for codec. Same as ``decode()`` builtin method. + :type errors: str + :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` + :type kwargs: dict + :rtype: ElementTree._ElementInterface + """ + builder = TreeBuilder() + for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs): + # Check whether the first char of the field marker + # indicates a block start (+) or end (-) + block = mkr[0] + if block in ("+", "-"): + mkr = mkr[1:] + else: + block = None + # Build tree on the basis of block char + if block == "+": + builder.start(mkr, {}) + builder.data(value) + elif block == "-": + builder.end(mkr) + else: + builder.start(mkr, {}) + builder.data(value) + builder.end(mkr) + return builder.close() + + +def to_settings_string(tree, encoding=None, errors="strict", unicode_fields=None): + # write XML to file + l = list() + _to_settings_string( + tree.getroot(), + l, + encoding=encoding, + errors=errors, + unicode_fields=unicode_fields, + ) + return "".join(l) + + +def _to_settings_string(node, l, **kwargs): + # write XML to file + tag = node.tag + text = node.text + if len(node) == 0: + if text: + l.append(f"\\{tag} {text}\n") + else: + l.append("\\%s\n" % tag) + else: + if text: + l.append(f"\\+{tag} {text}\n") + else: + l.append("\\+%s\n" % tag) + for n in node: + _to_settings_string(n, l, **kwargs) + l.append("\\-%s\n" % tag) + return + + +def remove_blanks(elem): + """ + Remove all elements and subelements with no text and no child elements. + + :param elem: toolbox data in an elementtree structure + :type elem: ElementTree._ElementInterface + """ + out = list() + for child in elem: + remove_blanks(child) + if child.text or len(child) > 0: + out.append(child) + elem[:] = out + + +def add_default_fields(elem, default_fields): + """ + Add blank elements and subelements specified in default_fields. + + :param elem: toolbox data in an elementtree structure + :type elem: ElementTree._ElementInterface + :param default_fields: fields to add to each type of element and subelement + :type default_fields: dict(tuple) + """ + for field in default_fields.get(elem.tag, []): + if elem.find(field) is None: + SubElement(elem, field) + for child in elem: + add_default_fields(child, default_fields) + + +def sort_fields(elem, field_orders): + """ + Sort the elements and subelements in order specified in field_orders. + + :param elem: toolbox data in an elementtree structure + :type elem: ElementTree._ElementInterface + :param field_orders: order of fields for each type of element and subelement + :type field_orders: dict(tuple) + """ + order_dicts = dict() + for field, order in field_orders.items(): + order_dicts[field] = order_key = dict() + for i, subfield in enumerate(order): + order_key[subfield] = i + _sort_fields(elem, order_dicts) + + +def _sort_fields(elem, orders_dicts): + """sort the children of elem""" + try: + order = orders_dicts[elem.tag] + except KeyError: + pass + else: + tmp = sorted( + ((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem) + ) + elem[:] = [child for key, child in tmp] + for child in elem: + if len(child): + _sort_fields(child, orders_dicts) + + +def add_blank_lines(tree, blanks_before, blanks_between): + """ + Add blank lines before all elements and subelements specified in blank_before. + + :param elem: toolbox data in an elementtree structure + :type elem: ElementTree._ElementInterface + :param blank_before: elements and subelements to add blank lines before + :type blank_before: dict(tuple) + """ + try: + before = blanks_before[tree.tag] + between = blanks_between[tree.tag] + except KeyError: + for elem in tree: + if len(elem): + add_blank_lines(elem, blanks_before, blanks_between) + else: + last_elem = None + for elem in tree: + tag = elem.tag + if last_elem is not None and last_elem.tag != tag: + if tag in before and last_elem is not None: + e = last_elem.getiterator()[-1] + e.text = (e.text or "") + "\n" + else: + if tag in between: + e = last_elem.getiterator()[-1] + e.text = (e.text or "") + "\n" + if len(elem): + add_blank_lines(elem, blanks_before, blanks_between) + last_elem = elem + + +def demo(): + from itertools import islice + + # zip_path = find('corpora/toolbox.zip') + # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse() + file_path = find("corpora/toolbox/rotokas.dic") + lexicon = ToolboxData(file_path).parse() + print("first field in fourth record:") + print(lexicon[3][0].tag) + print(lexicon[3][0].text) + + print("\nfields in sequential order:") + for field in islice(lexicon.find("record"), 10): + print(field.tag, field.text) + + print("\nlx fields:") + for field in islice(lexicon.findall("record/lx"), 10): + print(field.text) + + settings = ToolboxSettings() + file_path = find("corpora/toolbox/MDF/MDF_AltH.typ") + settings.open(file_path) + # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ')) + tree = settings.parse(unwrap=False, encoding="cp1252") + print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text) + settings_tree = ElementTree(tree) + print(to_settings_string(settings_tree).encode("utf8")) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/bleu_score.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/bleu_score.py new file mode 100644 index 0000000000000000000000000000000000000000..0744f9ceb667869f68a7e5af33e7925ab2a38951 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/bleu_score.py @@ -0,0 +1,685 @@ +# Natural Language Toolkit: BLEU Score +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim +# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan +# URL: +# For license information, see LICENSE.TXT + +"""BLEU score implementation.""" + +import math +import sys +import warnings +from collections import Counter +from fractions import Fraction + +from nltk.util import ngrams + + +def sentence_bleu( + references, + hypothesis, + weights=(0.25, 0.25, 0.25, 0.25), + smoothing_function=None, + auto_reweigh=False, +): + """ + Calculate BLEU score (Bilingual Evaluation Understudy) from + Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. + "BLEU: a method for automatic evaluation of machine translation." + In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + + >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', + ... 'forever', 'hearing', 'the', 'activity', 'guidebook', + ... 'that', 'party', 'direct'] + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', 'forever', + ... 'heed', 'Party', 'commands'] + + >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', + ... 'Party'] + + >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + + >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS + 0.5045... + + If there is no ngrams overlap for any order of n-grams, BLEU returns the + value 0. This is because the precision for the order of n-grams without + overlap is 0, and the geometric mean in the final BLEU score computation + multiplies the 0 with the precision of other n-grams. This results in 0 + (independently of the precision of the other n-gram orders). The following + example has zero 3-gram and 4-gram overlaps: + + >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS + 0.0 + + To avoid this harsh behaviour when no ngram overlaps are found a smoothing + function can be used. + + >>> chencherry = SmoothingFunction() + >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, + ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS + 0.0370... + + The default BLEU calculates a score for up to 4-grams using uniform + weights (this is called BLEU-4). To evaluate your translations with + higher/lower order ngrams, use customized weights. E.g. when accounting + for up to 5-grams with uniform weights (this is called BLEU-5) use: + + >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) + >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS + 0.3920... + + Multiple BLEU scores can be computed at once, by supplying a list of weights. + E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: + >>> weights = [ + ... (1./2., 1./2.), + ... (1./3., 1./3., 1./3.), + ... (1./4., 1./4., 1./4., 1./4.) + ... ] + >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS + [0.7453..., 0.6240..., 0.5045...] + + :param references: reference sentences + :type references: list(list(str)) + :param hypothesis: a hypothesis sentence + :type hypothesis: list(str) + :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) + :type weights: tuple(float) / list(tuple(float)) + :param smoothing_function: + :type smoothing_function: SmoothingFunction + :param auto_reweigh: Option to re-normalize the weights uniformly. + :type auto_reweigh: bool + :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. + :rtype: float / list(float) + """ + return corpus_bleu( + [references], [hypothesis], weights, smoothing_function, auto_reweigh + ) + + +def corpus_bleu( + list_of_references, + hypotheses, + weights=(0.25, 0.25, 0.25, 0.25), + smoothing_function=None, + auto_reweigh=False, +): + """ + Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all + the hypotheses and their respective references. + + Instead of averaging the sentence level BLEU scores (i.e. macro-average + precision), the original BLEU metric (Papineni et al. 2002) accounts for + the micro-average precision (i.e. summing the numerators and denominators + for each hypothesis-reference(s) pairs before the division). + + >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', 'forever', + ... 'heed', 'Party', 'commands'] + >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] + >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + + >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', + ... 'interested', 'in', 'world', 'history'] + >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', + ... 'because', 'he', 'read', 'the', 'book'] + + >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] + >>> hypotheses = [hyp1, hyp2] + >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS + 0.5920... + + The example below show that corpus_bleu() is different from averaging + sentence_bleu() for hypotheses + + >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) + >>> score2 = sentence_bleu([ref2a], hyp2) + >>> (score1 + score2) / 2 # doctest: +ELLIPSIS + 0.6223... + + Custom weights may be supplied to fine-tune the BLEU score further. + A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. + >>> weights = (0.1, 0.3, 0.5, 0.1) + >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS + 0.5818... + + This particular weight gave extra value to trigrams. + Furthermore, multiple weights can be given, resulting in multiple BLEU scores. + >>> weights = [ + ... (0.5, 0.5), + ... (0.333, 0.333, 0.334), + ... (0.25, 0.25, 0.25, 0.25), + ... (0.2, 0.2, 0.2, 0.2, 0.2) + ... ] + >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS + [0.8242..., 0.7067..., 0.5920..., 0.4719...] + + :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses + :type list_of_references: list(list(list(str))) + :param hypotheses: a list of hypothesis sentences + :type hypotheses: list(list(str)) + :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) + :type weights: tuple(float) / list(tuple(float)) + :param smoothing_function: + :type smoothing_function: SmoothingFunction + :param auto_reweigh: Option to re-normalize the weights uniformly. + :type auto_reweigh: bool + :return: The corpus-level BLEU score. + :rtype: float + """ + # Before proceeding to compute BLEU, perform sanity checks. + + p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. + p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. + hyp_lengths, ref_lengths = 0, 0 + + assert len(list_of_references) == len(hypotheses), ( + "The number of hypotheses and their reference(s) should be the " "same " + ) + + try: + weights[0][0] + except TypeError: + weights = [weights] + max_weight_length = max(len(weight) for weight in weights) + + # Iterate through each hypothesis and their corresponding references. + for references, hypothesis in zip(list_of_references, hypotheses): + # For each order of ngram, calculate the numerator and + # denominator for the corpus-level modified precision. + for i in range(1, max_weight_length + 1): + p_i = modified_precision(references, hypothesis, i) + p_numerators[i] += p_i.numerator + p_denominators[i] += p_i.denominator + + # Calculate the hypothesis length and the closest reference length. + # Adds them to the corpus-level hypothesis and reference counts. + hyp_len = len(hypothesis) + hyp_lengths += hyp_len + ref_lengths += closest_ref_length(references, hyp_len) + + # Calculate corpus-level brevity penalty. + bp = brevity_penalty(ref_lengths, hyp_lengths) + + # Collects the various precision values for the different ngram orders. + p_n = [ + Fraction(p_numerators[i], p_denominators[i], _normalize=False) + for i in range(1, max_weight_length + 1) + ] + + # Returns 0 if there's no matching n-grams + # We only need to check for p_numerators[1] == 0, since if there's + # no unigrams, there won't be any higher order ngrams. + if p_numerators[1] == 0: + return 0 if len(weights) == 1 else [0] * len(weights) + + # If there's no smoothing, set use method0 from SmoothinFunction class. + if not smoothing_function: + smoothing_function = SmoothingFunction().method0 + # Smoothen the modified precision. + # Note: smoothing_function() may convert values into floats; + # it tries to retain the Fraction object as much as the + # smoothing method allows. + p_n = smoothing_function( + p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths + ) + + bleu_scores = [] + for weight in weights: + # Uniformly re-weighting based on maximum hypothesis lengths if largest + # order of n-grams < 4 and weights is set at default. + if auto_reweigh: + if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): + weight = (1 / hyp_lengths,) * hyp_lengths + + s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) + s = bp * math.exp(math.fsum(s)) + bleu_scores.append(s) + return bleu_scores[0] if len(weights) == 1 else bleu_scores + + +def modified_precision(references, hypothesis, n): + """ + Calculate modified ngram precision. + + The normal precision method may lead to some wrong translations with + high-precision, e.g., the translation, in which a word of reference + repeats several times, has very high precision. + + This function only returns the Fraction object that contains the numerator + and denominator necessary to calculate the corpus-level precision. + To calculate the modified precision for a single pair of hypothesis and + references, cast the Fraction object into a float. + + The famous "the the the ... " example shows that you can get BLEU precision + by duplicating high frequency words. + + >>> reference1 = 'the cat is on the mat'.split() + >>> reference2 = 'there is a cat on the mat'.split() + >>> hypothesis1 = 'the the the the the the the'.split() + >>> references = [reference1, reference2] + >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS + 0.2857... + + In the modified n-gram precision, a reference word will be considered + exhausted after a matching hypothesis word is identified, e.g. + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', + ... 'forever', 'heed', 'Party', 'commands'] + >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', + ... 'Party'] + >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + >>> hypothesis = 'of the'.split() + >>> references = [reference1, reference2, reference3] + >>> float(modified_precision(references, hypothesis, n=1)) + 1.0 + >>> float(modified_precision(references, hypothesis, n=2)) + 1.0 + + An example of a normal machine translation hypothesis: + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + + >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', + ... 'forever', 'hearing', 'the', 'activity', 'guidebook', + ... 'that', 'party', 'direct'] + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', + ... 'forever', 'heed', 'Party', 'commands'] + + >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', + ... 'Party'] + + >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + >>> references = [reference1, reference2, reference3] + >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS + 0.9444... + >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS + 0.5714... + >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS + 0.5882352941176471 + >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS + 0.07692... + + + :param references: A list of reference translations. + :type references: list(list(str)) + :param hypothesis: A hypothesis translation. + :type hypothesis: list(str) + :param n: The ngram order. + :type n: int + :return: BLEU's modified precision for the nth order ngram. + :rtype: Fraction + """ + # Extracts all ngrams in hypothesis + # Set an empty Counter if hypothesis is empty. + counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() + # Extract a union of references' counts. + # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) + max_counts = {} + for reference in references: + reference_counts = ( + Counter(ngrams(reference, n)) if len(reference) >= n else Counter() + ) + for ngram in counts: + max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) + + # Assigns the intersection between hypothesis and references' counts. + clipped_counts = { + ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() + } + + numerator = sum(clipped_counts.values()) + # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. + # Usually this happens when the ngram order is > len(reference). + denominator = max(1, sum(counts.values())) + + return Fraction(numerator, denominator, _normalize=False) + + +def closest_ref_length(references, hyp_len): + """ + This function finds the reference that is the closest length to the + hypothesis. The closest reference length is referred to as *r* variable + from the brevity penalty formula in Papineni et. al. (2002) + + :param references: A list of reference translations. + :type references: list(list(str)) + :param hyp_len: The length of the hypothesis. + :type hyp_len: int + :return: The length of the reference that's closest to the hypothesis. + :rtype: int + """ + ref_lens = (len(reference) for reference in references) + closest_ref_len = min( + ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) + ) + return closest_ref_len + + +def brevity_penalty(closest_ref_len, hyp_len): + """ + Calculate brevity penalty. + + As the modified n-gram precision still has the problem from the short + length sentence, brevity penalty is used to modify the overall BLEU + score according to length. + + An example from the paper. There are three references with length 12, 15 + and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. + + >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 + >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 + >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 + >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 + >>> references = [reference1, reference2, reference3] + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 1.0 + + In case a hypothesis translation is shorter than the references, penalty is + applied. + + >>> references = [['a'] * 28, ['a'] * 28] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 0.2635971381157267 + + The length of the closest reference is used to compute the penalty. If the + length of a hypothesis is 12, and the reference lengths are 13 and 2, the + penalty is applied because the hypothesis length (12) is less then the + closest reference length (13). + + >>> references = [['a'] * 13, ['a'] * 2] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS + 0.9200... + + The brevity penalty doesn't depend on reference order. More importantly, + when two reference sentences are at the same distance, the shortest + reference sentence length is used. + + >>> references = [['a'] * 13, ['a'] * 11] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) + >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) + >>> bp1 == bp2 == 1 + True + + A test example from mteval-v13a.pl (starting from the line 705): + + >>> references = [['a'] * 11, ['a'] * 8] + >>> hypothesis = ['a'] * 7 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS + 0.8668... + + >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] + >>> hypothesis = ['a'] * 7 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 1.0 + + :param hyp_len: The length of the hypothesis for a single sentence OR the + sum of all the hypotheses' lengths for a corpus + :type hyp_len: int + :param closest_ref_len: The length of the closest reference for a single + hypothesis OR the sum of all the closest references for every hypotheses. + :type closest_ref_len: int + :return: BLEU's brevity penalty. + :rtype: float + """ + if hyp_len > closest_ref_len: + return 1 + # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 + elif hyp_len == 0: + return 0 + else: + return math.exp(1 - closest_ref_len / hyp_len) + + +class SmoothingFunction: + """ + This is an implementation of the smoothing techniques + for segment-level BLEU scores that was presented in + Boxing Chen and Collin Cherry (2014) A Systematic Comparison of + Smoothing Techniques for Sentence-Level BLEU. In WMT14. + http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf + """ + + def __init__(self, epsilon=0.1, alpha=5, k=5): + """ + This will initialize the parameters required for the various smoothing + techniques, the default values are set to the numbers used in the + experiments from Chen and Cherry (2014). + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', + ... 'that', 'the', 'military', 'always', 'obeys', 'the', + ... 'commands', 'of', 'the', 'party'] + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', + ... 'that', 'the', 'military', 'will', 'forever', 'heed', + ... 'Party', 'commands'] + + >>> chencherry = SmoothingFunction() + >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS + 0.4452... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS + 0.4905... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS + 0.4135... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS + 0.4905... + + :param epsilon: the epsilon value use in method 1 + :type epsilon: float + :param alpha: the alpha value use in method 6 + :type alpha: int + :param k: the k value use in method 4 + :type k: int + """ + self.epsilon = epsilon + self.alpha = alpha + self.k = k + + def method0(self, p_n, *args, **kwargs): + """ + No smoothing. + """ + p_n_new = [] + for i, p_i in enumerate(p_n): + if p_i.numerator != 0: + p_n_new.append(p_i) + else: + _msg = str( + "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" + "Therefore the BLEU score evaluates to 0, independently of\n" + "how many N-gram overlaps of lower order it contains.\n" + "Consider using lower n-gram order or use " + "SmoothingFunction()" + ).format(i + 1) + warnings.warn(_msg) + # When numerator==0 where denonminator==0 or !=0, the result + # for the precision score should be equal to 0 or undefined. + # Due to BLEU geometric mean computation in logarithm space, + # we we need to take the return sys.float_info.min such that + # math.log(sys.float_info.min) returns a 0 precision score. + p_n_new.append(sys.float_info.min) + return p_n_new + + def method1(self, p_n, *args, **kwargs): + """ + Smoothing method 1: Add *epsilon* counts to precision with 0 counts. + """ + return [ + (p_i.numerator + self.epsilon) / p_i.denominator + if p_i.numerator == 0 + else p_i + for p_i in p_n + ] + + def method2(self, p_n, *args, **kwargs): + """ + Smoothing method 2: Add 1 to both numerator and denominator from + Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for + Evaluating Automatic Evaluation Metrics for Machine Translation. + In COLING 2004. + """ + return [ + Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) + if i != 0 + else p_n[0] + for i in range(len(p_n)) + ] + + def method3(self, p_n, *args, **kwargs): + """ + Smoothing method 3: NIST geometric sequence smoothing + The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each + precision score whose matching n-gram count is null. + k is 1 for the first 'n' value for which the n-gram match count is null/ + + For example, if the text contains: + + - one 2-gram match + - and (consequently) two 1-gram matches + + the n-gram count for each individual precision score would be: + + - n=1 => prec_count = 2 (two unigrams) + - n=2 => prec_count = 1 (one bigram) + - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) + - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) + """ + incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. + for i, p_i in enumerate(p_n): + if p_i.numerator == 0: + p_n[i] = 1 / (2**incvnt * p_i.denominator) + incvnt += 1 + return p_n + + def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): + """ + Smoothing method 4: + Shorter translations may have inflated precision values due to having + smaller denominators; therefore, we give them proportionally + smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry + suggests dividing by 1/ln(len(T)), where T is the length of the translation. + """ + incvnt = 1 + hyp_len = hyp_len if hyp_len else len(hypothesis) + for i, p_i in enumerate(p_n): + if p_i.numerator == 0 and hyp_len > 1: + # incvnt = i + 1 * self.k / math.log( + # hyp_len + # ) # Note that this K is different from the K from NIST. + # p_n[i] = incvnt / p_i.denominator\ + numerator = 1 / (2**incvnt * self.k / math.log(hyp_len)) + p_n[i] = numerator / p_i.denominator + incvnt += 1 + return p_n + + def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): + """ + Smoothing method 5: + The matched counts for similar values of n should be similar. To a + calculate the n-gram matched count, it averages the n−1, n and n+1 gram + matched counts. + """ + hyp_len = hyp_len if hyp_len else len(hypothesis) + m = {} + # Requires an precision value for an addition ngram order. + p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] + m[-1] = p_n[0] + 1 + for i, p_i in enumerate(p_n): + p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 + m[i] = p_n[i] + return p_n + + def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): + """ + Smoothing method 6: + Interpolates the maximum likelihood estimate of the precision *p_n* with + a prior estimate *pi0*. The prior is estimated by assuming that the ratio + between pn and pn−1 will be the same as that between pn−1 and pn−2; from + Gao and He (2013) Training MRF-Based Phrase Translation Models using + Gradient Ascent. In NAACL. + """ + hyp_len = hyp_len if hyp_len else len(hypothesis) + # This smoothing only works when p_1 and p_2 is non-zero. + # Raise an error with an appropriate message when the input is too short + # to use this smoothing technique. + assert p_n[2], "This smoothing method requires non-zero precision for bigrams." + for i, p_i in enumerate(p_n): + if i in [0, 1]: # Skips the first 2 orders of ngrams. + continue + else: + pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] + # No. of ngrams in translation that matches the reference. + m = p_i.numerator + # No. of ngrams in translation. + l = sum(1 for _ in ngrams(hypothesis, i + 1)) + # Calculates the interpolated precision. + p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) + return p_n + + def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): + """ + Smoothing method 7: + Interpolates methods 4 and 5. + """ + hyp_len = hyp_len if hyp_len else len(hypothesis) + p_n = self.method4(p_n, references, hypothesis, hyp_len) + p_n = self.method5(p_n, references, hypothesis, hyp_len) + return p_n diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm3.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm3.py new file mode 100644 index 0000000000000000000000000000000000000000..f295dee0b563bbcb9a5b9557c8d1602942a75bc3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm3.py @@ -0,0 +1,346 @@ +# Natural Language Toolkit: IBM Model 3 +# +# Copyright (C) 2001-2013 NLTK Project +# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim +# URL: +# For license information, see LICENSE.TXT + +""" +Translation model that considers how a word can be aligned to +multiple words in another language. + +IBM Model 3 improves on Model 2 by directly modeling the phenomenon +where a word in one language may be translated into zero or more words +in another. This is expressed by the fertility probability, +n(phi | source word). + +If a source word translates into more than one word, it is possible to +generate sentences that have the same alignment in multiple ways. This +is modeled by a distortion step. The distortion probability, d(j|i,l,m), +predicts a target word position, given its aligned source word's +position. The distortion probability replaces the alignment probability +of Model 2. + +The fertility probability is not applicable for NULL. Target words that +align to NULL are assumed to be distributed uniformly in the target +sentence. The existence of these words is modeled by p1, the probability +that a target word produced by a real source word requires another +target word that is produced by NULL. + +The EM algorithm used in Model 3 is: + +:E step: In the training data, collect counts, weighted by prior + probabilities. + + - (a) count how many times a source language word is translated + into a target language word + - (b) count how many times a particular position in the target + sentence is aligned to a particular position in the source + sentence + - (c) count how many times a source word is aligned to phi number + of target words + - (d) count how many times NULL is aligned to a target word + +:M step: Estimate new probabilities based on the counts from the E step + +Because there are too many possible alignments, only the most probable +ones are considered. First, the best alignment is determined using prior +probabilities. Then, a hill climbing approach is used to find other good +candidates. + +Notations +--------- + +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:l: Number of words in the source sentence, excluding NULL +:m: Number of words in the target sentence +:s: A word in the source language +:t: A word in the target language +:phi: Fertility, the number of target words produced by a source word +:p1: Probability that a target word produced by a source word is + accompanied by another target word that is aligned to NULL +:p0: 1 - p1 + +References +---------- + +Philipp Koehn. 2010. Statistical Machine Translation. +Cambridge University Press, New York. + +Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and +Robert L. Mercer. 1993. The Mathematics of Statistical Machine +Translation: Parameter Estimation. Computational Linguistics, 19 (2), +263-311. +""" + +import warnings +from collections import defaultdict +from math import factorial + +from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel2 +from nltk.translate.ibm_model import Counts + + +class IBMModel3(IBMModel): + """ + Translation model that considers how a word can be aligned to + multiple words in another language + + >>> bitext = [] + >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) + >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) + >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) + >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) + >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) + >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) + >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) + + >>> ibm3 = IBMModel3(bitext, 5) + + >>> print(round(ibm3.translation_table['buch']['book'], 3)) + 1.0 + >>> print(round(ibm3.translation_table['das']['book'], 3)) + 0.0 + >>> print(round(ibm3.translation_table['ja'][None], 3)) + 1.0 + + >>> print(round(ibm3.distortion_table[1][1][2][2], 3)) + 1.0 + >>> print(round(ibm3.distortion_table[1][2][2][2], 3)) + 0.0 + >>> print(round(ibm3.distortion_table[2][2][4][5], 3)) + 0.75 + + >>> print(round(ibm3.fertility_table[2]['summarize'], 3)) + 1.0 + >>> print(round(ibm3.fertility_table[1]['book'], 3)) + 1.0 + + >>> print(round(ibm3.p1, 3)) + 0.054 + + >>> test_sentence = bitext[2] + >>> test_sentence.words + ['das', 'buch', 'ist', 'ja', 'klein'] + >>> test_sentence.mots + ['the', 'book', 'is', 'small'] + >>> test_sentence.alignment + Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) + + """ + + def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): + """ + Train on ``sentence_aligned_corpus`` and create a lexical + translation model, a distortion model, a fertility model, and a + model for generating NULL-aligned words. + + Translation direction is from ``AlignedSent.mots`` to + ``AlignedSent.words``. + + :param sentence_aligned_corpus: Sentence-aligned parallel corpus + :type sentence_aligned_corpus: list(AlignedSent) + + :param iterations: Number of iterations to run training algorithm + :type iterations: int + + :param probability_tables: Optional. Use this to pass in custom + probability values. If not specified, probabilities will be + set to a uniform distribution, or some other sensible value. + If specified, all the following entries must be present: + ``translation_table``, ``alignment_table``, + ``fertility_table``, ``p1``, ``distortion_table``. + See ``IBMModel`` for the type and purpose of these tables. + :type probability_tables: dict[str]: object + """ + super().__init__(sentence_aligned_corpus) + self.reset_probabilities() + + if probability_tables is None: + # Get translation and alignment probabilities from IBM Model 2 + ibm2 = IBMModel2(sentence_aligned_corpus, iterations) + self.translation_table = ibm2.translation_table + self.alignment_table = ibm2.alignment_table + self.set_uniform_probabilities(sentence_aligned_corpus) + else: + # Set user-defined probabilities + self.translation_table = probability_tables["translation_table"] + self.alignment_table = probability_tables["alignment_table"] + self.fertility_table = probability_tables["fertility_table"] + self.p1 = probability_tables["p1"] + self.distortion_table = probability_tables["distortion_table"] + + for n in range(0, iterations): + self.train(sentence_aligned_corpus) + + def reset_probabilities(self): + super().reset_probabilities() + self.distortion_table = defaultdict( + lambda: defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) + ) + ) + """ + dict[int][int][int][int]: float. Probability(j | i,l,m). + Values accessed as ``distortion_table[j][i][l][m]``. + """ + + def set_uniform_probabilities(self, sentence_aligned_corpus): + # d(j | i,l,m) = 1 / m for all i, j, l, m + l_m_combinations = set() + for aligned_sentence in sentence_aligned_corpus: + l = len(aligned_sentence.mots) + m = len(aligned_sentence.words) + if (l, m) not in l_m_combinations: + l_m_combinations.add((l, m)) + initial_prob = 1 / m + if initial_prob < IBMModel.MIN_PROB: + warnings.warn( + "A target sentence is too long (" + + str(m) + + " words). Results may be less accurate." + ) + for j in range(1, m + 1): + for i in range(0, l + 1): + self.distortion_table[j][i][l][m] = initial_prob + + # simple initialization, taken from GIZA++ + self.fertility_table[0] = defaultdict(lambda: 0.2) + self.fertility_table[1] = defaultdict(lambda: 0.65) + self.fertility_table[2] = defaultdict(lambda: 0.1) + self.fertility_table[3] = defaultdict(lambda: 0.04) + MAX_FERTILITY = 10 + initial_fert_prob = 0.01 / (MAX_FERTILITY - 4) + for phi in range(4, MAX_FERTILITY): + self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob) + + self.p1 = 0.5 + + def train(self, parallel_corpus): + counts = Model3Counts() + for aligned_sentence in parallel_corpus: + l = len(aligned_sentence.mots) + m = len(aligned_sentence.words) + + # Sample the alignment space + sampled_alignments, best_alignment = self.sample(aligned_sentence) + # Record the most probable alignment + aligned_sentence.alignment = Alignment( + best_alignment.zero_indexed_alignment() + ) + + # E step (a): Compute normalization factors to weigh counts + total_count = self.prob_of_alignments(sampled_alignments) + + # E step (b): Collect counts + for alignment_info in sampled_alignments: + count = self.prob_t_a_given_s(alignment_info) + normalized_count = count / total_count + + for j in range(1, m + 1): + counts.update_lexical_translation( + normalized_count, alignment_info, j + ) + counts.update_distortion(normalized_count, alignment_info, j, l, m) + + counts.update_null_generation(normalized_count, alignment_info) + counts.update_fertility(normalized_count, alignment_info) + + # M step: Update probabilities with maximum likelihood estimates + # If any probability is less than MIN_PROB, clamp it to MIN_PROB + existing_alignment_table = self.alignment_table + self.reset_probabilities() + self.alignment_table = existing_alignment_table # don't retrain + + self.maximize_lexical_translation_probabilities(counts) + self.maximize_distortion_probabilities(counts) + self.maximize_fertility_probabilities(counts) + self.maximize_null_generation_probabilities(counts) + + def maximize_distortion_probabilities(self, counts): + MIN_PROB = IBMModel.MIN_PROB + for j, i_s in counts.distortion.items(): + for i, src_sentence_lengths in i_s.items(): + for l, trg_sentence_lengths in src_sentence_lengths.items(): + for m in trg_sentence_lengths: + estimate = ( + counts.distortion[j][i][l][m] + / counts.distortion_for_any_j[i][l][m] + ) + self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB) + + def prob_t_a_given_s(self, alignment_info): + """ + Probability of target sentence and an alignment given the + source sentence + """ + src_sentence = alignment_info.src_sentence + trg_sentence = alignment_info.trg_sentence + l = len(src_sentence) - 1 # exclude NULL + m = len(trg_sentence) - 1 + p1 = self.p1 + p0 = 1 - p1 + + probability = 1.0 + MIN_PROB = IBMModel.MIN_PROB + + # Combine NULL insertion probability + null_fertility = alignment_info.fertility_of_i(0) + probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) + if probability < MIN_PROB: + return MIN_PROB + + # Compute combination (m - null_fertility) choose null_fertility + for i in range(1, null_fertility + 1): + probability *= (m - null_fertility - i + 1) / i + if probability < MIN_PROB: + return MIN_PROB + + # Combine fertility probabilities + for i in range(1, l + 1): + fertility = alignment_info.fertility_of_i(i) + probability *= ( + factorial(fertility) * self.fertility_table[fertility][src_sentence[i]] + ) + if probability < MIN_PROB: + return MIN_PROB + + # Combine lexical and distortion probabilities + for j in range(1, m + 1): + t = trg_sentence[j] + i = alignment_info.alignment[j] + s = src_sentence[i] + + probability *= ( + self.translation_table[t][s] * self.distortion_table[j][i][l][m] + ) + if probability < MIN_PROB: + return MIN_PROB + + return probability + + +class Model3Counts(Counts): + """ + Data object to store counts of various parameters during training. + Includes counts for distortion. + """ + + def __init__(self): + super().__init__() + self.distortion = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + ) + self.distortion_for_any_j = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) + ) + + def update_distortion(self, count, alignment_info, j, l, m): + i = alignment_info.alignment[j] + self.distortion[j][i][l][m] += count + self.distortion_for_any_j[i][l][m] += count diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm5.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm5.py new file mode 100644 index 0000000000000000000000000000000000000000..8cbc40ef12db4f33d0ff4332c997fbf30e15819f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm5.py @@ -0,0 +1,663 @@ +# Natural Language Toolkit: IBM Model 5 +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Tah Wei Hoon +# URL: +# For license information, see LICENSE.TXT + +""" +Translation model that keeps track of vacant positions in the target +sentence to decide where to place translated words. + +Translation can be viewed as a process where each word in the source +sentence is stepped through sequentially, generating translated words +for each source word. The target sentence can be viewed as being made +up of ``m`` empty slots initially, which gradually fill up as generated +words are placed in them. + +Models 3 and 4 use distortion probabilities to decide how to place +translated words. For simplicity, these models ignore the history of +which slots have already been occupied with translated words. +Consider the placement of the last translated word: there is only one +empty slot left in the target sentence, so the distortion probability +should be 1.0 for that position and 0.0 everywhere else. However, the +distortion probabilities for Models 3 and 4 are set up such that all +positions are under consideration. + +IBM Model 5 fixes this deficiency by accounting for occupied slots +during translation. It introduces the vacancy function v(j), the number +of vacancies up to, and including, position j in the target sentence. + +Terminology +----------- + +:Maximum vacancy: + The number of valid slots that a word can be placed in. + This is not necessarily the same as the number of vacant slots. + For example, if a tablet contains more than one word, the head word + cannot be placed at the last vacant slot because there will be no + space for the other words in the tablet. The number of valid slots + has to take into account the length of the tablet. + Non-head words cannot be placed before the head word, so vacancies + to the left of the head word are ignored. +:Vacancy difference: + For a head word: (v(j) - v(center of previous cept)) + Can be positive or negative. + For a non-head word: (v(j) - v(position of previously placed word)) + Always positive, because successive words in a tablet are assumed to + appear to the right of the previous word. + +Positioning of target words fall under three cases: + +1. Words generated by NULL are distributed uniformly +2. For a head word t, its position is modeled by the probability + v_head(dv | max_v,word_class_t(t)) +3. For a non-head word t, its position is modeled by the probability + v_non_head(dv | max_v,word_class_t(t)) + +dv and max_v are defined differently for head and non-head words. + +The EM algorithm used in Model 5 is: + +:E step: In the training data, collect counts, weighted by prior + probabilities. + + - (a) count how many times a source language word is translated + into a target language word + - (b) for a particular word class and maximum vacancy, count how + many times a head word and the previous cept's center have + a particular difference in number of vacancies + - (b) for a particular word class and maximum vacancy, count how + many times a non-head word and the previous target word + have a particular difference in number of vacancies + - (d) count how many times a source word is aligned to phi number + of target words + - (e) count how many times NULL is aligned to a target word + +:M step: Estimate new probabilities based on the counts from the E step + +Like Model 4, there are too many possible alignments to consider. Thus, +a hill climbing approach is used to sample good candidates. In addition, +pruning is used to weed out unlikely alignments based on Model 4 scores. + +Notations +--------- + +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:l: Number of words in the source sentence, excluding NULL +:m: Number of words in the target sentence +:s: A word in the source language +:t: A word in the target language +:phi: Fertility, the number of target words produced by a source word +:p1: Probability that a target word produced by a source word is + accompanied by another target word that is aligned to NULL +:p0: 1 - p1 +:max_v: Maximum vacancy +:dv: Vacancy difference, Δv + +The definition of v_head here differs from GIZA++, section 4.7 of +[Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is +v_head(v(j) | v(center of previous cept),max_v,word_class(t)). + +Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with +v(center of previous cept) to obtain dv: +v_head(v(j) - v(center of previous cept) | max_v,word_class(t)). + +References +---------- + +Philipp Koehn. 2010. Statistical Machine Translation. +Cambridge University Press, New York. + +Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and +Robert L. Mercer. 1993. The Mathematics of Statistical Machine +Translation: Parameter Estimation. Computational Linguistics, 19 (2), +263-311. +""" + +import warnings +from collections import defaultdict +from math import factorial + +from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel4 +from nltk.translate.ibm_model import Counts, longest_target_sentence_length + + +class IBMModel5(IBMModel): + """ + Translation model that keeps track of vacant positions in the target + sentence to decide where to place translated words + + >>> bitext = [] + >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) + >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) + >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) + >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) + >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) + >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) + >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) + >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } + >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } + + >>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes) + + >>> print(round(ibm5.head_vacancy_table[1][1][1], 3)) + 1.0 + >>> print(round(ibm5.head_vacancy_table[2][1][1], 3)) + 0.0 + >>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3)) + 1.0 + + >>> print(round(ibm5.fertility_table[2]['summarize'], 3)) + 1.0 + >>> print(round(ibm5.fertility_table[1]['book'], 3)) + 1.0 + + >>> print(round(ibm5.p1, 3)) + 0.033 + + >>> test_sentence = bitext[2] + >>> test_sentence.words + ['das', 'buch', 'ist', 'ja', 'klein'] + >>> test_sentence.mots + ['the', 'book', 'is', 'small'] + >>> test_sentence.alignment + Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) + + """ + + MIN_SCORE_FACTOR = 0.2 + """ + Alignments with scores below this factor are pruned during sampling + """ + + def __init__( + self, + sentence_aligned_corpus, + iterations, + source_word_classes, + target_word_classes, + probability_tables=None, + ): + """ + Train on ``sentence_aligned_corpus`` and create a lexical + translation model, vacancy models, a fertility model, and a + model for generating NULL-aligned words. + + Translation direction is from ``AlignedSent.mots`` to + ``AlignedSent.words``. + + :param sentence_aligned_corpus: Sentence-aligned parallel corpus + :type sentence_aligned_corpus: list(AlignedSent) + + :param iterations: Number of iterations to run training algorithm + :type iterations: int + + :param source_word_classes: Lookup table that maps a source word + to its word class, the latter represented by an integer id + :type source_word_classes: dict[str]: int + + :param target_word_classes: Lookup table that maps a target word + to its word class, the latter represented by an integer id + :type target_word_classes: dict[str]: int + + :param probability_tables: Optional. Use this to pass in custom + probability values. If not specified, probabilities will be + set to a uniform distribution, or some other sensible value. + If specified, all the following entries must be present: + ``translation_table``, ``alignment_table``, + ``fertility_table``, ``p1``, ``head_distortion_table``, + ``non_head_distortion_table``, ``head_vacancy_table``, + ``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``, + and ``IBMModel5`` for the type and purpose of these tables. + :type probability_tables: dict[str]: object + """ + super().__init__(sentence_aligned_corpus) + self.reset_probabilities() + self.src_classes = source_word_classes + self.trg_classes = target_word_classes + + if probability_tables is None: + # Get probabilities from IBM model 4 + ibm4 = IBMModel4( + sentence_aligned_corpus, + iterations, + source_word_classes, + target_word_classes, + ) + self.translation_table = ibm4.translation_table + self.alignment_table = ibm4.alignment_table + self.fertility_table = ibm4.fertility_table + self.p1 = ibm4.p1 + self.head_distortion_table = ibm4.head_distortion_table + self.non_head_distortion_table = ibm4.non_head_distortion_table + self.set_uniform_probabilities(sentence_aligned_corpus) + else: + # Set user-defined probabilities + self.translation_table = probability_tables["translation_table"] + self.alignment_table = probability_tables["alignment_table"] + self.fertility_table = probability_tables["fertility_table"] + self.p1 = probability_tables["p1"] + self.head_distortion_table = probability_tables["head_distortion_table"] + self.non_head_distortion_table = probability_tables[ + "non_head_distortion_table" + ] + self.head_vacancy_table = probability_tables["head_vacancy_table"] + self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"] + + for n in range(0, iterations): + self.train(sentence_aligned_corpus) + + def reset_probabilities(self): + super().reset_probabilities() + self.head_vacancy_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) + ) + """ + dict[int][int][int]: float. Probability(vacancy difference | + number of remaining valid positions,target word class). + Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``. + """ + + self.non_head_vacancy_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) + ) + """ + dict[int][int][int]: float. Probability(vacancy difference | + number of remaining valid positions,target word class). + Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``. + """ + + def set_uniform_probabilities(self, sentence_aligned_corpus): + """ + Set vacancy probabilities uniformly to + 1 / cardinality of vacancy difference values + """ + max_m = longest_target_sentence_length(sentence_aligned_corpus) + + # The maximum vacancy difference occurs when a word is placed in + # the last available position m of the target sentence and the + # previous word position has no vacancies. + # The minimum is 1-max_v, when a word is placed in the first + # available position and the previous word is placed beyond the + # last available position. + # Thus, the number of possible vacancy difference values is + # (max_v) - (1-max_v) + 1 = 2 * max_v. + if max_m > 0 and (1 / (2 * max_m)) < IBMModel.MIN_PROB: + warnings.warn( + "A target sentence is too long (" + + str(max_m) + + " words). Results may be less accurate." + ) + + for max_v in range(1, max_m + 1): + for dv in range(1, max_m + 1): + initial_prob = 1 / (2 * max_v) + self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob) + self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict( + lambda: initial_prob + ) + self.non_head_vacancy_table[dv][max_v] = defaultdict( + lambda: initial_prob + ) + self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict( + lambda: initial_prob + ) + + def train(self, parallel_corpus): + counts = Model5Counts() + for aligned_sentence in parallel_corpus: + l = len(aligned_sentence.mots) + m = len(aligned_sentence.words) + + # Sample the alignment space + sampled_alignments, best_alignment = self.sample(aligned_sentence) + # Record the most probable alignment + aligned_sentence.alignment = Alignment( + best_alignment.zero_indexed_alignment() + ) + + # E step (a): Compute normalization factors to weigh counts + total_count = self.prob_of_alignments(sampled_alignments) + + # E step (b): Collect counts + for alignment_info in sampled_alignments: + count = self.prob_t_a_given_s(alignment_info) + normalized_count = count / total_count + + for j in range(1, m + 1): + counts.update_lexical_translation( + normalized_count, alignment_info, j + ) + + slots = Slots(m) + for i in range(1, l + 1): + counts.update_vacancy( + normalized_count, alignment_info, i, self.trg_classes, slots + ) + + counts.update_null_generation(normalized_count, alignment_info) + counts.update_fertility(normalized_count, alignment_info) + + # M step: Update probabilities with maximum likelihood estimates + # If any probability is less than MIN_PROB, clamp it to MIN_PROB + existing_alignment_table = self.alignment_table + self.reset_probabilities() + self.alignment_table = existing_alignment_table # don't retrain + + self.maximize_lexical_translation_probabilities(counts) + self.maximize_vacancy_probabilities(counts) + self.maximize_fertility_probabilities(counts) + self.maximize_null_generation_probabilities(counts) + + def sample(self, sentence_pair): + """ + Sample the most probable alignments from the entire alignment + space according to Model 4 + + Note that Model 4 scoring is used instead of Model 5 because the + latter is too expensive to compute. + + First, determine the best alignment according to IBM Model 2. + With this initial alignment, use hill climbing to determine the + best alignment according to a IBM Model 4. Add this + alignment and its neighbors to the sample set. Repeat this + process with other initial alignments obtained by pegging an + alignment point. Finally, prune alignments that have + substantially lower Model 4 scores than the best alignment. + + :param sentence_pair: Source and target language sentence pair + to generate a sample of alignments from + :type sentence_pair: AlignedSent + + :return: A set of best alignments represented by their ``AlignmentInfo`` + and the best alignment of the set for convenience + :rtype: set(AlignmentInfo), AlignmentInfo + """ + sampled_alignments, best_alignment = super().sample(sentence_pair) + return self.prune(sampled_alignments), best_alignment + + def prune(self, alignment_infos): + """ + Removes alignments from ``alignment_infos`` that have + substantially lower Model 4 scores than the best alignment + + :return: Pruned alignments + :rtype: set(AlignmentInfo) + """ + alignments = [] + best_score = 0 + + for alignment_info in alignment_infos: + score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self) + best_score = max(score, best_score) + alignments.append((alignment_info, score)) + + threshold = IBMModel5.MIN_SCORE_FACTOR * best_score + alignments = [a[0] for a in alignments if a[1] > threshold] + return set(alignments) + + def hillclimb(self, alignment_info, j_pegged=None): + """ + Starting from the alignment in ``alignment_info``, look at + neighboring alignments iteratively for the best one, according + to Model 4 + + Note that Model 4 scoring is used instead of Model 5 because the + latter is too expensive to compute. + + There is no guarantee that the best alignment in the alignment + space will be found, because the algorithm might be stuck in a + local maximum. + + :param j_pegged: If specified, the search will be constrained to + alignments where ``j_pegged`` remains unchanged + :type j_pegged: int + + :return: The best alignment found from hill climbing + :rtype: AlignmentInfo + """ + alignment = alignment_info # alias with shorter name + max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self) + + while True: + old_alignment = alignment + for neighbor_alignment in self.neighboring(alignment, j_pegged): + neighbor_probability = IBMModel4.model4_prob_t_a_given_s( + neighbor_alignment, self + ) + + if neighbor_probability > max_probability: + alignment = neighbor_alignment + max_probability = neighbor_probability + + if alignment == old_alignment: + # Until there are no better alignments + break + + alignment.score = max_probability + return alignment + + def prob_t_a_given_s(self, alignment_info): + """ + Probability of target sentence and an alignment given the + source sentence + """ + probability = 1.0 + MIN_PROB = IBMModel.MIN_PROB + slots = Slots(len(alignment_info.trg_sentence) - 1) + + def null_generation_term(): + # Binomial distribution: B(m - null_fertility, p1) + value = 1.0 + p1 = self.p1 + p0 = 1 - p1 + null_fertility = alignment_info.fertility_of_i(0) + m = len(alignment_info.trg_sentence) - 1 + value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) + if value < MIN_PROB: + return MIN_PROB + + # Combination: (m - null_fertility) choose null_fertility + for i in range(1, null_fertility + 1): + value *= (m - null_fertility - i + 1) / i + return value + + def fertility_term(): + value = 1.0 + src_sentence = alignment_info.src_sentence + for i in range(1, len(src_sentence)): + fertility = alignment_info.fertility_of_i(i) + value *= ( + factorial(fertility) + * self.fertility_table[fertility][src_sentence[i]] + ) + if value < MIN_PROB: + return MIN_PROB + return value + + def lexical_translation_term(j): + t = alignment_info.trg_sentence[j] + i = alignment_info.alignment[j] + s = alignment_info.src_sentence[i] + return self.translation_table[t][s] + + def vacancy_term(i): + value = 1.0 + tablet = alignment_info.cepts[i] + tablet_length = len(tablet) + total_vacancies = slots.vacancies_at(len(slots)) + + # case 1: NULL-aligned words + if tablet_length == 0: + return value + + # case 2: head word + j = tablet[0] + previous_cept = alignment_info.previous_cept(j) + previous_center = alignment_info.center_of_cept(previous_cept) + dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center) + max_v = total_vacancies - tablet_length + 1 + trg_class = self.trg_classes[alignment_info.trg_sentence[j]] + value *= self.head_vacancy_table[dv][max_v][trg_class] + slots.occupy(j) # mark position as occupied + total_vacancies -= 1 + if value < MIN_PROB: + return MIN_PROB + + # case 3: non-head words + for k in range(1, tablet_length): + previous_position = tablet[k - 1] + previous_vacancies = slots.vacancies_at(previous_position) + j = tablet[k] + dv = slots.vacancies_at(j) - previous_vacancies + max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies + trg_class = self.trg_classes[alignment_info.trg_sentence[j]] + value *= self.non_head_vacancy_table[dv][max_v][trg_class] + slots.occupy(j) # mark position as occupied + total_vacancies -= 1 + if value < MIN_PROB: + return MIN_PROB + + return value + + # end nested functions + + # Abort computation whenever probability falls below MIN_PROB at + # any point, since MIN_PROB can be considered as zero + probability *= null_generation_term() + if probability < MIN_PROB: + return MIN_PROB + + probability *= fertility_term() + if probability < MIN_PROB: + return MIN_PROB + + for j in range(1, len(alignment_info.trg_sentence)): + probability *= lexical_translation_term(j) + if probability < MIN_PROB: + return MIN_PROB + + for i in range(1, len(alignment_info.src_sentence)): + probability *= vacancy_term(i) + if probability < MIN_PROB: + return MIN_PROB + + return probability + + def maximize_vacancy_probabilities(self, counts): + MIN_PROB = IBMModel.MIN_PROB + head_vacancy_table = self.head_vacancy_table + for dv, max_vs in counts.head_vacancy.items(): + for max_v, trg_classes in max_vs.items(): + for t_cls in trg_classes: + estimate = ( + counts.head_vacancy[dv][max_v][t_cls] + / counts.head_vacancy_for_any_dv[max_v][t_cls] + ) + head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB) + + non_head_vacancy_table = self.non_head_vacancy_table + for dv, max_vs in counts.non_head_vacancy.items(): + for max_v, trg_classes in max_vs.items(): + for t_cls in trg_classes: + estimate = ( + counts.non_head_vacancy[dv][max_v][t_cls] + / counts.non_head_vacancy_for_any_dv[max_v][t_cls] + ) + non_head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB) + + +class Model5Counts(Counts): + """ + Data object to store counts of various parameters during training. + Includes counts for vacancies. + """ + + def __init__(self): + super().__init__() + self.head_vacancy = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) + ) + self.head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(lambda: 0.0)) + self.non_head_vacancy = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) + ) + self.non_head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(lambda: 0.0)) + + def update_vacancy(self, count, alignment_info, i, trg_classes, slots): + """ + :param count: Value to add to the vacancy counts + :param alignment_info: Alignment under consideration + :param i: Source word position under consideration + :param trg_classes: Target word classes + :param slots: Vacancy states of the slots in the target sentence. + Output parameter that will be modified as new words are placed + in the target sentence. + """ + tablet = alignment_info.cepts[i] + tablet_length = len(tablet) + total_vacancies = slots.vacancies_at(len(slots)) + + # case 1: NULL aligned words + if tablet_length == 0: + return # ignore zero fertility words + + # case 2: head word + j = tablet[0] + previous_cept = alignment_info.previous_cept(j) + previous_center = alignment_info.center_of_cept(previous_cept) + dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center) + max_v = total_vacancies - tablet_length + 1 + trg_class = trg_classes[alignment_info.trg_sentence[j]] + self.head_vacancy[dv][max_v][trg_class] += count + self.head_vacancy_for_any_dv[max_v][trg_class] += count + slots.occupy(j) # mark position as occupied + total_vacancies -= 1 + + # case 3: non-head words + for k in range(1, tablet_length): + previous_position = tablet[k - 1] + previous_vacancies = slots.vacancies_at(previous_position) + j = tablet[k] + dv = slots.vacancies_at(j) - previous_vacancies + max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies + trg_class = trg_classes[alignment_info.trg_sentence[j]] + self.non_head_vacancy[dv][max_v][trg_class] += count + self.non_head_vacancy_for_any_dv[max_v][trg_class] += count + slots.occupy(j) # mark position as occupied + total_vacancies -= 1 + + +class Slots: + """ + Represents positions in a target sentence. Used to keep track of + which slot (position) is occupied. + """ + + def __init__(self, target_sentence_length): + self._slots = [False] * (target_sentence_length + 1) # 1-indexed + + def occupy(self, position): + """ + :return: Mark slot at ``position`` as occupied + """ + self._slots[position] = True + + def vacancies_at(self, position): + """ + :return: Number of vacant slots up to, and including, ``position`` + """ + vacancies = 0 + for k in range(1, position + 1): + if not self._slots[k]: + vacancies += 1 + return vacancies + + def __len__(self): + return len(self._slots) - 1 # exclude dummy zeroeth element diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm_model.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm_model.py new file mode 100644 index 0000000000000000000000000000000000000000..4be8d5462cc16b53b847d2a705206b5536425d6f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm_model.py @@ -0,0 +1,549 @@ +# Natural Language Toolkit: IBM Model Core +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Tah Wei Hoon +# URL: +# For license information, see LICENSE.TXT + +""" +Common methods and classes for all IBM models. See ``IBMModel1``, +``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5`` +for specific implementations. + +The IBM models are a series of generative models that learn lexical +translation probabilities, p(target language word|source language word), +given a sentence-aligned parallel corpus. + +The models increase in sophistication from model 1 to 5. Typically, the +output of lower models is used to seed the higher models. All models +use the Expectation-Maximization (EM) algorithm to learn various +probability tables. + +Words in a sentence are one-indexed. The first word of a sentence has +position 1, not 0. Index 0 is reserved in the source sentence for the +NULL token. The concept of position does not apply to NULL, but it is +indexed at 0 by convention. + +Each target word is aligned to exactly one source word or the NULL +token. + +References: +Philipp Koehn. 2010. Statistical Machine Translation. +Cambridge University Press, New York. + +Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and +Robert L. Mercer. 1993. The Mathematics of Statistical Machine +Translation: Parameter Estimation. Computational Linguistics, 19 (2), +263-311. +""" + +from bisect import insort_left +from collections import defaultdict +from copy import deepcopy +from math import ceil + + +def longest_target_sentence_length(sentence_aligned_corpus): + """ + :param sentence_aligned_corpus: Parallel corpus under consideration + :type sentence_aligned_corpus: list(AlignedSent) + :return: Number of words in the longest target language sentence + of ``sentence_aligned_corpus`` + """ + max_m = 0 + for aligned_sentence in sentence_aligned_corpus: + m = len(aligned_sentence.words) + max_m = max(m, max_m) + return max_m + + +class IBMModel: + """ + Abstract base class for all IBM models + """ + + # Avoid division by zero and precision errors by imposing a minimum + # value for probabilities. Note that this approach is theoretically + # incorrect, since it may create probabilities that sum to more + # than 1. In practice, the contribution of probabilities with MIN_PROB + # is tiny enough that the value of MIN_PROB can be treated as zero. + MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7 + + def __init__(self, sentence_aligned_corpus): + self.init_vocab(sentence_aligned_corpus) + self.reset_probabilities() + + def reset_probabilities(self): + self.translation_table = defaultdict( + lambda: defaultdict(lambda: IBMModel.MIN_PROB) + ) + """ + dict[str][str]: float. Probability(target word | source word). + Values accessed as ``translation_table[target_word][source_word]``. + """ + + self.alignment_table = defaultdict( + lambda: defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB)) + ) + ) + """ + dict[int][int][int][int]: float. Probability(i | j,l,m). + Values accessed as ``alignment_table[i][j][l][m]``. + Used in model 2 and hill climbing in models 3 and above + """ + + self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) + """ + dict[int][str]: float. Probability(fertility | source word). + Values accessed as ``fertility_table[fertility][source_word]``. + Used in model 3 and higher. + """ + + self.p1 = 0.5 + """ + Probability that a generated word requires another target word + that is aligned to NULL. + Used in model 3 and higher. + """ + + def set_uniform_probabilities(self, sentence_aligned_corpus): + """ + Initialize probability tables to a uniform distribution + + Derived classes should implement this accordingly. + """ + pass + + def init_vocab(self, sentence_aligned_corpus): + src_vocab = set() + trg_vocab = set() + for aligned_sentence in sentence_aligned_corpus: + trg_vocab.update(aligned_sentence.words) + src_vocab.update(aligned_sentence.mots) + # Add the NULL token + src_vocab.add(None) + + self.src_vocab = src_vocab + """ + set(str): All source language words used in training + """ + + self.trg_vocab = trg_vocab + """ + set(str): All target language words used in training + """ + + def sample(self, sentence_pair): + """ + Sample the most probable alignments from the entire alignment + space + + First, determine the best alignment according to IBM Model 2. + With this initial alignment, use hill climbing to determine the + best alignment according to a higher IBM Model. Add this + alignment and its neighbors to the sample set. Repeat this + process with other initial alignments obtained by pegging an + alignment point. + + Hill climbing may be stuck in a local maxima, hence the pegging + and trying out of different alignments. + + :param sentence_pair: Source and target language sentence pair + to generate a sample of alignments from + :type sentence_pair: AlignedSent + + :return: A set of best alignments represented by their ``AlignmentInfo`` + and the best alignment of the set for convenience + :rtype: set(AlignmentInfo), AlignmentInfo + """ + sampled_alignments = set() + l = len(sentence_pair.mots) + m = len(sentence_pair.words) + + # Start from the best model 2 alignment + initial_alignment = self.best_model2_alignment(sentence_pair) + potential_alignment = self.hillclimb(initial_alignment) + sampled_alignments.update(self.neighboring(potential_alignment)) + best_alignment = potential_alignment + + # Start from other model 2 alignments, + # with the constraint that j is aligned (pegged) to i + for j in range(1, m + 1): + for i in range(0, l + 1): + initial_alignment = self.best_model2_alignment(sentence_pair, j, i) + potential_alignment = self.hillclimb(initial_alignment, j) + neighbors = self.neighboring(potential_alignment, j) + sampled_alignments.update(neighbors) + if potential_alignment.score > best_alignment.score: + best_alignment = potential_alignment + + return sampled_alignments, best_alignment + + def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0): + """ + Finds the best alignment according to IBM Model 2 + + Used as a starting point for hill climbing in Models 3 and + above, because it is easier to compute than the best alignments + in higher models + + :param sentence_pair: Source and target language sentence pair + to be word-aligned + :type sentence_pair: AlignedSent + + :param j_pegged: If specified, the alignment point of j_pegged + will be fixed to i_pegged + :type j_pegged: int + + :param i_pegged: Alignment point to j_pegged + :type i_pegged: int + """ + src_sentence = [None] + sentence_pair.mots + trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed + + l = len(src_sentence) - 1 # exclude NULL + m = len(trg_sentence) - 1 + + alignment = [0] * (m + 1) # init all alignments to NULL + cepts = [[] for i in range(l + 1)] # init all cepts to empty list + + for j in range(1, m + 1): + if j == j_pegged: + # use the pegged alignment instead of searching for best one + best_i = i_pegged + else: + best_i = 0 + max_alignment_prob = IBMModel.MIN_PROB + t = trg_sentence[j] + + for i in range(0, l + 1): + s = src_sentence[i] + alignment_prob = ( + self.translation_table[t][s] * self.alignment_table[i][j][l][m] + ) + + if alignment_prob >= max_alignment_prob: + max_alignment_prob = alignment_prob + best_i = i + + alignment[j] = best_i + cepts[best_i].append(j) + + return AlignmentInfo( + tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts + ) + + def hillclimb(self, alignment_info, j_pegged=None): + """ + Starting from the alignment in ``alignment_info``, look at + neighboring alignments iteratively for the best one + + There is no guarantee that the best alignment in the alignment + space will be found, because the algorithm might be stuck in a + local maximum. + + :param j_pegged: If specified, the search will be constrained to + alignments where ``j_pegged`` remains unchanged + :type j_pegged: int + + :return: The best alignment found from hill climbing + :rtype: AlignmentInfo + """ + alignment = alignment_info # alias with shorter name + max_probability = self.prob_t_a_given_s(alignment) + + while True: + old_alignment = alignment + for neighbor_alignment in self.neighboring(alignment, j_pegged): + neighbor_probability = self.prob_t_a_given_s(neighbor_alignment) + + if neighbor_probability > max_probability: + alignment = neighbor_alignment + max_probability = neighbor_probability + + if alignment == old_alignment: + # Until there are no better alignments + break + + alignment.score = max_probability + return alignment + + def neighboring(self, alignment_info, j_pegged=None): + """ + Determine the neighbors of ``alignment_info``, obtained by + moving or swapping one alignment point + + :param j_pegged: If specified, neighbors that have a different + alignment point from j_pegged will not be considered + :type j_pegged: int + + :return: A set neighboring alignments represented by their + ``AlignmentInfo`` + :rtype: set(AlignmentInfo) + """ + neighbors = set() + + l = len(alignment_info.src_sentence) - 1 # exclude NULL + m = len(alignment_info.trg_sentence) - 1 + original_alignment = alignment_info.alignment + original_cepts = alignment_info.cepts + + for j in range(1, m + 1): + if j != j_pegged: + # Add alignments that differ by one alignment point + for i in range(0, l + 1): + new_alignment = list(original_alignment) + new_cepts = deepcopy(original_cepts) + old_i = original_alignment[j] + + # update alignment + new_alignment[j] = i + + # update cepts + insort_left(new_cepts[i], j) + new_cepts[old_i].remove(j) + + new_alignment_info = AlignmentInfo( + tuple(new_alignment), + alignment_info.src_sentence, + alignment_info.trg_sentence, + new_cepts, + ) + neighbors.add(new_alignment_info) + + for j in range(1, m + 1): + if j != j_pegged: + # Add alignments that have two alignment points swapped + for other_j in range(1, m + 1): + if other_j != j_pegged and other_j != j: + new_alignment = list(original_alignment) + new_cepts = deepcopy(original_cepts) + other_i = original_alignment[other_j] + i = original_alignment[j] + + # update alignments + new_alignment[j] = other_i + new_alignment[other_j] = i + + # update cepts + new_cepts[other_i].remove(other_j) + insort_left(new_cepts[other_i], j) + new_cepts[i].remove(j) + insort_left(new_cepts[i], other_j) + + new_alignment_info = AlignmentInfo( + tuple(new_alignment), + alignment_info.src_sentence, + alignment_info.trg_sentence, + new_cepts, + ) + neighbors.add(new_alignment_info) + + return neighbors + + def maximize_lexical_translation_probabilities(self, counts): + for t, src_words in counts.t_given_s.items(): + for s in src_words: + estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s] + self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB) + + def maximize_fertility_probabilities(self, counts): + for phi, src_words in counts.fertility.items(): + for s in src_words: + estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s] + self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB) + + def maximize_null_generation_probabilities(self, counts): + p1_estimate = counts.p1 / (counts.p1 + counts.p0) + p1_estimate = max(p1_estimate, IBMModel.MIN_PROB) + # Clip p1 if it is too large, because p0 = 1 - p1 should not be + # smaller than MIN_PROB + self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB) + + def prob_of_alignments(self, alignments): + probability = 0 + for alignment_info in alignments: + probability += self.prob_t_a_given_s(alignment_info) + return probability + + def prob_t_a_given_s(self, alignment_info): + """ + Probability of target sentence and an alignment given the + source sentence + + All required information is assumed to be in ``alignment_info`` + and self. + + Derived classes should override this method + """ + return 0.0 + + +class AlignmentInfo: + """ + Helper data object for training IBM Models 3 and up + + Read-only. For a source sentence and its counterpart in the target + language, this class holds information about the sentence pair's + alignment, cepts, and fertility. + + Warning: Alignments are one-indexed here, in contrast to + nltk.translate.Alignment and AlignedSent, which are zero-indexed + This class is not meant to be used outside of IBM models. + """ + + def __init__(self, alignment, src_sentence, trg_sentence, cepts): + if not isinstance(alignment, tuple): + raise TypeError( + "The alignment must be a tuple because it is used " + "to uniquely identify AlignmentInfo objects." + ) + + self.alignment = alignment + """ + tuple(int): Alignment function. ``alignment[j]`` is the position + in the source sentence that is aligned to the position j in the + target sentence. + """ + + self.src_sentence = src_sentence + """ + tuple(str): Source sentence referred to by this object. + Should include NULL token (None) in index 0. + """ + + self.trg_sentence = trg_sentence + """ + tuple(str): Target sentence referred to by this object. + Should have a dummy element in index 0 so that the first word + starts from index 1. + """ + + self.cepts = cepts + """ + list(list(int)): The positions of the target words, in + ascending order, aligned to a source word position. For example, + cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7 + of the target sentence are aligned to the word in position 4 of + the source sentence + """ + + self.score = None + """ + float: Optional. Probability of alignment, as defined by the + IBM model that assesses this alignment + """ + + def fertility_of_i(self, i): + """ + Fertility of word in position ``i`` of the source sentence + """ + return len(self.cepts[i]) + + def is_head_word(self, j): + """ + :return: Whether the word in position ``j`` of the target + sentence is a head word + """ + i = self.alignment[j] + return self.cepts[i][0] == j + + def center_of_cept(self, i): + """ + :return: The ceiling of the average positions of the words in + the tablet of cept ``i``, or 0 if ``i`` is None + """ + if i is None: + return 0 + + average_position = sum(self.cepts[i]) / len(self.cepts[i]) + return int(ceil(average_position)) + + def previous_cept(self, j): + """ + :return: The previous cept of ``j``, or None if ``j`` belongs to + the first cept + """ + i = self.alignment[j] + if i == 0: + raise ValueError( + "Words aligned to NULL cannot have a previous " + "cept because NULL has no position" + ) + previous_cept = i - 1 + while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0: + previous_cept -= 1 + + if previous_cept <= 0: + previous_cept = None + return previous_cept + + def previous_in_tablet(self, j): + """ + :return: The position of the previous word that is in the same + tablet as ``j``, or None if ``j`` is the first word of the + tablet + """ + i = self.alignment[j] + tablet_position = self.cepts[i].index(j) + if tablet_position == 0: + return None + return self.cepts[i][tablet_position - 1] + + def zero_indexed_alignment(self): + """ + :return: Zero-indexed alignment, suitable for use in external + ``nltk.translate`` modules like ``nltk.translate.Alignment`` + :rtype: list(tuple) + """ + zero_indexed_alignment = [] + for j in range(1, len(self.trg_sentence)): + i = self.alignment[j] - 1 + if i < 0: + i = None # alignment to NULL token + zero_indexed_alignment.append((j - 1, i)) + return zero_indexed_alignment + + def __eq__(self, other): + return self.alignment == other.alignment + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self.alignment) + + +class Counts: + """ + Data object to store counts of various parameters during training + """ + + def __init__(self): + self.t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0)) + self.any_t_given_s = defaultdict(lambda: 0.0) + self.p0 = 0.0 + self.p1 = 0.0 + self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0)) + self.fertility_for_any_phi = defaultdict(lambda: 0.0) + + def update_lexical_translation(self, count, alignment_info, j): + i = alignment_info.alignment[j] + t = alignment_info.trg_sentence[j] + s = alignment_info.src_sentence[i] + self.t_given_s[t][s] += count + self.any_t_given_s[s] += count + + def update_null_generation(self, count, alignment_info): + m = len(alignment_info.trg_sentence) - 1 + fertility_of_null = alignment_info.fertility_of_i(0) + self.p1 += fertility_of_null * count + self.p0 += (m - 2 * fertility_of_null) * count + + def update_fertility(self, count, alignment_info): + for i in range(0, len(alignment_info.src_sentence)): + s = alignment_info.src_sentence[i] + phi = alignment_info.fertility_of_i(i) + self.fertility[phi][s] += count + self.fertility_for_any_phi[s] += count diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ribes_score.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ribes_score.py new file mode 100644 index 0000000000000000000000000000000000000000..eaeda21d122d8f47622c4242218e39b34fd59a93 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ribes_score.py @@ -0,0 +1,330 @@ +# Natural Language Toolkit: RIBES Score +# +# Copyright (C) 2001-2022 NLTK Project +# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian +# Mark Byers, ekhumoro, P. Ortiz +# URL: +# For license information, see LICENSE.TXT +""" RIBES score implementation """ + +import math +from itertools import islice + +from nltk.util import choose, ngrams + + +def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10): + """ + The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from + Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and + Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for + Distant Language Pairs". In Proceedings of EMNLP. + https://www.aclweb.org/anthology/D/D10/D10-1092.pdf + + The generic RIBES scores used in shared task, e.g. Workshop for + Asian Translation (WAT) uses the following RIBES calculations: + + RIBES = kendall_tau * (alpha**p1) * (beta**bp) + + Please note that this re-implementation differs from the official + RIBES implementation and though it emulates the results as describe + in the original paper, there are further optimization implemented + in the official RIBES script. + + Users are encouraged to use the official RIBES script instead of this + implementation when evaluating your machine translation system. Refer + to https://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script. + + :param references: a list of reference sentences + :type references: list(list(str)) + :param hypothesis: a hypothesis sentence + :type hypothesis: list(str) + :param alpha: hyperparameter used as a prior for the unigram precision. + :type alpha: float + :param beta: hyperparameter used as a prior for the brevity penalty. + :type beta: float + :return: The best ribes score from one of the references. + :rtype: float + """ + best_ribes = -1.0 + # Calculates RIBES for each reference and returns the best score. + for reference in references: + # Collects the *worder* from the ranked correlation alignments. + worder = word_rank_alignment(reference, hypothesis) + nkt = kendall_tau(worder) + + # Calculates the brevity penalty + bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis))) + + # Calculates the unigram precision, *p1* + p1 = len(worder) / len(hypothesis) + + _ribes = nkt * (p1**alpha) * (bp**beta) + + if _ribes > best_ribes: # Keeps the best score. + best_ribes = _ribes + + return best_ribes + + +def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10): + """ + This function "calculates RIBES for a system output (hypothesis) with + multiple references, and returns "best" score among multi-references and + individual scores. The scores are corpus-wise, i.e., averaged by the number + of sentences." (c.f. RIBES version 1.03.1 code). + + Different from BLEU's micro-average precision, RIBES calculates the + macro-average precision by averaging the best RIBES score for each pair of + hypothesis and its corresponding references + + >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', 'forever', + ... 'heed', 'Party', 'commands'] + >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] + >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + + >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', + ... 'interested', 'in', 'world', 'history'] + >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', + ... 'because', 'he', 'read', 'the', 'book'] + + >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] + >>> hypotheses = [hyp1, hyp2] + >>> round(corpus_ribes(list_of_references, hypotheses),4) + 0.3597 + + :param references: a corpus of lists of reference sentences, w.r.t. hypotheses + :type references: list(list(list(str))) + :param hypotheses: a list of hypothesis sentences + :type hypotheses: list(list(str)) + :param alpha: hyperparameter used as a prior for the unigram precision. + :type alpha: float + :param beta: hyperparameter used as a prior for the brevity penalty. + :type beta: float + :return: The best ribes score from one of the references. + :rtype: float + """ + corpus_best_ribes = 0.0 + # Iterate through each hypothesis and their corresponding references. + for references, hypothesis in zip(list_of_references, hypotheses): + corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta) + return corpus_best_ribes / len(hypotheses) + + +def position_of_ngram(ngram, sentence): + """ + This function returns the position of the first instance of the ngram + appearing in a sentence. + + Note that one could also use string as follows but the code is a little + convoluted with type casting back and forth: + + char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))] + word_pos = char_pos.count(' ') + + Another way to conceive this is: + + return next(i for i, ng in enumerate(ngrams(sentence, len(ngram))) + if ng == ngram) + + :param ngram: The ngram that needs to be searched + :type ngram: tuple + :param sentence: The list of tokens to search from. + :type sentence: list(str) + """ + # Iterates through the ngrams in sentence. + for i, sublist in enumerate(ngrams(sentence, len(ngram))): + # Returns the index of the word when ngram matches. + if ngram == sublist: + return i + + +def word_rank_alignment(reference, hypothesis, character_based=False): + """ + This is the word rank alignment algorithm described in the paper to produce + the *worder* list, i.e. a list of word indices of the hypothesis word orders + w.r.t. the list of reference words. + + Below is (H0, R0) example from the Isozaki et al. 2010 paper, + note the examples are indexed from 1 but the results here are indexed from 0: + + >>> ref = str('he was interested in world history because he ' + ... 'read the book').split() + >>> hyp = str('he read the book because he was interested in world ' + ... 'history').split() + >>> word_rank_alignment(ref, hyp) + [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] + + The (H1, R1) example from the paper, note the 0th index: + + >>> ref = 'John hit Bob yesterday'.split() + >>> hyp = 'Bob hit John yesterday'.split() + >>> word_rank_alignment(ref, hyp) + [2, 1, 0, 3] + + Here is the (H2, R2) example from the paper, note the 0th index here too: + + >>> ref = 'the boy read the book'.split() + >>> hyp = 'the book was read by the boy'.split() + >>> word_rank_alignment(ref, hyp) + [3, 4, 2, 0, 1] + + :param reference: a reference sentence + :type reference: list(str) + :param hypothesis: a hypothesis sentence + :type hypothesis: list(str) + """ + worder = [] + hyp_len = len(hypothesis) + # Stores a list of possible ngrams from the reference sentence. + # This is used for matching context window later in the algorithm. + ref_ngrams = [] + hyp_ngrams = [] + for n in range(1, len(reference) + 1): + for ng in ngrams(reference, n): + ref_ngrams.append(ng) + for ng in ngrams(hypothesis, n): + hyp_ngrams.append(ng) + for i, h_word in enumerate(hypothesis): + # If word is not in the reference, continue. + if h_word not in reference: + continue + # If we can determine one-to-one word correspondence for unigrams that + # only appear once in both the reference and hypothesis. + elif hypothesis.count(h_word) == reference.count(h_word) == 1: + worder.append(reference.index(h_word)) + else: + max_window_size = max(i, hyp_len - i + 1) + for window in range(1, max_window_size): + if i + window < hyp_len: # If searching the right context is possible. + # Retrieve the right context window. + right_context_ngram = tuple(islice(hypothesis, i, i + window + 1)) + num_times_in_ref = ref_ngrams.count(right_context_ngram) + num_times_in_hyp = hyp_ngrams.count(right_context_ngram) + # If ngram appears only once in both ref and hyp. + if num_times_in_ref == num_times_in_hyp == 1: + # Find the position of ngram that matched the reference. + pos = position_of_ngram(right_context_ngram, reference) + worder.append(pos) # Add the positions of the ngram. + break + if window <= i: # If searching the left context is possible. + # Retrieve the left context window. + left_context_ngram = tuple(islice(hypothesis, i - window, i + 1)) + num_times_in_ref = ref_ngrams.count(left_context_ngram) + num_times_in_hyp = hyp_ngrams.count(left_context_ngram) + if num_times_in_ref == num_times_in_hyp == 1: + # Find the position of ngram that matched the reference. + pos = position_of_ngram(left_context_ngram, reference) + # Add the positions of the ngram. + worder.append(pos + len(left_context_ngram) - 1) + break + return worder + + +def find_increasing_sequences(worder): + """ + Given the *worder* list, this function groups monotonic +1 sequences. + + >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] + >>> list(find_increasing_sequences(worder)) + [(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)] + + :param worder: The worder list output from word_rank_alignment + :param type: list(int) + """ + items = iter(worder) + a, b = None, next(items, None) + result = [b] + while b is not None: + a, b = b, next(items, None) + if b is not None and a + 1 == b: + result.append(b) + else: + if len(result) > 1: + yield tuple(result) + result = [b] + + +def kendall_tau(worder, normalize=True): + """ + Calculates the Kendall's Tau correlation coefficient given the *worder* + list of word alignments from word_rank_alignment(), using the formula: + + tau = 2 * num_increasing_pairs / num_possible_pairs -1 + + Note that the no. of increasing pairs can be discontinuous in the *worder* + list and each each increasing sequence can be tabulated as choose(len(seq), 2) + no. of increasing pairs, e.g. + + >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] + >>> number_possible_pairs = choose(len(worder), 2) + >>> round(kendall_tau(worder, normalize=False),3) + -0.236 + >>> round(kendall_tau(worder),3) + 0.382 + + :param worder: The worder list output from word_rank_alignment + :type worder: list(int) + :param normalize: Flag to indicate normalization to between 0.0 and 1.0. + :type normalize: boolean + :return: The Kendall's Tau correlation coefficient. + :rtype: float + """ + worder_len = len(worder) + # With worder_len < 2, `choose(worder_len, 2)` will be 0. + # As we divide by this, it will give a ZeroDivisionError. + # To avoid this, we can just return the lowest possible score. + if worder_len < 2: + tau = -1 + else: + # Extract the groups of increasing/monotonic sequences. + increasing_sequences = find_increasing_sequences(worder) + # Calculate no. of increasing_pairs in *worder* list. + num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences) + # Calculate no. of possible pairs. + num_possible_pairs = choose(worder_len, 2) + # Kendall's Tau computation. + tau = 2 * num_increasing_pairs / num_possible_pairs - 1 + if normalize: # If normalized, the tau output falls between 0.0 to 1.0 + return (tau + 1) / 2 + else: # Otherwise, the tau outputs falls between -1.0 to +1.0 + return tau + + +def spearman_rho(worder, normalize=True): + """ + Calculates the Spearman's Rho correlation coefficient given the *worder* + list of word alignment from word_rank_alignment(), using the formula: + + rho = 1 - sum(d**2) / choose(len(worder)+1, 3) + + Given that d is the sum of difference between the *worder* list of indices + and the original word indices from the reference sentence. + + Using the (H0,R0) and (H5, R5) example from the paper + + >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] + >>> round(spearman_rho(worder, normalize=False), 3) + -0.591 + >>> round(spearman_rho(worder), 3) + 0.205 + + :param worder: The worder list output from word_rank_alignment + :param type: list(int) + """ + worder_len = len(worder) + sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len))) + rho = 1 - sum_d_square / choose(worder_len + 1, 3) + + if normalize: # If normalized, the rho output falls between 0.0 to 1.0 + return (rho + 1) / 2 + else: # Otherwise, the rho outputs falls between -1.0 to +1.0 + return rho diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/stack_decoder.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/stack_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..8bdae77219b1376eb5d258aad285f03fb72de860 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/stack_decoder.py @@ -0,0 +1,515 @@ +# Natural Language Toolkit: Stack decoder +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Tah Wei Hoon +# URL: +# For license information, see LICENSE.TXT + +""" +A decoder that uses stacks to implement phrase-based translation. + +In phrase-based translation, the source sentence is segmented into +phrases of one or more words, and translations for those phrases are +used to build the target sentence. + +Hypothesis data structures are used to keep track of the source words +translated so far and the partial output. A hypothesis can be expanded +by selecting an untranslated phrase, looking up its translation in a +phrase table, and appending that translation to the partial output. +Translation is complete when a hypothesis covers all source words. + +The search space is huge because the source sentence can be segmented +in different ways, the source phrases can be selected in any order, +and there could be multiple translations for the same source phrase in +the phrase table. To make decoding tractable, stacks are used to limit +the number of candidate hypotheses by doing histogram and/or threshold +pruning. + +Hypotheses with the same number of words translated are placed in the +same stack. In histogram pruning, each stack has a size limit, and +the hypothesis with the lowest score is removed when the stack is full. +In threshold pruning, hypotheses that score below a certain threshold +of the best hypothesis in that stack are removed. + +Hypothesis scoring can include various factors such as phrase +translation probability, language model probability, length of +translation, cost of remaining words to be translated, and so on. + + +References: +Philipp Koehn. 2010. Statistical Machine Translation. +Cambridge University Press, New York. +""" + +import warnings +from collections import defaultdict +from math import log + + +class StackDecoder: + """ + Phrase-based stack decoder for machine translation + + >>> from nltk.translate import PhraseTable + >>> phrase_table = PhraseTable() + >>> phrase_table.add(('niemand',), ('nobody',), log(0.8)) + >>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2)) + >>> phrase_table.add(('erwartet',), ('expects',), log(0.8)) + >>> phrase_table.add(('erwartet',), ('expecting',), log(0.2)) + >>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1)) + >>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8)) + >>> phrase_table.add(('!',), ('!',), log(0.8)) + + >>> # nltk.model should be used here once it is implemented + >>> from collections import defaultdict + >>> language_prob = defaultdict(lambda: -999.0) + >>> language_prob[('nobody',)] = log(0.5) + >>> language_prob[('expects',)] = log(0.4) + >>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2) + >>> language_prob[('!',)] = log(0.1) + >>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})() + + >>> stack_decoder = StackDecoder(phrase_table, language_model) + + >>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!']) + ['nobody', 'expects', 'the', 'spanish', 'inquisition', '!'] + + """ + + def __init__(self, phrase_table, language_model): + """ + :param phrase_table: Table of translations for source language + phrases and the log probabilities for those translations. + :type phrase_table: PhraseTable + + :param language_model: Target language model. Must define a + ``probability_change`` method that calculates the change in + log probability of a sentence, if a given string is appended + to it. + This interface is experimental and will likely be replaced + with nltk.model once it is implemented. + :type language_model: object + """ + self.phrase_table = phrase_table + self.language_model = language_model + + self.word_penalty = 0.0 + """ + float: Influences the translation length exponentially. + If positive, shorter translations are preferred. + If negative, longer translations are preferred. + If zero, no penalty is applied. + """ + + self.beam_threshold = 0.0 + """ + float: Hypotheses that score below this factor of the best + hypothesis in a stack are dropped from consideration. + Value between 0.0 and 1.0. + """ + + self.stack_size = 100 + """ + int: Maximum number of hypotheses to consider in a stack. + Higher values increase the likelihood of a good translation, + but increases processing time. + """ + + self.__distortion_factor = 0.5 + self.__compute_log_distortion() + + @property + def distortion_factor(self): + """ + float: Amount of reordering of source phrases. + Lower values favour monotone translation, suitable when + word order is similar for both source and target languages. + Value between 0.0 and 1.0. Default 0.5. + """ + return self.__distortion_factor + + @distortion_factor.setter + def distortion_factor(self, d): + self.__distortion_factor = d + self.__compute_log_distortion() + + def __compute_log_distortion(self): + # cache log(distortion_factor) so we don't have to recompute it + # when scoring hypotheses + if self.__distortion_factor == 0.0: + self.__log_distortion_factor = log(1e-9) # 1e-9 is almost zero + else: + self.__log_distortion_factor = log(self.__distortion_factor) + + def translate(self, src_sentence): + """ + :param src_sentence: Sentence to be translated + :type src_sentence: list(str) + + :return: Translated sentence + :rtype: list(str) + """ + sentence = tuple(src_sentence) # prevent accidental modification + sentence_length = len(sentence) + stacks = [ + _Stack(self.stack_size, self.beam_threshold) + for _ in range(0, sentence_length + 1) + ] + empty_hypothesis = _Hypothesis() + stacks[0].push(empty_hypothesis) + + all_phrases = self.find_all_src_phrases(sentence) + future_score_table = self.compute_future_scores(sentence) + for stack in stacks: + for hypothesis in stack: + possible_expansions = StackDecoder.valid_phrases( + all_phrases, hypothesis + ) + for src_phrase_span in possible_expansions: + src_phrase = sentence[src_phrase_span[0] : src_phrase_span[1]] + for translation_option in self.phrase_table.translations_for( + src_phrase + ): + raw_score = self.expansion_score( + hypothesis, translation_option, src_phrase_span + ) + new_hypothesis = _Hypothesis( + raw_score=raw_score, + src_phrase_span=src_phrase_span, + trg_phrase=translation_option.trg_phrase, + previous=hypothesis, + ) + new_hypothesis.future_score = self.future_score( + new_hypothesis, future_score_table, sentence_length + ) + total_words = new_hypothesis.total_translated_words() + stacks[total_words].push(new_hypothesis) + + if not stacks[sentence_length]: + warnings.warn( + "Unable to translate all words. " + "The source sentence contains words not in " + "the phrase table" + ) + # Instead of returning empty output, perhaps a partial + # translation could be returned + return [] + + best_hypothesis = stacks[sentence_length].best() + return best_hypothesis.translation_so_far() + + def find_all_src_phrases(self, src_sentence): + """ + Finds all subsequences in src_sentence that have a phrase + translation in the translation table + + :type src_sentence: tuple(str) + + :return: Subsequences that have a phrase translation, + represented as a table of lists of end positions. + For example, if result[2] is [5, 6, 9], then there are + three phrases starting from position 2 in ``src_sentence``, + ending at positions 5, 6, and 9 exclusive. The list of + ending positions are in ascending order. + :rtype: list(list(int)) + """ + sentence_length = len(src_sentence) + phrase_indices = [[] for _ in src_sentence] + for start in range(0, sentence_length): + for end in range(start + 1, sentence_length + 1): + potential_phrase = src_sentence[start:end] + if potential_phrase in self.phrase_table: + phrase_indices[start].append(end) + return phrase_indices + + def compute_future_scores(self, src_sentence): + """ + Determines the approximate scores for translating every + subsequence in ``src_sentence`` + + Future scores can be used a look-ahead to determine the + difficulty of translating the remaining parts of a src_sentence. + + :type src_sentence: tuple(str) + + :return: Scores of subsequences referenced by their start and + end positions. For example, result[2][5] is the score of the + subsequence covering positions 2, 3, and 4. + :rtype: dict(int: (dict(int): float)) + """ + scores = defaultdict(lambda: defaultdict(lambda: float("-inf"))) + for seq_length in range(1, len(src_sentence) + 1): + for start in range(0, len(src_sentence) - seq_length + 1): + end = start + seq_length + phrase = src_sentence[start:end] + if phrase in self.phrase_table: + score = self.phrase_table.translations_for(phrase)[ + 0 + ].log_prob # pick best (first) translation + # Warning: API of language_model is subject to change + score += self.language_model.probability(phrase) + scores[start][end] = score + + # check if a better score can be obtained by combining + # two child subsequences + for mid in range(start + 1, end): + combined_score = scores[start][mid] + scores[mid][end] + if combined_score > scores[start][end]: + scores[start][end] = combined_score + return scores + + def future_score(self, hypothesis, future_score_table, sentence_length): + """ + Determines the approximate score for translating the + untranslated words in ``hypothesis`` + """ + score = 0.0 + for span in hypothesis.untranslated_spans(sentence_length): + score += future_score_table[span[0]][span[1]] + return score + + def expansion_score(self, hypothesis, translation_option, src_phrase_span): + """ + Calculate the score of expanding ``hypothesis`` with + ``translation_option`` + + :param hypothesis: Hypothesis being expanded + :type hypothesis: _Hypothesis + + :param translation_option: Information about the proposed expansion + :type translation_option: PhraseTableEntry + + :param src_phrase_span: Word position span of the source phrase + :type src_phrase_span: tuple(int, int) + """ + score = hypothesis.raw_score + score += translation_option.log_prob + # The API of language_model is subject to change; it could accept + # a string, a list of words, and/or some other type + score += self.language_model.probability_change( + hypothesis, translation_option.trg_phrase + ) + score += self.distortion_score(hypothesis, src_phrase_span) + score -= self.word_penalty * len(translation_option.trg_phrase) + return score + + def distortion_score(self, hypothesis, next_src_phrase_span): + if not hypothesis.src_phrase_span: + return 0.0 + next_src_phrase_start = next_src_phrase_span[0] + prev_src_phrase_end = hypothesis.src_phrase_span[1] + distortion_distance = next_src_phrase_start - prev_src_phrase_end + return abs(distortion_distance) * self.__log_distortion_factor + + @staticmethod + def valid_phrases(all_phrases_from, hypothesis): + """ + Extract phrases from ``all_phrases_from`` that contains words + that have not been translated by ``hypothesis`` + + :param all_phrases_from: Phrases represented by their spans, in + the same format as the return value of + ``find_all_src_phrases`` + :type all_phrases_from: list(list(int)) + + :type hypothesis: _Hypothesis + + :return: A list of phrases, represented by their spans, that + cover untranslated positions. + :rtype: list(tuple(int, int)) + """ + untranslated_spans = hypothesis.untranslated_spans(len(all_phrases_from)) + valid_phrases = [] + for available_span in untranslated_spans: + start = available_span[0] + available_end = available_span[1] + while start < available_end: + for phrase_end in all_phrases_from[start]: + if phrase_end > available_end: + # Subsequent elements in all_phrases_from[start] + # will also be > available_end, since the + # elements are in ascending order + break + valid_phrases.append((start, phrase_end)) + start += 1 + return valid_phrases + + +class _Hypothesis: + """ + Partial solution to a translation. + + Records the word positions of the phrase being translated, its + translation, raw score, and the cost of the untranslated parts of + the sentence. When the next phrase is selected to build upon the + partial solution, a new _Hypothesis object is created, with a back + pointer to the previous hypothesis. + + To find out which words have been translated so far, look at the + ``src_phrase_span`` in the hypothesis chain. Similarly, the + translation output can be found by traversing up the chain. + """ + + def __init__( + self, + raw_score=0.0, + src_phrase_span=(), + trg_phrase=(), + previous=None, + future_score=0.0, + ): + """ + :param raw_score: Likelihood of hypothesis so far. + Higher is better. Does not account for untranslated words. + :type raw_score: float + + :param src_phrase_span: Span of word positions covered by the + source phrase in this hypothesis expansion. For example, + (2, 5) means that the phrase is from the second word up to, + but not including the fifth word in the source sentence. + :type src_phrase_span: tuple(int) + + :param trg_phrase: Translation of the source phrase in this + hypothesis expansion + :type trg_phrase: tuple(str) + + :param previous: Previous hypothesis before expansion to this one + :type previous: _Hypothesis + + :param future_score: Approximate score for translating the + remaining words not covered by this hypothesis. Higher means + that the remaining words are easier to translate. + :type future_score: float + """ + self.raw_score = raw_score + self.src_phrase_span = src_phrase_span + self.trg_phrase = trg_phrase + self.previous = previous + self.future_score = future_score + + def score(self): + """ + Overall score of hypothesis after accounting for local and + global features + """ + return self.raw_score + self.future_score + + def untranslated_spans(self, sentence_length): + """ + Starting from each untranslated word, find the longest + continuous span of untranslated positions + + :param sentence_length: Length of source sentence being + translated by the hypothesis + :type sentence_length: int + + :rtype: list(tuple(int, int)) + """ + translated_positions = self.translated_positions() + translated_positions.sort() + translated_positions.append(sentence_length) # add sentinel position + + untranslated_spans = [] + start = 0 + # each untranslated span must end in one of the translated_positions + for end in translated_positions: + if start < end: + untranslated_spans.append((start, end)) + start = end + 1 + + return untranslated_spans + + def translated_positions(self): + """ + List of positions in the source sentence of words already + translated. The list is not sorted. + + :rtype: list(int) + """ + translated_positions = [] + current_hypothesis = self + while current_hypothesis.previous is not None: + translated_span = current_hypothesis.src_phrase_span + translated_positions.extend(range(translated_span[0], translated_span[1])) + current_hypothesis = current_hypothesis.previous + return translated_positions + + def total_translated_words(self): + return len(self.translated_positions()) + + def translation_so_far(self): + translation = [] + self.__build_translation(self, translation) + return translation + + def __build_translation(self, hypothesis, output): + if hypothesis.previous is None: + return + self.__build_translation(hypothesis.previous, output) + output.extend(hypothesis.trg_phrase) + + +class _Stack: + """ + Collection of _Hypothesis objects + """ + + def __init__(self, max_size=100, beam_threshold=0.0): + """ + :param beam_threshold: Hypotheses that score less than this + factor of the best hypothesis are discarded from the stack. + Value must be between 0.0 and 1.0. + :type beam_threshold: float + """ + self.max_size = max_size + self.items = [] + + if beam_threshold == 0.0: + self.__log_beam_threshold = float("-inf") + else: + self.__log_beam_threshold = log(beam_threshold) + + def push(self, hypothesis): + """ + Add ``hypothesis`` to the stack. + Removes lowest scoring hypothesis if the stack is full. + After insertion, hypotheses that score less than + ``beam_threshold`` times the score of the best hypothesis + are removed. + """ + self.items.append(hypothesis) + self.items.sort(key=lambda h: h.score(), reverse=True) + while len(self.items) > self.max_size: + self.items.pop() + self.threshold_prune() + + def threshold_prune(self): + if not self.items: + return + # log(score * beam_threshold) = log(score) + log(beam_threshold) + threshold = self.items[0].score() + self.__log_beam_threshold + for hypothesis in reversed(self.items): + if hypothesis.score() < threshold: + self.items.pop() + else: + break + + def best(self): + """ + :return: Hypothesis with the highest score in the stack + :rtype: _Hypothesis + """ + if self.items: + return self.items[0] + return None + + def __iter__(self): + return iter(self.items) + + def __contains__(self, hypothesis): + return hypothesis in self.items + + def __bool__(self): + return len(self.items) != 0 + + __nonzero__ = __bool__ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/treeprettyprinter.py b/.eggs/nltk-3.8-py3.10.egg/nltk/treeprettyprinter.py new file mode 100644 index 0000000000000000000000000000000000000000..b14e1c30e0b115ec07058fa452a5b189b95559b0 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/treeprettyprinter.py @@ -0,0 +1,28 @@ +# Natural Language Toolkit: ASCII visualization of NLTK trees +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Andreas van Cranenburgh +# Peter Ljunglöf +# URL: +# For license information, see LICENSE.TXT + +""" +Pretty-printing of discontinuous trees. +Adapted from the disco-dop project, by Andreas van Cranenburgh. +https://github.com/andreasvc/disco-dop + +Interesting reference (not used for this code): +T. Eschbach et al., Orth. Hypergraph Drawing, Journal of +Graph Algorithms and Applications, 10(2) 141--157 (2006)149. +https://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf +""" + +from nltk.internals import Deprecated +from nltk.tree.prettyprinter import TreePrettyPrinter as TPP + + +class TreePrettyPrinter(Deprecated, TPP): + """Import `TreePrettyPrinter` using `from nltk.tree import TreePrettyPrinter` instead.""" + + +__all__ = ["TreePrettyPrinter"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/treetransforms.py b/.eggs/nltk-3.8-py3.10.egg/nltk/treetransforms.py new file mode 100644 index 0000000000000000000000000000000000000000..6ebc061f321c701c7851370cd00cacb4499a256c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/treetransforms.py @@ -0,0 +1,126 @@ +# Natural Language Toolkit: Tree Transformations +# +# Copyright (C) 2005-2007 Oregon Graduate Institute +# Author: Nathan Bodenstab +# URL: +# For license information, see LICENSE.TXT + +r""" +A collection of methods for tree (grammar) transformations used +in parsing natural language. + +Although many of these methods are technically grammar transformations +(ie. Chomsky Norm Form), when working with treebanks it is much more +natural to visualize these modifications in a tree structure. Hence, +we will do all transformation directly to the tree itself. +Transforming the tree directly also allows us to do parent annotation. +A grammar can then be simply induced from the modified tree. + +The following is a short tutorial on the available transformations. + + 1. Chomsky Normal Form (binarization) + + It is well known that any grammar has a Chomsky Normal Form (CNF) + equivalent grammar where CNF is defined by every production having + either two non-terminals or one terminal on its right hand side. + When we have hierarchically structured data (ie. a treebank), it is + natural to view this in terms of productions where the root of every + subtree is the head (left hand side) of the production and all of + its children are the right hand side constituents. In order to + convert a tree into CNF, we simply need to ensure that every subtree + has either two subtrees as children (binarization), or one leaf node + (non-terminal). In order to binarize a subtree with more than two + children, we must introduce artificial nodes. + + There are two popular methods to convert a tree into CNF: left + factoring and right factoring. The following example demonstrates + the difference between them. Example:: + + Original Right-Factored Left-Factored + + A A A + / | \ / \ / \ + B C D ==> B A| OR A| D + / \ / \ + C D B C + + 2. Parent Annotation + + In addition to binarizing the tree, there are two standard + modifications to node labels we can do in the same traversal: parent + annotation and Markov order-N smoothing (or sibling smoothing). + + The purpose of parent annotation is to refine the probabilities of + productions by adding a small amount of context. With this simple + addition, a CYK (inside-outside, dynamic programming chart parse) + can improve from 74% to 79% accuracy. A natural generalization from + parent annotation is to grandparent annotation and beyond. The + tradeoff becomes accuracy gain vs. computational complexity. We + must also keep in mind data sparcity issues. Example:: + + Original Parent Annotation + + A A^ + / | \ / \ + B C D ==> B^
    A|^ where ? is the + / \ parent of A + C^ D^ + + + 3. Markov order-N smoothing + + Markov smoothing combats data sparcity issues as well as decreasing + computational requirements by limiting the number of children + included in artificial nodes. In practice, most people use an order + 2 grammar. Example:: + + Original No Smoothing Markov order 1 Markov order 2 etc. + + __A__ A A A + / /|\ \ / \ / \ / \ + B C D E F ==> B A| ==> B A| ==> B A| + / \ / \ / \ + C ... C ... C ... + + + + Annotation decisions can be thought about in the vertical direction + (parent, grandparent, etc) and the horizontal direction (number of + siblings to keep). Parameters to the following functions specify + these values. For more information see: + + Dan Klein and Chris Manning (2003) "Accurate Unlexicalized + Parsing", ACL-03. https://www.aclweb.org/anthology/P03-1054 + + 4. Unary Collapsing + + Collapse unary productions (ie. subtrees with a single child) into a + new non-terminal (Tree node). This is useful when working with + algorithms that do not allow unary productions, yet you do not wish + to lose the parent information. Example:: + + A + | + B ==> A+B + / \ / \ + C D C D + +""" + +from nltk.internals import deprecated +from nltk.tree.transforms import chomsky_normal_form as cnf +from nltk.tree.transforms import collapse_unary as cu +from nltk.tree.transforms import un_chomsky_normal_form as ucnf + +chomsky_normal_form = deprecated( + "Import using `from nltk.tree import chomsky_normal_form` instead." +)(cnf) +un_chomsky_normal_form = deprecated( + "Import using `from nltk.tree import un_chomsky_normal_form` instead." +)(ucnf) +collapse_unary = deprecated( + "Import using `from nltk.tree import collapse_unary` instead." +)(cu) + + +__all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/wsd.py b/.eggs/nltk-3.8-py3.10.egg/nltk/wsd.py new file mode 100644 index 0000000000000000000000000000000000000000..a87b8f37b43033aef5a28441a443b1b857131345 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/wsd.py @@ -0,0 +1,51 @@ +# Natural Language Toolkit: Word Sense Disambiguation Algorithms +# +# Authors: Liling Tan , +# Dmitrijs Milajevs +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +from nltk.corpus import wordnet + + +def lesk(context_sentence, ambiguous_word, pos=None, synsets=None): + """Return a synset for an ambiguous word in a context. + + :param iter context_sentence: The context sentence where the ambiguous word + occurs, passed as an iterable of words. + :param str ambiguous_word: The ambiguous word that requires WSD. + :param str pos: A specified Part-of-Speech (POS). + :param iter synsets: Possible synsets of the ambiguous word. + :return: ``lesk_sense`` The Synset() object with the highest signature overlaps. + + This function is an implementation of the original Lesk algorithm (1986) [1]. + + Usage example:: + + >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n') + Synset('savings_bank.n.02') + + [1] Lesk, Michael. "Automatic sense disambiguation using machine + readable dictionaries: how to tell a pine cone from an ice cream + cone." Proceedings of the 5th Annual International Conference on + Systems Documentation. ACM, 1986. + https://dl.acm.org/citation.cfm?id=318728 + """ + + context = set(context_sentence) + if synsets is None: + synsets = wordnet.synsets(ambiguous_word) + + if pos: + synsets = [ss for ss in synsets if str(ss.pos()) == pos] + + if not synsets: + return None + + _, sense = max( + (len(context.intersection(ss.definition().split())), ss) for ss in synsets + ) + + return sense diff --git a/tmp/b8b96b4b-c407-4329-ba45-13a64f0b2d82_params.py b/tmp/b8b96b4b-c407-4329-ba45-13a64f0b2d82_params.py new file mode 100644 index 0000000000000000000000000000000000000000..642f8c4ba87f2e497cd41bb7758109db75afa248 --- /dev/null +++ b/tmp/b8b96b4b-c407-4329-ba45-13a64f0b2d82_params.py @@ -0,0 +1,1420 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net-1.3B', + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_163453' diff --git a/tmp/b9dd5066-f2c8-4da9-bead-389c5dd78e80_params.py b/tmp/b9dd5066-f2c8-4da9-bead-389c5dd78e80_params.py new file mode 100644 index 0000000000000000000000000000000000000000..a114b84596145276110d2606ac28a77424e69476 --- /dev/null +++ b/tmp/b9dd5066-f2c8-4da9-bead-389c5dd78e80_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[95:114]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[95:114]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/gla-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/gla-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_205110' diff --git a/tmp/bdc80472-7031-4225-9be3-871d5dd312b3_params.py b/tmp/bdc80472-7031-4225-9be3-871d5dd312b3_params.py new file mode 100644 index 0000000000000000000000000000000000000000..59098e41927650abef57a3b45904cf0b3445d481 --- /dev/null +++ b/tmp/bdc80472-7031-4225-9be3-871d5dd312b3_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[57:76]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[189:252]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[189:252]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[57:76]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[189:252]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[189:252]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/gla-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/gla-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_205110' diff --git a/tmp/c10b22f7-dcd1-4a16-89cb-e4657327207a_params.py b/tmp/c10b22f7-dcd1-4a16-89cb-e4657327207a_params.py new file mode 100644 index 0000000000000000000000000000000000000000..8127f4f79a1d09065ae423c85b2bd993235cdb24 --- /dev/null +++ b/tmp/c10b22f7-dcd1-4a16-89cb-e4657327207a_params.py @@ -0,0 +1,56 @@ +datasets = [ + [ + dict( + abbr='LongBench_trec', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='transformer', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/transformer-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/transformer-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251218_164105' diff --git a/tmp/c1ca6a08-c3af-4b66-8939-8fe606bf9105_params.py b/tmp/c1ca6a08-c3af-4b66-8939-8fe606bf9105_params.py new file mode 100644 index 0000000000000000000000000000000000000000..167689255a240ea6b25085f3ef15b2d5006404fb --- /dev/null +++ b/tmp/c1ca6a08-c3af-4b66-8939-8fe606bf9105_params.py @@ -0,0 +1,1421 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[76:95]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[76:95]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='mask_gdn-1.3B', + batch_padding=False, + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_164744' diff --git a/tmp/c1d4a42b-4544-4ba0-aa3d-18bc81c72f13_params.py b/tmp/c1d4a42b-4544-4ba0-aa3d-18bc81c72f13_params.py new file mode 100644 index 0000000000000000000000000000000000000000..f8797792b9089a9bc98c0d06a112b55f8c0a9416 --- /dev/null +++ b/tmp/c1d4a42b-4544-4ba0-aa3d-18bc81c72f13_params.py @@ -0,0 +1,50 @@ +datasets = [ + [ + dict( + abbr='LongBench_qasper', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='mask_gdn-1.3B', + batch_padding=False, + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_164548' diff --git a/tmp/c2544b6b-398d-4550-ad23-863db663cf5a_params.py b/tmp/c2544b6b-398d-4550-ad23-863db663cf5a_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/c267665e-1dae-4a20-bcab-7366cdf6c96d_params.py b/tmp/c267665e-1dae-4a20-bcab-7366cdf6c96d_params.py new file mode 100644 index 0000000000000000000000000000000000000000..b9d0d333d8d180625d5251363f8ecd2b1f6751ea --- /dev/null +++ b/tmp/c267665e-1dae-4a20-bcab-7366cdf6c96d_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[57:76]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[189:252]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[189:252]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[57:76]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[189:252]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[189:252]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_3', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_3', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[75:100]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_202918' diff --git a/tmp/c291ac81-b920-4eb7-b298-8ab5fd740228_params.py b/tmp/c291ac81-b920-4eb7-b298-8ab5fd740228_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/c313c9b0-fe20-45d8-838b-2941f016bfb7_params.py b/tmp/c313c9b0-fe20-45d8-838b-2941f016bfb7_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/cce5370b-0185-4621-ac96-a9b6466f31e0_params.py b/tmp/cce5370b-0185-4621-ac96-a9b6466f31e0_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/ce381378-2028-4fd0-bb12-b51d4fb95b3b_params.py b/tmp/ce381378-2028-4fd0-bb12-b51d4fb95b3b_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/ce7a0b2d-1334-4fd9-9e3b-25aad14b7f2a_params.py b/tmp/ce7a0b2d-1334-4fd9-9e3b-25aad14b7f2a_params.py new file mode 100644 index 0000000000000000000000000000000000000000..41a110c3d67964e9acc951f011971364c0d54035 --- /dev/null +++ b/tmp/ce7a0b2d-1334-4fd9-9e3b-25aad14b7f2a_params.py @@ -0,0 +1,53 @@ +datasets = [ + [ + dict( + abbr='LongBench_qasper', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='retnet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/retnet-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/retnet-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_223306' diff --git a/tmp/ceb20666-0578-46a1-9917-1d9f2335de15_params.py b/tmp/ceb20666-0578-46a1-9917-1d9f2335de15_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/cf27836d-26a4-4305-a3bf-cbed2902f1cc_params.py b/tmp/cf27836d-26a4-4305-a3bf-cbed2902f1cc_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/d1149155-4c4e-40d8-9b4d-87575bf04dac_params.py b/tmp/d1149155-4c4e-40d8-9b4d-87575bf04dac_params.py new file mode 100644 index 0000000000000000000000000000000000000000..68943b57736baa64e55b8f63da52b2061e2ce989 --- /dev/null +++ b/tmp/d1149155-4c4e-40d8-9b4d-87575bf04dac_params.py @@ -0,0 +1,61 @@ +datasets = [ + [ + dict( + abbr='triviaqa_wiki_1shot_3', + eval_cfg=dict( + evaluator=dict(type='opencompass.datasets.TriviaQAEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + ice_template=dict( + template='Q: {question}\nA: {answer}.\n', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + inferencer=dict( + max_out_len=50, + stopping_criteria=[ + 'Q:', + '\n', + ], + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + ice_token='', + template='Q: {question}\nA: ', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + fix_id_list=[ + 0, + ], + type='opencompass.openicl.icl_retriever.FixKRetriever')), + path='opencompass/trivia_qa', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer', + test_range='[3000:4000]', + test_split='validation', + train_split='train'), + type='opencompass.datasets.TriviaQADatasetV2'), + ], +] +models = [ + dict( + abbr='mask_gdn_1B_hrr-rank4_hf', + batch_size=8, + generation_kwargs=dict(), + max_out_len=256, + max_seq_len=None, + model_kwargs=dict(), + pad_token_id=None, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + peft_kwargs=dict(), + peft_path=None, + run_cfg=dict(num_gpus=1), + stop_words=[], + tokenizer_kwargs=dict(), + tokenizer_path=None, + type='opencompass.models.huggingface_above_v4_33.HuggingFaceBaseModel' + ), +] +work_dir = 'outputs/default/20251127_190244' diff --git a/tmp/d1308f02-18cf-4c2b-8027-d2aada837b6e_params.py b/tmp/d1308f02-18cf-4c2b-8027-d2aada837b6e_params.py new file mode 100644 index 0000000000000000000000000000000000000000..0a84cc8e8bd9a1b6c5537eb7bbf6d1c2b715dc6a --- /dev/null +++ b/tmp/d1308f02-18cf-4c2b-8027-d2aada837b6e_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:19]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:19]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_230930' diff --git a/tmp/d2591feb-65f5-43b8-a7db-0abcac9af57b_params.py b/tmp/d2591feb-65f5-43b8-a7db-0abcac9af57b_params.py new file mode 100644 index 0000000000000000000000000000000000000000..2390ba1f8bf5848d9b4f0338e019d6a3c9777b1a --- /dev/null +++ b/tmp/d2591feb-65f5-43b8-a7db-0abcac9af57b_params.py @@ -0,0 +1,1382 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_204122' diff --git a/tmp/d2bae0ef-a7e6-42aa-9856-a6639c0fb912_params.py b/tmp/d2bae0ef-a7e6-42aa-9856-a6639c0fb912_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/d4a5bc1e-f880-4ef9-8c83-2dbc459cb325_params.py b/tmp/d4a5bc1e-f880-4ef9-8c83-2dbc459cb325_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/d5c3967d-9a23-46fb-aa91-6ab5a2933f4a_params.py b/tmp/d5c3967d-9a23-46fb-aa91-6ab5a2933f4a_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/d5cc69f6-0c22-4d6f-b303-d74dffa055a6_params.py b/tmp/d5cc69f6-0c22-4d6f-b303-d74dffa055a6_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/d5f56564-568c-4cf9-9238-8b4cbd6ab00b_params.py b/tmp/d5f56564-568c-4cf9-9238-8b4cbd6ab00b_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/d7849725-e6a1-4b93-b9d1-48666205ccad_params.py b/tmp/d7849725-e6a1-4b93-b9d1-48666205ccad_params.py new file mode 100644 index 0000000000000000000000000000000000000000..423b566143efd875eadc66af6f537363b14a924d --- /dev/null +++ b/tmp/d7849725-e6a1-4b93-b9d1-48666205ccad_params.py @@ -0,0 +1,1421 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='mask_gdn-1.3B', + batch_padding=False, + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_164744' diff --git a/tmp/d7e9a345-e53f-453b-9c5a-038cfbed0863_params.py b/tmp/d7e9a345-e53f-453b-9c5a-038cfbed0863_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/d8d86631-8d1b-4a7f-84d5-db2c8a8b8313_params.py b/tmp/d8d86631-8d1b-4a7f-84d5-db2c8a8b8313_params.py new file mode 100644 index 0000000000000000000000000000000000000000..5dba92df8e30d004ca0fd5796298f70477007b96 --- /dev/null +++ b/tmp/d8d86631-8d1b-4a7f-84d5-db2c8a8b8313_params.py @@ -0,0 +1,1420 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[95:114]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[95:114]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net-1.3B', + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_163453' diff --git a/tmp/d976b9a1-d72c-4640-9115-21c52107e964_params.py b/tmp/d976b9a1-d72c-4640-9115-21c52107e964_params.py new file mode 100644 index 0000000000000000000000000000000000000000..ba7350f19acc5ab50c6a364423df0ae22ff19011 --- /dev/null +++ b/tmp/d976b9a1-d72c-4640-9115-21c52107e964_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[76:95]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[76:95]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='gated_deltanet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/download_model/hgrn2-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/download_model/hgrn2-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251219_084834' diff --git a/tmp/d9c79b01-bdcc-46e9-b1c5-2c9be6c14c49_params.py b/tmp/d9c79b01-bdcc-46e9-b1c5-2c9be6c14c49_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/db5fc239-a9a7-4b03-b5ab-64f75cb4eb5a_params.py b/tmp/db5fc239-a9a7-4b03-b5ab-64f75cb4eb5a_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/dc336bf4-3105-480f-800d-7b56013809a4_params.py b/tmp/dc336bf4-3105-480f-800d-7b56013809a4_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/dd81ccae-4515-4dca-b714-5e8c3c23d5b1_params.py b/tmp/dd81ccae-4515-4dca-b714-5e8c3c23d5b1_params.py new file mode 100644 index 0000000000000000000000000000000000000000..ef87563c6cd5508046457912d59569167e9de173 --- /dev/null +++ b/tmp/dd81ccae-4515-4dca-b714-5e8c3c23d5b1_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[76:95]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[76:95]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/gla-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/gla-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_205110' diff --git a/tmp/de43d72f-9d10-422c-8536-2751a6bfc105_params.py b/tmp/de43d72f-9d10-422c-8536-2751a6bfc105_params.py new file mode 100644 index 0000000000000000000000000000000000000000..1586828dc4c7b0ab1dc09d95e3f182cb031bbc81 --- /dev/null +++ b/tmp/de43d72f-9d10-422c-8536-2751a6bfc105_params.py @@ -0,0 +1,55 @@ +datasets = [ + [ + dict( + abbr='LongBench_lsht', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='mask_deltanet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/train_exp/mask_deltanet_1B_rank4', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251128_162747' diff --git a/tmp/df5ba41f-efd1-4b92-9fcd-32016b530642_params.py b/tmp/df5ba41f-efd1-4b92-9fcd-32016b530642_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/e12880bc-bd6e-4d44-9da3-d54f62bdf26d_params.py b/tmp/e12880bc-bd6e-4d44-9da3-d54f62bdf26d_params.py new file mode 100644 index 0000000000000000000000000000000000000000..b78ad958d1b38966896da1b3f59fe01639ddebb8 --- /dev/null +++ b/tmp/e12880bc-bd6e-4d44-9da3-d54f62bdf26d_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:19]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:19]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_223020' diff --git a/tmp/e2f9873d-8535-41ab-a68e-87460e6faaa1_params.py b/tmp/e2f9873d-8535-41ab-a68e-87460e6faaa1_params.py new file mode 100644 index 0000000000000000000000000000000000000000..bce51acd6df6c7c7e88070c8b305cba744e10fb6 --- /dev/null +++ b/tmp/e2f9873d-8535-41ab-a68e-87460e6faaa1_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:19]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:19]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='gated_deltanet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/download_model/hgrn2-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/download_model/hgrn2-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251219_084834' diff --git a/tmp/e78a4c5d-75df-4837-aecc-1dacc997ae27_params.py b/tmp/e78a4c5d-75df-4837-aecc-1dacc997ae27_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/e8e3c789-cf8b-47a1-9f61-7b16047db7b6_params.py b/tmp/e8e3c789-cf8b-47a1-9f61-7b16047db7b6_params.py new file mode 100644 index 0000000000000000000000000000000000000000..a99a2b30cb315923687ec71b6c08ebfcb7aa63eb --- /dev/null +++ b/tmp/e8e3c789-cf8b-47a1-9f61-7b16047db7b6_params.py @@ -0,0 +1,50 @@ +datasets = [ + [ + dict( + abbr='LongBench_hotpotqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='mask_gdn-1.3B', + batch_padding=False, + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_164548' diff --git a/tmp/e8ec7ae1-4c04-4a53-901a-c8c0a3b16775_params.py b/tmp/e8ec7ae1-4c04-4a53-901a-c8c0a3b16775_params.py new file mode 100644 index 0000000000000000000000000000000000000000..47586c6797ef78f65c852f7a34fd50fc4b5660e1 --- /dev/null +++ b/tmp/e8ec7ae1-4c04-4a53-901a-c8c0a3b16775_params.py @@ -0,0 +1,53 @@ +datasets = [ + [ + dict( + abbr='LongBench_multifieldqa_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='gated_deltanet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='download_model/hgrn2-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='download_model/hgrn2-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251219_163447' diff --git a/tmp/e91e98a9-f384-40a1-9879-0ebcb61c1344_params.py b/tmp/e91e98a9-f384-40a1-9879-0ebcb61c1344_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/eaf4d883-2a1c-4476-9b87-7bcff045f490_params.py b/tmp/eaf4d883-2a1c-4476-9b87-7bcff045f490_params.py new file mode 100644 index 0000000000000000000000000000000000000000..fb1b57a2c82efe1c83f7516acf5a265a2ecf4b64 --- /dev/null +++ b/tmp/eaf4d883-2a1c-4476-9b87-7bcff045f490_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_221150' diff --git a/tmp/ebe35c77-113a-40f7-b856-c3c83cd54332_params.py b/tmp/ebe35c77-113a-40f7-b856-c3c83cd54332_params.py new file mode 100644 index 0000000000000000000000000000000000000000..5049a9084297190c0c5c3892ff097669e5a6d18d --- /dev/null +++ b/tmp/ebe35c77-113a-40f7-b856-c3c83cd54332_params.py @@ -0,0 +1,61 @@ +datasets = [ + [ + dict( + abbr='triviaqa_wiki_1shot_2', + eval_cfg=dict( + evaluator=dict(type='opencompass.datasets.TriviaQAEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + ice_template=dict( + template='Q: {question}\nA: {answer}.\n', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + inferencer=dict( + max_out_len=50, + stopping_criteria=[ + 'Q:', + '\n', + ], + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + ice_token='', + template='Q: {question}\nA: ', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + fix_id_list=[ + 0, + ], + type='opencompass.openicl.icl_retriever.FixKRetriever')), + path='opencompass/trivia_qa', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer', + test_range='[2000:3000]', + test_split='validation', + train_split='train'), + type='opencompass.datasets.TriviaQADatasetV2'), + ], +] +models = [ + dict( + abbr='mask_gdn_1B_hrr-rank4_hf', + batch_size=8, + generation_kwargs=dict(), + max_out_len=256, + max_seq_len=None, + model_kwargs=dict(), + pad_token_id=None, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + peft_kwargs=dict(), + peft_path=None, + run_cfg=dict(num_gpus=1), + stop_words=[], + tokenizer_kwargs=dict(), + tokenizer_path=None, + type='opencompass.models.huggingface_above_v4_33.HuggingFaceBaseModel' + ), +] +work_dir = 'outputs/default/20251127_193336' diff --git a/tmp/fa9e69b8-4678-4ce0-bba9-46851a640553_params.py b/tmp/fa9e69b8-4678-4ce0-bba9-46851a640553_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/fd522826-23ee-40e2-88eb-5bade409917e_params.py b/tmp/fd522826-23ee-40e2-88eb-5bade409917e_params.py new file mode 100644 index 0000000000000000000000000000000000000000..f2cfb5440972958de8181a691f3cfb0f839aad07 --- /dev/null +++ b/tmp/fd522826-23ee-40e2-88eb-5bade409917e_params.py @@ -0,0 +1,1379 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net-1.3B', + batch_padding=True, + batch_size=16, + max_out_len=100, + max_seq_len=2048, + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1, num_procs=1), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_161037' diff --git a/tmp/fe176f2e-aa41-462d-a3d4-e1edafe4ecf4_params.py b/tmp/fe176f2e-aa41-462d-a3d4-e1edafe4ecf4_params.py new file mode 100644 index 0000000000000000000000000000000000000000..da9e53eeb215d9c6d9db680577522d08cfd44f62 --- /dev/null +++ b/tmp/fe176f2e-aa41-462d-a3d4-e1edafe4ecf4_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_202918' diff --git a/tmp/fef903a4-2454-425c-a265-2b87f8ac3d68_params.py b/tmp/fef903a4-2454-425c-a265-2b87f8ac3d68_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/ff710be5-ad0d-4c04-a992-028dc5a0d21f_params.py b/tmp/ff710be5-ad0d-4c04-a992-028dc5a0d21f_params.py new file mode 100644 index 0000000000000000000000000000000000000000..15a48bf64696ef1323ec1398cf0de20188dc30ed --- /dev/null +++ b/tmp/ff710be5-ad0d-4c04-a992-028dc5a0d21f_params.py @@ -0,0 +1,53 @@ +datasets = [ + [ + dict( + abbr='LongBench_narrativeqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='retnet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/retnet-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/retnet-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_223306'