diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e51ff408a721a42d25c3e00a9c387149ead8193 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py @@ -0,0 +1,186 @@ +# Natural Language Toolkit: Corpus Readers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +NLTK corpus readers. The modules in this package provide functions +that can be used to read corpus fileids in a variety of formats. These +functions can be used to read both the corpus fileids that are +distributed in the NLTK corpus package, and corpus fileids that are part +of external corpora. + +Corpus Reader Functions +======================= +Each corpus module defines one or more "corpus reader functions", +which can be used to read documents from that corpus. These functions +take an argument, ``item``, which is used to indicate which document +should be read from the corpus: + +- If ``item`` is one of the unique identifiers listed in the corpus + module's ``items`` variable, then the corresponding document will + be loaded from the NLTK corpus package. +- If ``item`` is a fileid, then that file will be read. + +Additionally, corpus reader functions can be given lists of item +names; in which case, they will return a concatenation of the +corresponding documents. + +Corpus reader functions are named based on the type of information +they return. Some common examples, and their return types, are: + +- words(): list of str +- sents(): list of (list of str) +- paras(): list of (list of (list of str)) +- tagged_words(): list of (str,str) tuple +- tagged_sents(): list of (list of (str,str)) +- tagged_paras(): list of (list of (list of (str,str))) +- chunked_sents(): list of (Tree w/ (str,str) leaves) +- parsed_sents(): list of (Tree with str leaves) +- parsed_paras(): list of (list of (Tree with str leaves)) +- xml(): A single xml ElementTree +- raw(): unprocessed corpus contents + +For example, to read a list of the words in the Brown Corpus, use +``nltk.corpus.brown.words()``: + + >>> from nltk.corpus import brown + >>> print(", ".join(brown.words()[:6])) # only first 6 words + The, Fulton, County, Grand, Jury, said + +isort:skip_file +""" + +from nltk.corpus.reader.plaintext import * +from nltk.corpus.reader.util import * +from nltk.corpus.reader.api import * +from nltk.corpus.reader.tagged import * +from nltk.corpus.reader.cmudict import * +from nltk.corpus.reader.conll import * +from nltk.corpus.reader.chunked import * +from nltk.corpus.reader.wordlist import * +from nltk.corpus.reader.xmldocs import * +from nltk.corpus.reader.ppattach import * +from nltk.corpus.reader.senseval import * +from nltk.corpus.reader.ieer import * +from nltk.corpus.reader.sinica_treebank import * +from nltk.corpus.reader.bracket_parse import * +from nltk.corpus.reader.indian import * +from nltk.corpus.reader.toolbox import * +from nltk.corpus.reader.timit import * +from nltk.corpus.reader.ycoe import * +from nltk.corpus.reader.rte import * +from nltk.corpus.reader.string_category import * +from nltk.corpus.reader.propbank import * +from nltk.corpus.reader.verbnet import * +from nltk.corpus.reader.bnc import * +from nltk.corpus.reader.nps_chat import * +from nltk.corpus.reader.wordnet import * +from nltk.corpus.reader.switchboard import * +from nltk.corpus.reader.dependency import * +from nltk.corpus.reader.nombank import * +from nltk.corpus.reader.ipipan import * +from nltk.corpus.reader.pl196x import * +from nltk.corpus.reader.knbc import * +from nltk.corpus.reader.chasen import * +from nltk.corpus.reader.childes import * +from nltk.corpus.reader.aligned import * +from nltk.corpus.reader.lin import * +from nltk.corpus.reader.semcor import * +from nltk.corpus.reader.framenet import * +from nltk.corpus.reader.udhr import * +from nltk.corpus.reader.bnc import * +from nltk.corpus.reader.sentiwordnet import * +from nltk.corpus.reader.twitter import * +from nltk.corpus.reader.nkjp import * +from nltk.corpus.reader.crubadan import * +from nltk.corpus.reader.mte import * +from nltk.corpus.reader.reviews import * +from nltk.corpus.reader.opinion_lexicon import * +from nltk.corpus.reader.pros_cons import * +from nltk.corpus.reader.categorized_sents import * +from nltk.corpus.reader.comparative_sents import * +from nltk.corpus.reader.panlex_lite import * +from nltk.corpus.reader.panlex_swadesh import * +from nltk.corpus.reader.bcp47 import * + +# Make sure that nltk.corpus.reader.bracket_parse gives the module, not +# the function bracket_parse() defined in nltk.tree: +from nltk.corpus.reader import bracket_parse + +__all__ = [ + "CorpusReader", + "CategorizedCorpusReader", + "PlaintextCorpusReader", + "find_corpus_fileids", + "TaggedCorpusReader", + "CMUDictCorpusReader", + "ConllChunkCorpusReader", + "WordListCorpusReader", + "PPAttachmentCorpusReader", + "SensevalCorpusReader", + "IEERCorpusReader", + "ChunkedCorpusReader", + "SinicaTreebankCorpusReader", + "BracketParseCorpusReader", + "IndianCorpusReader", + "ToolboxCorpusReader", + "TimitCorpusReader", + "YCOECorpusReader", + "MacMorphoCorpusReader", + "SyntaxCorpusReader", + "AlpinoCorpusReader", + "RTECorpusReader", + "StringCategoryCorpusReader", + "EuroparlCorpusReader", + "CategorizedBracketParseCorpusReader", + "CategorizedTaggedCorpusReader", + "CategorizedPlaintextCorpusReader", + "PortugueseCategorizedPlaintextCorpusReader", + "tagged_treebank_para_block_reader", + "PropbankCorpusReader", + "VerbnetCorpusReader", + "BNCCorpusReader", + "ConllCorpusReader", + "XMLCorpusReader", + "NPSChatCorpusReader", + "SwadeshCorpusReader", + "WordNetCorpusReader", + "WordNetICCorpusReader", + "SwitchboardCorpusReader", + "DependencyCorpusReader", + "NombankCorpusReader", + "IPIPANCorpusReader", + "Pl196xCorpusReader", + "TEICorpusView", + "KNBCorpusReader", + "ChasenCorpusReader", + "CHILDESCorpusReader", + "AlignedCorpusReader", + "TimitTaggedCorpusReader", + "LinThesaurusCorpusReader", + "SemcorCorpusReader", + "FramenetCorpusReader", + "UdhrCorpusReader", + "BNCCorpusReader", + "SentiWordNetCorpusReader", + "SentiSynset", + "TwitterCorpusReader", + "NKJPCorpusReader", + "CrubadanCorpusReader", + "MTECorpusReader", + "ReviewsCorpusReader", + "OpinionLexiconCorpusReader", + "ProsConsCorpusReader", + "CategorizedSentencesCorpusReader", + "ComparativeSentencesCorpusReader", + "PanLexLiteCorpusReader", + "NonbreakingPrefixesCorpusReader", + "UnicharsCorpusReader", + "MWAPPDBCorpusReader", + "PanlexSwadeshCorpusReader", + "BCP47CorpusReader", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py new file mode 100644 index 0000000000000000000000000000000000000000..51ebaa7597bb05d95ce2e62720799759b78ee05b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py @@ -0,0 +1,154 @@ +# Natural Language Toolkit: Aligned Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# Author: Steven Bird +# For license information, see LICENSE.TXT + +from nltk.corpus.reader.api import CorpusReader +from nltk.corpus.reader.util import ( + StreamBackedCorpusView, + concat, + read_alignedsent_block, +) +from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer +from nltk.translate import AlignedSent, Alignment + + +class AlignedCorpusReader(CorpusReader): + """ + Reader for corpora of word-aligned sentences. Tokens are assumed + to be separated by whitespace. Sentences begin on separate lines. + """ + + def __init__( + self, + root, + fileids, + sep="/", + word_tokenizer=WhitespaceTokenizer(), + sent_tokenizer=RegexpTokenizer("\n", gaps=True), + alignedsent_block_reader=read_alignedsent_block, + encoding="latin1", + ): + """ + Construct a new Aligned Corpus reader for a set of documents + located at the given root directory. Example usage: + + >>> root = '/...path to corpus.../' + >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP + + :param root: The root directory for this corpus. + :param fileids: A list or regexp specifying the fileids in this corpus. + """ + CorpusReader.__init__(self, root, fileids, encoding) + self._sep = sep + self._word_tokenizer = word_tokenizer + self._sent_tokenizer = sent_tokenizer + self._alignedsent_block_reader = alignedsent_block_reader + + def words(self, fileids=None): + """ + :return: the given file(s) as a list of words + and punctuation symbols. + :rtype: list(str) + """ + return concat( + [ + AlignedSentCorpusView( + fileid, + enc, + False, + False, + self._word_tokenizer, + self._sent_tokenizer, + self._alignedsent_block_reader, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def sents(self, fileids=None): + """ + :return: the given file(s) as a list of + sentences or utterances, each encoded as a list of word + strings. + :rtype: list(list(str)) + """ + return concat( + [ + AlignedSentCorpusView( + fileid, + enc, + False, + True, + self._word_tokenizer, + self._sent_tokenizer, + self._alignedsent_block_reader, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def aligned_sents(self, fileids=None): + """ + :return: the given file(s) as a list of AlignedSent objects. + :rtype: list(AlignedSent) + """ + return concat( + [ + AlignedSentCorpusView( + fileid, + enc, + True, + True, + self._word_tokenizer, + self._sent_tokenizer, + self._alignedsent_block_reader, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + +class AlignedSentCorpusView(StreamBackedCorpusView): + """ + A specialized corpus view for aligned sentences. + ``AlignedSentCorpusView`` objects are typically created by + ``AlignedCorpusReader`` (not directly by nltk users). + """ + + def __init__( + self, + corpus_file, + encoding, + aligned, + group_by_sent, + word_tokenizer, + sent_tokenizer, + alignedsent_block_reader, + ): + self._aligned = aligned + self._group_by_sent = group_by_sent + self._word_tokenizer = word_tokenizer + self._sent_tokenizer = sent_tokenizer + self._alignedsent_block_reader = alignedsent_block_reader + StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) + + def read_block(self, stream): + block = [ + self._word_tokenizer.tokenize(sent_str) + for alignedsent_str in self._alignedsent_block_reader(stream) + for sent_str in self._sent_tokenizer.tokenize(alignedsent_str) + ] + if self._aligned: + block[2] = Alignment.fromstring( + " ".join(block[2]) + ) # kludge; we shouldn't have tokenized the alignment string + block = [AlignedSent(*block)] + elif self._group_by_sent: + block = [block[0]] + else: + block = block[0] + + return block diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py new file mode 100644 index 0000000000000000000000000000000000000000..d1aa37bdc98d5111216af64fe7ffaa5f64644bb9 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py @@ -0,0 +1,516 @@ +# Natural Language Toolkit: API for Corpus Readers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +API for corpus readers. +""" + +import os +import re +from collections import defaultdict +from itertools import chain + +from nltk.corpus.reader.util import * +from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer + + +class CorpusReader: + """ + A base class for "corpus reader" classes, each of which can be + used to read a specific corpus format. Each individual corpus + reader instance is used to read a specific corpus, consisting of + one or more files under a common root directory. Each file is + identified by its ``file identifier``, which is the relative path + to the file from the root directory. + + A separate subclass is defined for each corpus format. These + subclasses define one or more methods that provide 'views' on the + corpus contents, such as ``words()`` (for a list of words) and + ``parsed_sents()`` (for a list of parsed sentences). Called with + no arguments, these methods will return the contents of the entire + corpus. For most corpora, these methods define one or more + selection arguments, such as ``fileids`` or ``categories``, which can + be used to select which portion of the corpus should be returned. + """ + + def __init__(self, root, fileids, encoding="utf8", tagset=None): + """ + :type root: PathPointer or str + :param root: A path pointer identifying the root directory for + this corpus. If a string is specified, then it will be + converted to a ``PathPointer`` automatically. + :param fileids: A list of the files that make up this corpus. + This list can either be specified explicitly, as a list of + strings; or implicitly, as a regular expression over file + paths. The absolute path for each file will be constructed + by joining the reader's root to each file name. + :param encoding: The default unicode encoding for the files + that make up the corpus. The value of ``encoding`` can be any + of the following: + + - A string: ``encoding`` is the encoding name for all files. + - A dictionary: ``encoding[file_id]`` is the encoding + name for the file whose identifier is ``file_id``. If + ``file_id`` is not in ``encoding``, then the file + contents will be processed using non-unicode byte strings. + - A list: ``encoding`` should be a list of ``(regexp, encoding)`` + tuples. The encoding for a file whose identifier is ``file_id`` + will be the ``encoding`` value for the first tuple whose + ``regexp`` matches the ``file_id``. If no tuple's ``regexp`` + matches the ``file_id``, the file contents will be processed + using non-unicode byte strings. + - None: the file contents of all files will be + processed using non-unicode byte strings. + :param tagset: The name of the tagset used by this corpus, to be used + for normalizing or converting the POS tags returned by the + ``tagged_...()`` methods. + """ + # Convert the root to a path pointer, if necessary. + if isinstance(root, str) and not isinstance(root, PathPointer): + m = re.match(r"(.*\.zip)/?(.*)$|", root) + zipfile, zipentry = m.groups() + if zipfile: + root = ZipFilePathPointer(zipfile, zipentry) + else: + root = FileSystemPathPointer(root) + elif not isinstance(root, PathPointer): + raise TypeError("CorpusReader: expected a string or a PathPointer") + + # If `fileids` is a regexp, then expand it. + if isinstance(fileids, str): + fileids = find_corpus_fileids(root, fileids) + + self._fileids = fileids + """A list of the relative paths for the fileids that make up + this corpus.""" + + self._root = root + """The root directory for this corpus.""" + + self._readme = "README" + self._license = "LICENSE" + self._citation = "citation.bib" + + # If encoding was specified as a list of regexps, then convert + # it to a dictionary. + if isinstance(encoding, list): + encoding_dict = {} + for fileid in self._fileids: + for x in encoding: + (regexp, enc) = x + if re.match(regexp, fileid): + encoding_dict[fileid] = enc + break + encoding = encoding_dict + + self._encoding = encoding + """The default unicode encoding for the fileids that make up + this corpus. If ``encoding`` is None, then the file + contents are processed using byte strings.""" + self._tagset = tagset + + def __repr__(self): + if isinstance(self._root, ZipFilePathPointer): + path = f"{self._root.zipfile.filename}/{self._root.entry}" + else: + path = "%s" % self._root.path + return f"<{self.__class__.__name__} in {path!r}>" + + def ensure_loaded(self): + """ + Load this corpus (if it has not already been loaded). This is + used by LazyCorpusLoader as a simple method that can be used to + make sure a corpus is loaded -- e.g., in case a user wants to + do help(some_corpus). + """ + pass # no need to actually do anything. + + def readme(self): + """ + Return the contents of the corpus README file, if it exists. + """ + with self.open(self._readme) as f: + return f.read() + + def license(self): + """ + Return the contents of the corpus LICENSE file, if it exists. + """ + with self.open(self._license) as f: + return f.read() + + def citation(self): + """ + Return the contents of the corpus citation.bib file, if it exists. + """ + with self.open(self._citation) as f: + return f.read() + + def fileids(self): + """ + Return a list of file identifiers for the fileids that make up + this corpus. + """ + return self._fileids + + def abspath(self, fileid): + """ + Return the absolute path for the given file. + + :type fileid: str + :param fileid: The file identifier for the file whose path + should be returned. + :rtype: PathPointer + """ + return self._root.join(fileid) + + def abspaths(self, fileids=None, include_encoding=False, include_fileid=False): + """ + Return a list of the absolute paths for all fileids in this corpus; + or for the given list of fileids, if specified. + + :type fileids: None or str or list + :param fileids: Specifies the set of fileids for which paths should + be returned. Can be None, for all fileids; a list of + file identifiers, for a specified set of fileids; or a single + file identifier, for a single file. Note that the return + value is always a list of paths, even if ``fileids`` is a + single file identifier. + + :param include_encoding: If true, then return a list of + ``(path_pointer, encoding)`` tuples. + + :rtype: list(PathPointer) + """ + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + + paths = [self._root.join(f) for f in fileids] + + if include_encoding and include_fileid: + return list(zip(paths, [self.encoding(f) for f in fileids], fileids)) + elif include_fileid: + return list(zip(paths, fileids)) + elif include_encoding: + return list(zip(paths, [self.encoding(f) for f in fileids])) + else: + return paths + + def raw(self, fileids=None): + """ + :param fileids: A list specifying the fileids that should be used. + :return: the given file(s) as a single string. + :rtype: str + """ + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + contents = [] + for f in fileids: + with self.open(f) as fp: + contents.append(fp.read()) + return concat(contents) + + def open(self, file): + """ + Return an open stream that can be used to read the given file. + If the file's encoding is not None, then the stream will + automatically decode the file's contents into unicode. + + :param file: The file identifier of the file to read. + """ + encoding = self.encoding(file) + stream = self._root.join(file).open(encoding) + return stream + + def encoding(self, file): + """ + Return the unicode encoding for the given corpus file, if known. + If the encoding is unknown, or if the given file should be + processed using byte strings (str), then return None. + """ + if isinstance(self._encoding, dict): + return self._encoding.get(file) + else: + return self._encoding + + def _get_root(self): + return self._root + + root = property( + _get_root, + doc=""" + The directory where this corpus is stored. + + :type: PathPointer""", + ) + + +###################################################################### +# { Corpora containing categorized items +###################################################################### + + +class CategorizedCorpusReader: + """ + A mixin class used to aid in the implementation of corpus readers + for categorized corpora. This class defines the method + ``categories()``, which returns a list of the categories for the + corpus or for a specified set of fileids; and overrides ``fileids()`` + to take a ``categories`` argument, restricting the set of fileids to + be returned. + + Subclasses are expected to: + + - Call ``__init__()`` to set up the mapping. + + - Override all view methods to accept a ``categories`` parameter, + which can be used *instead* of the ``fileids`` parameter, to + select which fileids should be included in the returned view. + """ + + def __init__(self, kwargs): + """ + Initialize this mapping based on keyword arguments, as + follows: + + - cat_pattern: A regular expression pattern used to find the + category for each file identifier. The pattern will be + applied to each file identifier, and the first matching + group will be used as the category label for that file. + + - cat_map: A dictionary, mapping from file identifiers to + category labels. + + - cat_file: The name of a file that contains the mapping + from file identifiers to categories. The argument + ``cat_delimiter`` can be used to specify a delimiter. + + The corresponding argument will be deleted from ``kwargs``. If + more than one argument is specified, an exception will be + raised. + """ + self._f2c = None #: file-to-category mapping + self._c2f = None #: category-to-file mapping + + self._pattern = None #: regexp specifying the mapping + self._map = None #: dict specifying the mapping + self._file = None #: fileid of file containing the mapping + self._delimiter = None #: delimiter for ``self._file`` + + if "cat_pattern" in kwargs: + self._pattern = kwargs["cat_pattern"] + del kwargs["cat_pattern"] + elif "cat_map" in kwargs: + self._map = kwargs["cat_map"] + del kwargs["cat_map"] + elif "cat_file" in kwargs: + self._file = kwargs["cat_file"] + del kwargs["cat_file"] + if "cat_delimiter" in kwargs: + self._delimiter = kwargs["cat_delimiter"] + del kwargs["cat_delimiter"] + else: + raise ValueError( + "Expected keyword argument cat_pattern or " "cat_map or cat_file." + ) + + if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs: + raise ValueError( + "Specify exactly one of: cat_pattern, " "cat_map, cat_file." + ) + + def _init(self): + self._f2c = defaultdict(set) + self._c2f = defaultdict(set) + + if self._pattern is not None: + for file_id in self._fileids: + category = re.match(self._pattern, file_id).group(1) + self._add(file_id, category) + + elif self._map is not None: + for (file_id, categories) in self._map.items(): + for category in categories: + self._add(file_id, category) + + elif self._file is not None: + with self.open(self._file) as f: + for line in f.readlines(): + line = line.strip() + file_id, categories = line.split(self._delimiter, 1) + if file_id not in self.fileids(): + raise ValueError( + "In category mapping file %s: %s " + "not found" % (self._file, file_id) + ) + for category in categories.split(self._delimiter): + self._add(file_id, category) + + def _add(self, file_id, category): + self._f2c[file_id].add(category) + self._c2f[category].add(file_id) + + def categories(self, fileids=None): + """ + Return a list of the categories that are defined for this corpus, + or for the file(s) if it is given. + """ + if self._f2c is None: + self._init() + if fileids is None: + return sorted(self._c2f) + if isinstance(fileids, str): + fileids = [fileids] + return sorted(set.union(*(self._f2c[d] for d in fileids))) + + def fileids(self, categories=None): + """ + Return a list of file identifiers for the files that make up + this corpus, or that make up the given category(s) if specified. + """ + if categories is None: + return super().fileids() + elif isinstance(categories, str): + if self._f2c is None: + self._init() + if categories in self._c2f: + return sorted(self._c2f[categories]) + else: + raise ValueError("Category %s not found" % categories) + else: + if self._f2c is None: + self._init() + return sorted(set.union(*(self._c2f[c] for c in categories))) + + def _resolve(self, fileids, categories): + if fileids is not None and categories is not None: + raise ValueError("Specify fileids or categories, not both") + if categories is not None: + return self.fileids(categories) + else: + return fileids + + def raw(self, fileids=None, categories=None): + return super().raw(self._resolve(fileids, categories)) + + def words(self, fileids=None, categories=None): + return super().words(self._resolve(fileids, categories)) + + def sents(self, fileids=None, categories=None): + return super().sents(self._resolve(fileids, categories)) + + def paras(self, fileids=None, categories=None): + return super().paras(self._resolve(fileids, categories)) + + +###################################################################### +# { Treebank readers +###################################################################### + +# [xx] is it worth it to factor this out? +class SyntaxCorpusReader(CorpusReader): + """ + An abstract base class for reading corpora consisting of + syntactically parsed text. Subclasses should define: + + - ``__init__``, which specifies the location of the corpus + and a method for detecting the sentence blocks in corpus files. + - ``_read_block``, which reads a block from the input stream. + - ``_word``, which takes a block and returns a list of list of words. + - ``_tag``, which takes a block and returns a list of list of tagged + words. + - ``_parse``, which takes a block and returns a list of parsed + sentences. + """ + + def _parse(self, s): + raise NotImplementedError() + + def _word(self, s): + raise NotImplementedError() + + def _tag(self, s): + raise NotImplementedError() + + def _read_block(self, stream): + raise NotImplementedError() + + def parsed_sents(self, fileids=None): + reader = self._read_parsed_sent_block + return concat( + [ + StreamBackedCorpusView(fileid, reader, encoding=enc) + for fileid, enc in self.abspaths(fileids, True) + ] + ) + + def tagged_sents(self, fileids=None, tagset=None): + def reader(stream): + return self._read_tagged_sent_block(stream, tagset) + + return concat( + [ + StreamBackedCorpusView(fileid, reader, encoding=enc) + for fileid, enc in self.abspaths(fileids, True) + ] + ) + + def sents(self, fileids=None): + reader = self._read_sent_block + return concat( + [ + StreamBackedCorpusView(fileid, reader, encoding=enc) + for fileid, enc in self.abspaths(fileids, True) + ] + ) + + def tagged_words(self, fileids=None, tagset=None): + def reader(stream): + return self._read_tagged_word_block(stream, tagset) + + return concat( + [ + StreamBackedCorpusView(fileid, reader, encoding=enc) + for fileid, enc in self.abspaths(fileids, True) + ] + ) + + def words(self, fileids=None): + return concat( + [ + StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc) + for fileid, enc in self.abspaths(fileids, True) + ] + ) + + # ------------------------------------------------------------ + # { Block Readers + + def _read_word_block(self, stream): + return list(chain.from_iterable(self._read_sent_block(stream))) + + def _read_tagged_word_block(self, stream, tagset=None): + return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset))) + + def _read_sent_block(self, stream): + return list(filter(None, [self._word(t) for t in self._read_block(stream)])) + + def _read_tagged_sent_block(self, stream, tagset=None): + return list( + filter(None, [self._tag(t, tagset) for t in self._read_block(stream)]) + ) + + def _read_parsed_sent_block(self, stream): + return list(filter(None, [self._parse(t) for t in self._read_block(stream)])) + + # } End of Block Readers + # ------------------------------------------------------------ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py new file mode 100644 index 0000000000000000000000000000000000000000..eedae2742ce2ed6725c80bddda13515ae2577cca --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py @@ -0,0 +1,218 @@ +# Natural Language Toolkit: BCP-47 language tags +# +# Copyright (C) 2022 NLTK Project +# Author: Eric Kafe +# URL: +# For license information, see LICENSE.TXT + +import re +from warnings import warn +from xml.etree import ElementTree as et + +from nltk.corpus.reader import CorpusReader + + +class BCP47CorpusReader(CorpusReader): + """ + Parse BCP-47 composite language tags + + Supports all the main subtags, and the 'u-sd' extension: + + >>> from nltk.corpus import bcp47 + >>> bcp47.name('oc-gascon-u-sd-fr64') + 'Occitan (post 1500): Gascon: Pyrénées-Atlantiques' + + Can load a conversion table to Wikidata Q-codes: + >>> bcp47.load_wiki_q() + >>> bcp47.wiki_q['en-GI-spanglis'] + 'Q79388' + + """ + + def __init__(self, root, fileids): + """Read the BCP-47 database""" + super().__init__(root, fileids) + self.langcode = {} + with self.open("iana/language-subtag-registry.txt") as fp: + self.db = self.data_dict(fp.read().split("%%\n")) + with self.open("cldr/common-subdivisions-en.xml") as fp: + self.subdiv = self.subdiv_dict( + et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision") + ) + self.morphology() + + def load_wiki_q(self): + """Load conversion table to Wikidata Q-codes (only if needed)""" + with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp: + self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:]) + + def wiki_dict(self, lines): + """Convert Wikidata list of Q-codes to a BCP-47 dictionary""" + return { + pair[1]: pair[0].split("/")[-1] + for pair in [line.strip().split("\t") for line in lines] + } + + def subdiv_dict(self, subdivs): + """Convert the CLDR subdivisions list to a dictionary""" + return {sub.attrib["type"]: sub.text for sub in subdivs} + + def morphology(self): + self.casing = { + "language": str.lower, + "extlang": str.lower, + "script": str.title, + "region": str.upper, + "variant": str.lower, + } + dig = "[0-9]" + low = "[a-z]" + up = "[A-Z]" + alnum = "[a-zA-Z0-9]" + self.format = { + "language": re.compile(f"{low*3}?"), + "extlang": re.compile(f"{low*3}"), + "script": re.compile(f"{up}{low*3}"), + "region": re.compile(f"({up*2})|({dig*3})"), + "variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"), + "singleton": re.compile(f"{low}"), + } + + def data_dict(self, records): + """Convert the BCP-47 language subtag registry to a dictionary""" + self.version = records[0].replace("File-Date:", "").strip() + dic = {} + dic["deprecated"] = {} + for label in [ + "language", + "extlang", + "script", + "region", + "variant", + "redundant", + "grandfathered", + ]: + dic["deprecated"][label] = {} + for record in records[1:]: + fields = [field.split(": ") for field in record.strip().split("\n")] + typ = fields[0][1] + tag = fields[1][1] + if typ not in dic: + dic[typ] = {} + subfields = {} + for field in fields[2:]: + if len(field) == 2: + [key, val] = field + if key not in subfields: + subfields[key] = [val] + else: # multiple value + subfields[key].append(val) + else: # multiline field + subfields[key][-1] += " " + field[0].strip() + if ( + "Deprecated" not in record + and typ == "language" + and key == "Description" + ): + self.langcode[subfields[key][-1]] = tag + for key in subfields: + if len(subfields[key]) == 1: # single value + subfields[key] = subfields[key][0] + if "Deprecated" in record: + dic["deprecated"][typ][tag] = subfields + else: + dic[typ][tag] = subfields + return dic + + def val2str(self, val): + """Return only first value""" + if type(val) == list: + # val = "/".join(val) # Concatenate all values + val = val[0] + return val + + def lang2str(self, lg_record): + """Concatenate subtag values""" + name = f"{lg_record['language']}" + for label in ["extlang", "script", "region", "variant", "extension"]: + if label in lg_record: + name += f": {lg_record[label]}" + return name + + def parse_tag(self, tag): + """Convert a BCP-47 tag to a dictionary of labelled subtags""" + subtags = tag.split("-") + lang = {} + labels = ["language", "extlang", "script", "region", "variant", "variant"] + while subtags and labels: + subtag = subtags.pop(0) + found = False + while labels: + label = labels.pop(0) + subtag = self.casing[label](subtag) + if self.format[label].fullmatch(subtag): + if subtag in self.db[label]: + found = True + valstr = self.val2str(self.db[label][subtag]["Description"]) + if label == "variant" and label in lang: + lang[label] += ": " + valstr + else: + lang[label] = valstr + break + elif subtag in self.db["deprecated"][label]: + found = True + note = f"The {subtag!r} {label} code is deprecated" + if "Preferred-Value" in self.db["deprecated"][label][subtag]: + prefer = self.db["deprecated"][label][subtag][ + "Preferred-Value" + ] + note += f"', prefer '{self.val2str(prefer)}'" + lang[label] = self.val2str( + self.db["deprecated"][label][subtag]["Description"] + ) + warn(note) + break + if not found: + if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions + sd = subtags[1] + if sd in self.subdiv: + ext = self.subdiv[sd] + else: + ext = f"" + else: # other extension subtags are not supported yet + ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower() + if not self.format["singleton"].fullmatch(subtag): + ext = f"" + warn(ext) + lang["extension"] = ext + subtags = [] + return lang + + def name(self, tag): + """ + Convert a BCP-47 tag to a colon-separated string of subtag names + + >>> from nltk.corpus import bcp47 + >>> bcp47.name('ca-Latn-ES-valencia') + 'Catalan: Latin: Spain: Valencian' + + """ + for label in ["redundant", "grandfathered"]: + val = None + if tag in self.db[label]: + val = f"{self.db[label][tag]['Description']}" + note = f"The {tag!r} code is {label}" + elif tag in self.db["deprecated"][label]: + val = f"{self.db['deprecated'][label][tag]['Description']}" + note = f"The {tag!r} code is {label} and deprecated" + if "Preferred-Value" in self.db["deprecated"][label][tag]: + prefer = self.db["deprecated"][label][tag]["Preferred-Value"] + note += f", prefer {self.val2str(prefer)!r}" + if val: + warn(note) + return val + try: + return self.lang2str(self.parse_tag(tag)) + except: + warn(f"Tag {tag!r} was not recognized") + return None diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py new file mode 100644 index 0000000000000000000000000000000000000000..d06fa4f71ab0d0e5089b49135b1523f3151f84ce --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py @@ -0,0 +1,265 @@ +# Natural Language Toolkit: Plaintext Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +"""Corpus reader for the XML version of the British National Corpus.""" + +from nltk.corpus.reader.util import concat +from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView + + +class BNCCorpusReader(XMLCorpusReader): + r"""Corpus reader for the XML version of the British National Corpus. + + For access to the complete XML data structure, use the ``xml()`` + method. For access to simple word lists and tagged word lists, use + ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. + + You can obtain the full version of the BNC corpus at + https://www.ota.ox.ac.uk/desc/2554 + + If you extracted the archive to a directory called `BNC`, then you can + instantiate the reader as:: + + BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml') + + """ + + def __init__(self, root, fileids, lazy=True): + XMLCorpusReader.__init__(self, root, fileids) + self._lazy = lazy + + def words(self, fileids=None, strip_space=True, stem=False): + """ + :return: the given file(s) as a list of words + and punctuation symbols. + :rtype: list(str) + + :param strip_space: If true, then strip trailing spaces from + word tokens. Otherwise, leave the spaces on the tokens. + :param stem: If true, then use word stems instead of word strings. + """ + return self._views(fileids, False, None, strip_space, stem) + + def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False): + """ + :return: the given file(s) as a list of tagged + words and punctuation symbols, encoded as tuples + ``(word,tag)``. + :rtype: list(tuple(str,str)) + + :param c5: If true, then the tags used will be the more detailed + c5 tags. Otherwise, the simplified tags will be used. + :param strip_space: If true, then strip trailing spaces from + word tokens. Otherwise, leave the spaces on the tokens. + :param stem: If true, then use word stems instead of word strings. + """ + tag = "c5" if c5 else "pos" + return self._views(fileids, False, tag, strip_space, stem) + + def sents(self, fileids=None, strip_space=True, stem=False): + """ + :return: the given file(s) as a list of + sentences or utterances, each encoded as a list of word + strings. + :rtype: list(list(str)) + + :param strip_space: If true, then strip trailing spaces from + word tokens. Otherwise, leave the spaces on the tokens. + :param stem: If true, then use word stems instead of word strings. + """ + return self._views(fileids, True, None, strip_space, stem) + + def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False): + """ + :return: the given file(s) as a list of + sentences, each encoded as a list of ``(word,tag)`` tuples. + :rtype: list(list(tuple(str,str))) + + :param c5: If true, then the tags used will be the more detailed + c5 tags. Otherwise, the simplified tags will be used. + :param strip_space: If true, then strip trailing spaces from + word tokens. Otherwise, leave the spaces on the tokens. + :param stem: If true, then use word stems instead of word strings. + """ + tag = "c5" if c5 else "pos" + return self._views( + fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem + ) + + def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False): + """A helper function that instantiates BNCWordViews or the list of words/sentences.""" + f = BNCWordView if self._lazy else self._words + return concat( + [ + f(fileid, sent, tag, strip_space, stem) + for fileid in self.abspaths(fileids) + ] + ) + + def _words(self, fileid, bracket_sent, tag, strip_space, stem): + """ + Helper used to implement the view methods -- returns a list of + words or a list of sentences, optionally tagged. + + :param fileid: The name of the underlying file. + :param bracket_sent: If true, include sentence bracketing. + :param tag: The name of the tagset to use, or None for no tags. + :param strip_space: If true, strip spaces from word tokens. + :param stem: If true, then substitute stems for words. + """ + result = [] + + xmldoc = ElementTree.parse(fileid).getroot() + for xmlsent in xmldoc.findall(".//s"): + sent = [] + for xmlword in _all_xmlwords_in(xmlsent): + word = xmlword.text + if not word: + word = "" # fixes issue 337? + if strip_space or stem: + word = word.strip() + if stem: + word = xmlword.get("hw", word) + if tag == "c5": + word = (word, xmlword.get("c5")) + elif tag == "pos": + word = (word, xmlword.get("pos", xmlword.get("c5"))) + sent.append(word) + if bracket_sent: + result.append(BNCSentence(xmlsent.attrib["n"], sent)) + else: + result.extend(sent) + + assert None not in result + return result + + +def _all_xmlwords_in(elt, result=None): + if result is None: + result = [] + for child in elt: + if child.tag in ("c", "w"): + result.append(child) + else: + _all_xmlwords_in(child, result) + return result + + +class BNCSentence(list): + """ + A list of words, augmented by an attribute ``num`` used to record + the sentence identifier (the ``n`` attribute from the XML). + """ + + def __init__(self, num, items): + self.num = num + list.__init__(self, items) + + +class BNCWordView(XMLCorpusView): + """ + A stream backed corpus view specialized for use with the BNC corpus. + """ + + tags_to_ignore = { + "pb", + "gap", + "vocal", + "event", + "unclear", + "shift", + "pause", + "align", + } + """These tags are ignored. For their description refer to the + technical documentation, for example, + http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html + + """ + + def __init__(self, fileid, sent, tag, strip_space, stem): + """ + :param fileid: The name of the underlying file. + :param sent: If true, include sentence bracketing. + :param tag: The name of the tagset to use, or None for no tags. + :param strip_space: If true, strip spaces from word tokens. + :param stem: If true, then substitute stems for words. + """ + if sent: + tagspec = ".*/s" + else: + tagspec = ".*/s/(.*/)?(c|w)" + self._sent = sent + self._tag = tag + self._strip_space = strip_space + self._stem = stem + + self.title = None #: Title of the document. + self.author = None #: Author of the document. + self.editor = None #: Editor + self.resps = None #: Statement of responsibility + + XMLCorpusView.__init__(self, fileid, tagspec) + + # Read in a tasty header. + self._open() + self.read_block(self._stream, ".*/teiHeader$", self.handle_header) + self.close() + + # Reset tag context. + self._tag_context = {0: ()} + + def handle_header(self, elt, context): + # Set up some metadata! + titles = elt.findall("titleStmt/title") + if titles: + self.title = "\n".join(title.text.strip() for title in titles) + + authors = elt.findall("titleStmt/author") + if authors: + self.author = "\n".join(author.text.strip() for author in authors) + + editors = elt.findall("titleStmt/editor") + if editors: + self.editor = "\n".join(editor.text.strip() for editor in editors) + + resps = elt.findall("titleStmt/respStmt") + if resps: + self.resps = "\n\n".join( + "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps + ) + + def handle_elt(self, elt, context): + if self._sent: + return self.handle_sent(elt) + else: + return self.handle_word(elt) + + def handle_word(self, elt): + word = elt.text + if not word: + word = "" # fixes issue 337? + if self._strip_space or self._stem: + word = word.strip() + if self._stem: + word = elt.get("hw", word) + if self._tag == "c5": + word = (word, elt.get("c5")) + elif self._tag == "pos": + word = (word, elt.get("pos", elt.get("c5"))) + return word + + def handle_sent(self, elt): + sent = [] + for child in elt: + if child.tag in ("mw", "hi", "corr", "trunc"): + sent += [self.handle_word(w) for w in child] + elif child.tag in ("w", "c"): + sent.append(self.handle_word(child)) + elif child.tag not in self.tags_to_ignore: + raise ValueError("Unexpected element %s" % child.tag) + return BNCSentence(elt.attrib["n"], sent) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py new file mode 100644 index 0000000000000000000000000000000000000000..5ef0cbcff3f5a746434563226d73ae4f82ee8b59 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py @@ -0,0 +1,237 @@ +# Natural Language Toolkit: Penn Treebank Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT +""" +Corpus reader for corpora that consist of parenthesis-delineated parse trees. +""" + +import sys + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.tag import map_tag +from nltk.tree import Tree + +# we use [^\s()]+ instead of \S+? to avoid matching () +SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)") +TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)") +WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)") +EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(") + + +class BracketParseCorpusReader(SyntaxCorpusReader): + """ + Reader for corpora that consist of parenthesis-delineated parse trees, + like those found in the "combined" section of the Penn Treebank, + e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))". + + """ + + def __init__( + self, + root, + fileids, + comment_char=None, + detect_blocks="unindented_paren", + encoding="utf8", + tagset=None, + ): + """ + :param root: The root directory for this corpus. + :param fileids: A list or regexp specifying the fileids in this corpus. + :param comment_char: The character which can appear at the start of + a line to indicate that the rest of the line is a comment. + :param detect_blocks: The method that is used to find blocks + in the corpus; can be 'unindented_paren' (every unindented + parenthesis starts a new parse) or 'sexpr' (brackets are + matched). + :param tagset: The name of the tagset used by this corpus, to be used + for normalizing or converting the POS tags returned by the + ``tagged_...()`` methods. + """ + SyntaxCorpusReader.__init__(self, root, fileids, encoding) + self._comment_char = comment_char + self._detect_blocks = detect_blocks + self._tagset = tagset + + def _read_block(self, stream): + if self._detect_blocks == "sexpr": + return read_sexpr_block(stream, comment_char=self._comment_char) + elif self._detect_blocks == "blankline": + return read_blankline_block(stream) + elif self._detect_blocks == "unindented_paren": + # Tokens start with unindented left parens. + toks = read_regexp_block(stream, start_re=r"^\(") + # Strip any comments out of the tokens. + if self._comment_char: + toks = [ + re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok) + for tok in toks + ] + return toks + else: + assert 0, "bad block type" + + def _normalize(self, t): + # Replace leaves of the form (!), (,), with (! !), (, ,) + t = re.sub(r"\((.)\)", r"(\1 \1)", t) + # Replace leaves of the form (tag word root) with (tag word) + t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) + return t + + def _parse(self, t): + try: + tree = Tree.fromstring(self._normalize(t)) + # If there's an empty node at the top, strip it off + if tree.label() == "" and len(tree) == 1: + return tree[0] + else: + return tree + + except ValueError as e: + sys.stderr.write("Bad tree detected; trying to recover...\n") + # Try to recover, if we can: + if e.args == ("mismatched parens",): + for n in range(1, 5): + try: + v = Tree(self._normalize(t + ")" * n)) + sys.stderr.write( + " Recovered by adding %d close " "paren(s)\n" % n + ) + return v + except ValueError: + pass + # Try something else: + sys.stderr.write(" Recovered by returning a flat parse.\n") + # sys.stderr.write(' '.join(t.split())+'\n') + return Tree("S", self._tag(t)) + + def _tag(self, t, tagset=None): + tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))] + if tagset and tagset != self._tagset: + tagged_sent = [ + (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent + ] + return tagged_sent + + def _word(self, t): + return WORD.findall(self._normalize(t)) + + +class CategorizedBracketParseCorpusReader( + CategorizedCorpusReader, BracketParseCorpusReader +): + """ + A reader for parsed corpora whose documents are + divided into categories based on their file identifiers. + @author: Nathan Schneider + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the corpus reader. Categorization arguments + (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to + the L{CategorizedCorpusReader constructor + }. The remaining arguments + are passed to the L{BracketParseCorpusReader constructor + }. + """ + CategorizedCorpusReader.__init__(self, kwargs) + BracketParseCorpusReader.__init__(self, *args, **kwargs) + + def tagged_words(self, fileids=None, categories=None, tagset=None): + return super().tagged_words(self._resolve(fileids, categories), tagset) + + def tagged_sents(self, fileids=None, categories=None, tagset=None): + return super().tagged_sents(self._resolve(fileids, categories), tagset) + + def tagged_paras(self, fileids=None, categories=None, tagset=None): + return super().tagged_paras(self._resolve(fileids, categories), tagset) + + def parsed_words(self, fileids=None, categories=None): + return super().parsed_words(self._resolve(fileids, categories)) + + def parsed_sents(self, fileids=None, categories=None): + return super().parsed_sents(self._resolve(fileids, categories)) + + def parsed_paras(self, fileids=None, categories=None): + return super().parsed_paras(self._resolve(fileids, categories)) + + +class AlpinoCorpusReader(BracketParseCorpusReader): + """ + Reader for the Alpino Dutch Treebank. + This corpus has a lexical breakdown structure embedded, as read by `_parse` + Unfortunately this puts punctuation and some other words out of the sentence + order in the xml element tree. This is no good for `tag_` and `word_` + `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered' + to the overridden _normalize function. The _parse function can then remain + untouched. + """ + + def __init__(self, root, encoding="ISO-8859-1", tagset=None): + BracketParseCorpusReader.__init__( + self, + root, + r"alpino\.xml", + detect_blocks="blankline", + encoding=encoding, + tagset=tagset, + ) + + def _normalize(self, t, ordered=False): + """Normalize the xml sentence element in t. + The sentence elements , although embedded in a few overall + xml elements, are separated by blank lines. That's how the reader can + deliver them one at a time. + Each sentence has a few category subnodes that are of no use to us. + The remaining word nodes may or may not appear in the proper order. + Each word node has attributes, among which: + - begin : the position of the word in the sentence + - pos : Part of Speech: the Tag + - word : the actual word + The return value is a string with all xml elementes replaced by + clauses: either a cat clause with nested clauses, or a word clause. + The order of the bracket clauses closely follows the xml. + If ordered == True, the word clauses include an order sequence number. + If ordered == False, the word clauses only have pos and word parts. + """ + if t[:10] != "', r"(\1", t) + if ordered: + t = re.sub( + r' ', + r"(\1 \2 \3)", + t, + ) + else: + t = re.sub(r' ', r"(\1 \2)", t) + t = re.sub(r" ", r")", t) + t = re.sub(r".*", r"", t) + t = re.sub(r"", r"", t) + return t + + def _tag(self, t, tagset=None): + tagged_sent = [ + (int(o), w, p) + for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True)) + ] + tagged_sent.sort() + if tagset and tagset != self._tagset: + tagged_sent = [ + (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent + ] + else: + tagged_sent = [(w, p) for (o, w, p) in tagged_sent] + return tagged_sent + + def _word(self, t): + """Return a correctly ordered list if words""" + tagged_sent = self._tag(t) + return [w for (w, p) in tagged_sent] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py new file mode 100644 index 0000000000000000000000000000000000000000..3e84e3e5ec17ecdfa3ffc80f16957061e669aa4a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py @@ -0,0 +1,168 @@ +# Natural Language Toolkit: Categorized Sentences Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Pierpaolo Pantone <24alsecondo@gmail.com> +# URL: +# For license information, see LICENSE.TXT + +""" +CorpusReader structured for corpora that contain one instance on each row. +This CorpusReader is specifically used for the Subjectivity Dataset and the +Sentence Polarity Dataset. + +- Subjectivity Dataset information - + +Authors: Bo Pang and Lillian Lee. +Url: https://www.cs.cornell.edu/people/pabo/movie-review-data + +Distributed with permission. + +Related papers: + +- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using + Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL, + 2004. + +- Sentence Polarity Dataset information - + +Authors: Bo Pang and Lillian Lee. +Url: https://www.cs.cornell.edu/people/pabo/movie-review-data + +Related papers: + +- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for + sentiment categorization with respect to rating scales". Proceedings of the + ACL, 2005. +""" + +from nltk.corpus.reader.api import * +from nltk.tokenize import * + + +class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader): + """ + A reader for corpora in which each row represents a single instance, mainly + a sentence. Istances are divided into categories based on their file identifiers + (see CategorizedCorpusReader). + Since many corpora allow rows that contain more than one sentence, it is + possible to specify a sentence tokenizer to retrieve all sentences instead + than all rows. + + Examples using the Subjectivity Dataset: + + >>> from nltk.corpus import subjectivity + >>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE + ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits', + 'happened', 'off', 'screen', '.'] + >>> subjectivity.categories() + ['obj', 'subj'] + >>> subjectivity.words(categories='subj') + ['smart', 'and', 'alert', ',', 'thirteen', ...] + + Examples using the Sentence Polarity Dataset: + + >>> from nltk.corpus import sentence_polarity + >>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE + [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', + 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', + 'it', 'funny', '.'], ...] + >>> sentence_polarity.categories() + ['neg', 'pos'] + """ + + CorpusView = StreamBackedCorpusView + + def __init__( + self, + root, + fileids, + word_tokenizer=WhitespaceTokenizer(), + sent_tokenizer=None, + encoding="utf8", + **kwargs + ): + """ + :param root: The root directory for the corpus. + :param fileids: a list or regexp specifying the fileids in the corpus. + :param word_tokenizer: a tokenizer for breaking sentences or paragraphs + into words. Default: `WhitespaceTokenizer` + :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences. + :param encoding: the encoding that should be used to read the corpus. + :param kwargs: additional parameters passed to CategorizedCorpusReader. + """ + + CorpusReader.__init__(self, root, fileids, encoding) + CategorizedCorpusReader.__init__(self, kwargs) + self._word_tokenizer = word_tokenizer + self._sent_tokenizer = sent_tokenizer + + def sents(self, fileids=None, categories=None): + """ + Return all sentences in the corpus or in the specified file(s). + + :param fileids: a list or regexp specifying the ids of the files whose + sentences have to be returned. + :param categories: a list specifying the categories whose sentences have + to be returned. + :return: the given file(s) as a list of sentences. + Each sentence is tokenized using the specified word_tokenizer. + :rtype: list(list(str)) + """ + fileids = self._resolve(fileids, categories) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + return concat( + [ + self.CorpusView(path, self._read_sent_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def words(self, fileids=None, categories=None): + """ + Return all words and punctuation symbols in the corpus or in the specified + file(s). + + :param fileids: a list or regexp specifying the ids of the files whose + words have to be returned. + :param categories: a list specifying the categories whose words have to + be returned. + :return: the given file(s) as a list of words and punctuation symbols. + :rtype: list(str) + """ + fileids = self._resolve(fileids, categories) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + return concat( + [ + self.CorpusView(path, self._read_word_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def _read_sent_block(self, stream): + sents = [] + for i in range(20): # Read 20 lines at a time. + line = stream.readline() + if not line: + continue + if self._sent_tokenizer: + sents.extend( + [ + self._word_tokenizer.tokenize(sent) + for sent in self._sent_tokenizer.tokenize(line) + ] + ) + else: + sents.append(self._word_tokenizer.tokenize(line)) + return sents + + def _read_word_block(self, stream): + words = [] + for sent in self._read_sent_block(stream): + words.extend(sent) + return words diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py new file mode 100644 index 0000000000000000000000000000000000000000..3360e7d6a8980abf09cecfa56d50ac28f5d7f6da --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py @@ -0,0 +1,158 @@ +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Masato Hagiwara +# URL: +# For license information, see LICENSE.TXT + +import sys + +from nltk.corpus.reader import util +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * + + +class ChasenCorpusReader(CorpusReader): + def __init__(self, root, fileids, encoding="utf8", sent_splitter=None): + self._sent_splitter = sent_splitter + CorpusReader.__init__(self, root, fileids, encoding) + + def words(self, fileids=None): + return concat( + [ + ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_words(self, fileids=None): + return concat( + [ + ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def sents(self, fileids=None): + return concat( + [ + ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_sents(self, fileids=None): + return concat( + [ + ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def paras(self, fileids=None): + return concat( + [ + ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_paras(self, fileids=None): + return concat( + [ + ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + +class ChasenCorpusView(StreamBackedCorpusView): + """ + A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``, + but this'll use fixed sets of word and sentence tokenizer. + """ + + def __init__( + self, + corpus_file, + encoding, + tagged, + group_by_sent, + group_by_para, + sent_splitter=None, + ): + self._tagged = tagged + self._group_by_sent = group_by_sent + self._group_by_para = group_by_para + self._sent_splitter = sent_splitter + StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) + + def read_block(self, stream): + """Reads one paragraph at a time.""" + block = [] + for para_str in read_regexp_block(stream, r".", r"^EOS\n"): + + para = [] + + sent = [] + for line in para_str.splitlines(): + + _eos = line.strip() == "EOS" + _cells = line.split("\t") + w = (_cells[0], "\t".join(_cells[1:])) + if not _eos: + sent.append(w) + + if _eos or (self._sent_splitter and self._sent_splitter(w)): + if not self._tagged: + sent = [w for (w, t) in sent] + if self._group_by_sent: + para.append(sent) + else: + para.extend(sent) + sent = [] + + if len(sent) > 0: + if not self._tagged: + sent = [w for (w, t) in sent] + + if self._group_by_sent: + para.append(sent) + else: + para.extend(sent) + + if self._group_by_para: + block.append(para) + else: + block.extend(para) + + return block + + +def demo(): + + import nltk + from nltk.corpus.util import LazyCorpusLoader + + jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") + print("/".join(jeita.words()[22100:22140])) + + print( + "\nEOS\n".join( + "\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent) + for sent in jeita.tagged_sents()[2170:2173] + ) + ) + + +def test(): + + from nltk.corpus.util import LazyCorpusLoader + + jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") + + assert isinstance(jeita.tagged_words()[0][1], str) + + +if __name__ == "__main__": + demo() + test() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py new file mode 100644 index 0000000000000000000000000000000000000000..e9c5669a375884b49755bcaf9da46412ee45f25d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py @@ -0,0 +1,174 @@ +# Natural Language Toolkit: PanLex Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: David Kamholz +# URL: +# For license information, see LICENSE.TXT + +""" +CorpusReader for PanLex Lite, a stripped down version of PanLex distributed +as an SQLite database. See the README.txt in the panlex_lite corpus directory +for more information on PanLex Lite. +""" + +import os +import sqlite3 + +from nltk.corpus.reader.api import CorpusReader + + +class PanLexLiteCorpusReader(CorpusReader): + MEANING_Q = """ + SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv + FROM dnx + JOIN ex ON (ex.ex = dnx.ex) + JOIN dnx dnx2 ON (dnx2.mn = dnx.mn) + JOIN ex ex2 ON (ex2.ex = dnx2.ex) + WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ? + ORDER BY dnx2.uq DESC + """ + + TRANSLATION_Q = """ + SELECT s.tt, sum(s.uq) AS trq FROM ( + SELECT ex2.tt, max(dnx.uq) AS uq + FROM dnx + JOIN ex ON (ex.ex = dnx.ex) + JOIN dnx dnx2 ON (dnx2.mn = dnx.mn) + JOIN ex ex2 ON (ex2.ex = dnx2.ex) + WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ? + GROUP BY ex2.tt, dnx.ui + ) s + GROUP BY s.tt + ORDER BY trq DESC, s.tt + """ + + def __init__(self, root): + self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor() + + self._uid_lv = {} + self._lv_uid = {} + + for row in self._c.execute("SELECT uid, lv FROM lv"): + self._uid_lv[row[0]] = row[1] + self._lv_uid[row[1]] = row[0] + + def language_varieties(self, lc=None): + """ + Return a list of PanLex language varieties. + + :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties + by this code. If unspecified, all varieties are returned. + :return: the specified language varieties as a list of tuples. The first + element is the language variety's seven-character uniform identifier, + and the second element is its default name. + :rtype: list(tuple) + """ + + if lc is None: + return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall() + else: + return self._c.execute( + "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,) + ).fetchall() + + def meanings(self, expr_uid, expr_tt): + """ + Return a list of meanings for an expression. + + :param expr_uid: the expression's language variety, as a seven-character + uniform identifier. + :param expr_tt: the expression's text. + :return: a list of Meaning objects. + :rtype: list(Meaning) + """ + + expr_lv = self._uid_lv[expr_uid] + + mn_info = {} + + for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)): + mn = i[0] + uid = self._lv_uid[i[5]] + + if not mn in mn_info: + mn_info[mn] = { + "uq": i[1], + "ap": i[2], + "ui": i[3], + "ex": {expr_uid: [expr_tt]}, + } + + if not uid in mn_info[mn]["ex"]: + mn_info[mn]["ex"][uid] = [] + + mn_info[mn]["ex"][uid].append(i[4]) + + return [Meaning(mn, mn_info[mn]) for mn in mn_info] + + def translations(self, from_uid, from_tt, to_uid): + """ + Return a list of translations for an expression into a single language + variety. + + :param from_uid: the source expression's language variety, as a + seven-character uniform identifier. + :param from_tt: the source expression's text. + :param to_uid: the target language variety, as a seven-character + uniform identifier. + :return: a list of translation tuples. The first element is the expression + text and the second element is the translation quality. + :rtype: list(tuple) + """ + + from_lv = self._uid_lv[from_uid] + to_lv = self._uid_lv[to_uid] + + return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall() + + +class Meaning(dict): + """ + Represents a single PanLex meaning. A meaning is a translation set derived + from a single source. + """ + + def __init__(self, mn, attr): + super().__init__(**attr) + self["mn"] = mn + + def id(self): + """ + :return: the meaning's id. + :rtype: int + """ + return self["mn"] + + def quality(self): + """ + :return: the meaning's source's quality (0=worst, 9=best). + :rtype: int + """ + return self["uq"] + + def source(self): + """ + :return: the meaning's source id. + :rtype: int + """ + return self["ap"] + + def source_group(self): + """ + :return: the meaning's source group id. + :rtype: int + """ + return self["ui"] + + def expressions(self): + """ + :return: the meaning's expressions as a dictionary whose keys are language + variety uniform identifiers and whose values are lists of expression + texts. + :rtype: dict + """ + return self["ex"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py new file mode 100644 index 0000000000000000000000000000000000000000..ddcc116948fbb0713744bdea935640a695928bad --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py @@ -0,0 +1,95 @@ +# Natural Language Toolkit: Word List Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + + +import re +from collections import defaultdict, namedtuple + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.corpus.reader.wordlist import WordListCorpusReader +from nltk.tokenize import line_tokenize + +PanlexLanguage = namedtuple( + "PanlexLanguage", + [ + "panlex_uid", # (1) PanLex UID + "iso639", # (2) ISO 639 language code + "iso639_type", # (3) ISO 639 language type, see README + "script", # (4) normal scripts of expressions + "name", # (5) PanLex default name + "langvar_uid", # (6) UID of the language variety in which the default name is an expression + ], +) + + +class PanlexSwadeshCorpusReader(WordListCorpusReader): + """ + This is a class to read the PanLex Swadesh list from + + David Kamholz, Jonathan Pool, and Susan M. Colowick (2014). + PanLex: Building a Resource for Panlingual Lexical Translation. + In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf + + License: CC0 1.0 Universal + https://creativecommons.org/publicdomain/zero/1.0/legalcode + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Find the swadesh size using the fileids' path. + self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1) + self._languages = {lang.panlex_uid: lang for lang in self.get_languages()} + self._macro_langauges = self.get_macrolanguages() + + def license(self): + return "CC0 1.0 Universal" + + def language_codes(self): + return self._languages.keys() + + def get_languages(self): + for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"): + if not line.strip(): # Skip empty lines. + continue + yield PanlexLanguage(*line.strip().split("\t")) + + def get_macrolanguages(self): + macro_langauges = defaultdict(list) + for lang in self._languages.values(): + macro_langauges[lang.iso639].append(lang.panlex_uid) + return macro_langauges + + def words_by_lang(self, lang_code): + """ + :return: a list of list(str) + """ + fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt" + return [concept.split("\t") for concept in self.words(fileid)] + + def words_by_iso639(self, iso63_code): + """ + :return: a list of list(str) + """ + fileids = [ + f"swadesh{self.swadesh_size}/{lang_code}.txt" + for lang_code in self._macro_langauges[iso63_code] + ] + return [ + concept.split("\t") for fileid in fileids for concept in self.words(fileid) + ] + + def entries(self, fileids=None): + """ + :return: a tuple of words for the specified fileids. + """ + if not fileids: + fileids = self.fileids() + + wordlists = [self.words(f) for f in fileids] + return list(zip(*wordlists)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py new file mode 100644 index 0000000000000000000000000000000000000000..5191401cd9954b9d0f86d38297b0009a43e5d580 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py @@ -0,0 +1,375 @@ +# Natural Language Toolkit: +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Piotr Kasprzyk +# URL: +# For license information, see LICENSE.TXT + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.xmldocs import XMLCorpusReader + +PARA = re.compile(r"]*){0,1}>(.*?)

") +SENT = re.compile(r"]*){0,1}>(.*?)") + +TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)") +WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)") + +TYPE = re.compile(r'type="(.*?)"') +ANA = re.compile(r'ana="(.*?)"') + +TEXTID = re.compile(r'text id="(.*?)"') + + +class TEICorpusView(StreamBackedCorpusView): + def __init__( + self, + corpus_file, + tagged, + group_by_sent, + group_by_para, + tagset=None, + head_len=0, + textids=None, + ): + + self._tagged = tagged + self._textids = textids + + self._group_by_sent = group_by_sent + self._group_by_para = group_by_para + # WARNING -- skip header + StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len) + + _pagesize = 4096 + + def read_block(self, stream): + block = stream.readlines(self._pagesize) + block = concat(block) + while (block.count(" block.count("")) or block.count( + "") + len("") + block = block[:beg] + block[beg + end :] + + output = [] + for para_str in PARA.findall(block): + para = [] + for sent_str in SENT.findall(para_str): + if not self._tagged: + sent = WORD.findall(sent_str) + else: + sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str))) + if self._group_by_sent: + para.append(sent) + else: + para.extend(sent) + if self._group_by_para: + output.append(para) + else: + output.extend(para) + return output + + def _parse_tag(self, tag_word_tuple): + (tag, word) = tag_word_tuple + if tag.startswith("w"): + tag = ANA.search(tag).group(1) + else: # tag.startswith('c') + tag = TYPE.search(tag).group(1) + return word, tag + + +class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader): + head_len = 2770 + + def __init__(self, *args, **kwargs): + if "textid_file" in kwargs: + self._textids = kwargs["textid_file"] + else: + self._textids = None + + XMLCorpusReader.__init__(self, *args) + CategorizedCorpusReader.__init__(self, kwargs) + + self._init_textids() + + def _init_textids(self): + self._f2t = defaultdict(list) + self._t2f = defaultdict(list) + if self._textids is not None: + with open(self._textids) as fp: + for line in fp: + line = line.strip() + file_id, text_ids = line.split(" ", 1) + if file_id not in self.fileids(): + raise ValueError( + "In text_id mapping file %s: %s not found" + % (self._textids, file_id) + ) + for text_id in text_ids.split(self._delimiter): + self._add_textids(file_id, text_id) + + def _add_textids(self, file_id, text_id): + self._f2t[file_id].append(text_id) + self._t2f[text_id].append(file_id) + + def _resolve(self, fileids, categories, textids=None): + tmp = None + if ( + len( + list( + filter( + lambda accessor: accessor is None, + (fileids, categories, textids), + ) + ) + ) + != 1 + ): + + raise ValueError( + "Specify exactly one of: fileids, " "categories or textids" + ) + + if fileids is not None: + return fileids, None + + if categories is not None: + return self.fileids(categories), None + + if textids is not None: + if isinstance(textids, str): + textids = [textids] + files = sum((self._t2f[t] for t in textids), []) + tdict = dict() + for f in files: + tdict[f] = set(self._f2t[f]) & set(textids) + return files, tdict + + def decode_tag(self, tag): + # to be implemented + return tag + + def textids(self, fileids=None, categories=None): + """ + In the pl196x corpus each category is stored in single + file and thus both methods provide identical functionality. In order + to accommodate finer granularity, a non-standard textids() method was + implemented. All the main functions can be supplied with a list + of required chunks---giving much more control to the user. + """ + fileids, _ = self._resolve(fileids, categories) + if fileids is None: + return sorted(self._t2f) + + if isinstance(fileids, str): + fileids = [fileids] + return sorted(sum((self._f2t[d] for d in fileids), [])) + + def words(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + + if textids: + return concat( + [ + TEICorpusView( + self.abspath(fileid), + False, + False, + False, + head_len=self.head_len, + textids=textids[fileid], + ) + for fileid in fileids + ] + ) + else: + return concat( + [ + TEICorpusView( + self.abspath(fileid), + False, + False, + False, + head_len=self.head_len, + ) + for fileid in fileids + ] + ) + + def sents(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + + if textids: + return concat( + [ + TEICorpusView( + self.abspath(fileid), + False, + True, + False, + head_len=self.head_len, + textids=textids[fileid], + ) + for fileid in fileids + ] + ) + else: + return concat( + [ + TEICorpusView( + self.abspath(fileid), False, True, False, head_len=self.head_len + ) + for fileid in fileids + ] + ) + + def paras(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + + if textids: + return concat( + [ + TEICorpusView( + self.abspath(fileid), + False, + True, + True, + head_len=self.head_len, + textids=textids[fileid], + ) + for fileid in fileids + ] + ) + else: + return concat( + [ + TEICorpusView( + self.abspath(fileid), False, True, True, head_len=self.head_len + ) + for fileid in fileids + ] + ) + + def tagged_words(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + + if textids: + return concat( + [ + TEICorpusView( + self.abspath(fileid), + True, + False, + False, + head_len=self.head_len, + textids=textids[fileid], + ) + for fileid in fileids + ] + ) + else: + return concat( + [ + TEICorpusView( + self.abspath(fileid), True, False, False, head_len=self.head_len + ) + for fileid in fileids + ] + ) + + def tagged_sents(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + + if textids: + return concat( + [ + TEICorpusView( + self.abspath(fileid), + True, + True, + False, + head_len=self.head_len, + textids=textids[fileid], + ) + for fileid in fileids + ] + ) + else: + return concat( + [ + TEICorpusView( + self.abspath(fileid), True, True, False, head_len=self.head_len + ) + for fileid in fileids + ] + ) + + def tagged_paras(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + + if textids: + return concat( + [ + TEICorpusView( + self.abspath(fileid), + True, + True, + True, + head_len=self.head_len, + textids=textids[fileid], + ) + for fileid in fileids + ] + ) + else: + return concat( + [ + TEICorpusView( + self.abspath(fileid), True, True, True, head_len=self.head_len + ) + for fileid in fileids + ] + ) + + def xml(self, fileids=None, categories=None): + fileids, _ = self._resolve(fileids, categories) + if len(fileids) == 1: + return XMLCorpusReader.xml(self, fileids[0]) + else: + raise TypeError("Expected a single file") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py new file mode 100644 index 0000000000000000000000000000000000000000..a7ae78d6c8a025e5c38080e3056ff9d1de7bd8d5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py @@ -0,0 +1,227 @@ +# Natural Language Toolkit: Plaintext Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# Nitin Madnani +# URL: +# For license information, see LICENSE.TXT + +""" +A reader for corpora that consist of plaintext documents. +""" + +import nltk.data +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.tokenize import * + + +class PlaintextCorpusReader(CorpusReader): + """ + Reader for corpora that consist of plaintext documents. Paragraphs + are assumed to be split using blank lines. Sentences and words can + be tokenized using the default tokenizers, or by custom tokenizers + specified as parameters to the constructor. + + This corpus reader can be customized (e.g., to skip preface + sections of specific document formats) by creating a subclass and + overriding the ``CorpusView`` class variable. + """ + + CorpusView = StreamBackedCorpusView + """The corpus view class used by this reader. Subclasses of + ``PlaintextCorpusReader`` may specify alternative corpus view + classes (e.g., to skip the preface sections of documents.)""" + + def __init__( + self, + root, + fileids, + word_tokenizer=WordPunctTokenizer(), + sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"), + para_block_reader=read_blankline_block, + encoding="utf8", + ): + r""" + Construct a new plaintext corpus reader for a set of documents + located at the given root directory. Example usage: + + >>> root = '/usr/local/share/nltk_data/corpora/webtext/' + >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP + + :param root: The root directory for this corpus. + :param fileids: A list or regexp specifying the fileids in this corpus. + :param word_tokenizer: Tokenizer for breaking sentences or + paragraphs into words. + :param sent_tokenizer: Tokenizer for breaking paragraphs + into words. + :param para_block_reader: The block reader used to divide the + corpus into paragraph blocks. + """ + CorpusReader.__init__(self, root, fileids, encoding) + self._word_tokenizer = word_tokenizer + self._sent_tokenizer = sent_tokenizer + self._para_block_reader = para_block_reader + + def words(self, fileids=None): + """ + :return: the given file(s) as a list of words + and punctuation symbols. + :rtype: list(str) + """ + return concat( + [ + self.CorpusView(path, self._read_word_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def sents(self, fileids=None): + """ + :return: the given file(s) as a list of + sentences or utterances, each encoded as a list of word + strings. + :rtype: list(list(str)) + """ + if self._sent_tokenizer is None: + raise ValueError("No sentence tokenizer for this corpus") + + return concat( + [ + self.CorpusView(path, self._read_sent_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def paras(self, fileids=None): + """ + :return: the given file(s) as a list of + paragraphs, each encoded as a list of sentences, which are + in turn encoded as lists of word strings. + :rtype: list(list(list(str))) + """ + if self._sent_tokenizer is None: + raise ValueError("No sentence tokenizer for this corpus") + + return concat( + [ + self.CorpusView(path, self._read_para_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def _read_word_block(self, stream): + words = [] + for i in range(20): # Read 20 lines at a time. + words.extend(self._word_tokenizer.tokenize(stream.readline())) + return words + + def _read_sent_block(self, stream): + sents = [] + for para in self._para_block_reader(stream): + sents.extend( + [ + self._word_tokenizer.tokenize(sent) + for sent in self._sent_tokenizer.tokenize(para) + ] + ) + return sents + + def _read_para_block(self, stream): + paras = [] + for para in self._para_block_reader(stream): + paras.append( + [ + self._word_tokenizer.tokenize(sent) + for sent in self._sent_tokenizer.tokenize(para) + ] + ) + return paras + + +class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader): + """ + A reader for plaintext corpora whose documents are divided into + categories based on their file identifiers. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the corpus reader. Categorization arguments + (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to + the ``CategorizedCorpusReader`` constructor. The remaining arguments + are passed to the ``PlaintextCorpusReader`` constructor. + """ + CategorizedCorpusReader.__init__(self, kwargs) + PlaintextCorpusReader.__init__(self, *args, **kwargs) + + +# FIXME: Is there a better way? How to not hardcode this? +# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to +# override the `sent_tokenizer`. +class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader): + def __init__(self, *args, **kwargs): + CategorizedCorpusReader.__init__(self, kwargs) + kwargs["sent_tokenizer"] = nltk.data.LazyLoader( + "tokenizers/punkt/portuguese.pickle" + ) + PlaintextCorpusReader.__init__(self, *args, **kwargs) + + +class EuroparlCorpusReader(PlaintextCorpusReader): + + """ + Reader for Europarl corpora that consist of plaintext documents. + Documents are divided into chapters instead of paragraphs as + for regular plaintext documents. Chapters are separated using blank + lines. Everything is inherited from ``PlaintextCorpusReader`` except + that: + + - Since the corpus is pre-processed and pre-tokenized, the + word tokenizer should just split the line at whitespaces. + - For the same reason, the sentence tokenizer should just + split the paragraph at line breaks. + - There is a new 'chapters()' method that returns chapters instead + instead of paragraphs. + - The 'paras()' method inherited from PlaintextCorpusReader is + made non-functional to remove any confusion between chapters + and paragraphs for Europarl. + """ + + def _read_word_block(self, stream): + words = [] + for i in range(20): # Read 20 lines at a time. + words.extend(stream.readline().split()) + return words + + def _read_sent_block(self, stream): + sents = [] + for para in self._para_block_reader(stream): + sents.extend([sent.split() for sent in para.splitlines()]) + return sents + + def _read_para_block(self, stream): + paras = [] + for para in self._para_block_reader(stream): + paras.append([sent.split() for sent in para.splitlines()]) + return paras + + def chapters(self, fileids=None): + """ + :return: the given file(s) as a list of + chapters, each encoded as a list of sentences, which are + in turn encoded as lists of word strings. + :rtype: list(list(list(str))) + """ + return concat( + [ + self.CorpusView(fileid, self._read_para_block, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def paras(self, fileids=None): + raise NotImplementedError( + "The Europarl corpus reader does not support paragraphs. Please use chapters() instead." + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py new file mode 100644 index 0000000000000000000000000000000000000000..dd12a8ff750fdded0b781d153e66bb2532c9c4df --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py @@ -0,0 +1,95 @@ +# Natural Language Toolkit: PP Attachment Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Read lines from the Prepositional Phrase Attachment Corpus. + +The PP Attachment Corpus contains several files having the format: + +sentence_id verb noun1 preposition noun2 attachment + +For example: + +42960 gives authority to administration V +46742 gives inventors of microchip N + +The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.: + +(VP gives (NP authority) (PP to administration)) +(VP gives (NP inventors (PP of microchip))) + +The corpus contains the following files: + +training: training set +devset: development test set, used for algorithm development. +test: test set, used to report results +bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal. + +Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional +Phrase Attachment. Proceedings of the ARPA Human Language Technology +Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps] + +The PP Attachment Corpus is distributed with NLTK with the permission +of the author. +""" + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * + + +class PPAttachment: + def __init__(self, sent, verb, noun1, prep, noun2, attachment): + self.sent = sent + self.verb = verb + self.noun1 = noun1 + self.prep = prep + self.noun2 = noun2 + self.attachment = attachment + + def __repr__(self): + return ( + "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, " + "noun2=%r, attachment=%r)" + % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment) + ) + + +class PPAttachmentCorpusReader(CorpusReader): + """ + sentence_id verb noun1 preposition noun2 attachment + """ + + def attachments(self, fileids): + return concat( + [ + StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tuples(self, fileids): + return concat( + [ + StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def _read_tuple_block(self, stream): + line = stream.readline() + if line: + return [tuple(line.split())] + else: + return [] + + def _read_obj_block(self, stream): + line = stream.readline() + if line: + return [PPAttachment(*line.split())] + else: + return [] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py new file mode 100644 index 0000000000000000000000000000000000000000..a5de042875dd332af27877c7014be323f957c3ef --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py @@ -0,0 +1,520 @@ +# Natural Language Toolkit: PropBank Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +import re +from functools import total_ordering +from xml.etree import ElementTree + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.internals import raise_unorderable_types +from nltk.tree import Tree + + +class PropbankCorpusReader(CorpusReader): + """ + Corpus reader for the propbank corpus, which augments the Penn + Treebank with information about the predicate argument structure + of every verb instance. The corpus consists of two parts: the + predicate-argument annotations themselves, and a set of "frameset + files" which define the argument labels used by the annotations, + on a per-verb basis. Each "frameset file" contains one or more + predicates, such as ``'turn'`` or ``'turn_on'``, each of which is + divided into coarse-grained word senses called "rolesets". For + each "roleset", the frameset file provides descriptions of the + argument roles, along with examples. + """ + + def __init__( + self, + root, + propfile, + framefiles="", + verbsfile=None, + parse_fileid_xform=None, + parse_corpus=None, + encoding="utf8", + ): + """ + :param root: The root directory for this corpus. + :param propfile: The name of the file containing the predicate- + argument annotations (relative to ``root``). + :param framefiles: A list or regexp specifying the frameset + fileids for this corpus. + :param parse_fileid_xform: A transform that should be applied + to the fileids in this corpus. This should be a function + of one argument (a fileid) that returns a string (the new + fileid). + :param parse_corpus: The corpus containing the parse trees + corresponding to this corpus. These parse trees are + necessary to resolve the tree pointers used by propbank. + """ + # If framefiles is specified as a regexp, expand it. + if isinstance(framefiles, str): + framefiles = find_corpus_fileids(root, framefiles) + framefiles = list(framefiles) + # Initialize the corpus reader. + CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding) + + # Record our frame fileids & prop file. + self._propfile = propfile + self._framefiles = framefiles + self._verbsfile = verbsfile + self._parse_fileid_xform = parse_fileid_xform + self._parse_corpus = parse_corpus + + def instances(self, baseform=None): + """ + :return: a corpus view that acts as a list of + ``PropBankInstance`` objects, one for each noun in the corpus. + """ + kwargs = {} + if baseform is not None: + kwargs["instance_filter"] = lambda inst: inst.baseform == baseform + return StreamBackedCorpusView( + self.abspath(self._propfile), + lambda stream: self._read_instance_block(stream, **kwargs), + encoding=self.encoding(self._propfile), + ) + + def lines(self): + """ + :return: a corpus view that acts as a list of strings, one for + each line in the predicate-argument annotation file. + """ + return StreamBackedCorpusView( + self.abspath(self._propfile), + read_line_block, + encoding=self.encoding(self._propfile), + ) + + def roleset(self, roleset_id): + """ + :return: the xml description for the given roleset. + """ + baseform = roleset_id.split(".")[0] + framefile = "frames/%s.xml" % baseform + if framefile not in self._framefiles: + raise ValueError("Frameset file for %s not found" % roleset_id) + + # n.b.: The encoding for XML fileids is specified by the file + # itself; so we ignore self._encoding here. + with self.abspath(framefile).open() as fp: + etree = ElementTree.parse(fp).getroot() + for roleset in etree.findall("predicate/roleset"): + if roleset.attrib["id"] == roleset_id: + return roleset + raise ValueError(f"Roleset {roleset_id} not found in {framefile}") + + def rolesets(self, baseform=None): + """ + :return: list of xml descriptions for rolesets. + """ + if baseform is not None: + framefile = "frames/%s.xml" % baseform + if framefile not in self._framefiles: + raise ValueError("Frameset file for %s not found" % baseform) + framefiles = [framefile] + else: + framefiles = self._framefiles + + rsets = [] + for framefile in framefiles: + # n.b.: The encoding for XML fileids is specified by the file + # itself; so we ignore self._encoding here. + with self.abspath(framefile).open() as fp: + etree = ElementTree.parse(fp).getroot() + rsets.append(etree.findall("predicate/roleset")) + return LazyConcatenation(rsets) + + def verbs(self): + """ + :return: a corpus view that acts as a list of all verb lemmas + in this corpus (from the verbs.txt file). + """ + return StreamBackedCorpusView( + self.abspath(self._verbsfile), + read_line_block, + encoding=self.encoding(self._verbsfile), + ) + + def _read_instance_block(self, stream, instance_filter=lambda inst: True): + block = [] + + # Read 100 at a time. + for i in range(100): + line = stream.readline().strip() + if line: + inst = PropbankInstance.parse( + line, self._parse_fileid_xform, self._parse_corpus + ) + if instance_filter(inst): + block.append(inst) + + return block + + +###################################################################### +# { Propbank Instance & related datatypes +###################################################################### + + +class PropbankInstance: + def __init__( + self, + fileid, + sentnum, + wordnum, + tagger, + roleset, + inflection, + predicate, + arguments, + parse_corpus=None, + ): + + self.fileid = fileid + """The name of the file containing the parse tree for this + instance's sentence.""" + + self.sentnum = sentnum + """The sentence number of this sentence within ``fileid``. + Indexing starts from zero.""" + + self.wordnum = wordnum + """The word number of this instance's predicate within its + containing sentence. Word numbers are indexed starting from + zero, and include traces and other empty parse elements.""" + + self.tagger = tagger + """An identifier for the tagger who tagged this instance; or + ``'gold'`` if this is an adjuticated instance.""" + + self.roleset = roleset + """The name of the roleset used by this instance's predicate. + Use ``propbank.roleset() `` to + look up information about the roleset.""" + + self.inflection = inflection + """A ``PropbankInflection`` object describing the inflection of + this instance's predicate.""" + + self.predicate = predicate + """A ``PropbankTreePointer`` indicating the position of this + instance's predicate within its containing sentence.""" + + self.arguments = tuple(arguments) + """A list of tuples (argloc, argid), specifying the location + and identifier for each of the predicate's argument in the + containing sentence. Argument identifiers are strings such as + ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain + the predicate.""" + + self.parse_corpus = parse_corpus + """A corpus reader for the parse trees corresponding to the + instances in this propbank corpus.""" + + @property + def baseform(self): + """The baseform of the predicate.""" + return self.roleset.split(".")[0] + + @property + def sensenumber(self): + """The sense number of the predicate.""" + return self.roleset.split(".")[1] + + @property + def predid(self): + """Identifier of the predicate.""" + return "rel" + + def __repr__(self): + return "".format( + self.fileid, + self.sentnum, + self.wordnum, + ) + + def __str__(self): + s = "{} {} {} {} {} {}".format( + self.fileid, + self.sentnum, + self.wordnum, + self.tagger, + self.roleset, + self.inflection, + ) + items = self.arguments + ((self.predicate, "rel"),) + for (argloc, argid) in sorted(items): + s += f" {argloc}-{argid}" + return s + + def _get_tree(self): + if self.parse_corpus is None: + return None + if self.fileid not in self.parse_corpus.fileids(): + return None + return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] + + tree = property( + _get_tree, + doc=""" + The parse tree corresponding to this instance, or None if + the corresponding tree is not available.""", + ) + + @staticmethod + def parse(s, parse_fileid_xform=None, parse_corpus=None): + pieces = s.split() + if len(pieces) < 7: + raise ValueError("Badly formatted propbank line: %r" % s) + + # Divide the line into its basic pieces. + (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6] + rel = [p for p in pieces[6:] if p.endswith("-rel")] + args = [p for p in pieces[6:] if not p.endswith("-rel")] + if len(rel) != 1: + raise ValueError("Badly formatted propbank line: %r" % s) + + # Apply the fileid selector, if any. + if parse_fileid_xform is not None: + fileid = parse_fileid_xform(fileid) + + # Convert sentence & word numbers to ints. + sentnum = int(sentnum) + wordnum = int(wordnum) + + # Parse the inflection + inflection = PropbankInflection.parse(inflection) + + # Parse the predicate location. + predicate = PropbankTreePointer.parse(rel[0][:-4]) + + # Parse the arguments. + arguments = [] + for arg in args: + argloc, argid = arg.split("-", 1) + arguments.append((PropbankTreePointer.parse(argloc), argid)) + + # Put it all together. + return PropbankInstance( + fileid, + sentnum, + wordnum, + tagger, + roleset, + inflection, + predicate, + arguments, + parse_corpus, + ) + + +class PropbankPointer: + """ + A pointer used by propbank to identify one or more constituents in + a parse tree. ``PropbankPointer`` is an abstract base class with + three concrete subclasses: + + - ``PropbankTreePointer`` is used to point to single constituents. + - ``PropbankSplitTreePointer`` is used to point to 'split' + constituents, which consist of a sequence of two or more + ``PropbankTreePointer`` pointers. + - ``PropbankChainTreePointer`` is used to point to entire trace + chains in a tree. It consists of a sequence of pieces, which + can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers. + """ + + def __init__(self): + if self.__class__ == PropbankPointer: + raise NotImplementedError() + + +class PropbankChainTreePointer(PropbankPointer): + def __init__(self, pieces): + self.pieces = pieces + """A list of the pieces that make up this chain. Elements may + be either ``PropbankSplitTreePointer`` or + ``PropbankTreePointer`` pointers.""" + + def __str__(self): + return "*".join("%s" % p for p in self.pieces) + + def __repr__(self): + return "" % self + + def select(self, tree): + if tree is None: + raise ValueError("Parse tree not available") + return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) + + +class PropbankSplitTreePointer(PropbankPointer): + def __init__(self, pieces): + self.pieces = pieces + """A list of the pieces that make up this chain. Elements are + all ``PropbankTreePointer`` pointers.""" + + def __str__(self): + return ",".join("%s" % p for p in self.pieces) + + def __repr__(self): + return "" % self + + def select(self, tree): + if tree is None: + raise ValueError("Parse tree not available") + return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) + + +@total_ordering +class PropbankTreePointer(PropbankPointer): + """ + wordnum:height*wordnum:height*... + wordnum:height, + + """ + + def __init__(self, wordnum, height): + self.wordnum = wordnum + self.height = height + + @staticmethod + def parse(s): + # Deal with chains (xx*yy*zz) + pieces = s.split("*") + if len(pieces) > 1: + return PropbankChainTreePointer( + [PropbankTreePointer.parse(elt) for elt in pieces] + ) + + # Deal with split args (xx,yy,zz) + pieces = s.split(",") + if len(pieces) > 1: + return PropbankSplitTreePointer( + [PropbankTreePointer.parse(elt) for elt in pieces] + ) + + # Deal with normal pointers. + pieces = s.split(":") + if len(pieces) != 2: + raise ValueError("bad propbank pointer %r" % s) + return PropbankTreePointer(int(pieces[0]), int(pieces[1])) + + def __str__(self): + return f"{self.wordnum}:{self.height}" + + def __repr__(self): + return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height) + + def __eq__(self, other): + while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): + other = other.pieces[0] + + if not isinstance(other, PropbankTreePointer): + return self is other + + return self.wordnum == other.wordnum and self.height == other.height + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): + other = other.pieces[0] + + if not isinstance(other, PropbankTreePointer): + return id(self) < id(other) + + return (self.wordnum, -self.height) < (other.wordnum, -other.height) + + def select(self, tree): + if tree is None: + raise ValueError("Parse tree not available") + return tree[self.treepos(tree)] + + def treepos(self, tree): + """ + Convert this pointer to a standard 'tree position' pointer, + given that it points to the given tree. + """ + if tree is None: + raise ValueError("Parse tree not available") + stack = [tree] + treepos = [] + + wordnum = 0 + while True: + # tree node: + if isinstance(stack[-1], Tree): + # Select the next child. + if len(treepos) < len(stack): + treepos.append(0) + else: + treepos[-1] += 1 + # Update the stack. + if treepos[-1] < len(stack[-1]): + stack.append(stack[-1][treepos[-1]]) + else: + # End of node's child list: pop up a level. + stack.pop() + treepos.pop() + # word node: + else: + if wordnum == self.wordnum: + return tuple(treepos[: len(treepos) - self.height - 1]) + else: + wordnum += 1 + stack.pop() + + +class PropbankInflection: + # { Inflection Form + INFINITIVE = "i" + GERUND = "g" + PARTICIPLE = "p" + FINITE = "v" + # { Inflection Tense + FUTURE = "f" + PAST = "p" + PRESENT = "n" + # { Inflection Aspect + PERFECT = "p" + PROGRESSIVE = "o" + PERFECT_AND_PROGRESSIVE = "b" + # { Inflection Person + THIRD_PERSON = "3" + # { Inflection Voice + ACTIVE = "a" + PASSIVE = "p" + # { Inflection + NONE = "-" + # } + + def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"): + self.form = form + self.tense = tense + self.aspect = aspect + self.person = person + self.voice = voice + + def __str__(self): + return self.form + self.tense + self.aspect + self.person + self.voice + + def __repr__(self): + return "" % self + + _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$") + + @staticmethod + def parse(s): + if not isinstance(s, str): + raise TypeError("expected a string") + if len(s) != 5 or not PropbankInflection._VALIDATE.match(s): + raise ValueError("Bad propbank inflection string %r" % s) + return PropbankInflection(*s) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py new file mode 100644 index 0000000000000000000000000000000000000000..bcabe8936491a5f5428c7df540f08e310eafe6aa --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py @@ -0,0 +1,133 @@ +# Natural Language Toolkit: Pros and Cons Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Pierpaolo Pantone <24alsecondo@gmail.com> +# URL: +# For license information, see LICENSE.TXT + +""" +CorpusReader for the Pros and Cons dataset. + +- Pros and Cons dataset information - + +Contact: Bing Liu, liub@cs.uic.edu + https://www.cs.uic.edu/~liub + +Distributed with permission. + +Related papers: + +- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences". + Proceedings of the 22nd International Conference on Computational Linguistics + (Coling-2008), Manchester, 18-22 August, 2008. + +- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing + Opinions on the Web". Proceedings of the 14th international World Wide Web + conference (WWW-2005), May 10-14, 2005, in Chiba, Japan. +""" +import re + +from nltk.corpus.reader.api import * +from nltk.tokenize import * + + +class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader): + """ + Reader for the Pros and Cons sentence dataset. + + >>> from nltk.corpus import pros_cons + >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE + [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy', + 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'], + ...] + >>> pros_cons.words('IntegratedPros.txt') + ['Easy', 'to', 'use', ',', 'economical', '!', ...] + """ + + CorpusView = StreamBackedCorpusView + + def __init__( + self, + root, + fileids, + word_tokenizer=WordPunctTokenizer(), + encoding="utf8", + **kwargs + ): + """ + :param root: The root directory for the corpus. + :param fileids: a list or regexp specifying the fileids in the corpus. + :param word_tokenizer: a tokenizer for breaking sentences or paragraphs + into words. Default: `WhitespaceTokenizer` + :param encoding: the encoding that should be used to read the corpus. + :param kwargs: additional parameters passed to CategorizedCorpusReader. + """ + + CorpusReader.__init__(self, root, fileids, encoding) + CategorizedCorpusReader.__init__(self, kwargs) + self._word_tokenizer = word_tokenizer + + def sents(self, fileids=None, categories=None): + """ + Return all sentences in the corpus or in the specified files/categories. + + :param fileids: a list or regexp specifying the ids of the files whose + sentences have to be returned. + :param categories: a list specifying the categories whose sentences + have to be returned. + :return: the given file(s) as a list of sentences. Each sentence is + tokenized using the specified word_tokenizer. + :rtype: list(list(str)) + """ + fileids = self._resolve(fileids, categories) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + return concat( + [ + self.CorpusView(path, self._read_sent_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def words(self, fileids=None, categories=None): + """ + Return all words and punctuation symbols in the corpus or in the specified + files/categories. + + :param fileids: a list or regexp specifying the ids of the files whose + words have to be returned. + :param categories: a list specifying the categories whose words have + to be returned. + :return: the given file(s) as a list of words and punctuation symbols. + :rtype: list(str) + """ + fileids = self._resolve(fileids, categories) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + return concat( + [ + self.CorpusView(path, self._read_word_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def _read_sent_block(self, stream): + sents = [] + for i in range(20): # Read 20 lines at a time. + line = stream.readline() + if not line: + continue + sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)", line) + if sent: + sents.append(self._word_tokenizer.tokenize(sent.group(2).strip())) + return sents + + def _read_word_block(self, stream): + words = [] + for sent in self._read_sent_block(stream): + words.extend(sent) + return words diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py new file mode 100644 index 0000000000000000000000000000000000000000..626c89dedf42e93c0d5fca786877a39e50550a98 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py @@ -0,0 +1,331 @@ +# Natural Language Toolkit: Product Reviews Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Pierpaolo Pantone <24alsecondo@gmail.com> +# URL: +# For license information, see LICENSE.TXT + +""" +CorpusReader for reviews corpora (syntax based on Customer Review Corpus). + +Customer Review Corpus information +================================== + +Annotated by: Minqing Hu and Bing Liu, 2004. + Department of Computer Science + University of Illinois at Chicago + +Contact: Bing Liu, liub@cs.uic.edu + https://www.cs.uic.edu/~liub + +Distributed with permission. + +The "product_reviews_1" and "product_reviews_2" datasets respectively contain +annotated customer reviews of 5 and 9 products from amazon.com. + +Related papers: + +- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". + Proceedings of the ACM SIGKDD International Conference on Knowledge + Discovery & Data Mining (KDD-04), 2004. + +- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews". + Proceedings of Nineteeth National Conference on Artificial Intelligence + (AAAI-2004), 2004. + +- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to + Opinion Mining." Proceedings of First ACM International Conference on Web + Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University, + Stanford, California, USA. + +Symbols used in the annotated reviews: + + :[t]: the title of the review: Each [t] tag starts a review. + :xxxx[+|-n]: xxxx is a product feature. + :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest. + Note that the strength is quite subjective. + You may want ignore it, but only considering + and - + :[-n]: Negative opinion + :##: start of each sentence. Each line is a sentence. + :[u]: feature not appeared in the sentence. + :[p]: feature not appeared in the sentence. Pronoun resolution is needed. + :[s]: suggestion or recommendation. + :[cc]: comparison with a competing product from a different brand. + :[cs]: comparison with a competing product from the same brand. + +Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not + provide separation between different reviews. This is due to the fact that + the dataset was specifically designed for aspect/feature-based sentiment + analysis, for which sentence-level annotation is sufficient. For document- + level classification and analysis, this peculiarity should be taken into + consideration. +""" + +import re + +from nltk.corpus.reader.api import * +from nltk.tokenize import * + +TITLE = re.compile(r"^\[t\](.*)$") # [t] Title +FEATURES = re.compile( + r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]" +) # find 'feature' in feature[+3] +NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p] +SENT = re.compile(r"##(.*)$") # find tokenized sentence + + +class Review: + """ + A Review is the main block of a ReviewsCorpusReader. + """ + + def __init__(self, title=None, review_lines=None): + """ + :param title: the title of the review. + :param review_lines: the list of the ReviewLines that belong to the Review. + """ + self.title = title + if review_lines is None: + self.review_lines = [] + else: + self.review_lines = review_lines + + def add_line(self, review_line): + """ + Add a line (ReviewLine) to the review. + + :param review_line: a ReviewLine instance that belongs to the Review. + """ + assert isinstance(review_line, ReviewLine) + self.review_lines.append(review_line) + + def features(self): + """ + Return a list of features in the review. Each feature is a tuple made of + the specific item feature and the opinion strength about that feature. + + :return: all features of the review as a list of tuples (feat, score). + :rtype: list(tuple) + """ + features = [] + for review_line in self.review_lines: + features.extend(review_line.features) + return features + + def sents(self): + """ + Return all tokenized sentences in the review. + + :return: all sentences of the review as lists of tokens. + :rtype: list(list(str)) + """ + return [review_line.sent for review_line in self.review_lines] + + def __repr__(self): + return 'Review(title="{}", review_lines={})'.format( + self.title, self.review_lines + ) + + +class ReviewLine: + """ + A ReviewLine represents a sentence of the review, together with (optional) + annotations of its features and notes about the reviewed item. + """ + + def __init__(self, sent, features=None, notes=None): + self.sent = sent + if features is None: + self.features = [] + else: + self.features = features + + if notes is None: + self.notes = [] + else: + self.notes = notes + + def __repr__(self): + return "ReviewLine(features={}, notes={}, sent={})".format( + self.features, self.notes, self.sent + ) + + +class ReviewsCorpusReader(CorpusReader): + """ + Reader for the Customer Review Data dataset by Hu, Liu (2004). + Note: we are not applying any sentence tokenization at the moment, just word + tokenization. + + >>> from nltk.corpus import product_reviews_1 + >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt') + >>> review = camera_reviews[0] + >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE + ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am', + 'extremely', 'satisfied', 'with', 'the', 'purchase', '.'] + >>> review.features() # doctest: +NORMALIZE_WHITESPACE + [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'), + ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'), + ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'), + ('option', '+1')] + + We can also reach the same information directly from the stream: + + >>> product_reviews_1.features('Canon_G3.txt') + [('canon powershot g3', '+3'), ('use', '+2'), ...] + + We can compute stats for specific product features: + + >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) + >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) + >>> mean = tot / n_reviews + >>> print(n_reviews, tot, mean) + 15 24 1.6 + """ + + CorpusView = StreamBackedCorpusView + + def __init__( + self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8" + ): + """ + :param root: The root directory for the corpus. + :param fileids: a list or regexp specifying the fileids in the corpus. + :param word_tokenizer: a tokenizer for breaking sentences or paragraphs + into words. Default: `WordPunctTokenizer` + :param encoding: the encoding that should be used to read the corpus. + """ + + CorpusReader.__init__(self, root, fileids, encoding) + self._word_tokenizer = word_tokenizer + self._readme = "README.txt" + + def features(self, fileids=None): + """ + Return a list of features. Each feature is a tuple made of the specific + item feature and the opinion strength about that feature. + + :param fileids: a list or regexp specifying the ids of the files whose + features have to be returned. + :return: all features for the item(s) in the given file(s). + :rtype: list(tuple) + """ + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + return concat( + [ + self.CorpusView(fileid, self._read_features, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def reviews(self, fileids=None): + """ + Return all the reviews as a list of Review objects. If `fileids` is + specified, return all the reviews from each of the specified files. + + :param fileids: a list or regexp specifying the ids of the files whose + reviews have to be returned. + :return: the given file(s) as a list of reviews. + """ + if fileids is None: + fileids = self._fileids + return concat( + [ + self.CorpusView(fileid, self._read_review_block, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def sents(self, fileids=None): + """ + Return all sentences in the corpus or in the specified files. + + :param fileids: a list or regexp specifying the ids of the files whose + sentences have to be returned. + :return: the given file(s) as a list of sentences, each encoded as a + list of word strings. + :rtype: list(list(str)) + """ + return concat( + [ + self.CorpusView(path, self._read_sent_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def words(self, fileids=None): + """ + Return all words and punctuation symbols in the corpus or in the specified + files. + + :param fileids: a list or regexp specifying the ids of the files whose + words have to be returned. + :return: the given file(s) as a list of words and punctuation symbols. + :rtype: list(str) + """ + return concat( + [ + self.CorpusView(path, self._read_word_block, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def _read_features(self, stream): + features = [] + for i in range(20): + line = stream.readline() + if not line: + return features + features.extend(re.findall(FEATURES, line)) + return features + + def _read_review_block(self, stream): + while True: + line = stream.readline() + if not line: + return [] # end of file. + title_match = re.match(TITLE, line) + if title_match: + review = Review( + title=title_match.group(1).strip() + ) # We create a new review + break + + # Scan until we find another line matching the regexp, or EOF. + while True: + oldpos = stream.tell() + line = stream.readline() + # End of file: + if not line: + return [review] + # Start of a new review: backup to just before it starts, and + # return the review we've already collected. + if re.match(TITLE, line): + stream.seek(oldpos) + return [review] + # Anything else is part of the review line. + feats = re.findall(FEATURES, line) + notes = re.findall(NOTES, line) + sent = re.findall(SENT, line) + if sent: + sent = self._word_tokenizer.tokenize(sent[0]) + review_line = ReviewLine(sent=sent, features=feats, notes=notes) + review.add_line(review_line) + + def _read_sent_block(self, stream): + sents = [] + for review in self._read_review_block(stream): + sents.extend([sent for sent in review.sents()]) + return sents + + def _read_word_block(self, stream): + words = [] + for i in range(20): # Read 20 lines at a time. + line = stream.readline() + sent = re.findall(SENT, line) + if sent: + words.extend(self._word_tokenizer.tokenize(sent[0])) + return words diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py new file mode 100644 index 0000000000000000000000000000000000000000..827ecb9e2603f6a84fc4f213f628cafcddc44944 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py @@ -0,0 +1,146 @@ +# Natural Language Toolkit: RTE Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# URL: +# For license information, see LICENSE.TXT + +""" +Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora. + +The files were taken from the RTE1, RTE2 and RTE3 datasets and the files +were regularized. + +Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the +gold standard annotated files. + +Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following +example is taken from RTE3:: + + + + The sale was made to pay Yukos' US$ 27.5 billion tax bill, + Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known + company Baikalfinansgroup which was later bought by the Russian + state-owned oil company Rosneft . + + Baikalfinansgroup was sold to Rosneft. + + +In order to provide globally unique IDs for each pair, a new attribute +``challenge`` has been added to the root element ``entailment-corpus`` of each +file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the +challenge number and 'n' is the pair ID. +""" +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.corpus.reader.xmldocs import * + + +def norm(value_string): + """ + Normalize the string value in an RTE pair's ``value`` or ``entailment`` + attribute as an integer (1, 0). + + :param value_string: the label used to classify a text/hypothesis pair + :type value_string: str + :rtype: int + """ + + valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0} + return valdict[value_string.upper()] + + +class RTEPair: + """ + Container for RTE text-hypothesis pairs. + + The entailment relation is signalled by the ``value`` attribute in RTE1, and by + ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment`` + attribute of this class. + """ + + def __init__( + self, + pair, + challenge=None, + id=None, + text=None, + hyp=None, + value=None, + task=None, + length=None, + ): + """ + :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3) + :param id: identifier for the pair + :param text: the text component of the pair + :param hyp: the hypothesis component of the pair + :param value: classification label for the pair + :param task: attribute for the particular NLP task that the data was drawn from + :param length: attribute for the length of the text of the pair + """ + self.challenge = challenge + self.id = pair.attrib["id"] + self.gid = f"{self.challenge}-{self.id}" + self.text = pair[0].text + self.hyp = pair[1].text + + if "value" in pair.attrib: + self.value = norm(pair.attrib["value"]) + elif "entailment" in pair.attrib: + self.value = norm(pair.attrib["entailment"]) + else: + self.value = value + if "task" in pair.attrib: + self.task = pair.attrib["task"] + else: + self.task = task + if "length" in pair.attrib: + self.length = pair.attrib["length"] + else: + self.length = length + + def __repr__(self): + if self.challenge: + return f"" + else: + return "" % self.id + + +class RTECorpusReader(XMLCorpusReader): + """ + Corpus reader for corpora in RTE challenges. + + This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected + structure of input documents. + """ + + def _read_etree(self, doc): + """ + Map the XML input into an RTEPair. + + This uses the ``getiterator()`` method from the ElementTree package to + find all the ```` elements. + + :param doc: a parsed XML document + :rtype: list(RTEPair) + """ + try: + challenge = doc.attrib["challenge"] + except KeyError: + challenge = None + pairiter = doc.iter("pair") + return [RTEPair(pair, challenge=challenge) for pair in pairiter] + + def pairs(self, fileids): + """ + Build a list of RTEPairs from a RTE corpus. + + :param fileids: a list of RTE corpus fileids + :type: list + :rtype: list(RTEPair) + """ + if isinstance(fileids, str): + fileids = [fileids] + return concat([self._read_etree(self.xml(fileid)) for fileid in fileids]) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py new file mode 100644 index 0000000000000000000000000000000000000000..2a00406281a42caa13bc9b7a35b28897d7df6eda --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py @@ -0,0 +1,296 @@ +# Natural Language Toolkit: SemCor Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Nathan Schneider +# URL: +# For license information, see LICENSE.TXT + +""" +Corpus reader for the SemCor Corpus. +""" + +__docformat__ = "epytext en" + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView +from nltk.tree import Tree + + +class SemcorCorpusReader(XMLCorpusReader): + """ + Corpus reader for the SemCor Corpus. + For access to the complete XML data structure, use the ``xml()`` + method. For access to simple word lists and tagged word lists, use + ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. + """ + + def __init__(self, root, fileids, wordnet, lazy=True): + XMLCorpusReader.__init__(self, root, fileids) + self._lazy = lazy + self._wordnet = wordnet + + def words(self, fileids=None): + """ + :return: the given file(s) as a list of words and punctuation symbols. + :rtype: list(str) + """ + return self._items(fileids, "word", False, False, False) + + def chunks(self, fileids=None): + """ + :return: the given file(s) as a list of chunks, + each of which is a list of words and punctuation symbols + that form a unit. + :rtype: list(list(str)) + """ + return self._items(fileids, "chunk", False, False, False) + + def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")): + """ + :return: the given file(s) as a list of tagged chunks, represented + in tree form. + :rtype: list(Tree) + + :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` + to indicate the kind of tags to include. Semantic tags consist of + WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity + without a specific entry in WordNet. (Named entities of type 'other' + have no lemma. Other chunks not in WordNet have no semantic tag. + Punctuation tokens have `None` for their part of speech tag.) + """ + return self._items(fileids, "chunk", False, tag != "sem", tag != "pos") + + def sents(self, fileids=None): + """ + :return: the given file(s) as a list of sentences, each encoded + as a list of word strings. + :rtype: list(list(str)) + """ + return self._items(fileids, "word", True, False, False) + + def chunk_sents(self, fileids=None): + """ + :return: the given file(s) as a list of sentences, each encoded + as a list of chunks. + :rtype: list(list(list(str))) + """ + return self._items(fileids, "chunk", True, False, False) + + def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")): + """ + :return: the given file(s) as a list of sentences. Each sentence + is represented as a list of tagged chunks (in tree form). + :rtype: list(list(Tree)) + + :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` + to indicate the kind of tags to include. Semantic tags consist of + WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity + without a specific entry in WordNet. (Named entities of type 'other' + have no lemma. Other chunks not in WordNet have no semantic tag. + Punctuation tokens have `None` for their part of speech tag.) + """ + return self._items(fileids, "chunk", True, tag != "sem", tag != "pos") + + def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag): + if unit == "word" and not bracket_sent: + # the result of the SemcorWordView may be a multiword unit, so the + # LazyConcatenation will make sure the sentence is flattened + _ = lambda *args: LazyConcatenation( + (SemcorWordView if self._lazy else self._words)(*args) + ) + else: + _ = SemcorWordView if self._lazy else self._words + return concat( + [ + _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet) + for fileid in self.abspaths(fileids) + ] + ) + + def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag): + """ + Helper used to implement the view methods -- returns a list of + tokens, (segmented) words, chunks, or sentences. The tokens + and chunks may optionally be tagged (with POS and sense + information). + + :param fileid: The name of the underlying file. + :param unit: One of `'token'`, `'word'`, or `'chunk'`. + :param bracket_sent: If true, include sentence bracketing. + :param pos_tag: Whether to include part-of-speech tags. + :param sem_tag: Whether to include semantic tags, namely WordNet lemma + and OOV named entity status. + """ + assert unit in ("token", "word", "chunk") + result = [] + + xmldoc = ElementTree.parse(fileid).getroot() + for xmlsent in xmldoc.findall(".//s"): + sent = [] + for xmlword in _all_xmlwords_in(xmlsent): + itm = SemcorCorpusReader._word( + xmlword, unit, pos_tag, sem_tag, self._wordnet + ) + if unit == "word": + sent.extend(itm) + else: + sent.append(itm) + + if bracket_sent: + result.append(SemcorSentence(xmlsent.attrib["snum"], sent)) + else: + result.extend(sent) + + assert None not in result + return result + + @staticmethod + def _word(xmlword, unit, pos_tag, sem_tag, wordnet): + tkn = xmlword.text + if not tkn: + tkn = "" # fixes issue 337? + + lemma = xmlword.get("lemma", tkn) # lemma or NE class + lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense) + if lexsn is not None: + sense_key = lemma + "%" + lexsn + wnpos = ("n", "v", "a", "r", "s")[ + int(lexsn.split(":")[0]) - 1 + ] # see http://wordnet.princeton.edu/man/senseidx.5WN.html + else: + sense_key = wnpos = None + redef = xmlword.get( + "rdf", tkn + ) # redefinition--this indicates the lookup string + # does not exactly match the enclosed string, e.g. due to typographical adjustments + # or discontinuity of a multiword expression. If a redefinition has occurred, + # the "rdf" attribute holds its inflected form and "lemma" holds its lemma. + # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class). + sensenum = xmlword.get("wnsn") # WordNet sense number + isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet + pos = xmlword.get( + "pos" + ) # part of speech for the whole chunk (None for punctuation) + + if unit == "token": + if not pos_tag and not sem_tag: + itm = tkn + else: + itm = ( + (tkn,) + + ((pos,) if pos_tag else ()) + + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ()) + ) + return itm + else: + ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE + if unit == "word": + return ww + else: + if sensenum is not None: + try: + sense = wordnet.lemma_from_key(sense_key) # Lemma object + except Exception: + # cannot retrieve the wordnet.Lemma object. possible reasons: + # (a) the wordnet corpus is not downloaded; + # (b) a nonexistent sense is annotated: e.g., such.s.00 triggers: + # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00' + # solution: just use the lemma name as a string + try: + sense = "%s.%s.%02d" % ( + lemma, + wnpos, + int(sensenum), + ) # e.g.: reach.v.02 + except ValueError: + sense = ( + lemma + "." + wnpos + "." + sensenum + ) # e.g. the sense number may be "2;1" + + bottom = [Tree(pos, ww)] if pos_tag else ww + + if sem_tag and isOOVEntity: + if sensenum is not None: + return Tree(sense, [Tree("NE", bottom)]) + else: # 'other' NE + return Tree("NE", bottom) + elif sem_tag and sensenum is not None: + return Tree(sense, bottom) + elif pos_tag: + return bottom[0] + else: + return bottom # chunk as a list + + +def _all_xmlwords_in(elt, result=None): + if result is None: + result = [] + for child in elt: + if child.tag in ("wf", "punc"): + result.append(child) + else: + _all_xmlwords_in(child, result) + return result + + +class SemcorSentence(list): + """ + A list of words, augmented by an attribute ``num`` used to record + the sentence identifier (the ``n`` attribute from the XML). + """ + + def __init__(self, num, items): + self.num = num + list.__init__(self, items) + + +class SemcorWordView(XMLCorpusView): + """ + A stream backed corpus view specialized for use with the BNC corpus. + """ + + def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet): + """ + :param fileid: The name of the underlying file. + :param unit: One of `'token'`, `'word'`, or `'chunk'`. + :param bracket_sent: If true, include sentence bracketing. + :param pos_tag: Whether to include part-of-speech tags. + :param sem_tag: Whether to include semantic tags, namely WordNet lemma + and OOV named entity status. + """ + if bracket_sent: + tagspec = ".*/s" + else: + tagspec = ".*/s/(punc|wf)" + + self._unit = unit + self._sent = bracket_sent + self._pos_tag = pos_tag + self._sem_tag = sem_tag + self._wordnet = wordnet + + XMLCorpusView.__init__(self, fileid, tagspec) + + def handle_elt(self, elt, context): + if self._sent: + return self.handle_sent(elt) + else: + return self.handle_word(elt) + + def handle_word(self, elt): + return SemcorCorpusReader._word( + elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet + ) + + def handle_sent(self, elt): + sent = [] + for child in elt: + if child.tag in ("wf", "punc"): + itm = self.handle_word(child) + if self._unit == "word": + sent.extend(itm) + else: + sent.append(itm) + else: + raise ValueError("Unexpected element %s" % child.tag) + return SemcorSentence(elt.attrib["snum"], sent) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py new file mode 100644 index 0000000000000000000000000000000000000000..0b0e39bc74486ba59b305beb4255fa3f6f7aceb2 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py @@ -0,0 +1,196 @@ +# Natural Language Toolkit: Senseval 2 Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# Steven Bird (modifications) +# URL: +# For license information, see LICENSE.TXT + +""" +Read from the Senseval 2 Corpus. + +SENSEVAL [http://www.senseval.org/] +Evaluation exercises for Word Sense Disambiguation. +Organized by ACL-SIGLEX [https://www.siglex.org/] + +Prepared by Ted Pedersen , University of Minnesota, +https://www.d.umn.edu/~tpederse/data.html +Distributed with permission. + +The NLTK version of the Senseval 2 files uses well-formed XML. +Each instance of the ambiguous words "hard", "interest", "line", and "serve" +is tagged with a sense identifier, and supplied with context. +""" + +import re +from xml.etree import ElementTree + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.tokenize import * + + +class SensevalInstance: + def __init__(self, word, position, context, senses): + self.word = word + self.senses = tuple(senses) + self.position = position + self.context = context + + def __repr__(self): + return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % ( + self.word, + self.position, + self.context, + self.senses, + ) + + +class SensevalCorpusReader(CorpusReader): + def instances(self, fileids=None): + return concat( + [ + SensevalCorpusView(fileid, enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def _entry(self, tree): + elts = [] + for lexelt in tree.findall("lexelt"): + for inst in lexelt.findall("instance"): + sense = inst[0].attrib["senseid"] + context = [(w.text, w.attrib["pos"]) for w in inst[1]] + elts.append((sense, context)) + return elts + + +class SensevalCorpusView(StreamBackedCorpusView): + def __init__(self, fileid, encoding): + StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) + + self._word_tokenizer = WhitespaceTokenizer() + self._lexelt_starts = [0] # list of streampos + self._lexelts = [None] # list of lexelt names + + def read_block(self, stream): + # Decide which lexical element we're in. + lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1 + lexelt = self._lexelts[lexelt_num] + + instance_lines = [] + in_instance = False + while True: + line = stream.readline() + if line == "": + assert instance_lines == [] + return [] + + # Start of a lexical element? + if line.lstrip().startswith(" has no 'item=...' + lexelt = m.group(1)[1:-1] + if lexelt_num < len(self._lexelts): + assert lexelt == self._lexelts[lexelt_num] + else: + self._lexelts.append(lexelt) + self._lexelt_starts.append(stream.tell()) + + # Start of an instance? + if line.lstrip().startswith("" + elif cword.tag == "wf": + context.append((cword.text, cword.attrib["pos"])) + elif cword.tag == "s": + pass # Sentence boundary marker. + + else: + print("ACK", cword.tag) + assert False, "expected CDATA or or " + if cword.tail: + context += self._word_tokenizer.tokenize(cword.tail) + else: + assert False, "unexpected tag %s" % child.tag + return SensevalInstance(lexelt, position, context, senses) + + +def _fixXML(text): + """ + Fix the various issues with Senseval pseudo-XML. + """ + # <~> or <^> => ~ or ^ + text = re.sub(r"<([~\^])>", r"\1", text) + # fix lone & + text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text) + # fix """ + text = re.sub(r'"""', "'\"'", text) + # fix => + text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) + # fix foreign word tag + text = re.sub(r"<\&frasl>\s*]*>", "FRASL", text) + # remove <&I .> + text = re.sub(r"<\&I[^>]*>", "", text) + # fix <{word}> + text = re.sub(r"<{([^}]+)}>", r"\1", text) + # remove <@>,

,

+ text = re.sub(r"<(@|/?p)>", r"", text) + # remove <&M .> and <&T .> and <&Ms .> + text = re.sub(r"<&\w+ \.>", r"", text) + # remove lines + text = re.sub(r"]*>", r"", text) + # remove <[hi]> and <[/p]> etc + text = re.sub(r"<\[\/?[^>]+\]*>", r"", text) + # take the thing out of the brackets: <…> + text = re.sub(r"<(\&\w+;)>", r"\1", text) + # and remove the & for those patterns that aren't regular XML + text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text) + # fix 'abc ' style tags - now abc + text = re.sub( + r'[ \t]*([^<>\s]+?)[ \t]*', r' \1', text + ) + text = re.sub(r'\s*"\s*', " \"", text) + return text diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py new file mode 100644 index 0000000000000000000000000000000000000000..4734e0f04c01ff69317ba766ec8097a323eba4b8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py @@ -0,0 +1,136 @@ +# Natural Language Toolkit: SentiWordNet +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Christopher Potts +# URL: +# For license information, see LICENSE.TXT + +""" +An NLTK interface for SentiWordNet + +SentiWordNet is a lexical resource for opinion mining. +SentiWordNet assigns to each synset of WordNet three +sentiment scores: positivity, negativity, and objectivity. + +For details about SentiWordNet see: +http://sentiwordnet.isti.cnr.it/ + + >>> from nltk.corpus import sentiwordnet as swn + >>> print(swn.senti_synset('breakdown.n.03')) + + >>> list(swn.senti_synsets('slow')) + [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\ + SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\ + SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\ + SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\ + SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),\ + SentiSynset('behind.r.03')] + >>> happy = swn.senti_synsets('happy', 'a') + >>> happy0 = list(happy)[0] + >>> happy0.pos_score() + 0.875 + >>> happy0.neg_score() + 0.0 + >>> happy0.obj_score() + 0.125 +""" + +import re + +from nltk.corpus.reader import CorpusReader + + +class SentiWordNetCorpusReader(CorpusReader): + def __init__(self, root, fileids, encoding="utf-8"): + """ + Construct a new SentiWordNet Corpus Reader, using data from + the specified file. + """ + super().__init__(root, fileids, encoding=encoding) + if len(self._fileids) != 1: + raise ValueError("Exactly one file must be specified") + self._db = {} + self._parse_src_file() + + def _parse_src_file(self): + lines = self.open(self._fileids[0]).read().splitlines() + lines = filter((lambda x: not re.search(r"^\s*#", x)), lines) + for i, line in enumerate(lines): + fields = [field.strip() for field in re.split(r"\t+", line)] + try: + pos, offset, pos_score, neg_score, synset_terms, gloss = fields + except BaseException as e: + raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e + if pos and offset: + offset = int(offset) + self._db[(pos, offset)] = (float(pos_score), float(neg_score)) + + def senti_synset(self, *vals): + from nltk.corpus import wordnet as wn + + if tuple(vals) in self._db: + pos_score, neg_score = self._db[tuple(vals)] + pos, offset = vals + if pos == "s": + pos = "a" + synset = wn.synset_from_pos_and_offset(pos, offset) + return SentiSynset(pos_score, neg_score, synset) + else: + synset = wn.synset(vals[0]) + pos = synset.pos() + if pos == "s": + pos = "a" + offset = synset.offset() + if (pos, offset) in self._db: + pos_score, neg_score = self._db[(pos, offset)] + return SentiSynset(pos_score, neg_score, synset) + else: + return None + + def senti_synsets(self, string, pos=None): + from nltk.corpus import wordnet as wn + + sentis = [] + synset_list = wn.synsets(string, pos) + for synset in synset_list: + sentis.append(self.senti_synset(synset.name())) + sentis = filter(lambda x: x, sentis) + return sentis + + def all_senti_synsets(self): + from nltk.corpus import wordnet as wn + + for key, fields in self._db.items(): + pos, offset = key + pos_score, neg_score = fields + synset = wn.synset_from_pos_and_offset(pos, offset) + yield SentiSynset(pos_score, neg_score, synset) + + +class SentiSynset: + def __init__(self, pos_score, neg_score, synset): + self._pos_score = pos_score + self._neg_score = neg_score + self._obj_score = 1.0 - (self._pos_score + self._neg_score) + self.synset = synset + + def pos_score(self): + return self._pos_score + + def neg_score(self): + return self._neg_score + + def obj_score(self): + return self._obj_score + + def __str__(self): + """Prints just the Pos/Neg scores for now.""" + s = "<" + s += self.synset.name() + ": " + s += "PosScore=%s " % self._pos_score + s += "NegScore=%s" % self._neg_score + s += ">" + return s + + def __repr__(self): + return "Senti" + repr(self.synset) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py new file mode 100644 index 0000000000000000000000000000000000000000..78193b78199b0c1a5584f22ecca3dc6404d0cb5d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py @@ -0,0 +1,75 @@ +# Natural Language Toolkit: Sinica Treebank Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Sinica Treebank Corpus Sample + +http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm + +10,000 parsed sentences, drawn from the Academia Sinica Balanced +Corpus of Modern Chinese. Parse tree notation is based on +Information-based Case Grammar. Tagset documentation is available +at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html + +Language and Knowledge Processing Group, Institute of Information +Science, Academia Sinica + +The data is distributed with the Natural Language Toolkit under the terms of +the Creative Commons Attribution-NonCommercial-ShareAlike License +[https://creativecommons.org/licenses/by-nc-sa/2.5/]. + +References: + +Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999) +The Construction of Sinica Treebank. Computational Linguistics and +Chinese Language Processing, 4, pp 87-104. + +Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming +Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria, +Annotation Guidelines, and On-line Interface. Proceedings of 2nd +Chinese Language Processing Workshop, Association for Computational +Linguistics. + +Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar +Extraction, Proceedings of IJCNLP-04, pp560-565. +""" + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.tag import map_tag +from nltk.tree import sinica_parse + +IDENTIFIER = re.compile(r"^#\S+\s") +APPENDIX = re.compile(r"(?<=\))#.*$") +TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)") +WORD = re.compile(r":[^:()|]+:([^:()|]+)") + + +class SinicaTreebankCorpusReader(SyntaxCorpusReader): + """ + Reader for the sinica treebank. + """ + + def _read_block(self, stream): + sent = stream.readline() + sent = IDENTIFIER.sub("", sent) + sent = APPENDIX.sub("", sent) + return [sent] + + def _parse(self, sent): + return sinica_parse(sent) + + def _tag(self, sent, tagset=None): + tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)] + if tagset and tagset != self._tagset: + tagged_sent = [ + (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent + ] + return tagged_sent + + def _word(self, sent): + return WORD.findall(sent) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py new file mode 100644 index 0000000000000000000000000000000000000000..00054234b58ee56b74f8b5fcd7ccbf2d08b2b635 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py @@ -0,0 +1,56 @@ +# Natural Language Toolkit: String Category Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Read tuples from a corpus consisting of categorized strings. +For example, from the question classification corpus: + +NUM:dist How far is it from Denver to Aspen ? +LOC:city What county is Modesto , California in ? +HUM:desc Who was Galileo ? +DESC:def What is an atom ? +NUM:date When did Hawaii become a state ? +""" + +from nltk.corpus.reader.api import * + +# based on PPAttachmentCorpusReader +from nltk.corpus.reader.util import * + + +# [xx] Should the order of the tuple be reversed -- in most other places +# in nltk, we use the form (data, tag) -- e.g., tagged words and +# labeled texts for classifiers. +class StringCategoryCorpusReader(CorpusReader): + def __init__(self, root, fileids, delimiter=" ", encoding="utf8"): + """ + :param root: The root directory for this corpus. + :param fileids: A list or regexp specifying the fileids in this corpus. + :param delimiter: Field delimiter + """ + CorpusReader.__init__(self, root, fileids, encoding) + self._delimiter = delimiter + + def tuples(self, fileids=None): + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, str): + fileids = [fileids] + return concat( + [ + StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def _read_tuple_block(self, stream): + line = stream.readline().strip() + if line: + return [tuple(line.split(self._delimiter, 1))] + else: + return [] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py new file mode 100644 index 0000000000000000000000000000000000000000..8ad5dd03848722de22326fe5e99ebd8272c3719f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py @@ -0,0 +1,125 @@ +# Natural Language Toolkit: Switchboard Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT +import re + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.tag import map_tag, str2tuple + + +class SwitchboardTurn(list): + """ + A specialized list object used to encode switchboard utterances. + The elements of the list are the words in the utterance; and two + attributes, ``speaker`` and ``id``, are provided to retrieve the + spearker identifier and utterance id. Note that utterance ids + are only unique within a given discourse. + """ + + def __init__(self, words, speaker, id): + list.__init__(self, words) + self.speaker = speaker + self.id = int(id) + + def __repr__(self): + if len(self) == 0: + text = "" + elif isinstance(self[0], tuple): + text = " ".join("%s/%s" % w for w in self) + else: + text = " ".join(self) + return f"<{self.speaker}.{self.id}: {text!r}>" + + +class SwitchboardCorpusReader(CorpusReader): + _FILES = ["tagged"] + # Use the "tagged" file even for non-tagged data methods, since + # it's tokenized. + + def __init__(self, root, tagset=None): + CorpusReader.__init__(self, root, self._FILES) + self._tagset = tagset + + def words(self): + return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader) + + def tagged_words(self, tagset=None): + def tagged_words_block_reader(stream): + return self._tagged_words_block_reader(stream, tagset) + + return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader) + + def turns(self): + return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader) + + def tagged_turns(self, tagset=None): + def tagged_turns_block_reader(stream): + return self._tagged_turns_block_reader(stream, tagset) + + return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader) + + def discourses(self): + return StreamBackedCorpusView( + self.abspath("tagged"), self._discourses_block_reader + ) + + def tagged_discourses(self, tagset=False): + def tagged_discourses_block_reader(stream): + return self._tagged_discourses_block_reader(stream, tagset) + + return StreamBackedCorpusView( + self.abspath("tagged"), tagged_discourses_block_reader + ) + + def _discourses_block_reader(self, stream): + # returns at most 1 discourse. (The other methods depend on this.) + return [ + [ + self._parse_utterance(u, include_tag=False) + for b in read_blankline_block(stream) + for u in b.split("\n") + if u.strip() + ] + ] + + def _tagged_discourses_block_reader(self, stream, tagset=None): + # returns at most 1 discourse. (The other methods depend on this.) + return [ + [ + self._parse_utterance(u, include_tag=True, tagset=tagset) + for b in read_blankline_block(stream) + for u in b.split("\n") + if u.strip() + ] + ] + + def _turns_block_reader(self, stream): + return self._discourses_block_reader(stream)[0] + + def _tagged_turns_block_reader(self, stream, tagset=None): + return self._tagged_discourses_block_reader(stream, tagset)[0] + + def _words_block_reader(self, stream): + return sum(self._discourses_block_reader(stream)[0], []) + + def _tagged_words_block_reader(self, stream, tagset=None): + return sum(self._tagged_discourses_block_reader(stream, tagset)[0], []) + + _UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)") + _SEP = "/" + + def _parse_utterance(self, utterance, include_tag, tagset=None): + m = self._UTTERANCE_RE.match(utterance) + if m is None: + raise ValueError("Bad utterance %r" % utterance) + speaker, id, text = m.groups() + words = [str2tuple(s, self._SEP) for s in text.split()] + if not include_tag: + words = [w for (w, t) in words] + elif tagset and tagset != self._tagset: + words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words] + return SwitchboardTurn(words, speaker, id) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py new file mode 100644 index 0000000000000000000000000000000000000000..896e1dc131944a0d3701aefaebf9eb863f995677 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py @@ -0,0 +1,354 @@ +# Natural Language Toolkit: Tagged Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Jacob Perkins +# URL: +# For license information, see LICENSE.TXT + +""" +A reader for corpora whose documents contain part-of-speech-tagged words. +""" + +import os + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.timit import read_timit_block +from nltk.corpus.reader.util import * +from nltk.tag import map_tag, str2tuple +from nltk.tokenize import * + + +class TaggedCorpusReader(CorpusReader): + """ + Reader for simple part-of-speech tagged corpora. Paragraphs are + assumed to be split using blank lines. Sentences and words can be + tokenized using the default tokenizers, or by custom tokenizers + specified as parameters to the constructor. Words are parsed + using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the + separator. I.e., words should have the form:: + + word1/tag1 word2/tag2 word3/tag3 ... + + But custom separators may be specified as parameters to the + constructor. Part of speech tags are case-normalized to upper + case. + """ + + def __init__( + self, + root, + fileids, + sep="/", + word_tokenizer=WhitespaceTokenizer(), + sent_tokenizer=RegexpTokenizer("\n", gaps=True), + para_block_reader=read_blankline_block, + encoding="utf8", + tagset=None, + ): + """ + Construct a new Tagged Corpus reader for a set of documents + located at the given root directory. Example usage: + + >>> root = '/...path to corpus.../' + >>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP + + :param root: The root directory for this corpus. + :param fileids: A list or regexp specifying the fileids in this corpus. + """ + CorpusReader.__init__(self, root, fileids, encoding) + self._sep = sep + self._word_tokenizer = word_tokenizer + self._sent_tokenizer = sent_tokenizer + self._para_block_reader = para_block_reader + self._tagset = tagset + + def words(self, fileids=None): + """ + :return: the given file(s) as a list of words + and punctuation symbols. + :rtype: list(str) + """ + return concat( + [ + TaggedCorpusView( + fileid, + enc, + False, + False, + False, + self._sep, + self._word_tokenizer, + self._sent_tokenizer, + self._para_block_reader, + None, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def sents(self, fileids=None): + """ + :return: the given file(s) as a list of + sentences or utterances, each encoded as a list of word + strings. + :rtype: list(list(str)) + """ + return concat( + [ + TaggedCorpusView( + fileid, + enc, + False, + True, + False, + self._sep, + self._word_tokenizer, + self._sent_tokenizer, + self._para_block_reader, + None, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def paras(self, fileids=None): + """ + :return: the given file(s) as a list of + paragraphs, each encoded as a list of sentences, which are + in turn encoded as lists of word strings. + :rtype: list(list(list(str))) + """ + return concat( + [ + TaggedCorpusView( + fileid, + enc, + False, + True, + True, + self._sep, + self._word_tokenizer, + self._sent_tokenizer, + self._para_block_reader, + None, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_words(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of tagged + words and punctuation symbols, encoded as tuples + ``(word,tag)``. + :rtype: list(tuple(str,str)) + """ + if tagset and tagset != self._tagset: + tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) + else: + tag_mapping_function = None + return concat( + [ + TaggedCorpusView( + fileid, + enc, + True, + False, + False, + self._sep, + self._word_tokenizer, + self._sent_tokenizer, + self._para_block_reader, + tag_mapping_function, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_sents(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of + sentences, each encoded as a list of ``(word,tag)`` tuples. + + :rtype: list(list(tuple(str,str))) + """ + if tagset and tagset != self._tagset: + tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) + else: + tag_mapping_function = None + return concat( + [ + TaggedCorpusView( + fileid, + enc, + True, + True, + False, + self._sep, + self._word_tokenizer, + self._sent_tokenizer, + self._para_block_reader, + tag_mapping_function, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + def tagged_paras(self, fileids=None, tagset=None): + """ + :return: the given file(s) as a list of + paragraphs, each encoded as a list of sentences, which are + in turn encoded as lists of ``(word,tag)`` tuples. + :rtype: list(list(list(tuple(str,str)))) + """ + if tagset and tagset != self._tagset: + tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) + else: + tag_mapping_function = None + return concat( + [ + TaggedCorpusView( + fileid, + enc, + True, + True, + True, + self._sep, + self._word_tokenizer, + self._sent_tokenizer, + self._para_block_reader, + tag_mapping_function, + ) + for (fileid, enc) in self.abspaths(fileids, True) + ] + ) + + +class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader): + """ + A reader for part-of-speech tagged corpora whose documents are + divided into categories based on their file identifiers. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the corpus reader. Categorization arguments + (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to + the ``CategorizedCorpusReader`` constructor. The remaining arguments + are passed to the ``TaggedCorpusReader``. + """ + CategorizedCorpusReader.__init__(self, kwargs) + TaggedCorpusReader.__init__(self, *args, **kwargs) + + def tagged_words(self, fileids=None, categories=None, tagset=None): + return super().tagged_words(self._resolve(fileids, categories), tagset) + + def tagged_sents(self, fileids=None, categories=None, tagset=None): + return super().tagged_sents(self._resolve(fileids, categories), tagset) + + def tagged_paras(self, fileids=None, categories=None, tagset=None): + return super().tagged_paras(self._resolve(fileids, categories), tagset) + + +class TaggedCorpusView(StreamBackedCorpusView): + """ + A specialized corpus view for tagged documents. It can be + customized via flags to divide the tagged corpus documents up by + sentence or paragraph, and to include or omit part of speech tags. + ``TaggedCorpusView`` objects are typically created by + ``TaggedCorpusReader`` (not directly by nltk users). + """ + + def __init__( + self, + corpus_file, + encoding, + tagged, + group_by_sent, + group_by_para, + sep, + word_tokenizer, + sent_tokenizer, + para_block_reader, + tag_mapping_function=None, + ): + self._tagged = tagged + self._group_by_sent = group_by_sent + self._group_by_para = group_by_para + self._sep = sep + self._word_tokenizer = word_tokenizer + self._sent_tokenizer = sent_tokenizer + self._para_block_reader = para_block_reader + self._tag_mapping_function = tag_mapping_function + StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) + + def read_block(self, stream): + """Reads one paragraph at a time.""" + block = [] + for para_str in self._para_block_reader(stream): + para = [] + for sent_str in self._sent_tokenizer.tokenize(para_str): + sent = [ + str2tuple(s, self._sep) + for s in self._word_tokenizer.tokenize(sent_str) + ] + if self._tag_mapping_function: + sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent] + if not self._tagged: + sent = [w for (w, t) in sent] + if self._group_by_sent: + para.append(sent) + else: + para.extend(sent) + if self._group_by_para: + block.append(para) + else: + block.extend(para) + return block + + +# needs to implement simplified tags +class MacMorphoCorpusReader(TaggedCorpusReader): + """ + A corpus reader for the MAC_MORPHO corpus. Each line contains a + single tagged word, using '_' as a separator. Sentence boundaries + are based on the end-sentence tag ('_.'). Paragraph information + is not included in the corpus, so each paragraph returned by + ``self.paras()`` and ``self.tagged_paras()`` contains a single + sentence. + """ + + def __init__(self, root, fileids, encoding="utf8", tagset=None): + TaggedCorpusReader.__init__( + self, + root, + fileids, + sep="_", + word_tokenizer=LineTokenizer(), + sent_tokenizer=RegexpTokenizer(".*\n"), + para_block_reader=self._read_block, + encoding=encoding, + tagset=tagset, + ) + + def _read_block(self, stream): + return read_regexp_block(stream, r".*", r".*_\.") + + +class TimitTaggedCorpusReader(TaggedCorpusReader): + """ + A corpus reader for tagged sentences that are included in the TIMIT corpus. + """ + + def __init__(self, *args, **kwargs): + TaggedCorpusReader.__init__( + self, para_block_reader=read_timit_block, *args, **kwargs + ) + + def paras(self): + raise NotImplementedError("use sents() instead") + + def tagged_paras(self): + raise NotImplementedError("use tagged_sents() instead") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py new file mode 100644 index 0000000000000000000000000000000000000000..e399ac2ff31fd39c5dfc9ac9e9de0bc29d1f1842 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py @@ -0,0 +1,510 @@ +# Natural Language Toolkit: TIMIT Corpus Reader +# +# Copyright (C) 2001-2007 NLTK Project +# Author: Haejoong Lee +# Steven Bird +# Jacob Perkins +# URL: +# For license information, see LICENSE.TXT + +# [xx] this docstring is out-of-date: +""" +Read tokens, phonemes and audio data from the NLTK TIMIT Corpus. + +This corpus contains selected portion of the TIMIT corpus. + + - 16 speakers from 8 dialect regions + - 1 male and 1 female from each dialect region + - total 130 sentences (10 sentences per speaker. Note that some + sentences are shared among other speakers, especially sa1 and sa2 + are spoken by all speakers.) + - total 160 recording of sentences (10 recordings per speaker) + - audio format: NIST Sphere, single channel, 16kHz sampling, + 16 bit sample, PCM encoding + + +Module contents +=============== + +The timit corpus reader provides 4 functions and 4 data items. + + - utterances + + List of utterances in the corpus. There are total 160 utterances, + each of which corresponds to a unique utterance of a speaker. + Here's an example of an utterance identifier in the list:: + + dr1-fvmh0/sx206 + - _---- _--- + | | | | | + | | | | | + | | | | `--- sentence number + | | | `----- sentence type (a:all, i:shared, x:exclusive) + | | `--------- speaker ID + | `------------ sex (m:male, f:female) + `-------------- dialect region (1..8) + + - speakers + + List of speaker IDs. An example of speaker ID:: + + dr1-fvmh0 + + Note that if you split an item ID with colon and take the first element of + the result, you will get a speaker ID. + + >>> itemid = 'dr1-fvmh0/sx206' + >>> spkrid , sentid = itemid.split('/') + >>> spkrid + 'dr1-fvmh0' + + The second element of the result is a sentence ID. + + - dictionary() + + Phonetic dictionary of words contained in this corpus. This is a Python + dictionary from words to phoneme lists. + + - spkrinfo() + + Speaker information table. It's a Python dictionary from speaker IDs to + records of 10 fields. Speaker IDs the same as the ones in timie.speakers. + Each record is a dictionary from field names to values, and the fields are + as follows:: + + id speaker ID as defined in the original TIMIT speaker info table + sex speaker gender (M:male, F:female) + dr speaker dialect region (1:new england, 2:northern, + 3:north midland, 4:south midland, 5:southern, 6:new york city, + 7:western, 8:army brat (moved around)) + use corpus type (TRN:training, TST:test) + in this sample corpus only TRN is available + recdate recording date + birthdate speaker birth date + ht speaker height + race speaker race (WHT:white, BLK:black, AMR:american indian, + SPN:spanish-american, ORN:oriental,???:unknown) + edu speaker education level (HS:high school, AS:associate degree, + BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA), + PHD:doctorate degree (PhD,JD,MD), ??:unknown) + comments comments by the recorder + +The 4 functions are as follows. + + - tokenized(sentences=items, offset=False) + + Given a list of items, returns an iterator of a list of word lists, + each of which corresponds to an item (sentence). If offset is set to True, + each element of the word list is a tuple of word(string), start offset and + end offset, where offset is represented as a number of 16kHz samples. + + - phonetic(sentences=items, offset=False) + + Given a list of items, returns an iterator of a list of phoneme lists, + each of which corresponds to an item (sentence). If offset is set to True, + each element of the phoneme list is a tuple of word(string), start offset + and end offset, where offset is represented as a number of 16kHz samples. + + - audiodata(item, start=0, end=None) + + Given an item, returns a chunk of audio samples formatted into a string. + When the function is called, if start and end are omitted, the entire + samples of the recording will be returned. If only end is omitted, + samples from the start offset to the end of the recording will be returned. + + - play(data) + + Play the given audio samples. The audio samples can be obtained from the + timit.audiodata function. + +""" +import sys +import time + +from nltk.corpus.reader.api import * +from nltk.internals import import_from_stdlib +from nltk.tree import Tree + + +class TimitCorpusReader(CorpusReader): + """ + Reader for the TIMIT corpus (or any other corpus with the same + file layout and use of file formats). The corpus root directory + should contain the following files: + + - timitdic.txt: dictionary of standard transcriptions + - spkrinfo.txt: table of speaker information + + In addition, the root directory should contain one subdirectory + for each speaker, containing three files for each utterance: + + - .txt: text content of utterances + - .wrd: tokenized text content of utterances + - .phn: phonetic transcription of utterances + - .wav: utterance sound file + """ + + _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt" + """A regexp matching fileids that are used by this corpus reader.""" + _UTTERANCE_RE = r"\w+-\w+/\w+\.txt" + + def __init__(self, root, encoding="utf8"): + """ + Construct a new TIMIT corpus reader in the given directory. + :param root: The root directory for this corpus. + """ + # Ensure that wave files don't get treated as unicode data: + if isinstance(encoding, str): + encoding = [(r".*\.wav", None), (".*", encoding)] + + CorpusReader.__init__( + self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding + ) + + self._utterances = [ + name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE) + ] + """A list of the utterance identifiers for all utterances in + this corpus.""" + + self._speakerinfo = None + self._root = root + self.speakers = sorted({u.split("/")[0] for u in self._utterances}) + + def fileids(self, filetype=None): + """ + Return a list of file identifiers for the files that make up + this corpus. + + :param filetype: If specified, then ``filetype`` indicates that + only the files that have the given type should be + returned. Accepted values are: ``txt``, ``wrd``, ``phn``, + ``wav``, or ``metadata``, + """ + if filetype is None: + return CorpusReader.fileids(self) + elif filetype in ("txt", "wrd", "phn", "wav"): + return [f"{u}.{filetype}" for u in self._utterances] + elif filetype == "metadata": + return ["timitdic.txt", "spkrinfo.txt"] + else: + raise ValueError("Bad value for filetype: %r" % filetype) + + def utteranceids( + self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None + ): + """ + :return: A list of the utterance identifiers for all + utterances in this corpus, or for the given speaker, dialect + region, gender, sentence type, or sentence number, if + specified. + """ + if isinstance(dialect, str): + dialect = [dialect] + if isinstance(sex, str): + sex = [sex] + if isinstance(spkrid, str): + spkrid = [spkrid] + if isinstance(sent_type, str): + sent_type = [sent_type] + if isinstance(sentid, str): + sentid = [sentid] + + utterances = self._utterances[:] + if dialect is not None: + utterances = [u for u in utterances if u[2] in dialect] + if sex is not None: + utterances = [u for u in utterances if u[4] in sex] + if spkrid is not None: + utterances = [u for u in utterances if u[:9] in spkrid] + if sent_type is not None: + utterances = [u for u in utterances if u[11] in sent_type] + if sentid is not None: + utterances = [u for u in utterances if u[10:] in spkrid] + return utterances + + def transcription_dict(self): + """ + :return: A dictionary giving the 'standard' transcription for + each word. + """ + _transcriptions = {} + with self.open("timitdic.txt") as fp: + for line in fp: + if not line.strip() or line[0] == ";": + continue + m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line) + if not m: + raise ValueError("Bad line: %r" % line) + _transcriptions[m.group(1)] = m.group(2).split() + return _transcriptions + + def spkrid(self, utterance): + return utterance.split("/")[0] + + def sentid(self, utterance): + return utterance.split("/")[1] + + def utterance(self, spkrid, sentid): + return f"{spkrid}/{sentid}" + + def spkrutteranceids(self, speaker): + """ + :return: A list of all utterances associated with a given + speaker. + """ + return [ + utterance + for utterance in self._utterances + if utterance.startswith(speaker + "/") + ] + + def spkrinfo(self, speaker): + """ + :return: A dictionary mapping .. something. + """ + if speaker in self._utterances: + speaker = self.spkrid(speaker) + + if self._speakerinfo is None: + self._speakerinfo = {} + with self.open("spkrinfo.txt") as fp: + for line in fp: + if not line.strip() or line[0] == ";": + continue + rec = line.strip().split(None, 9) + key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}" + self._speakerinfo[key] = SpeakerInfo(*rec) + + return self._speakerinfo[speaker] + + def phones(self, utterances=None): + results = [] + for fileid in self._utterance_fileids(utterances, ".phn"): + with self.open(fileid) as fp: + for line in fp: + if line.strip(): + results.append(line.split()[-1]) + return results + + def phone_times(self, utterances=None): + """ + offset is represented as a number of 16kHz samples! + """ + results = [] + for fileid in self._utterance_fileids(utterances, ".phn"): + with self.open(fileid) as fp: + for line in fp: + if line.strip(): + results.append( + ( + line.split()[2], + int(line.split()[0]), + int(line.split()[1]), + ) + ) + return results + + def words(self, utterances=None): + results = [] + for fileid in self._utterance_fileids(utterances, ".wrd"): + with self.open(fileid) as fp: + for line in fp: + if line.strip(): + results.append(line.split()[-1]) + return results + + def word_times(self, utterances=None): + results = [] + for fileid in self._utterance_fileids(utterances, ".wrd"): + with self.open(fileid) as fp: + for line in fp: + if line.strip(): + results.append( + ( + line.split()[2], + int(line.split()[0]), + int(line.split()[1]), + ) + ) + return results + + def sents(self, utterances=None): + results = [] + for fileid in self._utterance_fileids(utterances, ".wrd"): + with self.open(fileid) as fp: + results.append([line.split()[-1] for line in fp if line.strip()]) + return results + + def sent_times(self, utterances=None): + # TODO: Check this + return [ + ( + line.split(None, 2)[-1].strip(), + int(line.split()[0]), + int(line.split()[1]), + ) + for fileid in self._utterance_fileids(utterances, ".txt") + for line in self.open(fileid) + if line.strip() + ] + + def phone_trees(self, utterances=None): + if utterances is None: + utterances = self._utterances + if isinstance(utterances, str): + utterances = [utterances] + + trees = [] + for utterance in utterances: + word_times = self.word_times(utterance) + phone_times = self.phone_times(utterance) + sent_times = self.sent_times(utterance) + + while sent_times: + (sent, sent_start, sent_end) = sent_times.pop(0) + trees.append(Tree("S", [])) + while ( + word_times and phone_times and phone_times[0][2] <= word_times[0][1] + ): + trees[-1].append(phone_times.pop(0)[0]) + while word_times and word_times[0][2] <= sent_end: + (word, word_start, word_end) = word_times.pop(0) + trees[-1].append(Tree(word, [])) + while phone_times and phone_times[0][2] <= word_end: + trees[-1][-1].append(phone_times.pop(0)[0]) + while phone_times and phone_times[0][2] <= sent_end: + trees[-1].append(phone_times.pop(0)[0]) + return trees + + # [xx] NOTE: This is currently broken -- we're assuming that the + # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE + # fileids. + def wav(self, utterance, start=0, end=None): + # nltk.chunk conflicts with the stdlib module 'chunk' + wave = import_from_stdlib("wave") + + w = wave.open(self.open(utterance + ".wav"), "rb") + + if end is None: + end = w.getnframes() + + # Skip past frames before start, then read the frames we want + w.readframes(start) + frames = w.readframes(end - start) + + # Open a new temporary file -- the wave module requires + # an actual file, and won't work w/ stringio. :( + tf = tempfile.TemporaryFile() + out = wave.open(tf, "w") + + # Write the parameters & data to the new file. + out.setparams(w.getparams()) + out.writeframes(frames) + out.close() + + # Read the data back from the file, and return it. The + # file will automatically be deleted when we return. + tf.seek(0) + return tf.read() + + def audiodata(self, utterance, start=0, end=None): + assert end is None or end > start + headersize = 44 + with self.open(utterance + ".wav") as fp: + if end is None: + data = fp.read() + else: + data = fp.read(headersize + end * 2) + return data[headersize + start * 2 :] + + def _utterance_fileids(self, utterances, extension): + if utterances is None: + utterances = self._utterances + if isinstance(utterances, str): + utterances = [utterances] + return [f"{u}{extension}" for u in utterances] + + def play(self, utterance, start=0, end=None): + """ + Play the given audio sample. + + :param utterance: The utterance id of the sample to play + """ + # Method 1: os audio dev. + try: + import ossaudiodev + + try: + dsp = ossaudiodev.open("w") + dsp.setfmt(ossaudiodev.AFMT_S16_LE) + dsp.channels(1) + dsp.speed(16000) + dsp.write(self.audiodata(utterance, start, end)) + dsp.close() + except OSError as e: + print( + ( + "can't acquire the audio device; please " + "activate your audio device." + ), + file=sys.stderr, + ) + print("system error message:", str(e), file=sys.stderr) + return + except ImportError: + pass + + # Method 2: pygame + try: + # FIXME: this won't work under python 3 + import pygame.mixer + import StringIO + + pygame.mixer.init(16000) + f = StringIO.StringIO(self.wav(utterance, start, end)) + pygame.mixer.Sound(f).play() + while pygame.mixer.get_busy(): + time.sleep(0.01) + return + except ImportError: + pass + + # Method 3: complain. :) + print( + ("you must install pygame or ossaudiodev " "for audio playback."), + file=sys.stderr, + ) + + +class SpeakerInfo: + def __init__( + self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None + ): + self.id = id + self.sex = sex + self.dr = dr + self.use = use + self.recdate = recdate + self.birthdate = birthdate + self.ht = ht + self.race = race + self.edu = edu + self.comments = comments + + def __repr__(self): + attribs = "id sex dr use recdate birthdate ht race edu comments" + args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()] + return "SpeakerInfo(%s)" % (", ".join(args)) + + +def read_timit_block(stream): + """ + Block reader for timit tagged sentences, which are preceded by a sentence + number that will be ignored. + """ + line = stream.readline() + if not line: + return [] + n, sent = line.split(" ", 1) + return [sent] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py new file mode 100644 index 0000000000000000000000000000000000000000..7f70b2a4e363a6349256f3100873f11d276103eb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py @@ -0,0 +1,76 @@ +# Natural Language Toolkit: Toolbox Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Greg Aumann +# Stuart Robinson +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Module for reading, writing and manipulating +Toolbox databases and settings fileids. +""" + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.toolbox import ToolboxData + + +class ToolboxCorpusReader(CorpusReader): + def xml(self, fileids, key=None): + return concat( + [ + ToolboxData(path, enc).parse(key=key) + for (path, enc) in self.abspaths(fileids, True) + ] + ) + + def fields( + self, + fileids, + strip=True, + unwrap=True, + encoding="utf8", + errors="strict", + unicode_fields=None, + ): + return concat( + [ + list( + ToolboxData(fileid, enc).fields( + strip, unwrap, encoding, errors, unicode_fields + ) + ) + for (fileid, enc) in self.abspaths(fileids, include_encoding=True) + ] + ) + + # should probably be done lazily: + def entries(self, fileids, **kwargs): + if "key" in kwargs: + key = kwargs["key"] + del kwargs["key"] + else: + key = "lx" # the default key in MDF + entries = [] + for marker, contents in self.fields(fileids, **kwargs): + if marker == key: + entries.append((contents, [])) + else: + try: + entries[-1][-1].append((marker, contents)) + except IndexError: + pass + return entries + + def words(self, fileids, key="lx"): + return [contents for marker, contents in self.fields(fileids) if marker == key] + + +def demo(): + pass + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py new file mode 100644 index 0000000000000000000000000000000000000000..7f39fa4629ec5a2cbb3b25246a2a008ccb0d01c5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py @@ -0,0 +1,136 @@ +# Natural Language Toolkit: Twitter Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# URL: +# For license information, see LICENSE.TXT + +""" +A reader for corpora that consist of Tweets. It is assumed that the Tweets +have been serialised into line-delimited JSON. +""" + +import json +import os + +from nltk.corpus.reader.api import CorpusReader +from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat +from nltk.tokenize import TweetTokenizer + + +class TwitterCorpusReader(CorpusReader): + r""" + Reader for corpora that consist of Tweets represented as a list of line-delimited JSON. + + Individual Tweets can be tokenized using the default tokenizer, or by a + custom tokenizer specified as a parameter to the constructor. + + Construct a new Tweet corpus reader for a set of documents + located at the given root directory. + + If you made your own tweet collection in a directory called + `twitter-files`, then you can initialise the reader as:: + + from nltk.corpus import TwitterCorpusReader + reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json') + + However, the recommended approach is to set the relevant directory as the + value of the environmental variable `TWITTER`, and then invoke the reader + as follows:: + + root = os.environ['TWITTER'] + reader = TwitterCorpusReader(root, '.*\.json') + + If you want to work directly with the raw Tweets, the `json` library can + be used:: + + import json + for tweet in reader.docs(): + print(json.dumps(tweet, indent=1, sort_keys=True)) + + """ + + CorpusView = StreamBackedCorpusView + """ + The corpus view class used by this reader. + """ + + def __init__( + self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8" + ): + """ + :param root: The root directory for this corpus. + :param fileids: A list or regexp specifying the fileids in this corpus. + :param word_tokenizer: Tokenizer for breaking the text of Tweets into + smaller units, including but not limited to words. + """ + CorpusReader.__init__(self, root, fileids, encoding) + + for path in self.abspaths(self._fileids): + if isinstance(path, ZipFilePathPointer): + pass + elif os.path.getsize(path) == 0: + raise ValueError(f"File {path} is empty") + """Check that all user-created corpus files are non-empty.""" + + self._word_tokenizer = word_tokenizer + + def docs(self, fileids=None): + """ + Returns the full Tweet objects, as specified by `Twitter + documentation on Tweets + `_ + + :return: the given file(s) as a list of dictionaries deserialised + from JSON. + :rtype: list(dict) + """ + return concat( + [ + self.CorpusView(path, self._read_tweets, encoding=enc) + for (path, enc, fileid) in self.abspaths(fileids, True, True) + ] + ) + + def strings(self, fileids=None): + """ + Returns only the text content of Tweets in the file(s) + + :return: the given file(s) as a list of Tweets. + :rtype: list(str) + """ + fulltweets = self.docs(fileids) + tweets = [] + for jsono in fulltweets: + try: + text = jsono["text"] + if isinstance(text, bytes): + text = text.decode(self.encoding) + tweets.append(text) + except KeyError: + pass + return tweets + + def tokenized(self, fileids=None): + """ + :return: the given file(s) as a list of the text content of Tweets as + as a list of words, screenanames, hashtags, URLs and punctuation symbols. + + :rtype: list(list(str)) + """ + tweets = self.strings(fileids) + tokenizer = self._word_tokenizer + return [tokenizer.tokenize(t) for t in tweets] + + def _read_tweets(self, stream): + """ + Assumes that each line in ``stream`` is a JSON-serialised object. + """ + tweets = [] + for i in range(10): + line = stream.readline() + if not line: + return tweets + tweet = json.loads(line) + tweets.append(tweet) + return tweets diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py new file mode 100644 index 0000000000000000000000000000000000000000..e6309ff4559659ff9b97bf679b563bcb957d18f1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py @@ -0,0 +1,75 @@ +""" +UDHR corpus reader. It mostly deals with encodings. +""" + +from nltk.corpus.reader.plaintext import PlaintextCorpusReader +from nltk.corpus.reader.util import find_corpus_fileids + + +class UdhrCorpusReader(PlaintextCorpusReader): + + ENCODINGS = [ + (".*-Latin1$", "latin-1"), + (".*-Hebrew$", "hebrew"), + (".*-Arabic$", "cp1256"), + ("Czech_Cesky-UTF8", "cp1250"), # yeah + ("Polish-Latin2", "cp1250"), + ("Polish_Polski-Latin2", "cp1250"), + (".*-Cyrillic$", "cyrillic"), + (".*-SJIS$", "SJIS"), + (".*-GB2312$", "GB2312"), + (".*-Latin2$", "ISO-8859-2"), + (".*-Greek$", "greek"), + (".*-UTF8$", "utf-8"), + ("Hungarian_Magyar-Unicode", "utf-16-le"), + ("Amahuaca", "latin1"), + ("Turkish_Turkce-Turkish", "latin5"), + ("Lithuanian_Lietuviskai-Baltic", "latin4"), + ("Japanese_Nihongo-EUC", "EUC-JP"), + ("Japanese_Nihongo-JIS", "iso2022_jp"), + ("Chinese_Mandarin-HZ", "hz"), + (r"Abkhaz\-Cyrillic\+Abkh", "cp1251"), + ] + + SKIP = { + # The following files are not fully decodable because they + # were truncated at wrong bytes: + "Burmese_Myanmar-UTF8", + "Japanese_Nihongo-JIS", + "Chinese_Mandarin-HZ", + "Chinese_Mandarin-UTF8", + "Gujarati-UTF8", + "Hungarian_Magyar-Unicode", + "Lao-UTF8", + "Magahi-UTF8", + "Marathi-UTF8", + "Tamil-UTF8", + # Unfortunately, encodings required for reading + # the following files are not supported by Python: + "Vietnamese-VPS", + "Vietnamese-VIQR", + "Vietnamese-TCVN", + "Magahi-Agra", + "Bhojpuri-Agra", + "Esperanto-T61", # latin3 raises an exception + # The following files are encoded for specific fonts: + "Burmese_Myanmar-WinResearcher", + "Armenian-DallakHelv", + "Tigrinya_Tigrigna-VG2Main", + "Amharic-Afenegus6..60375", # ? + "Navaho_Dine-Navajo-Navaho-font", + # What are these? + "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117", + "Azeri_Azerbaijani_Latin-Az.Times.Lat0117", + # The following files are unintended: + "Czech-Latin2-err", + "Russian_Russky-UTF8~", + } + + def __init__(self, root="udhr"): + fileids = find_corpus_fileids(root, r"(?!README|\.).*") + super().__init__( + root, + [fileid for fileid in fileids if fileid not in self.SKIP], + encoding=self.ENCODINGS, + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py new file mode 100644 index 0000000000000000000000000000000000000000..73bfa4607a54d98b95e98c3f1b856ba8f10a8568 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py @@ -0,0 +1,867 @@ +# Natural Language Toolkit: Corpus Reader Utilities +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +import bisect +import os +import pickle +import re +import tempfile +from functools import reduce +from xml.etree import ElementTree + +from nltk.data import ( + FileSystemPathPointer, + PathPointer, + SeekableUnicodeStreamReader, + ZipFilePathPointer, +) +from nltk.internals import slice_bounds +from nltk.tokenize import wordpunct_tokenize +from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence + +###################################################################### +# { Corpus View +###################################################################### + + +class StreamBackedCorpusView(AbstractLazySequence): + """ + A 'view' of a corpus file, which acts like a sequence of tokens: + it can be accessed by index, iterated over, etc. However, the + tokens are only constructed as-needed -- the entire corpus is + never stored in memory at once. + + The constructor to ``StreamBackedCorpusView`` takes two arguments: + a corpus fileid (specified as a string or as a ``PathPointer``); + and a block reader. A "block reader" is a function that reads + zero or more tokens from a stream, and returns them as a list. A + very simple example of a block reader is: + + >>> def simple_block_reader(stream): + ... return stream.readline().split() + + This simple block reader reads a single line at a time, and + returns a single token (consisting of a string) for each + whitespace-separated substring on the line. + + When deciding how to define the block reader for a given + corpus, careful consideration should be given to the size of + blocks handled by the block reader. Smaller block sizes will + increase the memory requirements of the corpus view's internal + data structures (by 2 integers per block). On the other hand, + larger block sizes may decrease performance for random access to + the corpus. (But note that larger block sizes will *not* + decrease performance for iteration.) + + Internally, ``CorpusView`` maintains a partial mapping from token + index to file position, with one entry per block. When a token + with a given index *i* is requested, the ``CorpusView`` constructs + it as follows: + + 1. First, it searches the toknum/filepos mapping for the token + index closest to (but less than or equal to) *i*. + + 2. Then, starting at the file position corresponding to that + index, it reads one block at a time using the block reader + until it reaches the requested token. + + The toknum/filepos mapping is created lazily: it is initially + empty, but every time a new block is read, the block's + initial token is added to the mapping. (Thus, the toknum/filepos + map has one entry per block.) + + In order to increase efficiency for random access patterns that + have high degrees of locality, the corpus view may cache one or + more blocks. + + :note: Each ``CorpusView`` object internally maintains an open file + object for its underlying corpus file. This file should be + automatically closed when the ``CorpusView`` is garbage collected, + but if you wish to close it manually, use the ``close()`` + method. If you access a ``CorpusView``'s items after it has been + closed, the file object will be automatically re-opened. + + :warning: If the contents of the file are modified during the + lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior + is undefined. + + :warning: If a unicode encoding is specified when constructing a + ``CorpusView``, then the block reader may only call + ``stream.seek()`` with offsets that have been returned by + ``stream.tell()``; in particular, calling ``stream.seek()`` with + relative offsets, or with offsets based on string lengths, may + lead to incorrect behavior. + + :ivar _block_reader: The function used to read + a single block from the underlying file stream. + :ivar _toknum: A list containing the token index of each block + that has been processed. In particular, ``_toknum[i]`` is the + token index of the first token in block ``i``. Together + with ``_filepos``, this forms a partial mapping between token + indices and file positions. + :ivar _filepos: A list containing the file position of each block + that has been processed. In particular, ``_toknum[i]`` is the + file position of the first character in block ``i``. Together + with ``_toknum``, this forms a partial mapping between token + indices and file positions. + :ivar _stream: The stream used to access the underlying corpus file. + :ivar _len: The total number of tokens in the corpus, if known; + or None, if the number of tokens is not yet known. + :ivar _eofpos: The character position of the last character in the + file. This is calculated when the corpus view is initialized, + and is used to decide when the end of file has been reached. + :ivar _cache: A cache of the most recently read block. It + is encoded as a tuple (start_toknum, end_toknum, tokens), where + start_toknum is the token index of the first token in the block; + end_toknum is the token index of the first token not in the + block; and tokens is a list of the tokens in the block. + """ + + def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"): + """ + Create a new corpus view, based on the file ``fileid``, and + read with ``block_reader``. See the class documentation + for more information. + + :param fileid: The path to the file that is read by this + corpus view. ``fileid`` can either be a string or a + ``PathPointer``. + + :param startpos: The file position at which the view will + start reading. This can be used to skip over preface + sections. + + :param encoding: The unicode encoding that should be used to + read the file's contents. If no encoding is specified, + then the file's contents will be read as a non-unicode + string (i.e., a str). + """ + if block_reader: + self.read_block = block_reader + # Initialize our toknum/filepos mapping. + self._toknum = [0] + self._filepos = [startpos] + self._encoding = encoding + # We don't know our length (number of tokens) yet. + self._len = None + + self._fileid = fileid + self._stream = None + + self._current_toknum = None + """This variable is set to the index of the next token that + will be read, immediately before ``self.read_block()`` is + called. This is provided for the benefit of the block + reader, which under rare circumstances may need to know + the current token number.""" + + self._current_blocknum = None + """This variable is set to the index of the next block that + will be read, immediately before ``self.read_block()`` is + called. This is provided for the benefit of the block + reader, which under rare circumstances may need to know + the current block number.""" + + # Find the length of the file. + try: + if isinstance(self._fileid, PathPointer): + self._eofpos = self._fileid.file_size() + else: + self._eofpos = os.stat(self._fileid).st_size + except Exception as exc: + raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc + + # Maintain a cache of the most recently read block, to + # increase efficiency of random access. + self._cache = (-1, -1, None) + + fileid = property( + lambda self: self._fileid, + doc=""" + The fileid of the file that is accessed by this view. + + :type: str or PathPointer""", + ) + + def read_block(self, stream): + """ + Read a block from the input stream. + + :return: a block of tokens from the input stream + :rtype: list(any) + :param stream: an input stream + :type stream: stream + """ + raise NotImplementedError("Abstract Method") + + def _open(self): + """ + Open the file stream associated with this corpus view. This + will be called performed if any value is read from the view + while its file stream is closed. + """ + if isinstance(self._fileid, PathPointer): + self._stream = self._fileid.open(self._encoding) + elif self._encoding: + self._stream = SeekableUnicodeStreamReader( + open(self._fileid, "rb"), self._encoding + ) + else: + self._stream = open(self._fileid, "rb") + + def close(self): + """ + Close the file stream associated with this corpus view. This + can be useful if you are worried about running out of file + handles (although the stream should automatically be closed + upon garbage collection of the corpus view). If the corpus + view is accessed after it is closed, it will be automatically + re-opened. + """ + if self._stream is not None: + self._stream.close() + self._stream = None + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + def __len__(self): + if self._len is None: + # iterate_from() sets self._len when it reaches the end + # of the file: + for tok in self.iterate_from(self._toknum[-1]): + pass + return self._len + + def __getitem__(self, i): + if isinstance(i, slice): + start, stop = slice_bounds(self, i) + # Check if it's in the cache. + offset = self._cache[0] + if offset <= start and stop <= self._cache[1]: + return self._cache[2][start - offset : stop - offset] + # Construct & return the result. + return LazySubsequence(self, start, stop) + else: + # Handle negative indices + if i < 0: + i += len(self) + if i < 0: + raise IndexError("index out of range") + # Check if it's in the cache. + offset = self._cache[0] + if offset <= i < self._cache[1]: + return self._cache[2][i - offset] + # Use iterate_from to extract it. + try: + return next(self.iterate_from(i)) + except StopIteration as e: + raise IndexError("index out of range") from e + + # If we wanted to be thread-safe, then this method would need to + # do some locking. + def iterate_from(self, start_tok): + # Start by feeding from the cache, if possible. + if self._cache[0] <= start_tok < self._cache[1]: + for tok in self._cache[2][start_tok - self._cache[0] :]: + yield tok + start_tok += 1 + + # Decide where in the file we should start. If `start` is in + # our mapping, then we can jump straight to the correct block; + # otherwise, start at the last block we've processed. + if start_tok < self._toknum[-1]: + block_index = bisect.bisect_right(self._toknum, start_tok) - 1 + toknum = self._toknum[block_index] + filepos = self._filepos[block_index] + else: + block_index = len(self._toknum) - 1 + toknum = self._toknum[-1] + filepos = self._filepos[-1] + + # Open the stream, if it's not open already. + if self._stream is None: + self._open() + + # If the file is empty, the while loop will never run. + # This *seems* to be all the state we need to set: + if self._eofpos == 0: + self._len = 0 + + # Each iteration through this loop, we read a single block + # from the stream. + while filepos < self._eofpos: + # Read the next block. + self._stream.seek(filepos) + self._current_toknum = toknum + self._current_blocknum = block_index + tokens = self.read_block(self._stream) + assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( + "block reader %s() should return list or tuple." + % self.read_block.__name__ + ) + num_toks = len(tokens) + new_filepos = self._stream.tell() + assert ( + new_filepos > filepos + ), "block reader %s() should consume at least 1 byte (filepos=%d)" % ( + self.read_block.__name__, + filepos, + ) + + # Update our cache. + self._cache = (toknum, toknum + num_toks, list(tokens)) + + # Update our mapping. + assert toknum <= self._toknum[-1] + if num_toks > 0: + block_index += 1 + if toknum == self._toknum[-1]: + assert new_filepos > self._filepos[-1] # monotonic! + self._filepos.append(new_filepos) + self._toknum.append(toknum + num_toks) + else: + # Check for consistency: + assert ( + new_filepos == self._filepos[block_index] + ), "inconsistent block reader (num chars read)" + assert ( + toknum + num_toks == self._toknum[block_index] + ), "inconsistent block reader (num tokens returned)" + + # If we reached the end of the file, then update self._len + if new_filepos == self._eofpos: + self._len = toknum + num_toks + # Generate the tokens in this block (but skip any tokens + # before start_tok). Note that between yields, our state + # may be modified. + for tok in tokens[max(0, start_tok - toknum) :]: + yield tok + # If we're at the end of the file, then we're done. + assert new_filepos <= self._eofpos + if new_filepos == self._eofpos: + break + # Update our indices + toknum += num_toks + filepos = new_filepos + + # If we reach this point, then we should know our length. + assert self._len is not None + # Enforce closing of stream once we reached end of file + # We should have reached EOF once we're out of the while loop. + self.close() + + # Use concat for these, so we can use a ConcatenatedCorpusView + # when possible. + def __add__(self, other): + return concat([self, other]) + + def __radd__(self, other): + return concat([other, self]) + + def __mul__(self, count): + return concat([self] * count) + + def __rmul__(self, count): + return concat([self] * count) + + +class ConcatenatedCorpusView(AbstractLazySequence): + """ + A 'view' of a corpus file that joins together one or more + ``StreamBackedCorpusViews``. At most + one file handle is left open at any time. + """ + + def __init__(self, corpus_views): + self._pieces = corpus_views + """A list of the corpus subviews that make up this + concatenation.""" + + self._offsets = [0] + """A list of offsets, indicating the index at which each + subview begins. In particular:: + offsets[i] = sum([len(p) for p in pieces[:i]])""" + + self._open_piece = None + """The most recently accessed corpus subview (or None). + Before a new subview is accessed, this subview will be closed.""" + + def __len__(self): + if len(self._offsets) <= len(self._pieces): + # Iterate to the end of the corpus. + for tok in self.iterate_from(self._offsets[-1]): + pass + + return self._offsets[-1] + + def close(self): + for piece in self._pieces: + piece.close() + + def iterate_from(self, start_tok): + piecenum = bisect.bisect_right(self._offsets, start_tok) - 1 + + while piecenum < len(self._pieces): + offset = self._offsets[piecenum] + piece = self._pieces[piecenum] + + # If we've got another piece open, close it first. + if self._open_piece is not piece: + if self._open_piece is not None: + self._open_piece.close() + self._open_piece = piece + + # Get everything we can from this piece. + yield from piece.iterate_from(max(0, start_tok - offset)) + + # Update the offset table. + if piecenum + 1 == len(self._offsets): + self._offsets.append(self._offsets[-1] + len(piece)) + + # Move on to the next piece. + piecenum += 1 + + +def concat(docs): + """ + Concatenate together the contents of multiple documents from a + single corpus, using an appropriate concatenation function. This + utility function is used by corpus readers when the user requests + more than one document at a time. + """ + if len(docs) == 1: + return docs[0] + if len(docs) == 0: + raise ValueError("concat() expects at least one object!") + + types = {d.__class__ for d in docs} + + # If they're all strings, use string concatenation. + if all(isinstance(doc, str) for doc in docs): + return "".join(docs) + + # If they're all corpus views, then use ConcatenatedCorpusView. + for typ in types: + if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)): + break + else: + return ConcatenatedCorpusView(docs) + + # If they're all lazy sequences, use a lazy concatenation + for typ in types: + if not issubclass(typ, AbstractLazySequence): + break + else: + return LazyConcatenation(docs) + + # Otherwise, see what we can do: + if len(types) == 1: + typ = list(types)[0] + + if issubclass(typ, list): + return reduce((lambda a, b: a + b), docs, []) + + if issubclass(typ, tuple): + return reduce((lambda a, b: a + b), docs, ()) + + if ElementTree.iselement(typ): + xmltree = ElementTree.Element("documents") + for doc in docs: + xmltree.append(doc) + return xmltree + + # No method found! + raise ValueError("Don't know how to concatenate types: %r" % types) + + +###################################################################### +# { Corpus View for Pickled Sequences +###################################################################### + + +class PickleCorpusView(StreamBackedCorpusView): + """ + A stream backed corpus view for corpus files that consist of + sequences of serialized Python objects (serialized using + ``pickle.dump``). One use case for this class is to store the + result of running feature detection on a corpus to disk. This can + be useful when performing feature detection is expensive (so we + don't want to repeat it); but the corpus is too large to store in + memory. The following example illustrates this technique: + + >>> from nltk.corpus.reader.util import PickleCorpusView + >>> from nltk.util import LazyMap + >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP + >>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP + >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP + """ + + BLOCK_SIZE = 100 + PROTOCOL = -1 + + def __init__(self, fileid, delete_on_gc=False): + """ + Create a new corpus view that reads the pickle corpus + ``fileid``. + + :param delete_on_gc: If true, then ``fileid`` will be deleted + whenever this object gets garbage-collected. + """ + self._delete_on_gc = delete_on_gc + StreamBackedCorpusView.__init__(self, fileid) + + def read_block(self, stream): + result = [] + for i in range(self.BLOCK_SIZE): + try: + result.append(pickle.load(stream)) + except EOFError: + break + return result + + def __del__(self): + """ + If ``delete_on_gc`` was set to true when this + ``PickleCorpusView`` was created, then delete the corpus view's + fileid. (This method is called whenever a + ``PickledCorpusView`` is garbage-collected. + """ + if getattr(self, "_delete_on_gc"): + if os.path.exists(self._fileid): + try: + os.remove(self._fileid) + except OSError: + pass + self.__dict__.clear() # make the garbage collector's job easier + + @classmethod + def write(cls, sequence, output_file): + if isinstance(output_file, str): + output_file = open(output_file, "wb") + for item in sequence: + pickle.dump(item, output_file, cls.PROTOCOL) + + @classmethod + def cache_to_tempfile(cls, sequence, delete_on_gc=True): + """ + Write the given sequence to a temporary file as a pickle + corpus; and then return a ``PickleCorpusView`` view for that + temporary corpus file. + + :param delete_on_gc: If true, then the temporary file will be + deleted whenever this object gets garbage-collected. + """ + try: + fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-") + output_file = os.fdopen(fd, "wb") + cls.write(sequence, output_file) + output_file.close() + return PickleCorpusView(output_file_name, delete_on_gc) + except OSError as e: + raise ValueError("Error while creating temp file: %s" % e) from e + + +###################################################################### +# { Block Readers +###################################################################### + + +def read_whitespace_block(stream): + toks = [] + for i in range(20): # Read 20 lines at a time. + toks.extend(stream.readline().split()) + return toks + + +def read_wordpunct_block(stream): + toks = [] + for i in range(20): # Read 20 lines at a time. + toks.extend(wordpunct_tokenize(stream.readline())) + return toks + + +def read_line_block(stream): + toks = [] + for i in range(20): + line = stream.readline() + if not line: + return toks + toks.append(line.rstrip("\n")) + return toks + + +def read_blankline_block(stream): + s = "" + while True: + line = stream.readline() + # End of file: + if not line: + if s: + return [s] + else: + return [] + # Blank line: + elif line and not line.strip(): + if s: + return [s] + # Other line: + else: + s += line + + +def read_alignedsent_block(stream): + s = "" + while True: + line = stream.readline() + if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n": + continue + # End of file: + if not line: + if s: + return [s] + else: + return [] + # Other line: + else: + s += line + if re.match(r"^\d+-\d+", line) is not None: + return [s] + + +def read_regexp_block(stream, start_re, end_re=None): + """ + Read a sequence of tokens from a stream, where tokens begin with + lines that match ``start_re``. If ``end_re`` is specified, then + tokens end with lines that match ``end_re``; otherwise, tokens end + whenever the next line matching ``start_re`` or EOF is found. + """ + # Scan until we find a line matching the start regexp. + while True: + line = stream.readline() + if not line: + return [] # end of file. + if re.match(start_re, line): + break + + # Scan until we find another line matching the regexp, or EOF. + lines = [line] + while True: + oldpos = stream.tell() + line = stream.readline() + # End of file: + if not line: + return ["".join(lines)] + # End of token: + if end_re is not None and re.match(end_re, line): + return ["".join(lines)] + # Start of new token: backup to just before it starts, and + # return the token we've already collected. + if end_re is None and re.match(start_re, line): + stream.seek(oldpos) + return ["".join(lines)] + # Anything else is part of the token. + lines.append(line) + + +def read_sexpr_block(stream, block_size=16384, comment_char=None): + """ + Read a sequence of s-expressions from the stream, and leave the + stream's file position at the end the last complete s-expression + read. This function will always return at least one s-expression, + unless there are no more s-expressions in the file. + + If the file ends in in the middle of an s-expression, then that + incomplete s-expression is returned when the end of the file is + reached. + + :param block_size: The default block size for reading. If an + s-expression is longer than one block, then more than one + block will be read. + :param comment_char: A character that marks comments. Any lines + that begin with this character will be stripped out. + (If spaces or tabs precede the comment character, then the + line will not be stripped.) + """ + start = stream.tell() + block = stream.read(block_size) + encoding = getattr(stream, "encoding", None) + assert encoding is not None or isinstance(block, str) + if encoding not in (None, "utf-8"): + import warnings + + warnings.warn( + "Parsing may fail, depending on the properties " + "of the %s encoding!" % encoding + ) + # (e.g., the utf-16 encoding does not work because it insists + # on adding BOMs to the beginning of encoded strings.) + + if comment_char: + COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char)) + while True: + try: + # If we're stripping comments, then make sure our block ends + # on a line boundary; and then replace any comments with + # space characters. (We can't just strip them out -- that + # would make our offset wrong.) + if comment_char: + block += stream.readline() + block = re.sub(COMMENT, _sub_space, block) + # Read the block. + tokens, offset = _parse_sexpr_block(block) + # Skip whitespace + offset = re.compile(r"\s*").search(block, offset).end() + + # Move to the end position. + if encoding is None: + stream.seek(start + offset) + else: + stream.seek(start + len(block[:offset].encode(encoding))) + + # Return the list of tokens we processed + return tokens + except ValueError as e: + if e.args[0] == "Block too small": + next_block = stream.read(block_size) + if next_block: + block += next_block + continue + else: + # The file ended mid-sexpr -- return what we got. + return [block.strip()] + else: + raise + + +def _sub_space(m): + """Helper function: given a regexp match, return a string of + spaces that's the same length as the matched string.""" + return " " * (m.end() - m.start()) + + +def _parse_sexpr_block(block): + tokens = [] + start = end = 0 + + while end < len(block): + m = re.compile(r"\S").search(block, end) + if not m: + return tokens, end + + start = m.start() + + # Case 1: sexpr is not parenthesized. + if m.group() != "(": + m2 = re.compile(r"[\s(]").search(block, start) + if m2: + end = m2.start() + else: + if tokens: + return tokens, end + raise ValueError("Block too small") + + # Case 2: parenthesized sexpr. + else: + nesting = 0 + for m in re.compile(r"[()]").finditer(block, start): + if m.group() == "(": + nesting += 1 + else: + nesting -= 1 + if nesting == 0: + end = m.end() + break + else: + if tokens: + return tokens, end + raise ValueError("Block too small") + + tokens.append(block[start:end]) + + return tokens, end + + +###################################################################### +# { Finding Corpus Items +###################################################################### + + +def find_corpus_fileids(root, regexp): + if not isinstance(root, PathPointer): + raise TypeError("find_corpus_fileids: expected a PathPointer") + regexp += "$" + + # Find fileids in a zipfile: scan the zipfile's namelist. Filter + # out entries that end in '/' -- they're directories. + if isinstance(root, ZipFilePathPointer): + fileids = [ + name[len(root.entry) :] + for name in root.zipfile.namelist() + if not name.endswith("/") + ] + items = [name for name in fileids if re.match(regexp, name)] + return sorted(items) + + # Find fileids in a directory: use os.walk to search all (proper + # or symlinked) subdirectories, and match paths against the regexp. + elif isinstance(root, FileSystemPathPointer): + items = [] + for dirname, subdirs, fileids in os.walk(root.path): + prefix = "".join("%s/" % p for p in _path_from(root.path, dirname)) + items += [ + prefix + fileid + for fileid in fileids + if re.match(regexp, prefix + fileid) + ] + # Don't visit svn directories: + if ".svn" in subdirs: + subdirs.remove(".svn") + return sorted(items) + + else: + raise AssertionError("Don't know how to handle %r" % root) + + +def _path_from(parent, child): + if os.path.split(parent)[1] == "": + parent = os.path.split(parent)[0] + path = [] + while parent != child: + child, dirname = os.path.split(child) + path.insert(0, dirname) + assert os.path.split(child)[0] != child + return path + + +###################################################################### +# { Paragraph structure in Treebank files +###################################################################### + + +def tagged_treebank_para_block_reader(stream): + # Read the next paragraph. + para = "" + while True: + line = stream.readline() + # End of paragraph: + if re.match(r"======+\s*$", line): + if para.strip(): + return [para] + # End of file: + elif line == "": + if para.strip(): + return [para] + else: + return [] + # Content line: + else: + para += line diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b946fcadaf2a362a1bdfb0d6726fbf8d674f4d80 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py @@ -0,0 +1,629 @@ +# Natural Language Toolkit: Verbnet Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +An NLTK interface to the VerbNet verb lexicon + +For details about VerbNet see: +https://verbs.colorado.edu/~mpalmer/projects/verbnet.html +""" + +import re +import textwrap +from collections import defaultdict + +from nltk.corpus.reader.xmldocs import XMLCorpusReader + + +class VerbnetCorpusReader(XMLCorpusReader): + """ + An NLTK interface to the VerbNet verb lexicon. + + From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest + on-line verb lexicon currently available for English. It is a hierarchical + domain-independent, broad-coverage verb lexicon with mappings to other + lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG + (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)." + + For details about VerbNet see: + https://verbs.colorado.edu/~mpalmer/projects/verbnet.html + """ + + # No unicode encoding param, since the data files are all XML. + def __init__(self, root, fileids, wrap_etree=False): + XMLCorpusReader.__init__(self, root, fileids, wrap_etree) + + self._lemma_to_class = defaultdict(list) + """A dictionary mapping from verb lemma strings to lists of + VerbNet class identifiers.""" + + self._wordnet_to_class = defaultdict(list) + """A dictionary mapping from wordnet identifier strings to + lists of VerbNet class identifiers.""" + + self._class_to_fileid = {} + """A dictionary mapping from class identifiers to + corresponding file identifiers. The keys of this dictionary + provide a complete list of all classes and subclasses.""" + + self._shortid_to_longid = {} + + # Initialize the dictionaries. Use the quick (regexp-based) + # method instead of the slow (xml-based) method, because it + # runs 2-30 times faster. + self._quick_index() + + _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$") + """Regular expression that matches (and decomposes) longids""" + + _SHORTID_RE = re.compile(r"[\d+.\-]+$") + """Regular expression that matches shortids""" + + _INDEX_RE = re.compile( + r']+>|' r'' + ) + """Regular expression used by ``_index()`` to quickly scan the corpus + for basic information.""" + + def lemmas(self, vnclass=None): + """ + Return a list of all verb lemmas that appear in any class, or + in the ``classid`` if specified. + """ + if vnclass is None: + return sorted(self._lemma_to_class.keys()) + else: + # [xx] should this include subclass members? + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")] + + def wordnetids(self, vnclass=None): + """ + Return a list of all wordnet identifiers that appear in any + class, or in ``classid`` if specified. + """ + if vnclass is None: + return sorted(self._wordnet_to_class.keys()) + else: + # [xx] should this include subclass members? + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + return sum( + ( + member.get("wn", "").split() + for member in vnclass.findall("MEMBERS/MEMBER") + ), + [], + ) + + def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None): + """ + Return a list of the VerbNet class identifiers. If a file + identifier is specified, then return only the VerbNet class + identifiers for classes (and subclasses) defined by that file. + If a lemma is specified, then return only VerbNet class + identifiers for classes that contain that lemma as a member. + If a wordnetid is specified, then return only identifiers for + classes that contain that wordnetid as a member. If a classid + is specified, then return only identifiers for subclasses of + the specified VerbNet class. + If nothing is specified, return all classids within VerbNet + """ + if fileid is not None: + return [c for (c, f) in self._class_to_fileid.items() if f == fileid] + elif lemma is not None: + return self._lemma_to_class[lemma] + elif wordnetid is not None: + return self._wordnet_to_class[wordnetid] + elif classid is not None: + xmltree = self.vnclass(classid) + return [ + subclass.get("ID") + for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS") + ] + else: + return sorted(self._class_to_fileid.keys()) + + def vnclass(self, fileid_or_classid): + """Returns VerbNet class ElementTree + + Return an ElementTree containing the xml for the specified + VerbNet class. + + :param fileid_or_classid: An identifier specifying which class + should be returned. Can be a file identifier (such as + ``'put-9.1.xml'``), or a VerbNet class identifier (such as + ``'put-9.1'``) or a short VerbNet class identifier (such as + ``'9.1'``). + """ + # File identifier: just return the xml. + if fileid_or_classid in self._fileids: + return self.xml(fileid_or_classid) + + # Class identifier: get the xml, and find the right elt. + classid = self.longid(fileid_or_classid) + if classid in self._class_to_fileid: + fileid = self._class_to_fileid[self.longid(classid)] + tree = self.xml(fileid) + if classid == tree.get("ID"): + return tree + else: + for subclass in tree.findall(".//VNSUBCLASS"): + if classid == subclass.get("ID"): + return subclass + else: + assert False # we saw it during _index()! + + else: + raise ValueError(f"Unknown identifier {fileid_or_classid}") + + def fileids(self, vnclass_ids=None): + """ + Return a list of fileids that make up this corpus. If + ``vnclass_ids`` is specified, then return the fileids that make + up the specified VerbNet class(es). + """ + if vnclass_ids is None: + return self._fileids + elif isinstance(vnclass_ids, str): + return [self._class_to_fileid[self.longid(vnclass_ids)]] + else: + return [ + self._class_to_fileid[self.longid(vnclass_id)] + for vnclass_id in vnclass_ids + ] + + def frames(self, vnclass): + """Given a VerbNet class, this method returns VerbNet frames + + The members returned are: + 1) Example + 2) Description + 3) Syntax + 4) Semantics + + :param vnclass: A VerbNet class identifier; or an ElementTree + containing the xml contents of a VerbNet class. + :return: frames - a list of frame dictionaries + """ + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + frames = [] + vnframes = vnclass.findall("FRAMES/FRAME") + for vnframe in vnframes: + frames.append( + { + "example": self._get_example_within_frame(vnframe), + "description": self._get_description_within_frame(vnframe), + "syntax": self._get_syntactic_list_within_frame(vnframe), + "semantics": self._get_semantics_within_frame(vnframe), + } + ) + return frames + + def subclasses(self, vnclass): + """Returns subclass ids, if any exist + + Given a VerbNet class, this method returns subclass ids (if they exist) + in a list of strings. + + :param vnclass: A VerbNet class identifier; or an ElementTree + containing the xml contents of a VerbNet class. + :return: list of subclasses + """ + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + + subclasses = [ + subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS") + ] + return subclasses + + def themroles(self, vnclass): + """Returns thematic roles participating in a VerbNet class + + Members returned as part of roles are- + 1) Type + 2) Modifiers + + :param vnclass: A VerbNet class identifier; or an ElementTree + containing the xml contents of a VerbNet class. + :return: themroles: A list of thematic roles in the VerbNet class + """ + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + + themroles = [] + for trole in vnclass.findall("THEMROLES/THEMROLE"): + themroles.append( + { + "type": trole.get("type"), + "modifiers": [ + {"value": restr.get("Value"), "type": restr.get("type")} + for restr in trole.findall("SELRESTRS/SELRESTR") + ], + } + ) + return themroles + + ###################################################################### + # { Index Initialization + ###################################################################### + + def _index(self): + """ + Initialize the indexes ``_lemma_to_class``, + ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning + through the corpus fileids. This is fast if ElementTree + uses the C implementation (<0.1 secs), but quite slow (>10 secs) + if only the python implementation is available. + """ + for fileid in self._fileids: + self._index_helper(self.xml(fileid), fileid) + + def _index_helper(self, xmltree, fileid): + """Helper for ``_index()``""" + vnclass = xmltree.get("ID") + self._class_to_fileid[vnclass] = fileid + self._shortid_to_longid[self.shortid(vnclass)] = vnclass + for member in xmltree.findall("MEMBERS/MEMBER"): + self._lemma_to_class[member.get("name")].append(vnclass) + for wn in member.get("wn", "").split(): + self._wordnet_to_class[wn].append(vnclass) + for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"): + self._index_helper(subclass, fileid) + + def _quick_index(self): + """ + Initialize the indexes ``_lemma_to_class``, + ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning + through the corpus fileids. This doesn't do proper xml parsing, + but is good enough to find everything in the standard VerbNet + corpus -- and it runs about 30 times faster than xml parsing + (with the python ElementTree; only 2-3 times faster + if ElementTree uses the C implementation). + """ + # nb: if we got rid of wordnet_to_class, this would run 2-3 + # times faster. + for fileid in self._fileids: + vnclass = fileid[:-4] # strip the '.xml' + self._class_to_fileid[vnclass] = fileid + self._shortid_to_longid[self.shortid(vnclass)] = vnclass + with self.open(fileid) as fp: + for m in self._INDEX_RE.finditer(fp.read()): + groups = m.groups() + if groups[0] is not None: + self._lemma_to_class[groups[0]].append(vnclass) + for wn in groups[1].split(): + self._wordnet_to_class[wn].append(vnclass) + elif groups[2] is not None: + self._class_to_fileid[groups[2]] = fileid + vnclass = groups[2] # for elts. + self._shortid_to_longid[self.shortid(vnclass)] = vnclass + else: + assert False, "unexpected match condition" + + ###################################################################### + # { Identifier conversion + ###################################################################### + + def longid(self, shortid): + """Returns longid of a VerbNet class + + Given a short VerbNet class identifier (eg '37.10'), map it + to a long id (eg 'confess-37.10'). If ``shortid`` is already a + long id, then return it as-is""" + if self._LONGID_RE.match(shortid): + return shortid # it's already a longid. + elif not self._SHORTID_RE.match(shortid): + raise ValueError("vnclass identifier %r not found" % shortid) + try: + return self._shortid_to_longid[shortid] + except KeyError as e: + raise ValueError("vnclass identifier %r not found" % shortid) from e + + def shortid(self, longid): + """Returns shortid of a VerbNet class + + Given a long VerbNet class identifier (eg 'confess-37.10'), + map it to a short id (eg '37.10'). If ``longid`` is already a + short id, then return it as-is.""" + if self._SHORTID_RE.match(longid): + return longid # it's already a shortid. + m = self._LONGID_RE.match(longid) + if m: + return m.group(2) + else: + raise ValueError("vnclass identifier %r not found" % longid) + + ###################################################################### + # { Frame access utility functions + ###################################################################### + + def _get_semantics_within_frame(self, vnframe): + """Returns semantics within a single frame + + A utility function to retrieve semantics within a frame in VerbNet + Members of the semantics dictionary: + 1) Predicate value + 2) Arguments + + :param vnframe: An ElementTree containing the xml contents of + a VerbNet frame. + :return: semantics: semantics dictionary + """ + semantics_within_single_frame = [] + for pred in vnframe.findall("SEMANTICS/PRED"): + arguments = [ + {"type": arg.get("type"), "value": arg.get("value")} + for arg in pred.findall("ARGS/ARG") + ] + semantics_within_single_frame.append( + { + "predicate_value": pred.get("value"), + "arguments": arguments, + "negated": pred.get("bool") == "!", + } + ) + return semantics_within_single_frame + + def _get_example_within_frame(self, vnframe): + """Returns example within a frame + + A utility function to retrieve an example within a frame in VerbNet. + + :param vnframe: An ElementTree containing the xml contents of + a VerbNet frame. + :return: example_text: The example sentence for this particular frame + """ + example_element = vnframe.find("EXAMPLES/EXAMPLE") + if example_element is not None: + example_text = example_element.text + else: + example_text = "" + return example_text + + def _get_description_within_frame(self, vnframe): + """Returns member description within frame + + A utility function to retrieve a description of participating members + within a frame in VerbNet. + + :param vnframe: An ElementTree containing the xml contents of + a VerbNet frame. + :return: description: a description dictionary with members - primary and secondary + """ + description_element = vnframe.find("DESCRIPTION") + return { + "primary": description_element.attrib["primary"], + "secondary": description_element.get("secondary", ""), + } + + def _get_syntactic_list_within_frame(self, vnframe): + """Returns semantics within a frame + + A utility function to retrieve semantics within a frame in VerbNet. + Members of the syntactic dictionary: + 1) POS Tag + 2) Modifiers + + :param vnframe: An ElementTree containing the xml contents of + a VerbNet frame. + :return: syntax_within_single_frame + """ + syntax_within_single_frame = [] + for elt in vnframe.find("SYNTAX"): + pos_tag = elt.tag + modifiers = dict() + modifiers["value"] = elt.get("value") if "value" in elt.attrib else "" + modifiers["selrestrs"] = [ + {"value": restr.get("Value"), "type": restr.get("type")} + for restr in elt.findall("SELRESTRS/SELRESTR") + ] + modifiers["synrestrs"] = [ + {"value": restr.get("Value"), "type": restr.get("type")} + for restr in elt.findall("SYNRESTRS/SYNRESTR") + ] + syntax_within_single_frame.append( + {"pos_tag": pos_tag, "modifiers": modifiers} + ) + return syntax_within_single_frame + + ###################################################################### + # { Pretty Printing + ###################################################################### + + def pprint(self, vnclass): + """Returns pretty printed version of a VerbNet class + + Return a string containing a pretty-printed representation of + the given VerbNet class. + + :param vnclass: A VerbNet class identifier; or an ElementTree + containing the xml contents of a VerbNet class. + """ + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + + s = vnclass.get("ID") + "\n" + s += self.pprint_subclasses(vnclass, indent=" ") + "\n" + s += self.pprint_members(vnclass, indent=" ") + "\n" + s += " Thematic roles:\n" + s += self.pprint_themroles(vnclass, indent=" ") + "\n" + s += " Frames:\n" + s += self.pprint_frames(vnclass, indent=" ") + return s + + def pprint_subclasses(self, vnclass, indent=""): + """Returns pretty printed version of subclasses of VerbNet class + + Return a string containing a pretty-printed representation of + the given VerbNet class's subclasses. + + :param vnclass: A VerbNet class identifier; or an ElementTree + containing the xml contents of a VerbNet class. + """ + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + + subclasses = self.subclasses(vnclass) + if not subclasses: + subclasses = ["(none)"] + s = "Subclasses: " + " ".join(subclasses) + return textwrap.fill( + s, 70, initial_indent=indent, subsequent_indent=indent + " " + ) + + def pprint_members(self, vnclass, indent=""): + """Returns pretty printed version of members in a VerbNet class + + Return a string containing a pretty-printed representation of + the given VerbNet class's member verbs. + + :param vnclass: A VerbNet class identifier; or an ElementTree + containing the xml contents of a VerbNet class. + """ + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + + members = self.lemmas(vnclass) + if not members: + members = ["(none)"] + s = "Members: " + " ".join(members) + return textwrap.fill( + s, 70, initial_indent=indent, subsequent_indent=indent + " " + ) + + def pprint_themroles(self, vnclass, indent=""): + """Returns pretty printed version of thematic roles in a VerbNet class + + Return a string containing a pretty-printed representation of + the given VerbNet class's thematic roles. + + :param vnclass: A VerbNet class identifier; or an ElementTree + containing the xml contents of a VerbNet class. + """ + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + + pieces = [] + for themrole in self.themroles(vnclass): + piece = indent + "* " + themrole.get("type") + modifiers = [ + modifier["value"] + modifier["type"] + for modifier in themrole["modifiers"] + ] + if modifiers: + piece += "[{}]".format(" ".join(modifiers)) + pieces.append(piece) + return "\n".join(pieces) + + def pprint_frames(self, vnclass, indent=""): + """Returns pretty version of all frames in a VerbNet class + + Return a string containing a pretty-printed representation of + the list of frames within the VerbNet class. + + :param vnclass: A VerbNet class identifier; or an ElementTree + containing the xml contents of a VerbNet class. + """ + if isinstance(vnclass, str): + vnclass = self.vnclass(vnclass) + pieces = [] + for vnframe in self.frames(vnclass): + pieces.append(self._pprint_single_frame(vnframe, indent)) + return "\n".join(pieces) + + def _pprint_single_frame(self, vnframe, indent=""): + """Returns pretty printed version of a single frame in a VerbNet class + + Returns a string containing a pretty-printed representation of + the given frame. + + :param vnframe: An ElementTree containing the xml contents of + a VerbNet frame. + """ + frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n" + frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n" + frame_string += ( + self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n" + ) + frame_string += indent + " Semantics:\n" + frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ") + return frame_string + + def _pprint_example_within_frame(self, vnframe, indent=""): + """Returns pretty printed version of example within frame in a VerbNet class + + Return a string containing a pretty-printed representation of + the given VerbNet frame example. + + :param vnframe: An ElementTree containing the xml contents of + a Verbnet frame. + """ + if vnframe["example"]: + return indent + " Example: " + vnframe["example"] + + def _pprint_description_within_frame(self, vnframe, indent=""): + """Returns pretty printed version of a VerbNet frame description + + Return a string containing a pretty-printed representation of + the given VerbNet frame description. + + :param vnframe: An ElementTree containing the xml contents of + a VerbNet frame. + """ + description = indent + vnframe["description"]["primary"] + if vnframe["description"]["secondary"]: + description += " ({})".format(vnframe["description"]["secondary"]) + return description + + def _pprint_syntax_within_frame(self, vnframe, indent=""): + """Returns pretty printed version of syntax within a frame in a VerbNet class + + Return a string containing a pretty-printed representation of + the given VerbNet frame syntax. + + :param vnframe: An ElementTree containing the xml contents of + a VerbNet frame. + """ + pieces = [] + for element in vnframe["syntax"]: + piece = element["pos_tag"] + modifier_list = [] + if "value" in element["modifiers"] and element["modifiers"]["value"]: + modifier_list.append(element["modifiers"]["value"]) + modifier_list += [ + "{}{}".format(restr["value"], restr["type"]) + for restr in ( + element["modifiers"]["selrestrs"] + + element["modifiers"]["synrestrs"] + ) + ] + if modifier_list: + piece += "[{}]".format(" ".join(modifier_list)) + pieces.append(piece) + + return indent + " ".join(pieces) + + def _pprint_semantics_within_frame(self, vnframe, indent=""): + """Returns a pretty printed version of semantics within frame in a VerbNet class + + Return a string containing a pretty-printed representation of + the given VerbNet frame semantics. + + :param vnframe: An ElementTree containing the xml contents of + a VerbNet frame. + """ + pieces = [] + for predicate in vnframe["semantics"]: + arguments = [argument["value"] for argument in predicate["arguments"]] + pieces.append( + f"{'¬' if predicate['negated'] else ''}{predicate['predicate_value']}({', '.join(arguments)})" + ) + return "\n".join(f"{indent}* {piece}" for piece in pieces) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py new file mode 100644 index 0000000000000000000000000000000000000000..f24f87497ab4edc6e1d469acf522562f1886957d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py @@ -0,0 +1,166 @@ +# Natural Language Toolkit: Word List Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT +from nltk.corpus.reader.api import * +from nltk.corpus.reader.util import * +from nltk.tokenize import line_tokenize + + +class WordListCorpusReader(CorpusReader): + """ + List of words, one per line. Blank lines are ignored. + """ + + def words(self, fileids=None, ignore_lines_startswith="\n"): + return [ + line + for line in line_tokenize(self.raw(fileids)) + if not line.startswith(ignore_lines_startswith) + ] + + +class SwadeshCorpusReader(WordListCorpusReader): + def entries(self, fileids=None): + """ + :return: a tuple of words for the specified fileids. + """ + if not fileids: + fileids = self.fileids() + + wordlists = [self.words(f) for f in fileids] + return list(zip(*wordlists)) + + +class NonbreakingPrefixesCorpusReader(WordListCorpusReader): + """ + This is a class to read the nonbreaking prefixes textfiles from the + Moses Machine Translation toolkit. These lists are used in the Python port + of the Moses' word tokenizer. + """ + + available_langs = { + "catalan": "ca", + "czech": "cs", + "german": "de", + "greek": "el", + "english": "en", + "spanish": "es", + "finnish": "fi", + "french": "fr", + "hungarian": "hu", + "icelandic": "is", + "italian": "it", + "latvian": "lv", + "dutch": "nl", + "polish": "pl", + "portuguese": "pt", + "romanian": "ro", + "russian": "ru", + "slovak": "sk", + "slovenian": "sl", + "swedish": "sv", + "tamil": "ta", + } + # Also, add the lang IDs as the keys. + available_langs.update({v: v for v in available_langs.values()}) + + def words(self, lang=None, fileids=None, ignore_lines_startswith="#"): + """ + This module returns a list of nonbreaking prefixes for the specified + language(s). + + >>> from nltk.corpus import nonbreaking_prefixes as nbp + >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J'] + True + >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89'] + True + + :return: a list words for the specified language(s). + """ + # If *lang* in list of languages available, allocate apt fileid. + # Otherwise, the function returns non-breaking prefixes for + # all languages when fileids==None. + if lang in self.available_langs: + lang = self.available_langs[lang] + fileids = ["nonbreaking_prefix." + lang] + return [ + line + for line in line_tokenize(self.raw(fileids)) + if not line.startswith(ignore_lines_startswith) + ] + + +class UnicharsCorpusReader(WordListCorpusReader): + """ + This class is used to read lists of characters from the Perl Unicode + Properties (see https://perldoc.perl.org/perluniprops.html). + The files in the perluniprop.zip are extracted using the Unicode::Tussle + module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm + """ + + # These are categories similar to the Perl Unicode Properties + available_categories = [ + "Close_Punctuation", + "Currency_Symbol", + "IsAlnum", + "IsAlpha", + "IsLower", + "IsN", + "IsSc", + "IsSo", + "IsUpper", + "Line_Separator", + "Number", + "Open_Punctuation", + "Punctuation", + "Separator", + "Symbol", + ] + + def chars(self, category=None, fileids=None): + """ + This module returns a list of characters from the Perl Unicode Properties. + They are very useful when porting Perl tokenizers to Python. + + >>> from nltk.corpus import perluniprops as pup + >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c'] + True + >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5'] + True + >>> pup.available_categories + ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol'] + + :return: a list of characters given the specific unicode character category + """ + if category in self.available_categories: + fileids = [category + ".txt"] + return list(self.raw(fileids).strip()) + + +class MWAPPDBCorpusReader(WordListCorpusReader): + """ + This class is used to read the list of word pairs from the subset of lexical + pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word + Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015): + + - http://acl2014.org/acl2014/Q14/pdf/Q14-1017 + - https://www.aclweb.org/anthology/S14-2039 + - https://www.aclweb.org/anthology/S15-2027 + + The original source of the full PPDB corpus can be found on + https://www.cis.upenn.edu/~ccb/ppdb/ + + :return: a list of tuples of similar lexical terms. + """ + + mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs" + + def entries(self, fileids=mwa_ppdb_xxxl_file): + """ + :return: a tuple of synonym word pairs. + """ + return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py new file mode 100644 index 0000000000000000000000000000000000000000..cf5004ee12380a34d04e871038a4baf201040361 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py @@ -0,0 +1,2489 @@ +# Natural Language Toolkit: WordNet +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bethard +# Steven Bird +# Edward Loper +# Nitin Madnani +# Nasruddin A’aidil Shari +# Sim Wei Ying Geraldine +# Soe Lynn +# Francis Bond +# Eric Kafe + +# URL: +# For license information, see LICENSE.TXT + +""" +An NLTK interface for WordNet + +WordNet is a lexical database of English. +Using synsets, helps find conceptual relationships between words +such as hypernyms, hyponyms, synonyms, antonyms etc. + +For details about WordNet see: +https://wordnet.princeton.edu/ + +This module also allows you to find lemmas in languages +other than English from the Open Multilingual Wordnet +http://compling.hss.ntu.edu.sg/omw/ + +""" + +import math +import os +import re +import warnings +from collections import defaultdict, deque +from functools import total_ordering +from itertools import chain, islice +from operator import itemgetter + +from nltk.corpus.reader import CorpusReader +from nltk.internals import deprecated +from nltk.probability import FreqDist +from nltk.util import binary_search_file as _binary_search_file + +###################################################################### +# Table of Contents +###################################################################### +# - Constants +# - Data Classes +# - WordNetError +# - Lemma +# - Synset +# - WordNet Corpus Reader +# - WordNet Information Content Corpus Reader +# - Similarity Metrics +# - Demo + +###################################################################### +# Constants +###################################################################### + +#: Positive infinity (for similarity functions) +_INF = 1e300 + +# { Part-of-speech constants +ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v" +# } + +POS_LIST = [NOUN, VERB, ADJ, ADV] + +# A table of strings that are used to express verb frames. +VERB_FRAME_STRINGS = ( + None, + "Something %s", + "Somebody %s", + "It is %sing", + "Something is %sing PP", + "Something %s something Adjective/Noun", + "Something %s Adjective/Noun", + "Somebody %s Adjective", + "Somebody %s something", + "Somebody %s somebody", + "Something %s somebody", + "Something %s something", + "Something %s to somebody", + "Somebody %s on something", + "Somebody %s somebody something", + "Somebody %s something to somebody", + "Somebody %s something from somebody", + "Somebody %s somebody with something", + "Somebody %s somebody of something", + "Somebody %s something on somebody", + "Somebody %s somebody PP", + "Somebody %s something PP", + "Somebody %s PP", + "Somebody's (body part) %s", + "Somebody %s somebody to INFINITIVE", + "Somebody %s somebody INFINITIVE", + "Somebody %s that CLAUSE", + "Somebody %s to somebody", + "Somebody %s to INFINITIVE", + "Somebody %s whether INFINITIVE", + "Somebody %s somebody into V-ing something", + "Somebody %s something with something", + "Somebody %s INFINITIVE", + "Somebody %s VERB-ing", + "It %s that CLAUSE", + "Something %s INFINITIVE", + # OEWN additions: + "Somebody %s at something", + "Somebody %s for something", + "Somebody %s on somebody", + "Somebody %s out of somebody", +) + +SENSENUM_RE = re.compile(r"\.[\d]+\.") + + +###################################################################### +# Data Classes +###################################################################### + + +class WordNetError(Exception): + """An exception class for wordnet-related errors.""" + + +@total_ordering +class _WordNetObject: + """A common base class for lemmas and synsets.""" + + def hypernyms(self): + return self._related("@") + + def _hypernyms(self): + return self._related("@") + + def instance_hypernyms(self): + return self._related("@i") + + def _instance_hypernyms(self): + return self._related("@i") + + def hyponyms(self): + return self._related("~") + + def instance_hyponyms(self): + return self._related("~i") + + def member_holonyms(self): + return self._related("#m") + + def substance_holonyms(self): + return self._related("#s") + + def part_holonyms(self): + return self._related("#p") + + def member_meronyms(self): + return self._related("%m") + + def substance_meronyms(self): + return self._related("%s") + + def part_meronyms(self): + return self._related("%p") + + def topic_domains(self): + return self._related(";c") + + def in_topic_domains(self): + return self._related("-c") + + def region_domains(self): + return self._related(";r") + + def in_region_domains(self): + return self._related("-r") + + def usage_domains(self): + return self._related(";u") + + def in_usage_domains(self): + return self._related("-u") + + def attributes(self): + return self._related("=") + + def entailments(self): + return self._related("*") + + def causes(self): + return self._related(">") + + def also_sees(self): + return self._related("^") + + def verb_groups(self): + return self._related("$") + + def similar_tos(self): + return self._related("&") + + def __hash__(self): + return hash(self._name) + + def __eq__(self, other): + return self._name == other._name + + def __ne__(self, other): + return self._name != other._name + + def __lt__(self, other): + return self._name < other._name + + +class Lemma(_WordNetObject): + """ + The lexical entry for a single morphological form of a + sense-disambiguated word. + + Create a Lemma from a "..." string where: + is the morphological stem identifying the synset + is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB + is the sense number, counting from 0. + is the morphological form of interest + + Note that and can be different, e.g. the Synset + 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and + 'salt.n.03.salinity'. + + Lemma attributes, accessible via methods with the same name: + + - name: The canonical name of this lemma. + - synset: The synset that this lemma belongs to. + - syntactic_marker: For adjectives, the WordNet string identifying the + syntactic position relative modified noun. See: + https://wordnet.princeton.edu/documentation/wninput5wn + For all other parts of speech, this attribute is None. + - count: The frequency of this lemma in wordnet. + + Lemma methods: + + Lemmas have the following methods for retrieving related Lemmas. They + correspond to the names for the pointer symbols defined here: + https://wordnet.princeton.edu/documentation/wninput5wn + These methods all return lists of Lemmas: + + - antonyms + - hypernyms, instance_hypernyms + - hyponyms, instance_hyponyms + - member_holonyms, substance_holonyms, part_holonyms + - member_meronyms, substance_meronyms, part_meronyms + - topic_domains, region_domains, usage_domains + - attributes + - derivationally_related_forms + - entailments + - causes + - also_sees + - verb_groups + - similar_tos + - pertainyms + """ + + __slots__ = [ + "_wordnet_corpus_reader", + "_name", + "_syntactic_marker", + "_synset", + "_frame_strings", + "_frame_ids", + "_lexname_index", + "_lex_id", + "_lang", + "_key", + ] + + def __init__( + self, + wordnet_corpus_reader, + synset, + name, + lexname_index, + lex_id, + syntactic_marker, + ): + self._wordnet_corpus_reader = wordnet_corpus_reader + self._name = name + self._syntactic_marker = syntactic_marker + self._synset = synset + self._frame_strings = [] + self._frame_ids = [] + self._lexname_index = lexname_index + self._lex_id = lex_id + self._lang = "eng" + + self._key = None # gets set later. + + def name(self): + return self._name + + def syntactic_marker(self): + return self._syntactic_marker + + def synset(self): + return self._synset + + def frame_strings(self): + return self._frame_strings + + def frame_ids(self): + return self._frame_ids + + def lang(self): + return self._lang + + def key(self): + return self._key + + def __repr__(self): + tup = type(self).__name__, self._synset._name, self._name + return "%s('%s.%s')" % tup + + def _related(self, relation_symbol): + get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset + if (self._name, relation_symbol) not in self._synset._lemma_pointers: + return [] + return [ + get_synset(pos, offset)._lemmas[lemma_index] + for pos, offset, lemma_index in self._synset._lemma_pointers[ + self._name, relation_symbol + ] + ] + + def count(self): + """Return the frequency count for this Lemma""" + return self._wordnet_corpus_reader.lemma_count(self) + + def antonyms(self): + return self._related("!") + + def derivationally_related_forms(self): + return self._related("+") + + def pertainyms(self): + return self._related("\\") + + +class Synset(_WordNetObject): + """Create a Synset from a ".." string where: + is the word's morphological stem + is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB + is the sense number, counting from 0. + + Synset attributes, accessible via methods with the same name: + + - name: The canonical name of this synset, formed using the first lemma + of this synset. Note that this may be different from the name + passed to the constructor if that string used a different lemma to + identify the synset. + - pos: The synset's part of speech, matching one of the module level + attributes ADJ, ADJ_SAT, ADV, NOUN or VERB. + - lemmas: A list of the Lemma objects for this synset. + - definition: The definition for this synset. + - examples: A list of example strings for this synset. + - offset: The offset in the WordNet dict file of this synset. + - lexname: The name of the lexicographer file containing this synset. + + Synset methods: + + Synsets have the following methods for retrieving related Synsets. + They correspond to the names for the pointer symbols defined here: + https://wordnet.princeton.edu/documentation/wninput5wn + These methods all return lists of Synsets. + + - hypernyms, instance_hypernyms + - hyponyms, instance_hyponyms + - member_holonyms, substance_holonyms, part_holonyms + - member_meronyms, substance_meronyms, part_meronyms + - attributes + - entailments + - causes + - also_sees + - verb_groups + - similar_tos + + Additionally, Synsets support the following methods specific to the + hypernym relation: + + - root_hypernyms + - common_hypernyms + - lowest_common_hypernyms + + Note that Synsets do not support the following relations because + these are defined by WordNet as lexical relations: + + - antonyms + - derivationally_related_forms + - pertainyms + """ + + __slots__ = [ + "_pos", + "_offset", + "_name", + "_frame_ids", + "_lemmas", + "_lemma_names", + "_definition", + "_examples", + "_lexname", + "_pointers", + "_lemma_pointers", + "_max_depth", + "_min_depth", + ] + + def __init__(self, wordnet_corpus_reader): + self._wordnet_corpus_reader = wordnet_corpus_reader + # All of these attributes get initialized by + # WordNetCorpusReader._synset_from_pos_and_line() + + self._pos = None + self._offset = None + self._name = None + self._frame_ids = [] + self._lemmas = [] + self._lemma_names = [] + self._definition = None + self._examples = [] + self._lexname = None # lexicographer name + self._all_hypernyms = None + + self._pointers = defaultdict(set) + self._lemma_pointers = defaultdict(list) + + def pos(self): + return self._pos + + def offset(self): + return self._offset + + def name(self): + return self._name + + def frame_ids(self): + return self._frame_ids + + def _doc(self, doc_type, default, lang="eng"): + """Helper method for Synset.definition and Synset.examples""" + corpus = self._wordnet_corpus_reader + if lang not in corpus.langs(): + return None + elif lang == "eng": + return default + else: + corpus._load_lang_data(lang) + of = corpus.ss2of(self) + i = corpus.lg_attrs.index(doc_type) + if of in corpus._lang_data[lang][i]: + return corpus._lang_data[lang][i][of] + else: + return None + + def definition(self, lang="eng"): + """Return definition in specified language""" + return self._doc("def", self._definition, lang=lang) + + def examples(self, lang="eng"): + """Return examples in specified language""" + return self._doc("exe", self._examples, lang=lang) + + def lexname(self): + return self._lexname + + def _needs_root(self): + if self._pos == NOUN and self._wordnet_corpus_reader.get_version() != "1.6": + return False + else: + return True + + def lemma_names(self, lang="eng"): + """Return all the lemma_names associated with the synset""" + if lang == "eng": + return self._lemma_names + else: + reader = self._wordnet_corpus_reader + reader._load_lang_data(lang) + i = reader.ss2of(self) + if i in reader._lang_data[lang][0]: + return reader._lang_data[lang][0][i] + else: + return [] + + def lemmas(self, lang="eng"): + """Return all the lemma objects associated with the synset""" + if lang == "eng": + return self._lemmas + elif self._name: + self._wordnet_corpus_reader._load_lang_data(lang) + lemmark = [] + lemmy = self.lemma_names(lang) + for lem in lemmy: + temp = Lemma( + self._wordnet_corpus_reader, + self, + lem, + self._wordnet_corpus_reader._lexnames.index(self.lexname()), + 0, + None, + ) + temp._lang = lang + lemmark.append(temp) + return lemmark + + def root_hypernyms(self): + """Get the topmost hypernyms of this synset in WordNet.""" + + result = [] + seen = set() + todo = [self] + while todo: + next_synset = todo.pop() + if next_synset not in seen: + seen.add(next_synset) + next_hypernyms = ( + next_synset.hypernyms() + next_synset.instance_hypernyms() + ) + if not next_hypernyms: + result.append(next_synset) + else: + todo.extend(next_hypernyms) + return result + + # Simpler implementation which makes incorrect assumption that + # hypernym hierarchy is acyclic: + # + # if not self.hypernyms(): + # return [self] + # else: + # return list(set(root for h in self.hypernyms() + # for root in h.root_hypernyms())) + def max_depth(self): + """ + :return: The length of the longest hypernym path from this + synset to the root. + """ + + if "_max_depth" not in self.__dict__: + hypernyms = self.hypernyms() + self.instance_hypernyms() + if not hypernyms: + self._max_depth = 0 + else: + self._max_depth = 1 + max(h.max_depth() for h in hypernyms) + return self._max_depth + + def min_depth(self): + """ + :return: The length of the shortest hypernym path from this + synset to the root. + """ + + if "_min_depth" not in self.__dict__: + hypernyms = self.hypernyms() + self.instance_hypernyms() + if not hypernyms: + self._min_depth = 0 + else: + self._min_depth = 1 + min(h.min_depth() for h in hypernyms) + return self._min_depth + + def closure(self, rel, depth=-1): + """ + Return the transitive closure of source under the rel + relationship, breadth-first, discarding cycles: + + >>> from nltk.corpus import wordnet as wn + >>> computer = wn.synset('computer.n.01') + >>> topic = lambda s:s.topic_domains() + >>> print(list(computer.closure(topic))) + [Synset('computer_science.n.01')] + + UserWarning: Discarded redundant search for Synset('computer.n.01') at depth 2 + + + Include redundant paths (but only once), avoiding duplicate searches + (from 'animal.n.01' to 'entity.n.01'): + + >>> dog = wn.synset('dog.n.01') + >>> hyp = lambda s:s.hypernyms() + >>> print(list(dog.closure(hyp))) + [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'),\ + Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'),\ + Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'),\ + Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),\ + Synset('physical_entity.n.01'), Synset('entity.n.01')] + + UserWarning: Discarded redundant search for Synset('animal.n.01') at depth 7 + """ + + from nltk.util import acyclic_breadth_first + + for synset in acyclic_breadth_first(self, rel, depth): + if synset != self: + yield synset + + from nltk.util import acyclic_depth_first as acyclic_tree + from nltk.util import unweighted_minimum_spanning_tree as mst + + # Also add this shortcut? + # from nltk.util import unweighted_minimum_spanning_digraph as umsd + + def tree(self, rel, depth=-1, cut_mark=None): + """ + Return the full relation tree, including self, + discarding cycles: + + >>> from nltk.corpus import wordnet as wn + >>> from pprint import pprint + >>> computer = wn.synset('computer.n.01') + >>> topic = lambda s:s.topic_domains() + >>> pprint(computer.tree(topic)) + [Synset('computer.n.01'), [Synset('computer_science.n.01')]] + + UserWarning: Discarded redundant search for Synset('computer.n.01') at depth -3 + + + But keep duplicate branches (from 'animal.n.01' to 'entity.n.01'): + + >>> dog = wn.synset('dog.n.01') + >>> hyp = lambda s:s.hypernyms() + >>> pprint(dog.tree(hyp)) + [Synset('dog.n.01'), + [Synset('canine.n.02'), + [Synset('carnivore.n.01'), + [Synset('placental.n.01'), + [Synset('mammal.n.01'), + [Synset('vertebrate.n.01'), + [Synset('chordate.n.01'), + [Synset('animal.n.01'), + [Synset('organism.n.01'), + [Synset('living_thing.n.01'), + [Synset('whole.n.02'), + [Synset('object.n.01'), + [Synset('physical_entity.n.01'), + [Synset('entity.n.01')]]]]]]]]]]]]], + [Synset('domestic_animal.n.01'), + [Synset('animal.n.01'), + [Synset('organism.n.01'), + [Synset('living_thing.n.01'), + [Synset('whole.n.02'), + [Synset('object.n.01'), + [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]] + """ + + from nltk.util import acyclic_branches_depth_first + + return acyclic_branches_depth_first(self, rel, depth, cut_mark) + + def hypernym_paths(self): + """ + Get the path(s) from this synset to the root, where each path is a + list of the synset nodes traversed on the way to the root. + + :return: A list of lists, where each list gives the node sequence + connecting the initial ``Synset`` node and a root node. + """ + paths = [] + + hypernyms = self.hypernyms() + self.instance_hypernyms() + if len(hypernyms) == 0: + paths = [[self]] + + for hypernym in hypernyms: + for ancestor_list in hypernym.hypernym_paths(): + ancestor_list.append(self) + paths.append(ancestor_list) + return paths + + def common_hypernyms(self, other): + """ + Find all synsets that are hypernyms of this synset and the + other synset. + + :type other: Synset + :param other: other input synset. + :return: The synsets that are hypernyms of both synsets. + """ + if not self._all_hypernyms: + self._all_hypernyms = { + self_synset + for self_synsets in self._iter_hypernym_lists() + for self_synset in self_synsets + } + if not other._all_hypernyms: + other._all_hypernyms = { + other_synset + for other_synsets in other._iter_hypernym_lists() + for other_synset in other_synsets + } + return list(self._all_hypernyms.intersection(other._all_hypernyms)) + + def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False): + """ + Get a list of lowest synset(s) that both synsets have as a hypernym. + When `use_min_depth == False` this means that the synset which appears + as a hypernym of both `self` and `other` with the lowest maximum depth + is returned or if there are multiple such synsets at the same depth + they are all returned + + However, if `use_min_depth == True` then the synset(s) which has/have + the lowest minimum depth and appear(s) in both paths is/are returned. + + By setting the use_min_depth flag to True, the behavior of NLTK2 can be + preserved. This was changed in NLTK3 to give more accurate results in a + small set of cases, generally with synsets concerning people. (eg: + 'chef.n.01', 'fireman.n.01', etc.) + + This method is an implementation of Ted Pedersen's "Lowest Common + Subsumer" method from the Perl Wordnet module. It can return either + "self" or "other" if they are a hypernym of the other. + + :type other: Synset + :param other: other input synset + :type simulate_root: bool + :param simulate_root: The various verb taxonomies do not + share a single root which disallows this metric from working for + synsets that are not connected. This flag (False by default) + creates a fake root that connects all the taxonomies. Set it + to True to enable this behavior. For the noun taxonomy, + there is usually a default root except for WordNet version 1.6. + If you are using wordnet 1.6, a fake root will need to be added + for nouns as well. + :type use_min_depth: bool + :param use_min_depth: This setting mimics older (v2) behavior of NLTK + wordnet If True, will use the min_depth function to calculate the + lowest common hypernyms. This is known to give strange results for + some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained + for backwards compatibility + :return: The synsets that are the lowest common hypernyms of both + synsets + """ + synsets = self.common_hypernyms(other) + if simulate_root: + fake_synset = Synset(None) + fake_synset._name = "*ROOT*" + fake_synset.hypernyms = lambda: [] + fake_synset.instance_hypernyms = lambda: [] + synsets.append(fake_synset) + + try: + if use_min_depth: + max_depth = max(s.min_depth() for s in synsets) + unsorted_lch = [s for s in synsets if s.min_depth() == max_depth] + else: + max_depth = max(s.max_depth() for s in synsets) + unsorted_lch = [s for s in synsets if s.max_depth() == max_depth] + return sorted(unsorted_lch) + except ValueError: + return [] + + def hypernym_distances(self, distance=0, simulate_root=False): + """ + Get the path(s) from this synset to the root, counting the distance + of each node from the initial node on the way. A set of + (synset, distance) tuples is returned. + + :type distance: int + :param distance: the distance (number of edges) from this hypernym to + the original hypernym ``Synset`` on which this method was called. + :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is + a hypernym of the first ``Synset``. + """ + distances = {(self, distance)} + for hypernym in self._hypernyms() + self._instance_hypernyms(): + distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False) + if simulate_root: + fake_synset = Synset(None) + fake_synset._name = "*ROOT*" + fake_synset_distance = max(distances, key=itemgetter(1))[1] + distances.add((fake_synset, fake_synset_distance + 1)) + return distances + + def _shortest_hypernym_paths(self, simulate_root): + if self._name == "*ROOT*": + return {self: 0} + + queue = deque([(self, 0)]) + path = {} + + while queue: + s, depth = queue.popleft() + if s in path: + continue + path[s] = depth + + depth += 1 + queue.extend((hyp, depth) for hyp in s._hypernyms()) + queue.extend((hyp, depth) for hyp in s._instance_hypernyms()) + + if simulate_root: + fake_synset = Synset(None) + fake_synset._name = "*ROOT*" + path[fake_synset] = max(path.values()) + 1 + + return path + + def shortest_path_distance(self, other, simulate_root=False): + """ + Returns the distance of the shortest path linking the two synsets (if + one exists). For each synset, all the ancestor nodes and their + distances are recorded and compared. The ancestor node common to both + synsets that can be reached with the minimum number of traversals is + used. If no ancestor nodes are common, None is returned. If a node is + compared with itself 0 is returned. + + :type other: Synset + :param other: The Synset to which the shortest path will be found. + :return: The number of edges in the shortest path connecting the two + nodes, or None if no path exists. + """ + + if self == other: + return 0 + + dist_dict1 = self._shortest_hypernym_paths(simulate_root) + dist_dict2 = other._shortest_hypernym_paths(simulate_root) + + # For each ancestor synset common to both subject synsets, find the + # connecting path length. Return the shortest of these. + + inf = float("inf") + path_distance = inf + for synset, d1 in dist_dict1.items(): + d2 = dist_dict2.get(synset, inf) + path_distance = min(path_distance, d1 + d2) + + return None if math.isinf(path_distance) else path_distance + + # interface to similarity methods + def path_similarity(self, other, verbose=False, simulate_root=True): + """ + Path Distance Similarity: + Return a score denoting how similar two word senses are, based on the + shortest path that connects the senses in the is-a (hypernym/hypnoym) + taxonomy. The score is in the range 0 to 1, except in those cases where + a path cannot be found (will only be true for verbs as there are many + distinct verb taxonomies), in which case None is returned. A score of + 1 represents identity i.e. comparing a sense with itself will return 1. + + :type other: Synset + :param other: The ``Synset`` that this ``Synset`` is being compared to. + :type simulate_root: bool + :param simulate_root: The various verb taxonomies do not + share a single root which disallows this metric from working for + synsets that are not connected. This flag (True by default) + creates a fake root that connects all the taxonomies. Set it + to false to disable this behavior. For the noun taxonomy, + there is usually a default root except for WordNet version 1.6. + If you are using wordnet 1.6, a fake root will be added for nouns + as well. + :return: A score denoting the similarity of the two ``Synset`` objects, + normally between 0 and 1. None is returned if no connecting path + could be found. 1 is returned if a ``Synset`` is compared with + itself. + """ + + distance = self.shortest_path_distance( + other, + simulate_root=simulate_root and (self._needs_root() or other._needs_root()), + ) + if distance is None or distance < 0: + return None + return 1.0 / (distance + 1) + + def lch_similarity(self, other, verbose=False, simulate_root=True): + """ + Leacock Chodorow Similarity: + Return a score denoting how similar two word senses are, based on the + shortest path that connects the senses (as above) and the maximum depth + of the taxonomy in which the senses occur. The relationship is given as + -log(p/2d) where p is the shortest path length and d is the taxonomy + depth. + + :type other: Synset + :param other: The ``Synset`` that this ``Synset`` is being compared to. + :type simulate_root: bool + :param simulate_root: The various verb taxonomies do not + share a single root which disallows this metric from working for + synsets that are not connected. This flag (True by default) + creates a fake root that connects all the taxonomies. Set it + to false to disable this behavior. For the noun taxonomy, + there is usually a default root except for WordNet version 1.6. + If you are using wordnet 1.6, a fake root will be added for nouns + as well. + :return: A score denoting the similarity of the two ``Synset`` objects, + normally greater than 0. None is returned if no connecting path + could be found. If a ``Synset`` is compared with itself, the + maximum score is returned, which varies depending on the taxonomy + depth. + """ + + if self._pos != other._pos: + raise WordNetError( + "Computing the lch similarity requires " + "%s and %s to have the same part of speech." % (self, other) + ) + + need_root = self._needs_root() + + if self._pos not in self._wordnet_corpus_reader._max_depth: + self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root) + + depth = self._wordnet_corpus_reader._max_depth[self._pos] + + distance = self.shortest_path_distance( + other, simulate_root=simulate_root and need_root + ) + + if distance is None or distance < 0 or depth == 0: + return None + return -math.log((distance + 1) / (2.0 * depth)) + + def wup_similarity(self, other, verbose=False, simulate_root=True): + """ + Wu-Palmer Similarity: + Return a score denoting how similar two word senses are, based on the + depth of the two senses in the taxonomy and that of their Least Common + Subsumer (most specific ancestor node). Previously, the scores computed + by this implementation did _not_ always agree with those given by + Pedersen's Perl implementation of WordNet Similarity. However, with + the addition of the simulate_root flag (see below), the score for + verbs now almost always agree but not always for nouns. + + The LCS does not necessarily feature in the shortest path connecting + the two senses, as it is by definition the common ancestor deepest in + the taxonomy, not closest to the two senses. Typically, however, it + will so feature. Where multiple candidates for the LCS exist, that + whose shortest path to the root node is the longest will be selected. + Where the LCS has multiple paths to the root, the longer path is used + for the purposes of the calculation. + + :type other: Synset + :param other: The ``Synset`` that this ``Synset`` is being compared to. + :type simulate_root: bool + :param simulate_root: The various verb taxonomies do not + share a single root which disallows this metric from working for + synsets that are not connected. This flag (True by default) + creates a fake root that connects all the taxonomies. Set it + to false to disable this behavior. For the noun taxonomy, + there is usually a default root except for WordNet version 1.6. + If you are using wordnet 1.6, a fake root will be added for nouns + as well. + :return: A float score denoting the similarity of the two ``Synset`` + objects, normally greater than zero. If no connecting path between + the two senses can be found, None is returned. + + """ + need_root = self._needs_root() or other._needs_root() + + # Note that to preserve behavior from NLTK2 we set use_min_depth=True + # It is possible that more accurate results could be obtained by + # removing this setting and it should be tested later on + subsumers = self.lowest_common_hypernyms( + other, simulate_root=simulate_root and need_root, use_min_depth=True + ) + + # If no LCS was found return None + if len(subsumers) == 0: + return None + + subsumer = self if self in subsumers else subsumers[0] + + # Get the longest path from the LCS to the root, + # including a correction: + # - add one because the calculations include both the start and end + # nodes + depth = subsumer.max_depth() + 1 + + # Note: No need for an additional add-one correction for non-nouns + # to account for an imaginary root node because that is now + # automatically handled by simulate_root + # if subsumer._pos != NOUN: + # depth += 1 + + # Get the shortest path from the LCS to each of the synsets it is + # subsuming. Add this to the LCS path length to get the path + # length from each synset to the root. + len1 = self.shortest_path_distance( + subsumer, simulate_root=simulate_root and need_root + ) + len2 = other.shortest_path_distance( + subsumer, simulate_root=simulate_root and need_root + ) + if len1 is None or len2 is None: + return None + len1 += depth + len2 += depth + return (2.0 * depth) / (len1 + len2) + + def res_similarity(self, other, ic, verbose=False): + """ + Resnik Similarity: + Return a score denoting how similar two word senses are, based on the + Information Content (IC) of the Least Common Subsumer (most specific + ancestor node). + + :type other: Synset + :param other: The ``Synset`` that this ``Synset`` is being compared to. + :type ic: dict + :param ic: an information content object (as returned by + ``nltk.corpus.wordnet_ic.ic()``). + :return: A float score denoting the similarity of the two ``Synset`` + objects. Synsets whose LCS is the root node of the taxonomy will + have a score of 0 (e.g. N['dog'][0] and N['table'][0]). + """ + + ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) + return lcs_ic + + def jcn_similarity(self, other, ic, verbose=False): + """ + Jiang-Conrath Similarity: + Return a score denoting how similar two word senses are, based on the + Information Content (IC) of the Least Common Subsumer (most specific + ancestor node) and that of the two input Synsets. The relationship is + given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). + + :type other: Synset + :param other: The ``Synset`` that this ``Synset`` is being compared to. + :type ic: dict + :param ic: an information content object (as returned by + ``nltk.corpus.wordnet_ic.ic()``). + :return: A float score denoting the similarity of the two ``Synset`` + objects. + """ + + if self == other: + return _INF + + ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) + + # If either of the input synsets are the root synset, or have a + # frequency of 0 (sparse data problem), return 0. + if ic1 == 0 or ic2 == 0: + return 0 + + ic_difference = ic1 + ic2 - 2 * lcs_ic + + if ic_difference == 0: + return _INF + + return 1 / ic_difference + + def lin_similarity(self, other, ic, verbose=False): + """ + Lin Similarity: + Return a score denoting how similar two word senses are, based on the + Information Content (IC) of the Least Common Subsumer (most specific + ancestor node) and that of the two input Synsets. The relationship is + given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). + + :type other: Synset + :param other: The ``Synset`` that this ``Synset`` is being compared to. + :type ic: dict + :param ic: an information content object (as returned by + ``nltk.corpus.wordnet_ic.ic()``). + :return: A float score denoting the similarity of the two ``Synset`` + objects, in the range 0 to 1. + """ + + ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) + return (2.0 * lcs_ic) / (ic1 + ic2) + + def _iter_hypernym_lists(self): + """ + :return: An iterator over ``Synset`` objects that are either proper + hypernyms or instance of hypernyms of the synset. + """ + todo = [self] + seen = set() + while todo: + for synset in todo: + seen.add(synset) + yield todo + todo = [ + hypernym + for synset in todo + for hypernym in (synset.hypernyms() + synset.instance_hypernyms()) + if hypernym not in seen + ] + + def __repr__(self): + return f"{type(self).__name__}('{self._name}')" + + def _related(self, relation_symbol, sort=True): + get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset + if relation_symbol not in self._pointers: + return [] + pointer_tuples = self._pointers[relation_symbol] + r = [get_synset(pos, offset) for pos, offset in pointer_tuples] + if sort: + r.sort() + return r + + +###################################################################### +# WordNet Corpus Reader +###################################################################### + + +class WordNetCorpusReader(CorpusReader): + """ + A corpus reader used to access wordnet or its variants. + """ + + _ENCODING = "utf8" + + # { Part-of-speech constants + ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v" + # } + + # { Filename constants + _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"} + # } + + # { Part of speech constants + _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5} + _pos_names = dict(tup[::-1] for tup in _pos_numbers.items()) + # } + + #: A list of file identifiers for all the fileids used by this + #: corpus reader. + _FILES = ( + "cntlist.rev", + "lexnames", + "index.sense", + "index.adj", + "index.adv", + "index.noun", + "index.verb", + "data.adj", + "data.adv", + "data.noun", + "data.verb", + "adj.exc", + "adv.exc", + "noun.exc", + "verb.exc", + ) + + def __init__(self, root, omw_reader): + """ + Construct a new wordnet corpus reader, with the given root + directory. + """ + + super().__init__(root, self._FILES, encoding=self._ENCODING) + + # A index that provides the file offset + # Map from lemma -> pos -> synset_index -> offset + self._lemma_pos_offset_map = defaultdict(dict) + + # A cache so we don't have to reconstruct synsets + # Map from pos -> offset -> synset + self._synset_offset_cache = defaultdict(dict) + + # A lookup for the maximum depth of each part of speech. Useful for + # the lch similarity metric. + self._max_depth = defaultdict(dict) + + # Corpus reader containing omw data. + self._omw_reader = omw_reader + + # Corpus reader containing extended_omw data. + self._exomw_reader = None + + self.provenances = defaultdict(str) + self.provenances["eng"] = "" + + if self._omw_reader is None: + warnings.warn( + "The multilingual functions are not available with this Wordnet version" + ) + + self.omw_langs = set() + + # A cache to store the wordnet data of multiple languages + self._lang_data = defaultdict(list) + + self._data_file_map = {} + self._exception_map = {} + self._lexnames = [] + self._key_count_file = None + self._key_synset_file = None + + # Load the lexnames + with self.open("lexnames") as fp: + for i, line in enumerate(fp): + index, lexname, _ = line.split() + assert int(index) == i + self._lexnames.append(lexname) + + # Load the indices for lemmas and synset offsets + self._load_lemma_pos_offset_map() + + # load the exception file data into memory + self._load_exception_map() + + self.nomap = [] + self.splits = {} + + # map from WordNet 3.0 for OMW data + self.map30 = self.map_wn30() + + # Language data attributes + self.lg_attrs = ["lemma", "none", "def", "exe"] + + def index_sense(self, version=None): + """Read sense key to synset id mapping from index.sense file in corpus directory""" + fn = "index.sense" + if version: + from nltk.corpus import CorpusReader, LazyCorpusLoader + + ixreader = LazyCorpusLoader(version, CorpusReader, r".*/" + fn) + else: + ixreader = self + with ixreader.open(fn) as fp: + sensekey_map = {} + for line in fp: + fields = line.strip().split() + sensekey = fields[0] + pos = self._pos_names[int(sensekey.split("%")[1].split(":")[0])] + sensekey_map[sensekey] = f"{fields[1]}-{pos}" + return sensekey_map + + def map_to_many(self): + sensekey_map1 = self.index_sense("wordnet") + sensekey_map2 = self.index_sense() + synset_to_many = {} + for synsetid in set(sensekey_map1.values()): + synset_to_many[synsetid] = [] + for sensekey in set(sensekey_map1.keys()).intersection( + set(sensekey_map2.keys()) + ): + source = sensekey_map1[sensekey] + target = sensekey_map2[sensekey] + synset_to_many[source].append(target) + return synset_to_many + + def map_to_one(self): + synset_to_many = self.map_to_many() + synset_to_one = {} + for source in synset_to_many: + candidates_bag = synset_to_many[source] + if candidates_bag: + candidates_set = set(candidates_bag) + if len(candidates_set) == 1: + target = candidates_bag[0] + else: + counts = [] + for candidate in candidates_set: + counts.append((candidates_bag.count(candidate), candidate)) + self.splits[source] = counts + target = max(counts)[1] + synset_to_one[source] = target + if source[-1] == "s": + # Add a mapping from "a" to target for applications like omw, + # where only Lithuanian and Slovak use the "s" ss_type. + synset_to_one[f"{source[:-1]}a"] = target + else: + self.nomap.append(source) + return synset_to_one + + def map_wn30(self): + """Mapping from Wordnet 3.0 to currently loaded Wordnet version""" + if self.get_version() == "3.0": + return None + else: + return self.map_to_one() + + # Open Multilingual WordNet functions, contributed by + # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn + + def of2ss(self, of): + """take an id and return the synsets""" + return self.synset_from_pos_and_offset(of[-1], int(of[:8])) + + def ss2of(self, ss): + """return the ID of the synset""" + if ss: + return f"{ss.offset():08d}-{ss.pos()}" + + def _load_lang_data(self, lang): + """load the wordnet data of the requested language from the file to + the cache, _lang_data""" + + if lang in self._lang_data: + return + + if self._omw_reader and not self.omw_langs: + self.add_omw() + + if lang not in self.langs(): + raise WordNetError("Language is not supported.") + + if self._exomw_reader and lang not in self.omw_langs: + reader = self._exomw_reader + else: + reader = self._omw_reader + + prov = self.provenances[lang] + if prov in ["cldr", "wikt"]: + prov2 = prov + else: + prov2 = "data" + + with reader.open(f"{prov}/wn-{prov2}-{lang.split('_')[0]}.tab") as fp: + self.custom_lemmas(fp, lang) + self.disable_custom_lemmas(lang) + + def add_provs(self, reader): + """Add languages from Multilingual Wordnet to the provenance dictionary""" + fileids = reader.fileids() + for fileid in fileids: + prov, langfile = os.path.split(fileid) + file_name, file_extension = os.path.splitext(langfile) + if file_extension == ".tab": + lang = file_name.split("-")[-1] + if lang in self.provenances or prov in ["cldr", "wikt"]: + # We already have another resource for this lang, + # so we need to further specify the lang id: + lang = f"{lang}_{prov}" + self.provenances[lang] = prov + + def add_omw(self): + self.add_provs(self._omw_reader) + self.omw_langs = set(self.provenances.keys()) + + def add_exomw(self): + """ + Add languages from Extended OMW + + >>> import nltk + >>> from nltk.corpus import wordnet as wn + >>> wn.add_exomw() + >>> print(wn.synset('intrinsically.r.01').lemmas(lang="eng_wikt")) + [Lemma('intrinsically.r.01.per_se'), Lemma('intrinsically.r.01.as_such')] + """ + from nltk.corpus import extended_omw + + self.add_omw() + self._exomw_reader = extended_omw + self.add_provs(self._exomw_reader) + + def langs(self): + """return a list of languages supported by Multilingual Wordnet""" + return list(self.provenances.keys()) + + def _load_lemma_pos_offset_map(self): + for suffix in self._FILEMAP.values(): + + # parse each line of the file (ignoring comment lines) + with self.open("index.%s" % suffix) as fp: + for i, line in enumerate(fp): + if line.startswith(" "): + continue + + _iter = iter(line.split()) + + def _next_token(): + return next(_iter) + + try: + + # get the lemma and part-of-speech + lemma = _next_token() + pos = _next_token() + + # get the number of synsets for this lemma + n_synsets = int(_next_token()) + assert n_synsets > 0 + + # get and ignore the pointer symbols for all synsets of + # this lemma + n_pointers = int(_next_token()) + [_next_token() for _ in range(n_pointers)] + + # same as number of synsets + n_senses = int(_next_token()) + assert n_synsets == n_senses + + # get and ignore number of senses ranked according to + # frequency + _next_token() + + # get synset offsets + synset_offsets = [int(_next_token()) for _ in range(n_synsets)] + + # raise more informative error with file name and line number + except (AssertionError, ValueError) as e: + tup = ("index.%s" % suffix), (i + 1), e + raise WordNetError("file %s, line %i: %s" % tup) from e + + # map lemmas and parts of speech to synsets + self._lemma_pos_offset_map[lemma][pos] = synset_offsets + if pos == ADJ: + self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets + + def _load_exception_map(self): + # load the exception file data into memory + for pos, suffix in self._FILEMAP.items(): + self._exception_map[pos] = {} + with self.open("%s.exc" % suffix) as fp: + for line in fp: + terms = line.split() + self._exception_map[pos][terms[0]] = terms[1:] + self._exception_map[ADJ_SAT] = self._exception_map[ADJ] + + def _compute_max_depth(self, pos, simulate_root): + """ + Compute the max depth for the given part of speech. This is + used by the lch similarity metric. + """ + depth = 0 + for ii in self.all_synsets(pos): + try: + depth = max(depth, ii.max_depth()) + except RuntimeError: + print(ii) + if simulate_root: + depth += 1 + self._max_depth[pos] = depth + + def get_version(self): + fh = self._data_file(ADJ) + fh.seek(0) + for line in fh: + match = re.search(r"Word[nN]et (\d+|\d+\.\d+) Copyright", line) + if match is not None: + version = match.group(1) + fh.seek(0) + return version + + ############################################################# + # Loading Lemmas + ############################################################# + + def lemma(self, name, lang="eng"): + """Return lemma object that matches the name""" + # cannot simply split on first '.', + # e.g.: '.45_caliber.a.01..45_caliber' + separator = SENSENUM_RE.search(name).end() + + synset_name, lemma_name = name[: separator - 1], name[separator:] + + synset = self.synset(synset_name) + for lemma in synset.lemmas(lang): + if lemma._name == lemma_name: + return lemma + raise WordNetError(f"No lemma {lemma_name!r} in {synset_name!r}") + + def lemma_from_key(self, key): + # Keys are case sensitive and always lower-case + key = key.lower() + + lemma_name, lex_sense = key.split("%") + pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":") + pos = self._pos_names[int(pos_number)] + + # open the key -> synset file if necessary + if self._key_synset_file is None: + self._key_synset_file = self.open("index.sense") + + # Find the synset for the lemma. + synset_line = _binary_search_file(self._key_synset_file, key) + if not synset_line: + raise WordNetError("No synset found for key %r" % key) + offset = int(synset_line.split()[1]) + synset = self.synset_from_pos_and_offset(pos, offset) + # return the corresponding lemma + for lemma in synset._lemmas: + if lemma._key == key: + return lemma + raise WordNetError("No lemma found for for key %r" % key) + + ############################################################# + # Loading Synsets + ############################################################# + def synset(self, name): + # split name into lemma, part of speech and synset number + lemma, pos, synset_index_str = name.lower().rsplit(".", 2) + synset_index = int(synset_index_str) - 1 + + # get the offset for this synset + try: + offset = self._lemma_pos_offset_map[lemma][pos][synset_index] + except KeyError as e: + raise WordNetError(f"No lemma {lemma!r} with part of speech {pos!r}") from e + except IndexError as e: + n_senses = len(self._lemma_pos_offset_map[lemma][pos]) + raise WordNetError( + f"Lemma {lemma!r} with part of speech {pos!r} only " + f"has {n_senses} {'sense' if n_senses == 1 else 'senses'}" + ) from e + + # load synset information from the appropriate file + synset = self.synset_from_pos_and_offset(pos, offset) + + # some basic sanity checks on loaded attributes + if pos == "s" and synset._pos == "a": + message = ( + "Adjective satellite requested but only plain " + "adjective found for lemma %r" + ) + raise WordNetError(message % lemma) + assert synset._pos == pos or (pos == "a" and synset._pos == "s") + + # Return the synset object. + return synset + + def _data_file(self, pos): + """ + Return an open file pointer for the data file for the given + part of speech. + """ + if pos == ADJ_SAT: + pos = ADJ + if self._data_file_map.get(pos) is None: + fileid = "data.%s" % self._FILEMAP[pos] + self._data_file_map[pos] = self.open(fileid) + return self._data_file_map[pos] + + def synset_from_pos_and_offset(self, pos, offset): + """ + - pos: The synset's part of speech, matching one of the module level + attributes ADJ, ADJ_SAT, ADV, NOUN or VERB ('a', 's', 'r', 'n', or 'v'). + - offset: The byte offset of this synset in the WordNet dict file + for this pos. + + >>> from nltk.corpus import wordnet as wn + >>> print(wn.synset_from_pos_and_offset('n', 1740)) + Synset('entity.n.01') + """ + # Check to see if the synset is in the cache + if offset in self._synset_offset_cache[pos]: + return self._synset_offset_cache[pos][offset] + + data_file = self._data_file(pos) + data_file.seek(offset) + data_file_line = data_file.readline() + # If valid, the offset equals the 8-digit 0-padded integer found at the start of the line: + line_offset = data_file_line[:8] + if ( + line_offset.isalnum() + and line_offset == f"{'0'*(8-len(str(offset)))}{str(offset)}" + ): + synset = self._synset_from_pos_and_line(pos, data_file_line) + assert synset._offset == offset + self._synset_offset_cache[pos][offset] = synset + else: + synset = None + warnings.warn(f"No WordNet synset found for pos={pos} at offset={offset}.") + data_file.seek(0) + return synset + + @deprecated("Use public method synset_from_pos_and_offset() instead") + def _synset_from_pos_and_offset(self, *args, **kwargs): + """ + Hack to help people like the readers of + https://stackoverflow.com/a/27145655/1709587 + who were using this function before it was officially a public method + """ + return self.synset_from_pos_and_offset(*args, **kwargs) + + def _synset_from_pos_and_line(self, pos, data_file_line): + # Construct a new (empty) synset. + synset = Synset(self) + + # parse the entry for this synset + try: + + # parse out the definitions and examples from the gloss + columns_str, gloss = data_file_line.strip().split("|") + definition = re.sub(r"[\"].*?[\"]", "", gloss).strip() + examples = re.findall(r'"([^"]*)"', gloss) + for example in examples: + synset._examples.append(example) + + synset._definition = definition.strip("; ") + + # split the other info into fields + _iter = iter(columns_str.split()) + + def _next_token(): + return next(_iter) + + # get the offset + synset._offset = int(_next_token()) + + # determine the lexicographer file name + lexname_index = int(_next_token()) + synset._lexname = self._lexnames[lexname_index] + + # get the part of speech + synset._pos = _next_token() + + # create Lemma objects for each lemma + n_lemmas = int(_next_token(), 16) + for _ in range(n_lemmas): + # get the lemma name + lemma_name = _next_token() + # get the lex_id (used for sense_keys) + lex_id = int(_next_token(), 16) + # If the lemma has a syntactic marker, extract it. + m = re.match(r"(.*?)(\(.*\))?$", lemma_name) + lemma_name, syn_mark = m.groups() + # create the lemma object + lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark) + synset._lemmas.append(lemma) + synset._lemma_names.append(lemma._name) + + # collect the pointer tuples + n_pointers = int(_next_token()) + for _ in range(n_pointers): + symbol = _next_token() + offset = int(_next_token()) + pos = _next_token() + lemma_ids_str = _next_token() + if lemma_ids_str == "0000": + synset._pointers[symbol].add((pos, offset)) + else: + source_index = int(lemma_ids_str[:2], 16) - 1 + target_index = int(lemma_ids_str[2:], 16) - 1 + source_lemma_name = synset._lemmas[source_index]._name + lemma_pointers = synset._lemma_pointers + tups = lemma_pointers[source_lemma_name, symbol] + tups.append((pos, offset, target_index)) + + # read the verb frames + try: + frame_count = int(_next_token()) + except StopIteration: + pass + else: + for _ in range(frame_count): + # read the plus sign + plus = _next_token() + assert plus == "+" + # read the frame and lemma number + frame_number = int(_next_token()) + frame_string_fmt = VERB_FRAME_STRINGS[frame_number] + lemma_number = int(_next_token(), 16) + # lemma number of 00 means all words in the synset + if lemma_number == 0: + synset._frame_ids.append(frame_number) + for lemma in synset._lemmas: + lemma._frame_ids.append(frame_number) + lemma._frame_strings.append(frame_string_fmt % lemma._name) + # only a specific word in the synset + else: + lemma = synset._lemmas[lemma_number - 1] + lemma._frame_ids.append(frame_number) + lemma._frame_strings.append(frame_string_fmt % lemma._name) + + # raise a more informative error with line text + except ValueError as e: + raise WordNetError(f"line {data_file_line!r}: {e}") from e + + # set sense keys for Lemma objects - note that this has to be + # done afterwards so that the relations are available + for lemma in synset._lemmas: + if synset._pos == ADJ_SAT: + head_lemma = synset.similar_tos()[0]._lemmas[0] + head_name = head_lemma._name + head_id = "%02d" % head_lemma._lex_id + else: + head_name = head_id = "" + tup = ( + lemma._name, + WordNetCorpusReader._pos_numbers[synset._pos], + lemma._lexname_index, + lemma._lex_id, + head_name, + head_id, + ) + lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower() + + # the canonical name is based on the first lemma + lemma_name = synset._lemmas[0]._name.lower() + offsets = self._lemma_pos_offset_map[lemma_name][synset._pos] + sense_index = offsets.index(synset._offset) + tup = lemma_name, synset._pos, sense_index + 1 + synset._name = "%s.%s.%02i" % tup + + return synset + + def synset_from_sense_key(self, sense_key): + """ + Retrieves synset based on a given sense_key. Sense keys can be + obtained from lemma.key() + + From https://wordnet.princeton.edu/documentation/senseidx5wn: + A sense_key is represented as:: + + lemma % lex_sense (e.g. 'dog%1:18:01::') + + where lex_sense is encoded as:: + + ss_type:lex_filenum:lex_id:head_word:head_id + + :lemma: ASCII text of word/collocation, in lower case + :ss_type: synset type for the sense (1 digit int) + The synset type is encoded as follows:: + + 1 NOUN + 2 VERB + 3 ADJECTIVE + 4 ADVERB + 5 ADJECTIVE SATELLITE + :lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int) + :lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int) + :head_word: lemma of the first word in satellite's head synset + Only used if sense is in an adjective satellite synset + :head_id: uniquely identifies sense in a lexicographer file when paired with head_word + Only used if head_word is present (2 digit int) + + >>> import nltk + >>> from nltk.corpus import wordnet as wn + >>> print(wn.synset_from_sense_key("drive%1:04:03::")) + Synset('drive.n.06') + + >>> print(wn.synset_from_sense_key("driving%1:04:03::")) + Synset('drive.n.06') + """ + return self.lemma_from_key(sense_key).synset() + + ############################################################# + # Retrieve synsets and lemmas. + ############################################################# + + def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True): + """Load all synsets with a given lemma and part of speech tag. + If no pos is specified, all synsets for all parts of speech + will be loaded. + If lang is specified, all the synsets associated with the lemma name + of that language will be returned. + """ + lemma = lemma.lower() + + if lang == "eng": + get_synset = self.synset_from_pos_and_offset + index = self._lemma_pos_offset_map + if pos is None: + pos = POS_LIST + return [ + get_synset(p, offset) + for p in pos + for form in self._morphy(lemma, p, check_exceptions) + for offset in index[form].get(p, []) + ] + + else: + self._load_lang_data(lang) + synset_list = [] + if lemma in self._lang_data[lang][1]: + for l in self._lang_data[lang][1][lemma]: + if pos is not None and l[-1] != pos: + continue + synset_list.append(self.of2ss(l)) + return synset_list + + def lemmas(self, lemma, pos=None, lang="eng"): + """Return all Lemma objects with a name matching the specified lemma + name and part of speech tag. Matches any part of speech tag if none is + specified.""" + + lemma = lemma.lower() + if lang == "eng": + return [ + lemma_obj + for synset in self.synsets(lemma, pos) + for lemma_obj in synset.lemmas() + if lemma_obj.name().lower() == lemma + ] + + else: + self._load_lang_data(lang) + lemmas = [] + syn = self.synsets(lemma, lang=lang) + for s in syn: + if pos is not None and s.pos() != pos: + continue + for lemma_obj in s.lemmas(lang=lang): + if lemma_obj.name().lower() == lemma: + lemmas.append(lemma_obj) + return lemmas + + def all_lemma_names(self, pos=None, lang="eng"): + """Return all lemma names for all synsets for the given + part of speech tag and language or languages. If pos is + not specified, all synsets for all parts of speech will + be used.""" + + if lang == "eng": + if pos is None: + return iter(self._lemma_pos_offset_map) + else: + return ( + lemma + for lemma in self._lemma_pos_offset_map + if pos in self._lemma_pos_offset_map[lemma] + ) + else: + self._load_lang_data(lang) + lemma = [] + for i in self._lang_data[lang][0]: + if pos is not None and i[-1] != pos: + continue + lemma.extend(self._lang_data[lang][0][i]) + + lemma = iter(set(lemma)) + return lemma + + def all_omw_synsets(self, pos=None, lang=None): + if lang not in self.langs(): + return None + self._load_lang_data(lang) + for of in self._lang_data[lang][0]: + if not pos or of[-1] == pos: + ss = self.of2ss(of) + if ss: + yield ss + + # else: + # A few OMW offsets don't exist in Wordnet 3.0. + # warnings.warn(f"Language {lang}: no synset found for {of}") + + def all_synsets(self, pos=None, lang="eng"): + """Iterate over all synsets with a given part of speech tag. + If no pos is specified, all synsets for all parts of speech + will be loaded. + """ + if lang == "eng": + return self.all_eng_synsets(pos=pos) + else: + return self.all_omw_synsets(pos=pos, lang=lang) + + def all_eng_synsets(self, pos=None): + if pos is None: + pos_tags = self._FILEMAP.keys() + else: + pos_tags = [pos] + + cache = self._synset_offset_cache + from_pos_and_line = self._synset_from_pos_and_line + + # generate all synsets for each part of speech + for pos_tag in pos_tags: + # Open the file for reading. Note that we can not re-use + # the file pointers from self._data_file_map here, because + # we're defining an iterator, and those file pointers might + # be moved while we're not looking. + if pos_tag == ADJ_SAT: + pos_file = ADJ + else: + pos_file = pos_tag + fileid = "data.%s" % self._FILEMAP[pos_file] + data_file = self.open(fileid) + + try: + # generate synsets for each line in the POS file + offset = data_file.tell() + line = data_file.readline() + while line: + if not line[0].isspace(): + if offset in cache[pos_tag]: + # See if the synset is cached + synset = cache[pos_tag][offset] + else: + # Otherwise, parse the line + synset = from_pos_and_line(pos_tag, line) + cache[pos_tag][offset] = synset + + # adjective satellites are in the same file as + # adjectives so only yield the synset if it's actually + # a satellite + if pos_tag == ADJ_SAT and synset._pos == ADJ_SAT: + yield synset + # for all other POS tags, yield all synsets (this means + # that adjectives also include adjective satellites) + elif pos_tag != ADJ_SAT: + yield synset + offset = data_file.tell() + line = data_file.readline() + + # close the extra file handle we opened + except: + data_file.close() + raise + else: + data_file.close() + + def words(self, lang="eng"): + """return lemmas of the given language as list of words""" + return self.all_lemma_names(lang=lang) + + def synonyms(self, word, lang="eng"): + """return nested list with the synonyms of the different senses of word in the given language""" + return [ + sorted(list(set(ss.lemma_names(lang=lang)) - {word})) + for ss in self.synsets(word, lang=lang) + ] + + def doc(self, file="README", lang="eng"): + """Return the contents of readme, license or citation file + use lang=lang to get the file for an individual language""" + if lang == "eng": + reader = self + else: + reader = self._omw_reader + if lang in self.langs(): + file = f"{os.path.join(self.provenances[lang],file)}" + try: + with reader.open(file) as fp: + return fp.read() + except: + if lang in self._lang_data: + return f"Cannot determine {file} for {lang}" + else: + return f"Language {lang} is not supported." + + def license(self, lang="eng"): + """Return the contents of LICENSE (for omw) + use lang=lang to get the license for an individual language""" + return self.doc(file="LICENSE", lang=lang) + + def readme(self, lang="eng"): + """Return the contents of README (for omw) + use lang=lang to get the readme for an individual language""" + return self.doc(file="README", lang=lang) + + def citation(self, lang="eng"): + """Return the contents of citation.bib file (for omw) + use lang=lang to get the citation for an individual language""" + return self.doc(file="citation.bib", lang=lang) + + ############################################################# + # Misc + ############################################################# + def lemma_count(self, lemma): + """Return the frequency count for this Lemma""" + # Currently, count is only work for English + if lemma._lang != "eng": + return 0 + # open the count file if we haven't already + if self._key_count_file is None: + self._key_count_file = self.open("cntlist.rev") + # find the key in the counts file and return the count + line = _binary_search_file(self._key_count_file, lemma._key) + if line: + return int(line.rsplit(" ", 1)[-1]) + else: + return 0 + + def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True): + return synset1.path_similarity(synset2, verbose, simulate_root) + + path_similarity.__doc__ = Synset.path_similarity.__doc__ + + def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True): + return synset1.lch_similarity(synset2, verbose, simulate_root) + + lch_similarity.__doc__ = Synset.lch_similarity.__doc__ + + def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True): + return synset1.wup_similarity(synset2, verbose, simulate_root) + + wup_similarity.__doc__ = Synset.wup_similarity.__doc__ + + def res_similarity(self, synset1, synset2, ic, verbose=False): + return synset1.res_similarity(synset2, ic, verbose) + + res_similarity.__doc__ = Synset.res_similarity.__doc__ + + def jcn_similarity(self, synset1, synset2, ic, verbose=False): + return synset1.jcn_similarity(synset2, ic, verbose) + + jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ + + def lin_similarity(self, synset1, synset2, ic, verbose=False): + return synset1.lin_similarity(synset2, ic, verbose) + + lin_similarity.__doc__ = Synset.lin_similarity.__doc__ + + ############################################################# + # Morphy + ############################################################# + # Morphy, adapted from Oliver Steele's pywordnet + def morphy(self, form, pos=None, check_exceptions=True): + """ + Find a possible base form for the given form, with the given + part of speech, by checking WordNet's list of exceptional + forms, and by recursively stripping affixes for this part of + speech until a form in WordNet is found. + + >>> from nltk.corpus import wordnet as wn + >>> print(wn.morphy('dogs')) + dog + >>> print(wn.morphy('churches')) + church + >>> print(wn.morphy('aardwolves')) + aardwolf + >>> print(wn.morphy('abaci')) + abacus + >>> wn.morphy('hardrock', wn.ADV) + >>> print(wn.morphy('book', wn.NOUN)) + book + >>> wn.morphy('book', wn.ADJ) + """ + + if pos is None: + morphy = self._morphy + analyses = chain(a for p in POS_LIST for a in morphy(form, p)) + else: + analyses = self._morphy(form, pos, check_exceptions) + + # get the first one we find + first = list(islice(analyses, 1)) + if len(first) == 1: + return first[0] + else: + return None + + MORPHOLOGICAL_SUBSTITUTIONS = { + NOUN: [ + ("s", ""), + ("ses", "s"), + ("ves", "f"), + ("xes", "x"), + ("zes", "z"), + ("ches", "ch"), + ("shes", "sh"), + ("men", "man"), + ("ies", "y"), + ], + VERB: [ + ("s", ""), + ("ies", "y"), + ("es", "e"), + ("es", ""), + ("ed", "e"), + ("ed", ""), + ("ing", "e"), + ("ing", ""), + ], + ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")], + ADV: [], + } + + MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ] + + def _morphy(self, form, pos, check_exceptions=True): + # from jordanbg: + # Given an original string x + # 1. Apply rules once to the input to get y1, y2, y3, etc. + # 2. Return all that are in the database + # 3. If there are no matches, keep applying rules until you either + # find a match or you can't go any further + + exceptions = self._exception_map[pos] + substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos] + + def apply_rules(forms): + return [ + form[: -len(old)] + new + for form in forms + for old, new in substitutions + if form.endswith(old) + ] + + def filter_forms(forms): + result = [] + seen = set() + for form in forms: + if form in self._lemma_pos_offset_map: + if pos in self._lemma_pos_offset_map[form]: + if form not in seen: + result.append(form) + seen.add(form) + return result + + # 0. Check the exception lists + if check_exceptions: + if form in exceptions: + return filter_forms([form] + exceptions[form]) + + # 1. Apply rules once to the input to get y1, y2, y3, etc. + forms = apply_rules([form]) + + # 2. Return all that are in the database (and check the original too) + results = filter_forms([form] + forms) + if results: + return results + + # 3. If there are no matches, keep applying rules until we find a match + while forms: + forms = apply_rules(forms) + results = filter_forms(forms) + if results: + return results + + # Return an empty list if we can't find anything + return [] + + ############################################################# + # Create information content from corpus + ############################################################# + def ic(self, corpus, weight_senses_equally=False, smoothing=1.0): + """ + Creates an information content lookup dictionary from a corpus. + + :type corpus: CorpusReader + :param corpus: The corpus from which we create an information + content dictionary. + :type weight_senses_equally: bool + :param weight_senses_equally: If this is True, gives all + possible senses equal weight rather than dividing by the + number of possible senses. (If a word has 3 synses, each + sense gets 0.3333 per appearance when this is False, 1.0 when + it is true.) + :param smoothing: How much do we smooth synset counts (default is 1.0) + :type smoothing: float + :return: An information content dictionary + """ + counts = FreqDist() + for ww in corpus.words(): + counts[ww] += 1 + + ic = {} + for pp in POS_LIST: + ic[pp] = defaultdict(float) + + # Initialize the counts with the smoothing value + if smoothing > 0.0: + for pp in POS_LIST: + ic[pp][0] = smoothing + for ss in self.all_synsets(): + pos = ss._pos + if pos == ADJ_SAT: + pos = ADJ + ic[pos][ss._offset] = smoothing + + for ww in counts: + possible_synsets = self.synsets(ww) + if len(possible_synsets) == 0: + continue + + # Distribute weight among possible synsets + weight = float(counts[ww]) + if not weight_senses_equally: + weight /= float(len(possible_synsets)) + + for ss in possible_synsets: + pos = ss._pos + if pos == ADJ_SAT: + pos = ADJ + for level in ss._iter_hypernym_lists(): + for hh in level: + ic[pos][hh._offset] += weight + # Add the weight to the root + ic[pos][0] += weight + return ic + + def custom_lemmas(self, tab_file, lang): + """ + Reads a custom tab file containing mappings of lemmas in the given + language to Princeton WordNet 3.0 synset offsets, allowing NLTK's + WordNet functions to then be used with that language. + + See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for + documentation on the Multilingual WordNet tab file format. + + :param tab_file: Tab file as a file or file-like object + :type: lang str + :param: lang ISO 639-3 code of the language of the tab file + """ + lg = lang.split("_")[0] + if len(lg) != 3: + raise ValueError("lang should be a (3 character) ISO 639-3 code") + self._lang_data[lang] = [ + defaultdict(list), + defaultdict(list), + defaultdict(list), + defaultdict(list), + ] + for line in tab_file.readlines(): + if isinstance(line, bytes): + # Support byte-stream files (e.g. as returned by Python 2's + # open() function) as well as text-stream ones + line = line.decode("utf-8") + if not line.startswith("#"): + triple = line.strip().split("\t") + if len(triple) < 3: + continue + offset_pos, label = triple[:2] + val = triple[-1] + if self.map30: + if offset_pos in self.map30: + # Map offset_pos to current Wordnet version: + offset_pos = self.map30[offset_pos] + else: + # Some OMW offsets were never in Wordnet: + if ( + offset_pos not in self.nomap + and offset_pos.replace("a", "s") not in self.nomap + ): + warnings.warn( + f"{lang}: invalid offset {offset_pos} in '{line}'" + ) + continue + elif offset_pos[-1] == "a": + wnss = self.of2ss(offset_pos) + if wnss and wnss.pos() == "s": # Wordnet pos is "s" + # Label OMW adjective satellites back to their Wordnet pos ("s") + offset_pos = self.ss2of(wnss) + pair = label.split(":") + attr = pair[-1] + if len(pair) == 1 or pair[0] == lg: + if attr == "lemma": + val = val.strip().replace(" ", "_") + self._lang_data[lang][1][val.lower()].append(offset_pos) + if attr in self.lg_attrs: + self._lang_data[lang][self.lg_attrs.index(attr)][ + offset_pos + ].append(val) + + def disable_custom_lemmas(self, lang): + """prevent synsets from being mistakenly added""" + for n in range(len(self.lg_attrs)): + self._lang_data[lang][n].default_factory = None + + ###################################################################### + # Visualize WordNet relation graphs using Graphviz + ###################################################################### + + def digraph( + self, + inputs, + rel=lambda s: s.hypernyms(), + pos=None, + maxdepth=-1, + shapes=None, + attr=None, + verbose=False, + ): + """ + Produce a graphical representation from 'inputs' (a list of + start nodes, which can be a mix of Synsets, Lemmas and/or words), + and a synset relation, for drawing with the 'dot' graph visualisation + program from the Graphviz package. + + Return a string in the DOT graph file language, which can then be + converted to an image by nltk.parse.dependencygraph.dot2img(dot_string). + + Optional Parameters: + :rel: Wordnet synset relation + :pos: for words, restricts Part of Speech to 'n', 'v', 'a' or 'r' + :maxdepth: limit the longest path + :shapes: dictionary of strings that trigger a specified shape + :attr: dictionary with global graph attributes + :verbose: warn about cycles + + >>> from nltk.corpus import wordnet as wn + >>> print(wn.digraph([wn.synset('dog.n.01')])) + digraph G { + "Synset('animal.n.01')" -> "Synset('organism.n.01')"; + "Synset('canine.n.02')" -> "Synset('carnivore.n.01')"; + "Synset('carnivore.n.01')" -> "Synset('placental.n.01')"; + "Synset('chordate.n.01')" -> "Synset('animal.n.01')"; + "Synset('dog.n.01')" -> "Synset('canine.n.02')"; + "Synset('dog.n.01')" -> "Synset('domestic_animal.n.01')"; + "Synset('domestic_animal.n.01')" -> "Synset('animal.n.01')"; + "Synset('living_thing.n.01')" -> "Synset('whole.n.02')"; + "Synset('mammal.n.01')" -> "Synset('vertebrate.n.01')"; + "Synset('object.n.01')" -> "Synset('physical_entity.n.01')"; + "Synset('organism.n.01')" -> "Synset('living_thing.n.01')"; + "Synset('physical_entity.n.01')" -> "Synset('entity.n.01')"; + "Synset('placental.n.01')" -> "Synset('mammal.n.01')"; + "Synset('vertebrate.n.01')" -> "Synset('chordate.n.01')"; + "Synset('whole.n.02')" -> "Synset('object.n.01')"; + } + + """ + from nltk.util import edge_closure, edges2dot + + synsets = set() + edges = set() + if not shapes: + shapes = dict() + if not attr: + attr = dict() + + def add_lemma(lem): + ss = lem.synset() + synsets.add(ss) + edges.add((lem, ss)) + + for node in inputs: + typ = type(node) + if typ == Synset: + synsets.add(node) + elif typ == Lemma: + add_lemma(node) + elif typ == str: + for lemma in self.lemmas(node, pos): + add_lemma(lemma) + + for ss in synsets: + edges = edges.union(edge_closure(ss, rel, maxdepth, verbose)) + dot_string = edges2dot(sorted(list(edges)), shapes=shapes, attr=attr) + return dot_string + + +###################################################################### +# WordNet Information Content Corpus Reader +###################################################################### + + +class WordNetICCorpusReader(CorpusReader): + """ + A corpus reader for the WordNet information content corpus. + """ + + def __init__(self, root, fileids): + CorpusReader.__init__(self, root, fileids, encoding="utf8") + + # this load function would be more efficient if the data was pickled + # Note that we can't use NLTK's frequency distributions because + # synsets are overlapping (each instance of a synset also counts + # as an instance of its hypernyms) + def ic(self, icfile): + """ + Load an information content file from the wordnet_ic corpus + and return a dictionary. This dictionary has just two keys, + NOUN and VERB, whose values are dictionaries that map from + synsets to information content values. + + :type icfile: str + :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat") + :return: An information content dictionary + """ + ic = {} + ic[NOUN] = defaultdict(float) + ic[VERB] = defaultdict(float) + with self.open(icfile) as fp: + for num, line in enumerate(fp): + if num == 0: # skip the header + continue + fields = line.split() + offset = int(fields[0][:-1]) + value = float(fields[1]) + pos = _get_pos(fields[0]) + if len(fields) == 3 and fields[2] == "ROOT": + # Store root count. + ic[pos][0] += value + if value != 0: + ic[pos][offset] = value + return ic + + +###################################################################### +# Similarity metrics +###################################################################### + +# TODO: Add in the option to manually add a new root node; this will be +# useful for verb similarity as there exist multiple verb taxonomies. + +# More information about the metrics is available at +# http://marimba.d.umn.edu/similarity/measures.html + + +def path_similarity(synset1, synset2, verbose=False, simulate_root=True): + return synset1.path_similarity( + synset2, verbose=verbose, simulate_root=simulate_root + ) + + +def lch_similarity(synset1, synset2, verbose=False, simulate_root=True): + return synset1.lch_similarity(synset2, verbose=verbose, simulate_root=simulate_root) + + +def wup_similarity(synset1, synset2, verbose=False, simulate_root=True): + return synset1.wup_similarity(synset2, verbose=verbose, simulate_root=simulate_root) + + +def res_similarity(synset1, synset2, ic, verbose=False): + return synset1.res_similarity(synset2, ic, verbose=verbose) + + +def jcn_similarity(synset1, synset2, ic, verbose=False): + return synset1.jcn_similarity(synset2, ic, verbose=verbose) + + +def lin_similarity(synset1, synset2, ic, verbose=False): + return synset1.lin_similarity(synset2, ic, verbose=verbose) + + +path_similarity.__doc__ = Synset.path_similarity.__doc__ +lch_similarity.__doc__ = Synset.lch_similarity.__doc__ +wup_similarity.__doc__ = Synset.wup_similarity.__doc__ +res_similarity.__doc__ = Synset.res_similarity.__doc__ +jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ +lin_similarity.__doc__ = Synset.lin_similarity.__doc__ + + +def _lcs_ic(synset1, synset2, ic, verbose=False): + """ + Get the information content of the least common subsumer that has + the highest information content value. If two nodes have no + explicit common subsumer, assume that they share an artificial + root node that is the hypernym of all explicit roots. + + :type synset1: Synset + :param synset1: First input synset. + :type synset2: Synset + :param synset2: Second input synset. Must be the same part of + speech as the first synset. + :type ic: dict + :param ic: an information content object (as returned by ``load_ic()``). + :return: The information content of the two synsets and their most + informative subsumer + """ + if synset1._pos != synset2._pos: + raise WordNetError( + "Computing the least common subsumer requires " + "%s and %s to have the same part of speech." % (synset1, synset2) + ) + + ic1 = information_content(synset1, ic) + ic2 = information_content(synset2, ic) + subsumers = synset1.common_hypernyms(synset2) + if len(subsumers) == 0: + subsumer_ic = 0 + else: + subsumer_ic = max(information_content(s, ic) for s in subsumers) + + if verbose: + print("> LCS Subsumer by content:", subsumer_ic) + + return ic1, ic2, subsumer_ic + + +# Utility functions + + +def information_content(synset, ic): + pos = synset._pos + if pos == ADJ_SAT: + pos = ADJ + try: + icpos = ic[pos] + except KeyError as e: + msg = "Information content file has no entries for part-of-speech: %s" + raise WordNetError(msg % pos) from e + + counts = icpos[synset._offset] + if counts == 0: + return _INF + else: + return -math.log(counts / icpos[0]) + + +# get the part of speech (NOUN or VERB) from the information content record +# (each identifier has a 'n' or 'v' suffix) + + +def _get_pos(field): + if field[-1] == "n": + return NOUN + elif field[-1] == "v": + return VERB + else: + msg = ( + "Unidentified part of speech in WordNet Information Content file " + "for field %s" % field + ) + raise ValueError(msg) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py new file mode 100644 index 0000000000000000000000000000000000000000..c1479b9b1aec19148e446be5824cda5bd7ca9fe6 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py @@ -0,0 +1,397 @@ +# Natural Language Toolkit: XML Corpus Reader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Corpus reader for corpora whose documents are xml files. + +(note -- not named 'xml' to avoid conflicting w/ standard xml package) +""" + +import codecs +from xml.etree import ElementTree + +from nltk.corpus.reader.api import CorpusReader +from nltk.corpus.reader.util import * +from nltk.data import SeekableUnicodeStreamReader +from nltk.internals import ElementWrapper +from nltk.tokenize import WordPunctTokenizer + + +class XMLCorpusReader(CorpusReader): + """ + Corpus reader for corpora whose documents are xml files. + + Note that the ``XMLCorpusReader`` constructor does not take an + ``encoding`` argument, because the unicode encoding is specified by + the XML files themselves. See the XML specs for more info. + """ + + def __init__(self, root, fileids, wrap_etree=False): + self._wrap_etree = wrap_etree + CorpusReader.__init__(self, root, fileids) + + def xml(self, fileid=None): + # Make sure we have exactly one file -- no concatenating XML. + if fileid is None and len(self._fileids) == 1: + fileid = self._fileids[0] + if not isinstance(fileid, str): + raise TypeError("Expected a single file identifier string") + # Read the XML in using ElementTree. + with self.abspath(fileid).open() as fp: + elt = ElementTree.parse(fp).getroot() + # If requested, wrap it. + if self._wrap_etree: + elt = ElementWrapper(elt) + # Return the ElementTree element. + return elt + + def words(self, fileid=None): + """ + Returns all of the words and punctuation symbols in the specified file + that were in text nodes -- ie, tags are ignored. Like the xml() method, + fileid can only specify one file. + + :return: the given file's text nodes as a list of words and punctuation symbols + :rtype: list(str) + """ + + elt = self.xml(fileid) + encoding = self.encoding(fileid) + word_tokenizer = WordPunctTokenizer() + try: + iterator = elt.getiterator() + except: + iterator = elt.iter() + out = [] + + for node in iterator: + text = node.text + if text is not None: + if isinstance(text, bytes): + text = text.decode(encoding) + toks = word_tokenizer.tokenize(text) + out.extend(toks) + return out + + +class XMLCorpusView(StreamBackedCorpusView): + """ + A corpus view that selects out specified elements from an XML + file, and provides a flat list-like interface for accessing them. + (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself, + but may be used by subclasses of ``XMLCorpusReader``.) + + Every XML corpus view has a "tag specification", indicating what + XML elements should be included in the view; and each (non-nested) + element that matches this specification corresponds to one item in + the view. Tag specifications are regular expressions over tag + paths, where a tag path is a list of element tag names, separated + by '/', indicating the ancestry of the element. Some examples: + + - ``'foo'``: A top-level element whose tag is ``foo``. + - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent + is a top-level element whose tag is ``foo``. + - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere + in the xml tree. + - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``, + appearing anywhere in the xml tree. + + The view items are generated from the selected XML elements via + the method ``handle_elt()``. By default, this method returns the + element as-is (i.e., as an ElementTree object); but it can be + overridden, either via subclassing or via the ``elt_handler`` + constructor parameter. + """ + + #: If true, then display debugging output to stdout when reading + #: blocks. + _DEBUG = False + + #: The number of characters read at a time by this corpus reader. + _BLOCK_SIZE = 1024 + + def __init__(self, fileid, tagspec, elt_handler=None): + """ + Create a new corpus view based on a specified XML file. + + Note that the ``XMLCorpusView`` constructor does not take an + ``encoding`` argument, because the unicode encoding is + specified by the XML files themselves. + + :type tagspec: str + :param tagspec: A tag specification, indicating what XML + elements should be included in the view. Each non-nested + element that matches this specification corresponds to one + item in the view. + + :param elt_handler: A function used to transform each element + to a value for the view. If no handler is specified, then + ``self.handle_elt()`` is called, which returns the element + as an ElementTree object. The signature of elt_handler is:: + + elt_handler(elt, tagspec) -> value + """ + if elt_handler: + self.handle_elt = elt_handler + + self._tagspec = re.compile(tagspec + r"\Z") + """The tag specification for this corpus view.""" + + self._tag_context = {0: ()} + """A dictionary mapping from file positions (as returned by + ``stream.seek()`` to XML contexts. An XML context is a + tuple of XML tag names, indicating which tags have not yet + been closed.""" + + encoding = self._detect_encoding(fileid) + StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) + + def _detect_encoding(self, fileid): + if isinstance(fileid, PathPointer): + try: + infile = fileid.open() + s = infile.readline() + finally: + infile.close() + else: + with open(fileid, "rb") as infile: + s = infile.readline() + if s.startswith(codecs.BOM_UTF16_BE): + return "utf-16-be" + if s.startswith(codecs.BOM_UTF16_LE): + return "utf-16-le" + if s.startswith(codecs.BOM_UTF32_BE): + return "utf-32-be" + if s.startswith(codecs.BOM_UTF32_LE): + return "utf-32-le" + if s.startswith(codecs.BOM_UTF8): + return "utf-8" + m = re.match(rb'\s*<\?xml\b.*\bencoding="([^"]+)"', s) + if m: + return m.group(1).decode() + m = re.match(rb"\s*<\?xml\b.*\bencoding='([^']+)'", s) + if m: + return m.group(1).decode() + # No encoding found -- what should the default be? + return "utf-8" + + def handle_elt(self, elt, context): + """ + Convert an element into an appropriate value for inclusion in + the view. Unless overridden by a subclass or by the + ``elt_handler`` constructor argument, this method simply + returns ``elt``. + + :return: The view value corresponding to ``elt``. + + :type elt: ElementTree + :param elt: The element that should be converted. + + :type context: str + :param context: A string composed of element tags separated by + forward slashes, indicating the XML context of the given + element. For example, the string ``'foo/bar/baz'`` + indicates that the element is a ``baz`` element whose + parent is a ``bar`` element and whose grandparent is a + top-level ``foo`` element. + """ + return elt + + #: A regular expression that matches XML fragments that do not + #: contain any un-closed tags. + _VALID_XML_RE = re.compile( + r""" + [^<]* + ( + (() | # comment + () | # doctype decl + (<[^!>][^>]*>)) # tag or PI + [^<]*)* + \Z""", + re.DOTALL | re.VERBOSE, + ) + + #: A regular expression used to extract the tag name from a start tag, + #: end tag, or empty-elt tag string. + _XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)") + + #: A regular expression used to find all start-tags, end-tags, and + #: empty-elt tags in an XML file. This regexp is more lenient than + #: the XML spec -- e.g., it allows spaces in some places where the + #: spec does not. + _XML_PIECE = re.compile( + r""" + # Include these so we can skip them: + (?P )| + (?P )| + (?P <\?.*?\?> )| + (?P ]*(\[[^\]]*])?\s*>)| + # These are the ones we actually care about: + (?P <\s*[^>/\?!\s][^>]*/\s*> )| + (?P <\s*[^>/\?!\s][^>]*> )| + (?P <\s*/[^>/\?!\s][^>]*> )""", + re.DOTALL | re.VERBOSE, + ) + + def _read_xml_fragment(self, stream): + """ + Read a string from the given stream that does not contain any + un-closed tags. In particular, this function first reads a + block from the stream of size ``self._BLOCK_SIZE``. It then + checks if that block contains an un-closed tag. If it does, + then this function either backtracks to the last '<', or reads + another block. + """ + fragment = "" + + if isinstance(stream, SeekableUnicodeStreamReader): + startpos = stream.tell() + while True: + # Read a block and add it to the fragment. + xml_block = stream.read(self._BLOCK_SIZE) + fragment += xml_block + + # Do we have a well-formed xml fragment? + if self._VALID_XML_RE.match(fragment): + return fragment + + # Do we have a fragment that will never be well-formed? + if re.search("[<>]", fragment).group(0) == ">": + pos = stream.tell() - ( + len(fragment) - re.search("[<>]", fragment).end() + ) + raise ValueError('Unexpected ">" near char %s' % pos) + + # End of file? + if not xml_block: + raise ValueError("Unexpected end of file: tag not closed") + + # If not, then we must be in the middle of a <..tag..>. + # If appropriate, backtrack to the most recent '<' + # character. + last_open_bracket = fragment.rfind("<") + if last_open_bracket > 0: + if self._VALID_XML_RE.match(fragment[:last_open_bracket]): + if isinstance(stream, SeekableUnicodeStreamReader): + stream.seek(startpos) + stream.char_seek_forward(last_open_bracket) + else: + stream.seek(-(len(fragment) - last_open_bracket), 1) + return fragment[:last_open_bracket] + + # Otherwise, read another block. (i.e., return to the + # top of the loop.) + + def read_block(self, stream, tagspec=None, elt_handler=None): + """ + Read from ``stream`` until we find at least one element that + matches ``tagspec``, and return the result of applying + ``elt_handler`` to each element found. + """ + if tagspec is None: + tagspec = self._tagspec + if elt_handler is None: + elt_handler = self.handle_elt + + # Use a stack of strings to keep track of our context: + context = list(self._tag_context.get(stream.tell())) + assert context is not None # check this -- could it ever happen? + + elts = [] + + elt_start = None # where does the elt start + elt_depth = None # what context depth + elt_text = "" + + while elts == [] or elt_start is not None: + if isinstance(stream, SeekableUnicodeStreamReader): + startpos = stream.tell() + xml_fragment = self._read_xml_fragment(stream) + + # End of file. + if not xml_fragment: + if elt_start is None: + break + else: + raise ValueError("Unexpected end of file") + + # Process each in the xml fragment. + for piece in self._XML_PIECE.finditer(xml_fragment): + if self._DEBUG: + print("{:>25} {}".format("/".join(context)[-20:], piece.group())) + + if piece.group("START_TAG"): + name = self._XML_TAG_NAME.match(piece.group()).group(1) + # Keep context up-to-date. + context.append(name) + # Is this one of the elts we're looking for? + if elt_start is None: + if re.match(tagspec, "/".join(context)): + elt_start = piece.start() + elt_depth = len(context) + + elif piece.group("END_TAG"): + name = self._XML_TAG_NAME.match(piece.group()).group(1) + # sanity checks: + if not context: + raise ValueError("Unmatched tag " % name) + if name != context[-1]: + raise ValueError(f"Unmatched tag <{context[-1]}>...") + # Is this the end of an element? + if elt_start is not None and elt_depth == len(context): + elt_text += xml_fragment[elt_start : piece.end()] + elts.append((elt_text, "/".join(context))) + elt_start = elt_depth = None + elt_text = "" + # Keep context up-to-date + context.pop() + + elif piece.group("EMPTY_ELT_TAG"): + name = self._XML_TAG_NAME.match(piece.group()).group(1) + if elt_start is None: + if re.match(tagspec, "/".join(context) + "/" + name): + elts.append((piece.group(), "/".join(context) + "/" + name)) + + if elt_start is not None: + # If we haven't found any elements yet, then keep + # looping until we do. + if elts == []: + elt_text += xml_fragment[elt_start:] + elt_start = 0 + + # If we've found at least one element, then try + # backtracking to the start of the element that we're + # inside of. + else: + # take back the last start-tag, and return what + # we've gotten so far (elts is non-empty). + if self._DEBUG: + print(" " * 36 + "(backtrack)") + if isinstance(stream, SeekableUnicodeStreamReader): + stream.seek(startpos) + stream.char_seek_forward(elt_start) + else: + stream.seek(-(len(xml_fragment) - elt_start), 1) + context = context[: elt_depth - 1] + elt_start = elt_depth = None + elt_text = "" + + # Update the _tag_context dict. + pos = stream.tell() + if pos in self._tag_context: + assert tuple(context) == self._tag_context[pos] + else: + self._tag_context[pos] = tuple(context) + + return [ + elt_handler( + ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")), + context, + ) + for (elt, context) in elts + ] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py new file mode 100644 index 0000000000000000000000000000000000000000..35bafdfef4f12f934de8e5e4617341fb2ba7b7a8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py @@ -0,0 +1,256 @@ +# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) +# +# Copyright (C) 2001-2015 NLTK Project +# Author: Selina Dennis +# URL: +# For license information, see LICENSE.TXT + +""" +Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old +English Prose (YCOE), a 1.5 million word syntactically-annotated +corpus of Old English prose texts. The corpus is distributed by the +Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included +with NLTK. + +The YCOE corpus is divided into 100 files, each representing +an Old English prose text. Tags used within each text complies +to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm +""" + +import os +import re + +from nltk.corpus.reader.api import * +from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader +from nltk.corpus.reader.tagged import TaggedCorpusReader +from nltk.corpus.reader.util import * +from nltk.tokenize import RegexpTokenizer + + +class YCOECorpusReader(CorpusReader): + """ + Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old + English Prose (YCOE), a 1.5 million word syntactically-annotated + corpus of Old English prose texts. + """ + + def __init__(self, root, encoding="utf8"): + CorpusReader.__init__(self, root, [], encoding) + + self._psd_reader = YCOEParseCorpusReader( + self.root.join("psd"), ".*", ".psd", encoding=encoding + ) + self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos") + + # Make sure we have a consistent set of items: + documents = {f[:-4] for f in self._psd_reader.fileids()} + if {f[:-4] for f in self._pos_reader.fileids()} != documents: + raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.") + + fileids = sorted( + ["%s.psd" % doc for doc in documents] + + ["%s.pos" % doc for doc in documents] + ) + CorpusReader.__init__(self, root, fileids, encoding) + self._documents = sorted(documents) + + def documents(self, fileids=None): + """ + Return a list of document identifiers for all documents in + this corpus, or for the documents with the given file(s) if + specified. + """ + if fileids is None: + return self._documents + if isinstance(fileids, str): + fileids = [fileids] + for f in fileids: + if f not in self._fileids: + raise KeyError("File id %s not found" % fileids) + # Strip off the '.pos' and '.psd' extensions. + return sorted({f[:-4] for f in fileids}) + + def fileids(self, documents=None): + """ + Return a list of file identifiers for the files that make up + this corpus, or that store the given document(s) if specified. + """ + if documents is None: + return self._fileids + elif isinstance(documents, str): + documents = [documents] + return sorted( + set( + ["%s.pos" % doc for doc in documents] + + ["%s.psd" % doc for doc in documents] + ) + ) + + def _getfileids(self, documents, subcorpus): + """ + Helper that selects the appropriate fileids for a given set of + documents from a given subcorpus (pos or psd). + """ + if documents is None: + documents = self._documents + else: + if isinstance(documents, str): + documents = [documents] + for document in documents: + if document not in self._documents: + if document[-4:] in (".pos", ".psd"): + raise ValueError( + "Expected a document identifier, not a file " + "identifier. (Use corpus.documents() to get " + "a list of document identifiers." + ) + else: + raise ValueError("Document identifier %s not found" % document) + return [f"{d}.{subcorpus}" for d in documents] + + # Delegate to one of our two sub-readers: + def words(self, documents=None): + return self._pos_reader.words(self._getfileids(documents, "pos")) + + def sents(self, documents=None): + return self._pos_reader.sents(self._getfileids(documents, "pos")) + + def paras(self, documents=None): + return self._pos_reader.paras(self._getfileids(documents, "pos")) + + def tagged_words(self, documents=None): + return self._pos_reader.tagged_words(self._getfileids(documents, "pos")) + + def tagged_sents(self, documents=None): + return self._pos_reader.tagged_sents(self._getfileids(documents, "pos")) + + def tagged_paras(self, documents=None): + return self._pos_reader.tagged_paras(self._getfileids(documents, "pos")) + + def parsed_sents(self, documents=None): + return self._psd_reader.parsed_sents(self._getfileids(documents, "psd")) + + +class YCOEParseCorpusReader(BracketParseCorpusReader): + """Specialized version of the standard bracket parse corpus reader + that strips out (CODE ...) and (ID ...) nodes.""" + + def _parse(self, t): + t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t) + if re.match(r"\s*\(\s*\)\s*$", t): + return None + return BracketParseCorpusReader._parse(self, t) + + +class YCOETaggedCorpusReader(TaggedCorpusReader): + def __init__(self, root, items, encoding="utf8"): + gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*" + sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) + TaggedCorpusReader.__init__( + self, root, items, sep="_", sent_tokenizer=sent_tokenizer + ) + + +#: A list of all documents and their titles in ycoe. +documents = { + "coadrian.o34": "Adrian and Ritheus", + "coaelhom.o3": "Ælfric, Supplemental Homilies", + "coaelive.o3": "Ælfric's Lives of Saints", + "coalcuin": "Alcuin De virtutibus et vitiis", + "coalex.o23": "Alexander's Letter to Aristotle", + "coapollo.o3": "Apollonius of Tyre", + "coaugust": "Augustine", + "cobede.o2": "Bede's History of the English Church", + "cobenrul.o3": "Benedictine Rule", + "coblick.o23": "Blickling Homilies", + "coboeth.o2": "Boethius' Consolation of Philosophy", + "cobyrhtf.o3": "Byrhtferth's Manual", + "cocanedgD": "Canons of Edgar (D)", + "cocanedgX": "Canons of Edgar (X)", + "cocathom1.o3": "Ælfric's Catholic Homilies I", + "cocathom2.o3": "Ælfric's Catholic Homilies II", + "cochad.o24": "Saint Chad", + "cochdrul": "Chrodegang of Metz, Rule", + "cochristoph": "Saint Christopher", + "cochronA.o23": "Anglo-Saxon Chronicle A", + "cochronC": "Anglo-Saxon Chronicle C", + "cochronD": "Anglo-Saxon Chronicle D", + "cochronE.o34": "Anglo-Saxon Chronicle E", + "cocura.o2": "Cura Pastoralis", + "cocuraC": "Cura Pastoralis (Cotton)", + "codicts.o34": "Dicts of Cato", + "codocu1.o1": "Documents 1 (O1)", + "codocu2.o12": "Documents 2 (O1/O2)", + "codocu2.o2": "Documents 2 (O2)", + "codocu3.o23": "Documents 3 (O2/O3)", + "codocu3.o3": "Documents 3 (O3)", + "codocu4.o24": "Documents 4 (O2/O4)", + "coeluc1": "Honorius of Autun, Elucidarium 1", + "coeluc2": "Honorius of Autun, Elucidarium 1", + "coepigen.o3": "Ælfric's Epilogue to Genesis", + "coeuphr": "Saint Euphrosyne", + "coeust": "Saint Eustace and his companions", + "coexodusP": "Exodus (P)", + "cogenesiC": "Genesis (C)", + "cogregdC.o24": "Gregory's Dialogues (C)", + "cogregdH.o23": "Gregory's Dialogues (H)", + "coherbar": "Pseudo-Apuleius, Herbarium", + "coinspolD.o34": "Wulfstan's Institute of Polity (D)", + "coinspolX": "Wulfstan's Institute of Polity (X)", + "cojames": "Saint James", + "colacnu.o23": "Lacnunga", + "colaece.o2": "Leechdoms", + "colaw1cn.o3": "Laws, Cnut I", + "colaw2cn.o3": "Laws, Cnut II", + "colaw5atr.o3": "Laws, Æthelred V", + "colaw6atr.o3": "Laws, Æthelred VI", + "colawaf.o2": "Laws, Alfred", + "colawafint.o2": "Alfred's Introduction to Laws", + "colawger.o34": "Laws, Gerefa", + "colawine.ox2": "Laws, Ine", + "colawnorthu.o3": "Northumbra Preosta Lagu", + "colawwllad.o4": "Laws, William I, Lad", + "coleofri.o4": "Leofric", + "colsigef.o3": "Ælfric's Letter to Sigefyrth", + "colsigewB": "Ælfric's Letter to Sigeweard (B)", + "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)", + "colwgeat": "Ælfric's Letter to Wulfgeat", + "colwsigeT": "Ælfric's Letter to Wulfsige (T)", + "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)", + "colwstan1.o3": "Ælfric's Letter to Wulfstan I", + "colwstan2.o3": "Ælfric's Letter to Wulfstan II", + "comargaC.o34": "Saint Margaret (C)", + "comargaT": "Saint Margaret (T)", + "comart1": "Martyrology, I", + "comart2": "Martyrology, II", + "comart3.o23": "Martyrology, III", + "comarvel.o23": "Marvels of the East", + "comary": "Mary of Egypt", + "coneot": "Saint Neot", + "conicodA": "Gospel of Nicodemus (A)", + "conicodC": "Gospel of Nicodemus (C)", + "conicodD": "Gospel of Nicodemus (D)", + "conicodE": "Gospel of Nicodemus (E)", + "coorosiu.o2": "Orosius", + "cootest.o3": "Heptateuch", + "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I", + "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II", + "coprefcura.o2": "Preface to the Cura Pastoralis", + "coprefgen.o3": "Ælfric's Preface to Genesis", + "copreflives.o3": "Ælfric's Preface to Lives of Saints", + "coprefsolilo": "Preface to Augustine's Soliloquies", + "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus", + "corood": "History of the Holy Rood-Tree", + "cosevensl": "Seven Sleepers", + "cosolilo": "St. Augustine's Soliloquies", + "cosolsat1.o4": "Solomon and Saturn I", + "cosolsat2": "Solomon and Saturn II", + "cotempo.o3": "Ælfric's De Temporibus Anni", + "coverhom": "Vercelli Homilies", + "coverhomE": "Vercelli Homilies (E)", + "coverhomL": "Vercelli Homilies (L)", + "covinceB": "Saint Vincent (Bodley 343)", + "covinsal": "Vindicta Salvatoris", + "cowsgosp.o3": "West-Saxon Gospels", + "cowulf.o34": "Wulfstan's Homilies", +} diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py new file mode 100644 index 0000000000000000000000000000000000000000..c122785fa855933c9c4909fe58ce226f53466527 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py @@ -0,0 +1,393 @@ +# Natural Language Toolkit: Interface to MaltParser +# +# Author: Dan Garrette +# Contributor: Liling Tan, Mustufain, osamamukhtar11 +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +import inspect +import os +import subprocess +import sys +import tempfile + +from nltk.data import ZipFilePathPointer +from nltk.internals import find_dir, find_file, find_jars_within_path +from nltk.parse.api import ParserI +from nltk.parse.dependencygraph import DependencyGraph +from nltk.parse.util import taggedsents_to_conll + + +def malt_regex_tagger(): + from nltk.tag import RegexpTagger + + _tagger = RegexpTagger( + [ + (r"\.$", "."), + (r"\,$", ","), + (r"\?$", "?"), # fullstop, comma, Qmark + (r"\($", "("), + (r"\)$", ")"), # round brackets + (r"\[$", "["), + (r"\]$", "]"), # square brackets + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers + (r"(The|the|A|a|An|an)$", "DT"), # articles + (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns + (r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive + (r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive + (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions + (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions + (r"(till|Till|until|Until)$", "IN"), # time prepopsitions + (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions + (r"(under|Under|below|Below)$", "IN"), # space prepopsitions + (r"(over|Over|above|Above)$", "IN"), # space prepopsitions + (r"(across|Across|through|Through)$", "IN"), # space prepopsitions + (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions + (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions + (r".*able$", "JJ"), # adjectives + (r".*ness$", "NN"), # nouns formed from adjectives + (r".*ly$", "RB"), # adverbs + (r".*s$", "NNS"), # plural nouns + (r".*ing$", "VBG"), # gerunds + (r".*ed$", "VBD"), # past tense verbs + (r".*", "NN"), # nouns (default) + ] + ) + return _tagger.tag + + +def find_maltparser(parser_dirname): + """ + A module to find MaltParser .jar file and its dependencies. + """ + if os.path.exists(parser_dirname): # If a full path is given. + _malt_dir = parser_dirname + else: # Try to find path to maltparser directory in environment variables. + _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",)) + # Checks that that the found directory contains all the necessary .jar + malt_dependencies = ["", "", ""] + _malt_jars = set(find_jars_within_path(_malt_dir)) + _jars = {os.path.split(jar)[1] for jar in _malt_jars} + malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"} + + assert malt_dependencies.issubset(_jars) + assert any( + filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars) + ) + return list(_malt_jars) + + +def find_malt_model(model_filename): + """ + A module to find pre-trained MaltParser model. + """ + if model_filename is None: + return "malt_temp.mco" + elif os.path.exists(model_filename): # If a full path is given. + return model_filename + else: # Try to find path to malt model in environment variables. + return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False) + + +class MaltParser(ParserI): + """ + A class for dependency parsing with MaltParser. The input is the paths to: + - (optionally) a maltparser directory + - (optionally) the path to a pre-trained MaltParser .mco model file + - (optionally) the tagger to use for POS tagging before parsing + - (optionally) additional Java arguments + + Example: + >>> from nltk.parse import malt + >>> # With MALT_PARSER and MALT_MODEL environment set. + >>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP + >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP + (shot I (elephant an) (in (pajamas my)) .) + >>> # Without MALT_PARSER and MALT_MODEL environment. + >>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP + >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP + (shot I (elephant an) (in (pajamas my)) .) + """ + + def __init__( + self, + parser_dirname="", + model_filename=None, + tagger=None, + additional_java_args=None, + ): + """ + An interface for parsing with the Malt Parser. + + :param parser_dirname: The path to the maltparser directory that + contains the maltparser-1.x.jar + :type parser_dirname: str + :param model_filename: The name of the pre-trained model with .mco file + extension. If provided, training will not be required. + (see http://www.maltparser.org/mco/mco.html and + see http://www.patful.com/chalk/node/185) + :type model_filename: str + :param tagger: The tagger used to POS tag the raw string before + formatting to CONLL format. It should behave like `nltk.pos_tag` + :type tagger: function + :param additional_java_args: This is the additional Java arguments that + one can use when calling Maltparser, usually this is the heapsize + limits, e.g. `additional_java_args=['-Xmx1024m']` + (see https://goo.gl/mpDBvQ) + :type additional_java_args: list + """ + + # Find all the necessary jar files for MaltParser. + self.malt_jars = find_maltparser(parser_dirname) + # Initialize additional java arguments. + self.additional_java_args = ( + additional_java_args if additional_java_args is not None else [] + ) + # Initialize model. + self.model = find_malt_model(model_filename) + self._trained = self.model != "malt_temp.mco" + # Set the working_dir parameters i.e. `-w` from MaltParser's option. + self.working_dir = tempfile.gettempdir() + # Initialize POS tagger. + self.tagger = tagger if tagger is not None else malt_regex_tagger() + + def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"): + """ + Use MaltParser to parse multiple POS tagged sentences. Takes multiple + sentences where each sentence is a list of (word, tag) tuples. + The sentences must have already been tokenized and tagged. + + :param sentences: Input sentences to parse + :type sentence: list(list(tuple(str, str))) + :return: iter(iter(``DependencyGraph``)) the dependency graph + representation of each sentence + """ + if not self._trained: + raise Exception("Parser has not been trained. Call train() first.") + + with tempfile.NamedTemporaryFile( + prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False + ) as input_file: + with tempfile.NamedTemporaryFile( + prefix="malt_output.conll.", + dir=self.working_dir, + mode="w", + delete=False, + ) as output_file: + # Convert list of sentences to CONLL format. + for line in taggedsents_to_conll(sentences): + input_file.write(str(line)) + input_file.close() + + # Generate command to run maltparser. + cmd = self.generate_malt_command( + input_file.name, output_file.name, mode="parse" + ) + + # This is a maltparser quirk, it needs to be run + # where the model file is. otherwise it goes into an awkward + # missing .jars or strange -w working_dir problem. + _current_path = os.getcwd() # Remembers the current path. + try: # Change to modelfile path + os.chdir(os.path.split(self.model)[0]) + except: + pass + ret = self._execute(cmd, verbose) # Run command. + os.chdir(_current_path) # Change back to current path. + + if ret != 0: + raise Exception( + "MaltParser parsing (%s) failed with exit " + "code %d" % (" ".join(cmd), ret) + ) + + # Must return iter(iter(Tree)) + with open(output_file.name) as infile: + for tree_str in infile.read().split("\n\n"): + yield ( + iter( + [ + DependencyGraph( + tree_str, top_relation_label=top_relation_label + ) + ] + ) + ) + + os.remove(input_file.name) + os.remove(output_file.name) + + def parse_sents(self, sentences, verbose=False, top_relation_label="null"): + """ + Use MaltParser to parse multiple sentences. + Takes a list of sentences, where each sentence is a list of words. + Each sentence will be automatically tagged with this + MaltParser instance's tagger. + + :param sentences: Input sentences to parse + :type sentence: list(list(str)) + :return: iter(DependencyGraph) + """ + tagged_sentences = (self.tagger(sentence) for sentence in sentences) + return self.parse_tagged_sents( + tagged_sentences, verbose, top_relation_label=top_relation_label + ) + + def generate_malt_command(self, inputfilename, outputfilename=None, mode=None): + """ + This function generates the maltparser command use at the terminal. + + :param inputfilename: path to the input file + :type inputfilename: str + :param outputfilename: path to the output file + :type outputfilename: str + """ + + cmd = ["java"] + cmd += self.additional_java_args # Adds additional java arguments + # Joins classpaths with ";" if on Windows and on Linux/Mac use ":" + classpaths_separator = ";" if sys.platform.startswith("win") else ":" + cmd += [ + "-cp", + classpaths_separator.join(self.malt_jars), + ] # Adds classpaths for jars + cmd += ["org.maltparser.Malt"] # Adds the main function. + + # Adds the model file. + if os.path.exists(self.model): # when parsing + cmd += ["-c", os.path.split(self.model)[-1]] + else: # when learning + cmd += ["-c", self.model] + + cmd += ["-i", inputfilename] + if mode == "parse": + cmd += ["-o", outputfilename] + cmd += ["-m", mode] # mode use to generate parses. + return cmd + + @staticmethod + def _execute(cmd, verbose=False): + output = None if verbose else subprocess.PIPE + p = subprocess.Popen(cmd, stdout=output, stderr=output) + return p.wait() + + def train(self, depgraphs, verbose=False): + """ + Train MaltParser from a list of ``DependencyGraph`` objects + + :param depgraphs: list of ``DependencyGraph`` objects for training input data + :type depgraphs: DependencyGraph + """ + + # Write the conll_str to malt_train.conll file in /tmp/ + with tempfile.NamedTemporaryFile( + prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False + ) as input_file: + input_str = "\n".join(dg.to_conll(10) for dg in depgraphs) + input_file.write(str(input_str)) + # Trains the model with the malt_train.conll + self.train_from_file(input_file.name, verbose=verbose) + # Removes the malt_train.conll once training finishes. + os.remove(input_file.name) + + def train_from_file(self, conll_file, verbose=False): + """ + Train MaltParser from a file + :param conll_file: str for the filename of the training input data + :type conll_file: str + """ + + # If conll_file is a ZipFilePathPointer, + # then we need to do some extra massaging + if isinstance(conll_file, ZipFilePathPointer): + with tempfile.NamedTemporaryFile( + prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False + ) as input_file: + with conll_file.open() as conll_input_file: + conll_str = conll_input_file.read() + input_file.write(str(conll_str)) + return self.train_from_file(input_file.name, verbose=verbose) + + # Generate command to run maltparser. + cmd = self.generate_malt_command(conll_file, mode="learn") + ret = self._execute(cmd, verbose) + if ret != 0: + raise Exception( + "MaltParser training (%s) failed with exit " + "code %d" % (" ".join(cmd), ret) + ) + self._trained = True + + +if __name__ == "__main__": + """ + A demonstration function to show how NLTK users can use the malt parser API. + + >>> from nltk import pos_tag + >>> assert 'MALT_PARSER' in os.environ, str( + ... "Please set MALT_PARSER in your global environment, e.g.:\n" + ... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'") + >>> + >>> assert 'MALT_MODEL' in os.environ, str( + ... "Please set MALT_MODEL in your global environment, e.g.:\n" + ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'") + >>> + >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" + ... "2 sees _ VB _ _ 0 ROOT _ _\n" + ... "3 a _ DT _ _ 4 SPEC _ _\n" + ... "4 dog _ NN _ _ 2 OBJ _ _\n" + ... "5 . _ . _ _ 2 PUNCT _ _\n") + >>> + >>> + >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" + ... "2 walks _ VB _ _ 0 ROOT _ _\n" + ... "3 . _ . _ _ 2 PUNCT _ _\n") + >>> dg1 = DependencyGraph(_dg1_str) + >>> dg2 = DependencyGraph(_dg2_str) + >>> # Initialize a MaltParser object + >>> mp = MaltParser() + >>> + >>> # Trains a model. + >>> mp.train([dg1,dg2], verbose=False) + >>> sent1 = ['John','sees','Mary', '.'] + >>> sent2 = ['John', 'walks', 'a', 'dog', '.'] + >>> + >>> # Parse a single sentence. + >>> parsed_sent1 = mp.parse_one(sent1) + >>> parsed_sent2 = mp.parse_one(sent2) + >>> print(parsed_sent1.tree()) + (sees John Mary .) + >>> print(parsed_sent2.tree()) + (walks John (dog a) .) + >>> + >>> # Parsing multiple sentences. + >>> sentences = [sent1,sent2] + >>> parsed_sents = mp.parse_sents(sentences) + >>> print(next(next(parsed_sents)).tree()) + (sees John Mary .) + >>> print(next(next(parsed_sents)).tree()) + (walks John (dog a) .) + >>> + >>> # Initialize a MaltParser object with an English pre-trained model. + >>> parser_dirname = 'maltparser-1.9.2' + >>> model_name = 'engmalt.linear-1.7.mco' + >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag) + >>> sent1 = 'I shot an elephant in my pajamas .'.split() + >>> sent2 = 'Time flies like banana .'.split() + >>> # Parse a single sentence. + >>> print(mp.parse_one(sent1).tree()) + (shot I (elephant an) (in (pajamas my)) .) + # Parsing multiple sentences + >>> sentences = [sent1,sent2] + >>> parsed_sents = mp.parse_sents(sentences) + >>> print(next(next(parsed_sents)).tree()) + (shot I (elephant an) (in (pajamas my)) .) + >>> print(next(next(parsed_sents)).tree()) + (flies Time (like banana) .) + """ + + import doctest + + doctest.testmod() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb2ce4e060b77cd8ef36346f8d52ba629f3f209 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py @@ -0,0 +1,772 @@ +# Natural Language Toolkit: Dependency Grammars +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Jason Narad +# +# URL: +# For license information, see LICENSE.TXT +# + +import logging +import math + +from nltk.parse.dependencygraph import DependencyGraph + +logger = logging.getLogger(__name__) + +################################################################# +# DependencyScorerI - Interface for Graph-Edge Weight Calculation +################################################################# + + +class DependencyScorerI: + """ + A scorer for calculated the weights on the edges of a weighted + dependency graph. This is used by a + ``ProbabilisticNonprojectiveParser`` to initialize the edge + weights of a ``DependencyGraph``. While typically this would be done + by training a binary classifier, any class that can return a + multidimensional list representation of the edge weights can + implement this interface. As such, it has no necessary + fields. + """ + + def __init__(self): + if self.__class__ == DependencyScorerI: + raise TypeError("DependencyScorerI is an abstract interface") + + def train(self, graphs): + """ + :type graphs: list(DependencyGraph) + :param graphs: A list of dependency graphs to train the scorer. + Typically the edges present in the graphs can be used as + positive training examples, and the edges not present as negative + examples. + """ + raise NotImplementedError() + + def score(self, graph): + """ + :type graph: DependencyGraph + :param graph: A dependency graph whose set of edges need to be + scored. + :rtype: A three-dimensional list of numbers. + :return: The score is returned in a multidimensional(3) list, such + that the outer-dimension refers to the head, and the + inner-dimension refers to the dependencies. For instance, + scores[0][1] would reference the list of scores corresponding to + arcs from node 0 to node 1. The node's 'address' field can be used + to determine its number identification. + + For further illustration, a score list corresponding to Fig.2 of + Keith Hall's 'K-best Spanning Tree Parsing' paper:: + + scores = [[[], [5], [1], [1]], + [[], [], [11], [4]], + [[], [10], [], [5]], + [[], [8], [8], []]] + + When used in conjunction with a MaxEntClassifier, each score would + correspond to the confidence of a particular edge being classified + with the positive training examples. + """ + raise NotImplementedError() + + +################################################################# +# NaiveBayesDependencyScorer +################################################################# + + +class NaiveBayesDependencyScorer(DependencyScorerI): + """ + A dependency scorer built around a MaxEnt classifier. In this + particular class that classifier is a ``NaiveBayesClassifier``. + It uses head-word, head-tag, child-word, and child-tag features + for classification. + + >>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2 + + >>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry] + >>> npp = ProbabilisticNonprojectiveParser() + >>> npp.train(graphs, NaiveBayesDependencyScorer()) + >>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']) + >>> len(list(parses)) + 1 + + """ + + def __init__(self): + pass # Do nothing without throwing error + + def train(self, graphs): + """ + Trains a ``NaiveBayesClassifier`` using the edges present in + graphs list as positive examples, the edges not present as + negative examples. Uses a feature vector of head-word, + head-tag, child-word, and child-tag. + + :type graphs: list(DependencyGraph) + :param graphs: A list of dependency graphs to train the scorer. + """ + + from nltk.classify import NaiveBayesClassifier + + # Create training labeled training examples + labeled_examples = [] + for graph in graphs: + for head_node in graph.nodes.values(): + for child_index, child_node in graph.nodes.items(): + if child_index in head_node["deps"]: + label = "T" + else: + label = "F" + labeled_examples.append( + ( + dict( + a=head_node["word"], + b=head_node["tag"], + c=child_node["word"], + d=child_node["tag"], + ), + label, + ) + ) + + self.classifier = NaiveBayesClassifier.train(labeled_examples) + + def score(self, graph): + """ + Converts the graph into a feature-based representation of + each edge, and then assigns a score to each based on the + confidence of the classifier in assigning it to the + positive label. Scores are returned in a multidimensional list. + + :type graph: DependencyGraph + :param graph: A dependency graph to score. + :rtype: 3 dimensional list + :return: Edge scores for the graph parameter. + """ + # Convert graph to feature representation + edges = [] + for head_node in graph.nodes.values(): + for child_node in graph.nodes.values(): + edges.append( + dict( + a=head_node["word"], + b=head_node["tag"], + c=child_node["word"], + d=child_node["tag"], + ) + ) + + # Score edges + edge_scores = [] + row = [] + count = 0 + for pdist in self.classifier.prob_classify_many(edges): + logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F")) + # smoothing in case the probability = 0 + row.append([math.log(pdist.prob("T") + 0.00000000001)]) + count += 1 + if count == len(graph.nodes): + edge_scores.append(row) + row = [] + count = 0 + return edge_scores + + +################################################################# +# A Scorer for Demo Purposes +################################################################# +# A short class necessary to show parsing example from paper +class DemoScorer(DependencyScorerI): + def train(self, graphs): + print("Training...") + + def score(self, graph): + # scores for Keith Hall 'K-best Spanning Tree Parsing' paper + return [ + [[], [5], [1], [1]], + [[], [], [11], [4]], + [[], [10], [], [5]], + [[], [8], [8], []], + ] + + +################################################################# +# Non-Projective Probabilistic Parsing +################################################################# + + +class ProbabilisticNonprojectiveParser: + """A probabilistic non-projective dependency parser. + + Nonprojective dependencies allows for "crossing branches" in the parse tree + which is necessary for representing particular linguistic phenomena, or even + typical parses in some languages. This parser follows the MST parsing + algorithm, outlined in McDonald(2005), which likens the search for the best + non-projective parse to finding the maximum spanning tree in a weighted + directed graph. + + >>> class Scorer(DependencyScorerI): + ... def train(self, graphs): + ... pass + ... + ... def score(self, graph): + ... return [ + ... [[], [5], [1], [1]], + ... [[], [], [11], [4]], + ... [[], [10], [], [5]], + ... [[], [8], [8], []], + ... ] + + + >>> npp = ProbabilisticNonprojectiveParser() + >>> npp.train([], Scorer()) + + >>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None]) + >>> len(list(parses)) + 1 + + Rule based example + + >>> from nltk.grammar import DependencyGrammar + + >>> grammar = DependencyGrammar.fromstring(''' + ... 'taught' -> 'play' | 'man' + ... 'man' -> 'the' | 'in' + ... 'in' -> 'corner' + ... 'corner' -> 'the' + ... 'play' -> 'golf' | 'dachshund' | 'to' + ... 'dachshund' -> 'his' + ... ''') + + >>> ndp = NonprojectiveDependencyParser(grammar) + >>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf']) + >>> len(list(parses)) + 4 + + """ + + def __init__(self): + """ + Creates a new non-projective parser. + """ + logging.debug("initializing prob. nonprojective...") + + def train(self, graphs, dependency_scorer): + """ + Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects, + and establishes this as the parser's scorer. This is used to + initialize the scores on a ``DependencyGraph`` during the parsing + procedure. + + :type graphs: list(DependencyGraph) + :param graphs: A list of dependency graphs to train the scorer. + :type dependency_scorer: DependencyScorerI + :param dependency_scorer: A scorer which implements the + ``DependencyScorerI`` interface. + """ + self._scorer = dependency_scorer + self._scorer.train(graphs) + + def initialize_edge_scores(self, graph): + """ + Assigns a score to every edge in the ``DependencyGraph`` graph. + These scores are generated via the parser's scorer which + was assigned during the training process. + + :type graph: DependencyGraph + :param graph: A dependency graph to assign scores to. + """ + self.scores = self._scorer.score(graph) + + def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph): + """ + Takes a list of nodes that have been identified to belong to a cycle, + and collapses them into on larger node. The arcs of all nodes in + the graph must be updated to account for this. + + :type new_node: Node. + :param new_node: A Node (Dictionary) to collapse the cycle nodes into. + :type cycle_path: A list of integers. + :param cycle_path: A list of node addresses, each of which is in the cycle. + :type g_graph, b_graph, c_graph: DependencyGraph + :param g_graph, b_graph, c_graph: Graphs which need to be updated. + """ + logger.debug("Collapsing nodes...") + # Collapse all cycle nodes into v_n+1 in G_Graph + for cycle_node_index in cycle_path: + g_graph.remove_by_address(cycle_node_index) + g_graph.add_node(new_node) + g_graph.redirect_arcs(cycle_path, new_node["address"]) + + def update_edge_scores(self, new_node, cycle_path): + """ + Updates the edge scores to reflect a collapse operation into + new_node. + + :type new_node: A Node. + :param new_node: The node which cycle nodes are collapsed into. + :type cycle_path: A list of integers. + :param cycle_path: A list of node addresses that belong to the cycle. + """ + logger.debug("cycle %s", cycle_path) + + cycle_path = self.compute_original_indexes(cycle_path) + + logger.debug("old cycle %s", cycle_path) + logger.debug("Prior to update: %s", self.scores) + + for i, row in enumerate(self.scores): + for j, column in enumerate(self.scores[i]): + logger.debug(self.scores[i][j]) + if j in cycle_path and i not in cycle_path and self.scores[i][j]: + subtract_val = self.compute_max_subtract_score(j, cycle_path) + + logger.debug("%s - %s", self.scores[i][j], subtract_val) + + new_vals = [] + for cur_val in self.scores[i][j]: + new_vals.append(cur_val - subtract_val) + + self.scores[i][j] = new_vals + + for i, row in enumerate(self.scores): + for j, cell in enumerate(self.scores[i]): + if i in cycle_path and j in cycle_path: + self.scores[i][j] = [] + + logger.debug("After update: %s", self.scores) + + def compute_original_indexes(self, new_indexes): + """ + As nodes are collapsed into others, they are replaced + by the new node in the graph, but it's still necessary + to keep track of what these original nodes were. This + takes a list of node addresses and replaces any collapsed + node addresses with their original addresses. + + :type new_indexes: A list of integers. + :param new_indexes: A list of node addresses to check for + subsumed nodes. + """ + swapped = True + while swapped: + originals = [] + swapped = False + for new_index in new_indexes: + if new_index in self.inner_nodes: + for old_val in self.inner_nodes[new_index]: + if old_val not in originals: + originals.append(old_val) + swapped = True + else: + originals.append(new_index) + new_indexes = originals + return new_indexes + + def compute_max_subtract_score(self, column_index, cycle_indexes): + """ + When updating scores the score of the highest-weighted incoming + arc is subtracted upon collapse. This returns the correct + amount to subtract from that edge. + + :type column_index: integer. + :param column_index: A index representing the column of incoming arcs + to a particular node being updated + :type cycle_indexes: A list of integers. + :param cycle_indexes: Only arcs from cycle nodes are considered. This + is a list of such nodes addresses. + """ + max_score = -100000 + for row_index in cycle_indexes: + for subtract_val in self.scores[row_index][column_index]: + if subtract_val > max_score: + max_score = subtract_val + return max_score + + def best_incoming_arc(self, node_index): + """ + Returns the source of the best incoming arc to the + node with address: node_index + + :type node_index: integer. + :param node_index: The address of the 'destination' node, + the node that is arced to. + """ + originals = self.compute_original_indexes([node_index]) + logger.debug("originals: %s", originals) + + max_arc = None + max_score = None + for row_index in range(len(self.scores)): + for col_index in range(len(self.scores[row_index])): + if col_index in originals and ( + max_score is None or self.scores[row_index][col_index] > max_score + ): + max_score = self.scores[row_index][col_index] + max_arc = row_index + logger.debug("%s, %s", row_index, col_index) + + logger.debug(max_score) + + for key in self.inner_nodes: + replaced_nodes = self.inner_nodes[key] + if max_arc in replaced_nodes: + return key + + return max_arc + + def original_best_arc(self, node_index): + originals = self.compute_original_indexes([node_index]) + max_arc = None + max_score = None + max_orig = None + for row_index in range(len(self.scores)): + for col_index in range(len(self.scores[row_index])): + if col_index in originals and ( + max_score is None or self.scores[row_index][col_index] > max_score + ): + max_score = self.scores[row_index][col_index] + max_arc = row_index + max_orig = col_index + return [max_arc, max_orig] + + def parse(self, tokens, tags): + """ + Parses a list of tokens in accordance to the MST parsing algorithm + for non-projective dependency parses. Assumes that the tokens to + be parsed have already been tagged and those tags are provided. Various + scoring methods can be used by implementing the ``DependencyScorerI`` + interface and passing it to the training algorithm. + + :type tokens: list(str) + :param tokens: A list of words or punctuation to be parsed. + :type tags: list(str) + :param tags: A list of tags corresponding by index to the words in the tokens list. + :return: An iterator of non-projective parses. + :rtype: iter(DependencyGraph) + """ + self.inner_nodes = {} + + # Initialize g_graph + g_graph = DependencyGraph() + for index, token in enumerate(tokens): + g_graph.nodes[index + 1].update( + {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} + ) + + # Fully connect non-root nodes in g_graph + g_graph.connect_graph() + original_graph = DependencyGraph() + for index, token in enumerate(tokens): + original_graph.nodes[index + 1].update( + {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} + ) + + b_graph = DependencyGraph() + c_graph = DependencyGraph() + + for index, token in enumerate(tokens): + c_graph.nodes[index + 1].update( + {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} + ) + + # Assign initial scores to g_graph edges + self.initialize_edge_scores(g_graph) + logger.debug(self.scores) + # Initialize a list of unvisited vertices (by node address) + unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()] + # Iterate over unvisited vertices + nr_vertices = len(tokens) + betas = {} + while unvisited_vertices: + # Mark current node as visited + current_vertex = unvisited_vertices.pop(0) + logger.debug("current_vertex: %s", current_vertex) + # Get corresponding node n_i to vertex v_i + current_node = g_graph.get_by_address(current_vertex) + logger.debug("current_node: %s", current_node) + # Get best in-edge node b for current node + best_in_edge = self.best_incoming_arc(current_vertex) + betas[current_vertex] = self.original_best_arc(current_vertex) + logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex) + # b_graph = Union(b_graph, b) + for new_vertex in [current_vertex, best_in_edge]: + b_graph.nodes[new_vertex].update( + {"word": "TEMP", "rel": "NTOP", "address": new_vertex} + ) + b_graph.add_arc(best_in_edge, current_vertex) + # Beta(current node) = b - stored for parse recovery + # If b_graph contains a cycle, collapse it + cycle_path = b_graph.contains_cycle() + if cycle_path: + # Create a new node v_n+1 with address = len(nodes) + 1 + new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1} + # c_graph = Union(c_graph, v_n+1) + c_graph.add_node(new_node) + # Collapse all nodes in cycle C into v_n+1 + self.update_edge_scores(new_node, cycle_path) + self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) + for cycle_index in cycle_path: + c_graph.add_arc(new_node["address"], cycle_index) + # self.replaced_by[cycle_index] = new_node['address'] + + self.inner_nodes[new_node["address"]] = cycle_path + + # Add v_n+1 to list of unvisited vertices + unvisited_vertices.insert(0, nr_vertices + 1) + + # increment # of nodes counter + nr_vertices += 1 + + # Remove cycle nodes from b_graph; B = B - cycle c + for cycle_node_address in cycle_path: + b_graph.remove_by_address(cycle_node_address) + + logger.debug("g_graph: %s", g_graph) + logger.debug("b_graph: %s", b_graph) + logger.debug("c_graph: %s", c_graph) + logger.debug("Betas: %s", betas) + logger.debug("replaced nodes %s", self.inner_nodes) + + # Recover parse tree + logger.debug("Final scores: %s", self.scores) + + logger.debug("Recovering parse...") + for i in range(len(tokens) + 1, nr_vertices + 1): + betas[betas[i][1]] = betas[i] + + logger.debug("Betas: %s", betas) + for node in original_graph.nodes.values(): + # TODO: It's dangerous to assume that deps it a dictionary + # because it's a default dictionary. Ideally, here we should not + # be concerned how dependencies are stored inside of a dependency + # graph. + node["deps"] = {} + for i in range(1, len(tokens) + 1): + original_graph.add_arc(betas[i][0], betas[i][1]) + + logger.debug("Done.") + yield original_graph + + +################################################################# +# Rule-based Non-Projective Parser +################################################################# + + +class NonprojectiveDependencyParser: + """ + A non-projective, rule-based, dependency parser. This parser + will return the set of all possible non-projective parses based on + the word-to-word relations defined in the parser's dependency + grammar, and will allow the branches of the parse tree to cross + in order to capture a variety of linguistic phenomena that a + projective parser will not. + """ + + def __init__(self, dependency_grammar): + """ + Creates a new ``NonprojectiveDependencyParser``. + + :param dependency_grammar: a grammar of word-to-word relations. + :type dependency_grammar: DependencyGrammar + """ + self._grammar = dependency_grammar + + def parse(self, tokens): + """ + Parses the input tokens with respect to the parser's grammar. Parsing + is accomplished by representing the search-space of possible parses as + a fully-connected directed graph. Arcs that would lead to ungrammatical + parses are removed and a lattice is constructed of length n, where n is + the number of input tokens, to represent all possible grammatical + traversals. All possible paths through the lattice are then enumerated + to produce the set of non-projective parses. + + param tokens: A list of tokens to parse. + type tokens: list(str) + return: An iterator of non-projective parses. + rtype: iter(DependencyGraph) + """ + # Create graph representation of tokens + self._graph = DependencyGraph() + + for index, token in enumerate(tokens): + self._graph.nodes[index] = { + "word": token, + "deps": [], + "rel": "NTOP", + "address": index, + } + + for head_node in self._graph.nodes.values(): + deps = [] + for dep_node in self._graph.nodes.values(): + if ( + self._grammar.contains(head_node["word"], dep_node["word"]) + and head_node["word"] != dep_node["word"] + ): + deps.append(dep_node["address"]) + head_node["deps"] = deps + + # Create lattice of possible heads + roots = [] + possible_heads = [] + for i, word in enumerate(tokens): + heads = [] + for j, head in enumerate(tokens): + if (i != j) and self._grammar.contains(head, word): + heads.append(j) + if len(heads) == 0: + roots.append(i) + possible_heads.append(heads) + + # Set roots to attempt + if len(roots) < 2: + if len(roots) == 0: + for i in range(len(tokens)): + roots.append(i) + + # Traverse lattice + analyses = [] + for _ in roots: + stack = [] + analysis = [[] for i in range(len(possible_heads))] + i = 0 + forward = True + while i >= 0: + if forward: + if len(possible_heads[i]) == 1: + analysis[i] = possible_heads[i][0] + elif len(possible_heads[i]) == 0: + analysis[i] = -1 + else: + head = possible_heads[i].pop() + analysis[i] = head + stack.append([i, head]) + if not forward: + index_on_stack = False + for stack_item in stack: + if stack_item[0] == i: + index_on_stack = True + orig_length = len(possible_heads[i]) + + if index_on_stack and orig_length == 0: + for j in range(len(stack) - 1, -1, -1): + stack_item = stack[j] + if stack_item[0] == i: + possible_heads[i].append(stack.pop(j)[1]) + + elif index_on_stack and orig_length > 0: + head = possible_heads[i].pop() + analysis[i] = head + stack.append([i, head]) + forward = True + + if i + 1 == len(possible_heads): + analyses.append(analysis[:]) + forward = False + if forward: + i += 1 + else: + i -= 1 + + # Filter parses + # ensure 1 root, every thing has 1 head + for analysis in analyses: + if analysis.count(-1) > 1: + # there are several root elements! + continue + + graph = DependencyGraph() + graph.root = graph.nodes[analysis.index(-1) + 1] + + for address, (token, head_index) in enumerate( + zip(tokens, analysis), start=1 + ): + head_address = head_index + 1 + + node = graph.nodes[address] + node.update({"word": token, "address": address}) + + if head_address == 0: + rel = "ROOT" + else: + rel = "" + graph.nodes[head_index + 1]["deps"][rel].append(address) + + # TODO: check for cycles + yield graph + + +################################################################# +# Demos +################################################################# + + +def demo(): + # hall_demo() + nonprojective_conll_parse_demo() + rule_based_demo() + + +def hall_demo(): + npp = ProbabilisticNonprojectiveParser() + npp.train([], DemoScorer()) + for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]): + print(parse_graph) + + +def nonprojective_conll_parse_demo(): + from nltk.parse.dependencygraph import conll_data2 + + graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] + npp = ProbabilisticNonprojectiveParser() + npp.train(graphs, NaiveBayesDependencyScorer()) + for parse_graph in npp.parse( + ["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"] + ): + print(parse_graph) + + +def rule_based_demo(): + from nltk.grammar import DependencyGrammar + + grammar = DependencyGrammar.fromstring( + """ + 'taught' -> 'play' | 'man' + 'man' -> 'the' | 'in' + 'in' -> 'corner' + 'corner' -> 'the' + 'play' -> 'golf' | 'dachshund' | 'to' + 'dachshund' -> 'his' + """ + ) + print(grammar) + ndp = NonprojectiveDependencyParser(grammar) + graphs = ndp.parse( + [ + "the", + "man", + "in", + "the", + "corner", + "taught", + "his", + "dachshund", + "to", + "play", + "golf", + ] + ) + print("Graphs:") + for graph in graphs: + print(graph) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py new file mode 100644 index 0000000000000000000000000000000000000000..cc02f9f127309b6d018bf29d3a23569f1a89f2aa --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py @@ -0,0 +1,684 @@ +# Natural Language Toolkit: Recursive Descent Parser +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from nltk.grammar import Nonterminal +from nltk.parse.api import ParserI +from nltk.tree import ImmutableTree, Tree + + +##////////////////////////////////////////////////////// +## Recursive Descent Parser +##////////////////////////////////////////////////////// +class RecursiveDescentParser(ParserI): + """ + A simple top-down CFG parser that parses texts by recursively + expanding the fringe of a Tree, and matching it against a + text. + + ``RecursiveDescentParser`` uses a list of tree locations called a + "frontier" to remember which subtrees have not yet been expanded + and which leaves have not yet been matched against the text. Each + tree location consists of a list of child indices specifying the + path from the root of the tree to a subtree or a leaf; see the + reference documentation for Tree for more information + about tree locations. + + When the parser begins parsing a text, it constructs a tree + containing only the start symbol, and a frontier containing the + location of the tree's root node. It then extends the tree to + cover the text, using the following recursive procedure: + + - If the frontier is empty, and the text is covered by the tree, + then return the tree as a possible parse. + - If the frontier is empty, and the text is not covered by the + tree, then return no parses. + - If the first element of the frontier is a subtree, then + use CFG productions to "expand" it. For each applicable + production, add the expanded subtree's children to the + frontier, and recursively find all parses that can be + generated by the new tree and frontier. + - If the first element of the frontier is a token, then "match" + it against the next token from the text. Remove the token + from the frontier, and recursively find all parses that can be + generated by the new tree and frontier. + + :see: ``nltk.grammar`` + """ + + def __init__(self, grammar, trace=0): + """ + Create a new ``RecursiveDescentParser``, that uses ``grammar`` + to parse texts. + + :type grammar: CFG + :param grammar: The grammar used to parse texts. + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + and higher numbers will produce more verbose tracing + output. + """ + self._grammar = grammar + self._trace = trace + + def grammar(self): + return self._grammar + + def parse(self, tokens): + # Inherit docs from ParserI + + tokens = list(tokens) + self._grammar.check_coverage(tokens) + + # Start a recursive descent parse, with an initial tree + # containing just the start symbol. + start = self._grammar.start().symbol() + initial_tree = Tree(start, []) + frontier = [()] + if self._trace: + self._trace_start(initial_tree, frontier, tokens) + return self._parse(tokens, initial_tree, frontier) + + def _parse(self, remaining_text, tree, frontier): + """ + Recursively expand and match each elements of ``tree`` + specified by ``frontier``, to cover ``remaining_text``. Return + a list of all parses found. + + :return: An iterator of all parses that can be generated by + matching and expanding the elements of ``tree`` + specified by ``frontier``. + :rtype: iter(Tree) + :type tree: Tree + :param tree: A partial structure for the text that is + currently being parsed. The elements of ``tree`` + that are specified by ``frontier`` have not yet been + expanded or matched. + :type remaining_text: list(str) + :param remaining_text: The portion of the text that is not yet + covered by ``tree``. + :type frontier: list(tuple(int)) + :param frontier: A list of the locations within ``tree`` of + all subtrees that have not yet been expanded, and all + leaves that have not yet been matched. This list sorted + in left-to-right order of location within the tree. + """ + + # If the tree covers the text, and there's nothing left to + # expand, then we've found a complete parse; return it. + if len(remaining_text) == 0 and len(frontier) == 0: + if self._trace: + self._trace_succeed(tree, frontier) + yield tree + + # If there's still text, but nothing left to expand, we failed. + elif len(frontier) == 0: + if self._trace: + self._trace_backtrack(tree, frontier) + + # If the next element on the frontier is a tree, expand it. + elif isinstance(tree[frontier[0]], Tree): + yield from self._expand(remaining_text, tree, frontier) + + # If the next element on the frontier is a token, match it. + else: + yield from self._match(remaining_text, tree, frontier) + + def _match(self, rtext, tree, frontier): + """ + :rtype: iter(Tree) + :return: an iterator of all parses that can be generated by + matching the first element of ``frontier`` against the + first token in ``rtext``. In particular, if the first + element of ``frontier`` has the same type as the first + token in ``rtext``, then substitute the token into + ``tree``; and return all parses that can be generated by + matching and expanding the remaining elements of + ``frontier``. If the first element of ``frontier`` does not + have the same type as the first token in ``rtext``, then + return empty list. + + :type tree: Tree + :param tree: A partial structure for the text that is + currently being parsed. The elements of ``tree`` + that are specified by ``frontier`` have not yet been + expanded or matched. + :type rtext: list(str) + :param rtext: The portion of the text that is not yet + covered by ``tree``. + :type frontier: list of tuple of int + :param frontier: A list of the locations within ``tree`` of + all subtrees that have not yet been expanded, and all + leaves that have not yet been matched. + """ + + tree_leaf = tree[frontier[0]] + if len(rtext) > 0 and tree_leaf == rtext[0]: + # If it's a terminal that matches rtext[0], then substitute + # in the token, and continue parsing. + newtree = tree.copy(deep=True) + newtree[frontier[0]] = rtext[0] + if self._trace: + self._trace_match(newtree, frontier[1:], rtext[0]) + yield from self._parse(rtext[1:], newtree, frontier[1:]) + else: + # If it's a non-matching terminal, fail. + if self._trace: + self._trace_backtrack(tree, frontier, rtext[:1]) + + def _expand(self, remaining_text, tree, frontier, production=None): + """ + :rtype: iter(Tree) + :return: An iterator of all parses that can be generated by + expanding the first element of ``frontier`` with + ``production``. In particular, if the first element of + ``frontier`` is a subtree whose node type is equal to + ``production``'s left hand side, then add a child to that + subtree for each element of ``production``'s right hand + side; and return all parses that can be generated by + matching and expanding the remaining elements of + ``frontier``. If the first element of ``frontier`` is not a + subtree whose node type is equal to ``production``'s left + hand side, then return an empty list. If ``production`` is + not specified, then return a list of all parses that can + be generated by expanding the first element of ``frontier`` + with *any* CFG production. + + :type tree: Tree + :param tree: A partial structure for the text that is + currently being parsed. The elements of ``tree`` + that are specified by ``frontier`` have not yet been + expanded or matched. + :type remaining_text: list(str) + :param remaining_text: The portion of the text that is not yet + covered by ``tree``. + :type frontier: list(tuple(int)) + :param frontier: A list of the locations within ``tree`` of + all subtrees that have not yet been expanded, and all + leaves that have not yet been matched. + """ + + if production is None: + productions = self._grammar.productions() + else: + productions = [production] + + for production in productions: + lhs = production.lhs().symbol() + if lhs == tree[frontier[0]].label(): + subtree = self._production_to_tree(production) + if frontier[0] == (): + newtree = subtree + else: + newtree = tree.copy(deep=True) + newtree[frontier[0]] = subtree + new_frontier = [ + frontier[0] + (i,) for i in range(len(production.rhs())) + ] + if self._trace: + self._trace_expand(newtree, new_frontier, production) + yield from self._parse( + remaining_text, newtree, new_frontier + frontier[1:] + ) + + def _production_to_tree(self, production): + """ + :rtype: Tree + :return: The Tree that is licensed by ``production``. + In particular, given the production ``[lhs -> elt[1] ... elt[n]]`` + return a tree that has a node ``lhs.symbol``, and + ``n`` children. For each nonterminal element + ``elt[i]`` in the production, the tree token has a + childless subtree with node value ``elt[i].symbol``; and + for each terminal element ``elt[j]``, the tree token has + a leaf token with type ``elt[j]``. + + :param production: The CFG production that licenses the tree + token that should be returned. + :type production: Production + """ + children = [] + for elt in production.rhs(): + if isinstance(elt, Nonterminal): + children.append(Tree(elt.symbol(), [])) + else: + # This will be matched. + children.append(elt) + return Tree(production.lhs().symbol(), children) + + def trace(self, trace=2): + """ + Set the level of tracing output that should be generated when + parsing a text. + + :type trace: int + :param trace: The trace level. A trace level of ``0`` will + generate no tracing output; and higher trace levels will + produce more verbose tracing output. + :rtype: None + """ + self._trace = trace + + def _trace_fringe(self, tree, treeloc=None): + """ + Print trace output displaying the fringe of ``tree``. The + fringe of ``tree`` consists of all of its leaves and all of + its childless subtrees. + + :rtype: None + """ + + if treeloc == (): + print("*", end=" ") + if isinstance(tree, Tree): + if len(tree) == 0: + print(repr(Nonterminal(tree.label())), end=" ") + for i in range(len(tree)): + if treeloc is not None and i == treeloc[0]: + self._trace_fringe(tree[i], treeloc[1:]) + else: + self._trace_fringe(tree[i]) + else: + print(repr(tree), end=" ") + + def _trace_tree(self, tree, frontier, operation): + """ + Print trace output displaying the parser's current state. + + :param operation: A character identifying the operation that + generated the current state. + :rtype: None + """ + if self._trace == 2: + print(" %c [" % operation, end=" ") + else: + print(" [", end=" ") + if len(frontier) > 0: + self._trace_fringe(tree, frontier[0]) + else: + self._trace_fringe(tree) + print("]") + + def _trace_start(self, tree, frontier, text): + print("Parsing %r" % " ".join(text)) + if self._trace > 2: + print("Start:") + if self._trace > 1: + self._trace_tree(tree, frontier, " ") + + def _trace_expand(self, tree, frontier, production): + if self._trace > 2: + print("Expand: %s" % production) + if self._trace > 1: + self._trace_tree(tree, frontier, "E") + + def _trace_match(self, tree, frontier, tok): + if self._trace > 2: + print("Match: %r" % tok) + if self._trace > 1: + self._trace_tree(tree, frontier, "M") + + def _trace_succeed(self, tree, frontier): + if self._trace > 2: + print("GOOD PARSE:") + if self._trace == 1: + print("Found a parse:\n%s" % tree) + if self._trace > 1: + self._trace_tree(tree, frontier, "+") + + def _trace_backtrack(self, tree, frontier, toks=None): + if self._trace > 2: + if toks: + print("Backtrack: %r match failed" % toks[0]) + else: + print("Backtrack") + + +##////////////////////////////////////////////////////// +## Stepping Recursive Descent Parser +##////////////////////////////////////////////////////// +class SteppingRecursiveDescentParser(RecursiveDescentParser): + """ + A ``RecursiveDescentParser`` that allows you to step through the + parsing process, performing a single operation at a time. + + The ``initialize`` method is used to start parsing a text. + ``expand`` expands the first element on the frontier using a single + CFG production, and ``match`` matches the first element on the + frontier against the next text token. ``backtrack`` undoes the most + recent expand or match operation. ``step`` performs a single + expand, match, or backtrack operation. ``parses`` returns the set + of parses that have been found by the parser. + + :ivar _history: A list of ``(rtext, tree, frontier)`` tripples, + containing the previous states of the parser. This history is + used to implement the ``backtrack`` operation. + :ivar _tried_e: A record of all productions that have been tried + for a given tree. This record is used by ``expand`` to perform + the next untried production. + :ivar _tried_m: A record of what tokens have been matched for a + given tree. This record is used by ``step`` to decide whether + or not to match a token. + :see: ``nltk.grammar`` + """ + + def __init__(self, grammar, trace=0): + super().__init__(grammar, trace) + self._rtext = None + self._tree = None + self._frontier = [()] + self._tried_e = {} + self._tried_m = {} + self._history = [] + self._parses = [] + + # [XX] TEMPORARY HACK WARNING! This should be replaced with + # something nicer when we get the chance. + def _freeze(self, tree): + c = tree.copy() + # for pos in c.treepositions('leaves'): + # c[pos] = c[pos].freeze() + return ImmutableTree.convert(c) + + def parse(self, tokens): + tokens = list(tokens) + self.initialize(tokens) + while self.step() is not None: + pass + return self.parses() + + def initialize(self, tokens): + """ + Start parsing a given text. This sets the parser's tree to + the start symbol, its frontier to the root node, and its + remaining text to ``token['SUBTOKENS']``. + """ + + self._rtext = tokens + start = self._grammar.start().symbol() + self._tree = Tree(start, []) + self._frontier = [()] + self._tried_e = {} + self._tried_m = {} + self._history = [] + self._parses = [] + if self._trace: + self._trace_start(self._tree, self._frontier, self._rtext) + + def remaining_text(self): + """ + :return: The portion of the text that is not yet covered by the + tree. + :rtype: list(str) + """ + return self._rtext + + def frontier(self): + """ + :return: A list of the tree locations of all subtrees that + have not yet been expanded, and all leaves that have not + yet been matched. + :rtype: list(tuple(int)) + """ + return self._frontier + + def tree(self): + """ + :return: A partial structure for the text that is + currently being parsed. The elements specified by the + frontier have not yet been expanded or matched. + :rtype: Tree + """ + return self._tree + + def step(self): + """ + Perform a single parsing operation. If an untried match is + possible, then perform the match, and return the matched + token. If an untried expansion is possible, then perform the + expansion, and return the production that it is based on. If + backtracking is possible, then backtrack, and return True. + Otherwise, return None. + + :return: None if no operation was performed; a token if a match + was performed; a production if an expansion was performed; + and True if a backtrack operation was performed. + :rtype: Production or String or bool + """ + # Try matching (if we haven't already) + if self.untried_match(): + token = self.match() + if token is not None: + return token + + # Try expanding. + production = self.expand() + if production is not None: + return production + + # Try backtracking + if self.backtrack(): + self._trace_backtrack(self._tree, self._frontier) + return True + + # Nothing left to do. + return None + + def expand(self, production=None): + """ + Expand the first element of the frontier. In particular, if + the first element of the frontier is a subtree whose node type + is equal to ``production``'s left hand side, then add a child + to that subtree for each element of ``production``'s right hand + side. If ``production`` is not specified, then use the first + untried expandable production. If all expandable productions + have been tried, do nothing. + + :return: The production used to expand the frontier, if an + expansion was performed. If no expansion was performed, + return None. + :rtype: Production or None + """ + + # Make sure we *can* expand. + if len(self._frontier) == 0: + return None + if not isinstance(self._tree[self._frontier[0]], Tree): + return None + + # If they didn't specify a production, check all untried ones. + if production is None: + productions = self.untried_expandable_productions() + else: + productions = [production] + + parses = [] + for prod in productions: + # Record that we've tried this production now. + self._tried_e.setdefault(self._freeze(self._tree), []).append(prod) + + # Try expanding. + for _result in self._expand(self._rtext, self._tree, self._frontier, prod): + return prod + + # We didn't expand anything. + return None + + def match(self): + """ + Match the first element of the frontier. In particular, if + the first element of the frontier has the same type as the + next text token, then substitute the text token into the tree. + + :return: The token matched, if a match operation was + performed. If no match was performed, return None + :rtype: str or None + """ + + # Record that we've tried matching this token. + tok = self._rtext[0] + self._tried_m.setdefault(self._freeze(self._tree), []).append(tok) + + # Make sure we *can* match. + if len(self._frontier) == 0: + return None + if isinstance(self._tree[self._frontier[0]], Tree): + return None + + for _result in self._match(self._rtext, self._tree, self._frontier): + # Return the token we just matched. + return self._history[-1][0][0] + return None + + def backtrack(self): + """ + Return the parser to its state before the most recent + match or expand operation. Calling ``undo`` repeatedly return + the parser to successively earlier states. If no match or + expand operations have been performed, ``undo`` will make no + changes. + + :return: true if an operation was successfully undone. + :rtype: bool + """ + if len(self._history) == 0: + return False + (self._rtext, self._tree, self._frontier) = self._history.pop() + return True + + def expandable_productions(self): + """ + :return: A list of all the productions for which expansions + are available for the current parser state. + :rtype: list(Production) + """ + # Make sure we *can* expand. + if len(self._frontier) == 0: + return [] + frontier_child = self._tree[self._frontier[0]] + if len(self._frontier) == 0 or not isinstance(frontier_child, Tree): + return [] + + return [ + p + for p in self._grammar.productions() + if p.lhs().symbol() == frontier_child.label() + ] + + def untried_expandable_productions(self): + """ + :return: A list of all the untried productions for which + expansions are available for the current parser state. + :rtype: list(Production) + """ + + tried_expansions = self._tried_e.get(self._freeze(self._tree), []) + return [p for p in self.expandable_productions() if p not in tried_expansions] + + def untried_match(self): + """ + :return: Whether the first element of the frontier is a token + that has not yet been matched. + :rtype: bool + """ + + if len(self._rtext) == 0: + return False + tried_matches = self._tried_m.get(self._freeze(self._tree), []) + return self._rtext[0] not in tried_matches + + def currently_complete(self): + """ + :return: Whether the parser's current state represents a + complete parse. + :rtype: bool + """ + return len(self._frontier) == 0 and len(self._rtext) == 0 + + def _parse(self, remaining_text, tree, frontier): + """ + A stub version of ``_parse`` that sets the parsers current + state to the given arguments. In ``RecursiveDescentParser``, + the ``_parse`` method is used to recursively continue parsing a + text. ``SteppingRecursiveDescentParser`` overrides it to + capture these recursive calls. It records the parser's old + state in the history (to allow for backtracking), and updates + the parser's new state using the given arguments. Finally, it + returns ``[1]``, which is used by ``match`` and ``expand`` to + detect whether their operations were successful. + + :return: ``[1]`` + :rtype: list of int + """ + self._history.append((self._rtext, self._tree, self._frontier)) + self._rtext = remaining_text + self._tree = tree + self._frontier = frontier + + # Is it a good parse? If so, record it. + if len(frontier) == 0 and len(remaining_text) == 0: + self._parses.append(tree) + self._trace_succeed(self._tree, self._frontier) + + return [1] + + def parses(self): + """ + :return: An iterator of the parses that have been found by this + parser so far. + :rtype: list of Tree + """ + return iter(self._parses) + + def set_grammar(self, grammar): + """ + Change the grammar used to parse texts. + + :param grammar: The new grammar. + :type grammar: CFG + """ + self._grammar = grammar + + +##////////////////////////////////////////////////////// +## Demonstration Code +##////////////////////////////////////////////////////// + + +def demo(): + """ + A demonstration of the recursive descent parser. + """ + + from nltk import CFG, parse + + grammar = CFG.fromstring( + """ + S -> NP VP + NP -> Det N | Det N PP + VP -> V NP | V NP PP + PP -> P NP + NP -> 'I' + N -> 'man' | 'park' | 'telescope' | 'dog' + Det -> 'the' | 'a' + P -> 'in' | 'with' + V -> 'saw' + """ + ) + + for prod in grammar.productions(): + print(prod) + + sent = "I saw a man in the park".split() + parser = parse.RecursiveDescentParser(grammar, trace=2) + for p in parser.parse(sent): + print(p) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py new file mode 100644 index 0000000000000000000000000000000000000000..604d5cc729851755b69dd083265b9572f77d7650 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py @@ -0,0 +1,479 @@ +# Natural Language Toolkit: Shift-Reduce Parser +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from nltk.grammar import Nonterminal +from nltk.parse.api import ParserI +from nltk.tree import Tree + + +##////////////////////////////////////////////////////// +## Shift/Reduce Parser +##////////////////////////////////////////////////////// +class ShiftReduceParser(ParserI): + """ + A simple bottom-up CFG parser that uses two operations, "shift" + and "reduce", to find a single parse for a text. + + ``ShiftReduceParser`` maintains a stack, which records the + structure of a portion of the text. This stack is a list of + strings and Trees that collectively cover a portion of + the text. For example, while parsing the sentence "the dog saw + the man" with a typical grammar, ``ShiftReduceParser`` will produce + the following stack, which covers "the dog saw":: + + [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')] + + ``ShiftReduceParser`` attempts to extend the stack to cover the + entire text, and to combine the stack elements into a single tree, + producing a complete parse for the sentence. + + Initially, the stack is empty. It is extended to cover the text, + from left to right, by repeatedly applying two operations: + + - "shift" moves a token from the beginning of the text to the + end of the stack. + - "reduce" uses a CFG production to combine the rightmost stack + elements into a single Tree. + + Often, more than one operation can be performed on a given stack. + In this case, ``ShiftReduceParser`` uses the following heuristics + to decide which operation to perform: + + - Only shift if no reductions are available. + - If multiple reductions are available, then apply the reduction + whose CFG production is listed earliest in the grammar. + + Note that these heuristics are not guaranteed to choose an + operation that leads to a parse of the text. Also, if multiple + parses exists, ``ShiftReduceParser`` will return at most one of + them. + + :see: ``nltk.grammar`` + """ + + def __init__(self, grammar, trace=0): + """ + Create a new ``ShiftReduceParser``, that uses ``grammar`` to + parse texts. + + :type grammar: Grammar + :param grammar: The grammar used to parse texts. + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + and higher numbers will produce more verbose tracing + output. + """ + self._grammar = grammar + self._trace = trace + self._check_grammar() + + def grammar(self): + return self._grammar + + def parse(self, tokens): + tokens = list(tokens) + self._grammar.check_coverage(tokens) + + # initialize the stack. + stack = [] + remaining_text = tokens + + # Trace output. + if self._trace: + print("Parsing %r" % " ".join(tokens)) + self._trace_stack(stack, remaining_text) + + # iterate through the text, pushing the token onto + # the stack, then reducing the stack. + while len(remaining_text) > 0: + self._shift(stack, remaining_text) + while self._reduce(stack, remaining_text): + pass + + # Did we reduce everything? + if len(stack) == 1: + # Did we end up with the right category? + if stack[0].label() == self._grammar.start().symbol(): + yield stack[0] + + def _shift(self, stack, remaining_text): + """ + Move a token from the beginning of ``remaining_text`` to the + end of ``stack``. + + :type stack: list(str and Tree) + :param stack: A list of strings and Trees, encoding + the structure of the text that has been parsed so far. + :type remaining_text: list(str) + :param remaining_text: The portion of the text that is not yet + covered by ``stack``. + :rtype: None + """ + stack.append(remaining_text[0]) + remaining_text.remove(remaining_text[0]) + if self._trace: + self._trace_shift(stack, remaining_text) + + def _match_rhs(self, rhs, rightmost_stack): + """ + :rtype: bool + :return: true if the right hand side of a CFG production + matches the rightmost elements of the stack. ``rhs`` + matches ``rightmost_stack`` if they are the same length, + and each element of ``rhs`` matches the corresponding + element of ``rightmost_stack``. A nonterminal element of + ``rhs`` matches any Tree whose node value is equal + to the nonterminal's symbol. A terminal element of ``rhs`` + matches any string whose type is equal to the terminal. + :type rhs: list(terminal and Nonterminal) + :param rhs: The right hand side of a CFG production. + :type rightmost_stack: list(string and Tree) + :param rightmost_stack: The rightmost elements of the parser's + stack. + """ + + if len(rightmost_stack) != len(rhs): + return False + for i in range(len(rightmost_stack)): + if isinstance(rightmost_stack[i], Tree): + if not isinstance(rhs[i], Nonterminal): + return False + if rightmost_stack[i].label() != rhs[i].symbol(): + return False + else: + if isinstance(rhs[i], Nonterminal): + return False + if rightmost_stack[i] != rhs[i]: + return False + return True + + def _reduce(self, stack, remaining_text, production=None): + """ + Find a CFG production whose right hand side matches the + rightmost stack elements; and combine those stack elements + into a single Tree, with the node specified by the + production's left-hand side. If more than one CFG production + matches the stack, then use the production that is listed + earliest in the grammar. The new Tree replaces the + elements in the stack. + + :rtype: Production or None + :return: If a reduction is performed, then return the CFG + production that the reduction is based on; otherwise, + return false. + :type stack: list(string and Tree) + :param stack: A list of strings and Trees, encoding + the structure of the text that has been parsed so far. + :type remaining_text: list(str) + :param remaining_text: The portion of the text that is not yet + covered by ``stack``. + """ + if production is None: + productions = self._grammar.productions() + else: + productions = [production] + + # Try each production, in order. + for production in productions: + rhslen = len(production.rhs()) + + # check if the RHS of a production matches the top of the stack + if self._match_rhs(production.rhs(), stack[-rhslen:]): + + # combine the tree to reflect the reduction + tree = Tree(production.lhs().symbol(), stack[-rhslen:]) + stack[-rhslen:] = [tree] + + # We reduced something + if self._trace: + self._trace_reduce(stack, production, remaining_text) + return production + + # We didn't reduce anything + return None + + def trace(self, trace=2): + """ + Set the level of tracing output that should be generated when + parsing a text. + + :type trace: int + :param trace: The trace level. A trace level of ``0`` will + generate no tracing output; and higher trace levels will + produce more verbose tracing output. + :rtype: None + """ + # 1: just show shifts. + # 2: show shifts & reduces + # 3: display which tokens & productions are shifed/reduced + self._trace = trace + + def _trace_stack(self, stack, remaining_text, marker=" "): + """ + Print trace output displaying the given stack and text. + + :rtype: None + :param marker: A character that is printed to the left of the + stack. This is used with trace level 2 to print 'S' + before shifted stacks and 'R' before reduced stacks. + """ + s = " " + marker + " [ " + for elt in stack: + if isinstance(elt, Tree): + s += repr(Nonterminal(elt.label())) + " " + else: + s += repr(elt) + " " + s += "* " + " ".join(remaining_text) + "]" + print(s) + + def _trace_shift(self, stack, remaining_text): + """ + Print trace output displaying that a token has been shifted. + + :rtype: None + """ + if self._trace > 2: + print("Shift %r:" % stack[-1]) + if self._trace == 2: + self._trace_stack(stack, remaining_text, "S") + elif self._trace > 0: + self._trace_stack(stack, remaining_text) + + def _trace_reduce(self, stack, production, remaining_text): + """ + Print trace output displaying that ``production`` was used to + reduce ``stack``. + + :rtype: None + """ + if self._trace > 2: + rhs = " ".join(production.rhs()) + print(f"Reduce {production.lhs()!r} <- {rhs}") + if self._trace == 2: + self._trace_stack(stack, remaining_text, "R") + elif self._trace > 1: + self._trace_stack(stack, remaining_text) + + def _check_grammar(self): + """ + Check to make sure that all of the CFG productions are + potentially useful. If any productions can never be used, + then print a warning. + + :rtype: None + """ + productions = self._grammar.productions() + + # Any production whose RHS is an extension of another production's RHS + # will never be used. + for i in range(len(productions)): + for j in range(i + 1, len(productions)): + rhs1 = productions[i].rhs() + rhs2 = productions[j].rhs() + if rhs1[: len(rhs2)] == rhs2: + print("Warning: %r will never be used" % productions[i]) + + +##////////////////////////////////////////////////////// +## Stepping Shift/Reduce Parser +##////////////////////////////////////////////////////// +class SteppingShiftReduceParser(ShiftReduceParser): + """ + A ``ShiftReduceParser`` that allows you to setp through the parsing + process, performing a single operation at a time. It also allows + you to change the parser's grammar midway through parsing a text. + + The ``initialize`` method is used to start parsing a text. + ``shift`` performs a single shift operation, and ``reduce`` performs + a single reduce operation. ``step`` will perform a single reduce + operation if possible; otherwise, it will perform a single shift + operation. ``parses`` returns the set of parses that have been + found by the parser. + + :ivar _history: A list of ``(stack, remaining_text)`` pairs, + containing all of the previous states of the parser. This + history is used to implement the ``undo`` operation. + :see: ``nltk.grammar`` + """ + + def __init__(self, grammar, trace=0): + super().__init__(grammar, trace) + self._stack = None + self._remaining_text = None + self._history = [] + + def parse(self, tokens): + tokens = list(tokens) + self.initialize(tokens) + while self.step(): + pass + return self.parses() + + def stack(self): + """ + :return: The parser's stack. + :rtype: list(str and Tree) + """ + return self._stack + + def remaining_text(self): + """ + :return: The portion of the text that is not yet covered by the + stack. + :rtype: list(str) + """ + return self._remaining_text + + def initialize(self, tokens): + """ + Start parsing a given text. This sets the parser's stack to + ``[]`` and sets its remaining text to ``tokens``. + """ + self._stack = [] + self._remaining_text = tokens + self._history = [] + + def step(self): + """ + Perform a single parsing operation. If a reduction is + possible, then perform that reduction, and return the + production that it is based on. Otherwise, if a shift is + possible, then perform it, and return True. Otherwise, + return False. + + :return: False if no operation was performed; True if a shift was + performed; and the CFG production used to reduce if a + reduction was performed. + :rtype: Production or bool + """ + return self.reduce() or self.shift() + + def shift(self): + """ + Move a token from the beginning of the remaining text to the + end of the stack. If there are no more tokens in the + remaining text, then do nothing. + + :return: True if the shift operation was successful. + :rtype: bool + """ + if len(self._remaining_text) == 0: + return False + self._history.append((self._stack[:], self._remaining_text[:])) + self._shift(self._stack, self._remaining_text) + return True + + def reduce(self, production=None): + """ + Use ``production`` to combine the rightmost stack elements into + a single Tree. If ``production`` does not match the + rightmost stack elements, then do nothing. + + :return: The production used to reduce the stack, if a + reduction was performed. If no reduction was performed, + return None. + + :rtype: Production or None + """ + self._history.append((self._stack[:], self._remaining_text[:])) + return_val = self._reduce(self._stack, self._remaining_text, production) + + if not return_val: + self._history.pop() + return return_val + + def undo(self): + """ + Return the parser to its state before the most recent + shift or reduce operation. Calling ``undo`` repeatedly return + the parser to successively earlier states. If no shift or + reduce operations have been performed, ``undo`` will make no + changes. + + :return: true if an operation was successfully undone. + :rtype: bool + """ + if len(self._history) == 0: + return False + (self._stack, self._remaining_text) = self._history.pop() + return True + + def reducible_productions(self): + """ + :return: A list of the productions for which reductions are + available for the current parser state. + :rtype: list(Production) + """ + productions = [] + for production in self._grammar.productions(): + rhslen = len(production.rhs()) + if self._match_rhs(production.rhs(), self._stack[-rhslen:]): + productions.append(production) + return productions + + def parses(self): + """ + :return: An iterator of the parses that have been found by this + parser so far. + :rtype: iter(Tree) + """ + if ( + len(self._remaining_text) == 0 + and len(self._stack) == 1 + and self._stack[0].label() == self._grammar.start().symbol() + ): + yield self._stack[0] + + # copied from nltk.parser + + def set_grammar(self, grammar): + """ + Change the grammar used to parse texts. + + :param grammar: The new grammar. + :type grammar: CFG + """ + self._grammar = grammar + + +##////////////////////////////////////////////////////// +## Demonstration Code +##////////////////////////////////////////////////////// + + +def demo(): + """ + A demonstration of the shift-reduce parser. + """ + + from nltk import CFG, parse + + grammar = CFG.fromstring( + """ + S -> NP VP + NP -> Det N | Det N PP + VP -> V NP | V NP PP + PP -> P NP + NP -> 'I' + N -> 'man' | 'park' | 'telescope' | 'dog' + Det -> 'the' | 'a' + P -> 'in' | 'with' + V -> 'saw' + """ + ) + + sent = "I saw a man in the park".split() + + parser = parse.ShiftReduceParser(grammar, trace=2) + for p in parser.parse(sent): + print(p) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py new file mode 100644 index 0000000000000000000000000000000000000000..d1beb81238b3dda8aefb2de941c9be86ca57cf03 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py @@ -0,0 +1,470 @@ +# Natural Language Toolkit: Interface to the Stanford Parser +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Xu +# +# URL: +# For license information, see LICENSE.TXT + +import os +import tempfile +import warnings +from subprocess import PIPE + +from nltk.internals import ( + _java_options, + config_java, + find_jar_iter, + find_jars_within_path, + java, +) +from nltk.parse.api import ParserI +from nltk.parse.dependencygraph import DependencyGraph +from nltk.tree import Tree + +_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml" + + +class GenericStanfordParser(ParserI): + """Interface to the Stanford Parser""" + + _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar" + _JAR = r"stanford-parser\.jar" + _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser" + + _USE_STDIN = False + _DOUBLE_SPACED_OUTPUT = False + + def __init__( + self, + path_to_jar=None, + path_to_models_jar=None, + model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", + encoding="utf8", + verbose=False, + java_options="-mx4g", + corenlp_options="", + ): + + # find the most recent code and model jar + stanford_jar = max( + find_jar_iter( + self._JAR, + path_to_jar, + env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"), + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, + ), + key=lambda model_path: os.path.dirname(model_path), + ) + + model_jar = max( + find_jar_iter( + self._MODEL_JAR_PATTERN, + path_to_models_jar, + env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"), + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, + ), + key=lambda model_path: os.path.dirname(model_path), + ) + + # self._classpath = (stanford_jar, model_jar) + + # Adding logging jar files to classpath + stanford_dir = os.path.split(stanford_jar)[0] + self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir)) + + self.model_path = model_path + self._encoding = encoding + self.corenlp_options = corenlp_options + self.java_options = java_options + + def _parse_trees_output(self, output_): + res = [] + cur_lines = [] + cur_trees = [] + blank = False + for line in output_.splitlines(False): + if line == "": + if blank: + res.append(iter(cur_trees)) + cur_trees = [] + blank = False + elif self._DOUBLE_SPACED_OUTPUT: + cur_trees.append(self._make_tree("\n".join(cur_lines))) + cur_lines = [] + blank = True + else: + res.append(iter([self._make_tree("\n".join(cur_lines))])) + cur_lines = [] + else: + cur_lines.append(line) + blank = False + return iter(res) + + def parse_sents(self, sentences, verbose=False): + """ + Use StanfordParser to parse multiple sentences. Takes multiple sentences as a + list where each sentence is a list of words. + Each sentence will be automatically tagged with this StanfordParser instance's + tagger. + If whitespaces exists inside a token, then the token will be treated as + separate tokens. + + :param sentences: Input sentences to parse + :type sentences: list(list(str)) + :rtype: iter(iter(Tree)) + """ + cmd = [ + self._MAIN_CLASS, + "-model", + self.model_path, + "-sentences", + "newline", + "-outputFormat", + self._OUTPUT_FORMAT, + "-tokenized", + "-escaper", + "edu.stanford.nlp.process.PTBEscapingProcessor", + ] + return self._parse_trees_output( + self._execute( + cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose + ) + ) + + def raw_parse(self, sentence, verbose=False): + """ + Use StanfordParser to parse a sentence. Takes a sentence as a string; + before parsing, it will be automatically tokenized and tagged by + the Stanford Parser. + + :param sentence: Input sentence to parse + :type sentence: str + :rtype: iter(Tree) + """ + return next(self.raw_parse_sents([sentence], verbose)) + + def raw_parse_sents(self, sentences, verbose=False): + """ + Use StanfordParser to parse multiple sentences. Takes multiple sentences as a + list of strings. + Each sentence will be automatically tokenized and tagged by the Stanford Parser. + + :param sentences: Input sentences to parse + :type sentences: list(str) + :rtype: iter(iter(Tree)) + """ + cmd = [ + self._MAIN_CLASS, + "-model", + self.model_path, + "-sentences", + "newline", + "-outputFormat", + self._OUTPUT_FORMAT, + ] + return self._parse_trees_output( + self._execute(cmd, "\n".join(sentences), verbose) + ) + + def tagged_parse(self, sentence, verbose=False): + """ + Use StanfordParser to parse a sentence. Takes a sentence as a list of + (word, tag) tuples; the sentence must have already been tokenized and + tagged. + + :param sentence: Input sentence to parse + :type sentence: list(tuple(str, str)) + :rtype: iter(Tree) + """ + return next(self.tagged_parse_sents([sentence], verbose)) + + def tagged_parse_sents(self, sentences, verbose=False): + """ + Use StanfordParser to parse multiple sentences. Takes multiple sentences + where each sentence is a list of (word, tag) tuples. + The sentences must have already been tokenized and tagged. + + :param sentences: Input sentences to parse + :type sentences: list(list(tuple(str, str))) + :rtype: iter(iter(Tree)) + """ + tag_separator = "/" + cmd = [ + self._MAIN_CLASS, + "-model", + self.model_path, + "-sentences", + "newline", + "-outputFormat", + self._OUTPUT_FORMAT, + "-tokenized", + "-tagSeparator", + tag_separator, + "-tokenizerFactory", + "edu.stanford.nlp.process.WhitespaceTokenizer", + "-tokenizerMethod", + "newCoreLabelTokenizerFactory", + ] + # We don't need to escape slashes as "splitting is done on the last instance of the character in the token" + return self._parse_trees_output( + self._execute( + cmd, + "\n".join( + " ".join(tag_separator.join(tagged) for tagged in sentence) + for sentence in sentences + ), + verbose, + ) + ) + + def _execute(self, cmd, input_, verbose=False): + encoding = self._encoding + cmd.extend(["-encoding", encoding]) + if self.corenlp_options: + cmd.extend(self.corenlp_options.split()) + + default_options = " ".join(_java_options) + + # Configure java. + config_java(options=self.java_options, verbose=verbose) + + # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. + with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file: + # Write the actual sentences to the temporary input file + if isinstance(input_, str) and encoding: + input_ = input_.encode(encoding) + input_file.write(input_) + input_file.flush() + + # Run the tagger and get the output. + if self._USE_STDIN: + input_file.seek(0) + stdout, stderr = java( + cmd, + classpath=self._classpath, + stdin=input_file, + stdout=PIPE, + stderr=PIPE, + ) + else: + cmd.append(input_file.name) + stdout, stderr = java( + cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE + ) + + stdout = stdout.replace(b"\xc2\xa0", b" ") + stdout = stdout.replace(b"\x00\xa0", b" ") + stdout = stdout.decode(encoding) + + os.unlink(input_file.name) + + # Return java configurations to their default values. + config_java(options=default_options, verbose=False) + + return stdout + + +class StanfordParser(GenericStanfordParser): + """ + >>> parser=StanfordParser( + ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" + ... ) # doctest: +SKIP + + >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), + Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), + Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])] + + >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents(( + ... "the quick brown fox jumps over the lazy dog", + ... "the quick grey wolf jumps over the lazy fox" + ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), + Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), + Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP', + [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP', + [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), + Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])] + + >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents(( + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]), + Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP', + [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']), + Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []), + Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])] + + >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( + ... ( + ... ("The", "DT"), + ... ("quick", "JJ"), + ... ("brown", "JJ"), + ... ("fox", "NN"), + ... ("jumped", "VBD"), + ... ("over", "IN"), + ... ("the", "DT"), + ... ("lazy", "JJ"), + ... ("dog", "NN"), + ... (".", "."), + ... ), + ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), + Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', + [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] + """ + + _OUTPUT_FORMAT = "penn" + + def __init__(self, *args, **kwargs): + warnings.warn( + "The StanfordParser will be deprecated\n" + "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.", + DeprecationWarning, + stacklevel=2, + ) + + super().__init__(*args, **kwargs) + + def _make_tree(self, result): + return Tree.fromstring(result) + + +class StanfordDependencyParser(GenericStanfordParser): + + """ + >>> dep_parser=StanfordDependencyParser( + ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" + ... ) # doctest: +SKIP + + >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] + + >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP + [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), + ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), + ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), + ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] + + >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( + ... "The quick brown fox jumps over the lazy dog.", + ... "The quick grey wolf jumps over the lazy fox." + ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), + Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] + + >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] + + >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( + ... ( + ... ("The", "DT"), + ... ("quick", "JJ"), + ... ("brown", "JJ"), + ... ("fox", "NN"), + ... ("jumped", "VBD"), + ... ("over", "IN"), + ... ("the", "DT"), + ... ("lazy", "JJ"), + ... ("dog", "NN"), + ... (".", "."), + ... ), + ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP + [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), + ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), + ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), + ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] + + """ + + _OUTPUT_FORMAT = "conll2007" + + def __init__(self, *args, **kwargs): + warnings.warn( + "The StanfordDependencyParser will be deprecated\n" + "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.", + DeprecationWarning, + stacklevel=2, + ) + + super().__init__(*args, **kwargs) + + def _make_tree(self, result): + return DependencyGraph(result, top_relation_label="root") + + +class StanfordNeuralDependencyParser(GenericStanfordParser): + """ + >>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP + >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP + + >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])] + + >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP + [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', + (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), + u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), + ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', + (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'), + u'punct', (u'.', u'.'))]] + + >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( + ... "The quick brown fox jumps over the lazy dog.", + ... "The quick grey wolf jumps over the lazy fox." + ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', + 'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), + Tree('fox', ['over', 'the', 'lazy']), '.'])] + + >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP + [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', + ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])] + """ + + _OUTPUT_FORMAT = "conll" + _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP" + _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar" + _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar" + _USE_STDIN = True + _DOUBLE_SPACED_OUTPUT = True + + def __init__(self, *args, **kwargs): + warnings.warn( + "The StanfordNeuralDependencyParser will be deprecated\n" + "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.", + DeprecationWarning, + stacklevel=2, + ) + + super().__init__(*args, **kwargs) + self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse" + + def tagged_parse_sents(self, sentences, verbose=False): + """ + Currently unimplemented because the neural dependency parser (and + the StanfordCoreNLP pipeline class) doesn't support passing in pre- + tagged tokens. + """ + raise NotImplementedError( + "tagged_parse[_sents] is not supported by " + "StanfordNeuralDependencyParser; use " + "parse[_sents] or raw_parse[_sents] instead." + ) + + def _make_tree(self, result): + return DependencyGraph(result, top_relation_label="ROOT") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py new file mode 100644 index 0000000000000000000000000000000000000000..4f43bb0992fd8d08bd66f20e452ccc522face197 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py @@ -0,0 +1,794 @@ +# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers +# +# Author: Long Duong +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +import pickle +import tempfile +from copy import deepcopy +from operator import itemgetter +from os import remove + +try: + from numpy import array + from scipy import sparse + from sklearn import svm + from sklearn.datasets import load_svmlight_file +except ImportError: + pass + +from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI + + +class Configuration: + """ + Class for holding configuration which is the partial analysis of the input sentence. + The transition based parser aims at finding set of operators that transfer the initial + configuration to the terminal configuration. + + The configuration includes: + - Stack: for storing partially proceeded words + - Buffer: for storing remaining input words + - Set of arcs: for storing partially built dependency tree + + This class also provides a method to represent a configuration as list of features. + """ + + def __init__(self, dep_graph): + """ + :param dep_graph: the representation of an input in the form of dependency graph. + :type dep_graph: DependencyGraph where the dependencies are not specified. + """ + # dep_graph.nodes contain list of token for a sentence + self.stack = [0] # The root element + self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer + self.arcs = [] # empty set of arc + self._tokens = dep_graph.nodes + self._max_address = len(self.buffer) + + def __str__(self): + return ( + "Stack : " + + str(self.stack) + + " Buffer : " + + str(self.buffer) + + " Arcs : " + + str(self.arcs) + ) + + def _check_informative(self, feat, flag=False): + """ + Check whether a feature is informative + The flag control whether "_" is informative or not + """ + if feat is None: + return False + if feat == "": + return False + if flag is False: + if feat == "_": + return False + return True + + def extract_features(self): + """ + Extract the set of features for the current configuration. Implement standard features as describe in + Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre. + Please note that these features are very basic. + :return: list(str) + """ + result = [] + # Todo : can come up with more complicated features set for better + # performance. + if len(self.stack) > 0: + # Stack 0 + stack_idx0 = self.stack[len(self.stack) - 1] + token = self._tokens[stack_idx0] + if self._check_informative(token["word"], True): + result.append("STK_0_FORM_" + token["word"]) + if "lemma" in token and self._check_informative(token["lemma"]): + result.append("STK_0_LEMMA_" + token["lemma"]) + if self._check_informative(token["tag"]): + result.append("STK_0_POS_" + token["tag"]) + if "feats" in token and self._check_informative(token["feats"]): + feats = token["feats"].split("|") + for feat in feats: + result.append("STK_0_FEATS_" + feat) + # Stack 1 + if len(self.stack) > 1: + stack_idx1 = self.stack[len(self.stack) - 2] + token = self._tokens[stack_idx1] + if self._check_informative(token["tag"]): + result.append("STK_1_POS_" + token["tag"]) + + # Left most, right most dependency of stack[0] + left_most = 1000000 + right_most = -1 + dep_left_most = "" + dep_right_most = "" + for (wi, r, wj) in self.arcs: + if wi == stack_idx0: + if (wj > wi) and (wj > right_most): + right_most = wj + dep_right_most = r + if (wj < wi) and (wj < left_most): + left_most = wj + dep_left_most = r + if self._check_informative(dep_left_most): + result.append("STK_0_LDEP_" + dep_left_most) + if self._check_informative(dep_right_most): + result.append("STK_0_RDEP_" + dep_right_most) + + # Check Buffered 0 + if len(self.buffer) > 0: + # Buffer 0 + buffer_idx0 = self.buffer[0] + token = self._tokens[buffer_idx0] + if self._check_informative(token["word"], True): + result.append("BUF_0_FORM_" + token["word"]) + if "lemma" in token and self._check_informative(token["lemma"]): + result.append("BUF_0_LEMMA_" + token["lemma"]) + if self._check_informative(token["tag"]): + result.append("BUF_0_POS_" + token["tag"]) + if "feats" in token and self._check_informative(token["feats"]): + feats = token["feats"].split("|") + for feat in feats: + result.append("BUF_0_FEATS_" + feat) + # Buffer 1 + if len(self.buffer) > 1: + buffer_idx1 = self.buffer[1] + token = self._tokens[buffer_idx1] + if self._check_informative(token["word"], True): + result.append("BUF_1_FORM_" + token["word"]) + if self._check_informative(token["tag"]): + result.append("BUF_1_POS_" + token["tag"]) + if len(self.buffer) > 2: + buffer_idx2 = self.buffer[2] + token = self._tokens[buffer_idx2] + if self._check_informative(token["tag"]): + result.append("BUF_2_POS_" + token["tag"]) + if len(self.buffer) > 3: + buffer_idx3 = self.buffer[3] + token = self._tokens[buffer_idx3] + if self._check_informative(token["tag"]): + result.append("BUF_3_POS_" + token["tag"]) + # Left most, right most dependency of stack[0] + left_most = 1000000 + right_most = -1 + dep_left_most = "" + dep_right_most = "" + for (wi, r, wj) in self.arcs: + if wi == buffer_idx0: + if (wj > wi) and (wj > right_most): + right_most = wj + dep_right_most = r + if (wj < wi) and (wj < left_most): + left_most = wj + dep_left_most = r + if self._check_informative(dep_left_most): + result.append("BUF_0_LDEP_" + dep_left_most) + if self._check_informative(dep_right_most): + result.append("BUF_0_RDEP_" + dep_right_most) + + return result + + +class Transition: + """ + This class defines a set of transition which is applied to a configuration to get another configuration + Note that for different parsing algorithm, the transition is different. + """ + + # Define set of transitions + LEFT_ARC = "LEFTARC" + RIGHT_ARC = "RIGHTARC" + SHIFT = "SHIFT" + REDUCE = "REDUCE" + + def __init__(self, alg_option): + """ + :param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm + :type alg_option: str + """ + self._algo = alg_option + if alg_option not in [ + TransitionParser.ARC_STANDARD, + TransitionParser.ARC_EAGER, + ]: + raise ValueError( + " Currently we only support %s and %s " + % (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER) + ) + + def left_arc(self, conf, relation): + """ + Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager + + :param configuration: is the current configuration + :return: A new configuration or -1 if the pre-condition is not satisfied + """ + if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): + return -1 + if conf.buffer[0] == 0: + # here is the Root element + return -1 + + idx_wi = conf.stack[len(conf.stack) - 1] + + flag = True + if self._algo == TransitionParser.ARC_EAGER: + for (idx_parent, r, idx_child) in conf.arcs: + if idx_child == idx_wi: + flag = False + + if flag: + conf.stack.pop() + idx_wj = conf.buffer[0] + conf.arcs.append((idx_wj, relation, idx_wi)) + else: + return -1 + + def right_arc(self, conf, relation): + """ + Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager + + :param configuration: is the current configuration + :return: A new configuration or -1 if the pre-condition is not satisfied + """ + if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): + return -1 + if self._algo == TransitionParser.ARC_STANDARD: + idx_wi = conf.stack.pop() + idx_wj = conf.buffer[0] + conf.buffer[0] = idx_wi + conf.arcs.append((idx_wi, relation, idx_wj)) + else: # arc-eager + idx_wi = conf.stack[len(conf.stack) - 1] + idx_wj = conf.buffer.pop(0) + conf.stack.append(idx_wj) + conf.arcs.append((idx_wi, relation, idx_wj)) + + def reduce(self, conf): + """ + Note that the algorithm for reduce is only available for arc-eager + + :param configuration: is the current configuration + :return: A new configuration or -1 if the pre-condition is not satisfied + """ + + if self._algo != TransitionParser.ARC_EAGER: + return -1 + if len(conf.stack) <= 0: + return -1 + + idx_wi = conf.stack[len(conf.stack) - 1] + flag = False + for (idx_parent, r, idx_child) in conf.arcs: + if idx_child == idx_wi: + flag = True + if flag: + conf.stack.pop() # reduce it + else: + return -1 + + def shift(self, conf): + """ + Note that the algorithm for shift is the SAME for arc-standard and arc-eager + + :param configuration: is the current configuration + :return: A new configuration or -1 if the pre-condition is not satisfied + """ + if len(conf.buffer) <= 0: + return -1 + idx_wi = conf.buffer.pop(0) + conf.stack.append(idx_wi) + + +class TransitionParser(ParserI): + + """ + Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager" + """ + + ARC_STANDARD = "arc-standard" + ARC_EAGER = "arc-eager" + + def __init__(self, algorithm): + """ + :param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm + :type algorithm: str + """ + if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]): + raise ValueError( + " Currently we only support %s and %s " + % (self.ARC_STANDARD, self.ARC_EAGER) + ) + self._algorithm = algorithm + + self._dictionary = {} + self._transition = {} + self._match_transition = {} + + def _get_dep_relation(self, idx_parent, idx_child, depgraph): + p_node = depgraph.nodes[idx_parent] + c_node = depgraph.nodes[idx_child] + + if c_node["word"] is None: + return None # Root word + + if c_node["head"] == p_node["address"]: + return c_node["rel"] + else: + return None + + def _convert_to_binary_features(self, features): + """ + :param features: list of feature string which is needed to convert to binary features + :type features: list(str) + :return : string of binary features in libsvm format which is 'featureID:value' pairs + """ + unsorted_result = [] + for feature in features: + self._dictionary.setdefault(feature, len(self._dictionary)) + unsorted_result.append(self._dictionary[feature]) + + # Default value of each feature is 1.0 + return " ".join( + str(featureID) + ":1.0" for featureID in sorted(unsorted_result) + ) + + def _is_projective(self, depgraph): + arc_list = [] + for key in depgraph.nodes: + node = depgraph.nodes[key] + + if "head" in node: + childIdx = node["address"] + parentIdx = node["head"] + if parentIdx is not None: + arc_list.append((parentIdx, childIdx)) + + for (parentIdx, childIdx) in arc_list: + # Ensure that childIdx < parentIdx + if childIdx > parentIdx: + temp = childIdx + childIdx = parentIdx + parentIdx = temp + for k in range(childIdx + 1, parentIdx): + for m in range(len(depgraph.nodes)): + if (m < childIdx) or (m > parentIdx): + if (k, m) in arc_list: + return False + if (m, k) in arc_list: + return False + return True + + def _write_to_file(self, key, binary_features, input_file): + """ + write the binary features to input file and update the transition dictionary + """ + self._transition.setdefault(key, len(self._transition) + 1) + self._match_transition[self._transition[key]] = key + + input_str = str(self._transition[key]) + " " + binary_features + "\n" + input_file.write(input_str.encode("utf-8")) + + def _create_training_examples_arc_std(self, depgraphs, input_file): + """ + Create the training example in the libsvm format and write it to the input_file. + Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009) + """ + operation = Transition(self.ARC_STANDARD) + count_proj = 0 + training_seq = [] + + for depgraph in depgraphs: + if not self._is_projective(depgraph): + continue + + count_proj += 1 + conf = Configuration(depgraph) + while len(conf.buffer) > 0: + b0 = conf.buffer[0] + features = conf.extract_features() + binary_features = self._convert_to_binary_features(features) + + if len(conf.stack) > 0: + s0 = conf.stack[len(conf.stack) - 1] + # Left-arc operation + rel = self._get_dep_relation(b0, s0, depgraph) + if rel is not None: + key = Transition.LEFT_ARC + ":" + rel + self._write_to_file(key, binary_features, input_file) + operation.left_arc(conf, rel) + training_seq.append(key) + continue + + # Right-arc operation + rel = self._get_dep_relation(s0, b0, depgraph) + if rel is not None: + precondition = True + # Get the max-index of buffer + maxID = conf._max_address + + for w in range(maxID + 1): + if w != b0: + relw = self._get_dep_relation(b0, w, depgraph) + if relw is not None: + if (b0, relw, w) not in conf.arcs: + precondition = False + + if precondition: + key = Transition.RIGHT_ARC + ":" + rel + self._write_to_file(key, binary_features, input_file) + operation.right_arc(conf, rel) + training_seq.append(key) + continue + + # Shift operation as the default + key = Transition.SHIFT + self._write_to_file(key, binary_features, input_file) + operation.shift(conf) + training_seq.append(key) + + print(" Number of training examples : " + str(len(depgraphs))) + print(" Number of valid (projective) examples : " + str(count_proj)) + return training_seq + + def _create_training_examples_arc_eager(self, depgraphs, input_file): + """ + Create the training example in the libsvm format and write it to the input_file. + Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre + """ + operation = Transition(self.ARC_EAGER) + countProj = 0 + training_seq = [] + + for depgraph in depgraphs: + if not self._is_projective(depgraph): + continue + + countProj += 1 + conf = Configuration(depgraph) + while len(conf.buffer) > 0: + b0 = conf.buffer[0] + features = conf.extract_features() + binary_features = self._convert_to_binary_features(features) + + if len(conf.stack) > 0: + s0 = conf.stack[len(conf.stack) - 1] + # Left-arc operation + rel = self._get_dep_relation(b0, s0, depgraph) + if rel is not None: + key = Transition.LEFT_ARC + ":" + rel + self._write_to_file(key, binary_features, input_file) + operation.left_arc(conf, rel) + training_seq.append(key) + continue + + # Right-arc operation + rel = self._get_dep_relation(s0, b0, depgraph) + if rel is not None: + key = Transition.RIGHT_ARC + ":" + rel + self._write_to_file(key, binary_features, input_file) + operation.right_arc(conf, rel) + training_seq.append(key) + continue + + # reduce operation + flag = False + for k in range(s0): + if self._get_dep_relation(k, b0, depgraph) is not None: + flag = True + if self._get_dep_relation(b0, k, depgraph) is not None: + flag = True + if flag: + key = Transition.REDUCE + self._write_to_file(key, binary_features, input_file) + operation.reduce(conf) + training_seq.append(key) + continue + + # Shift operation as the default + key = Transition.SHIFT + self._write_to_file(key, binary_features, input_file) + operation.shift(conf) + training_seq.append(key) + + print(" Number of training examples : " + str(len(depgraphs))) + print(" Number of valid (projective) examples : " + str(countProj)) + return training_seq + + def train(self, depgraphs, modelfile, verbose=True): + """ + :param depgraphs : list of DependencyGraph as the training data + :type depgraphs : DependencyGraph + :param modelfile : file name to save the trained model + :type modelfile : str + """ + + try: + input_file = tempfile.NamedTemporaryFile( + prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False + ) + + if self._algorithm == self.ARC_STANDARD: + self._create_training_examples_arc_std(depgraphs, input_file) + else: + self._create_training_examples_arc_eager(depgraphs, input_file) + + input_file.close() + # Using the temporary file to train the libsvm classifier + x_train, y_train = load_svmlight_file(input_file.name) + # The parameter is set according to the paper: + # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre + # Todo : because of probability = True => very slow due to + # cross-validation. Need to improve the speed here + model = svm.SVC( + kernel="poly", + degree=2, + coef0=0, + gamma=0.2, + C=0.5, + verbose=verbose, + probability=True, + ) + + model.fit(x_train, y_train) + # Save the model to file name (as pickle) + pickle.dump(model, open(modelfile, "wb")) + finally: + remove(input_file.name) + + def parse(self, depgraphs, modelFile): + """ + :param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy + :type depgraphs: list(DependencyGraph) + :param modelfile: the model file + :type modelfile: str + :return: list (DependencyGraph) with the 'head' and 'rel' information + """ + result = [] + # First load the model + model = pickle.load(open(modelFile, "rb")) + operation = Transition(self._algorithm) + + for depgraph in depgraphs: + conf = Configuration(depgraph) + while len(conf.buffer) > 0: + features = conf.extract_features() + col = [] + row = [] + data = [] + for feature in features: + if feature in self._dictionary: + col.append(self._dictionary[feature]) + row.append(0) + data.append(1.0) + np_col = array(sorted(col)) # NB : index must be sorted + np_row = array(row) + np_data = array(data) + + x_test = sparse.csr_matrix( + (np_data, (np_row, np_col)), shape=(1, len(self._dictionary)) + ) + + # It's best to use decision function as follow BUT it's not supported yet for sparse SVM + # Using decision function to build the votes array + # dec_func = model.decision_function(x_test)[0] + # votes = {} + # k = 0 + # for i in range(len(model.classes_)): + # for j in range(i+1, len(model.classes_)): + # #if dec_func[k] > 0: + # votes.setdefault(i,0) + # votes[i] +=1 + # else: + # votes.setdefault(j,0) + # votes[j] +=1 + # k +=1 + # Sort votes according to the values + # sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True) + + # We will use predict_proba instead of decision_function + prob_dict = {} + pred_prob = model.predict_proba(x_test)[0] + for i in range(len(pred_prob)): + prob_dict[i] = pred_prob[i] + sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True) + + # Note that SHIFT is always a valid operation + for (y_pred_idx, confidence) in sorted_Prob: + # y_pred = model.predict(x_test)[0] + # From the prediction match to the operation + y_pred = model.classes_[y_pred_idx] + + if y_pred in self._match_transition: + strTransition = self._match_transition[y_pred] + baseTransition = strTransition.split(":")[0] + + if baseTransition == Transition.LEFT_ARC: + if ( + operation.left_arc(conf, strTransition.split(":")[1]) + != -1 + ): + break + elif baseTransition == Transition.RIGHT_ARC: + if ( + operation.right_arc(conf, strTransition.split(":")[1]) + != -1 + ): + break + elif baseTransition == Transition.REDUCE: + if operation.reduce(conf) != -1: + break + elif baseTransition == Transition.SHIFT: + if operation.shift(conf) != -1: + break + else: + raise ValueError( + "The predicted transition is not recognized, expected errors" + ) + + # Finish with operations build the dependency graph from Conf.arcs + + new_depgraph = deepcopy(depgraph) + for key in new_depgraph.nodes: + node = new_depgraph.nodes[key] + node["rel"] = "" + # With the default, all the token depend on the Root + node["head"] = 0 + for (head, rel, child) in conf.arcs: + c_node = new_depgraph.nodes[child] + c_node["head"] = head + c_node["rel"] = rel + result.append(new_depgraph) + + return result + + +def demo(): + """ + >>> from nltk.parse import DependencyGraph, DependencyEvaluator + >>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition + >>> gold_sent = DependencyGraph(\""" + ... Economic JJ 2 ATT + ... news NN 3 SBJ + ... has VBD 0 ROOT + ... little JJ 5 ATT + ... effect NN 3 OBJ + ... on IN 5 ATT + ... financial JJ 8 ATT + ... markets NNS 6 PC + ... . . 3 PU + ... \""") + + >>> conf = Configuration(gold_sent) + + ###################### Check the Initial Feature ######################## + + >>> print(', '.join(conf.extract_features())) + STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ + + ###################### Check The Transition ####################### + Check the Initialized Configuration + >>> print(conf) + Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : [] + + A. Do some transition checks for ARC-STANDARD + + >>> operation = Transition('arc-standard') + >>> operation.shift(conf) + >>> operation.left_arc(conf, "ATT") + >>> operation.shift(conf) + >>> operation.left_arc(conf,"SBJ") + >>> operation.shift(conf) + >>> operation.shift(conf) + >>> operation.left_arc(conf, "ATT") + >>> operation.shift(conf) + >>> operation.shift(conf) + >>> operation.shift(conf) + >>> operation.left_arc(conf, "ATT") + + Middle Configuration and Features Check + >>> print(conf) + Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)] + + >>> print(', '.join(conf.extract_features())) + STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT + + >>> operation.right_arc(conf, "PC") + >>> operation.right_arc(conf, "ATT") + >>> operation.right_arc(conf, "OBJ") + >>> operation.shift(conf) + >>> operation.right_arc(conf, "PU") + >>> operation.right_arc(conf, "ROOT") + >>> operation.shift(conf) + + Terminated Configuration Check + >>> print(conf) + Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)] + + + B. Do some transition checks for ARC-EAGER + + >>> conf = Configuration(gold_sent) + >>> operation = Transition('arc-eager') + >>> operation.shift(conf) + >>> operation.left_arc(conf,'ATT') + >>> operation.shift(conf) + >>> operation.left_arc(conf,'SBJ') + >>> operation.right_arc(conf,'ROOT') + >>> operation.shift(conf) + >>> operation.left_arc(conf,'ATT') + >>> operation.right_arc(conf,'OBJ') + >>> operation.right_arc(conf,'ATT') + >>> operation.shift(conf) + >>> operation.left_arc(conf,'ATT') + >>> operation.right_arc(conf,'PC') + >>> operation.reduce(conf) + >>> operation.reduce(conf) + >>> operation.reduce(conf) + >>> operation.right_arc(conf,'PU') + >>> print(conf) + Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)] + + ###################### Check The Training Function ####################### + + A. Check the ARC-STANDARD training + >>> import tempfile + >>> import os + >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) + + >>> parser_std = TransitionParser('arc-standard') + >>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file))) + Number of training examples : 1 + Number of valid (projective) examples : 1 + SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT + + >>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False) + Number of training examples : 1 + Number of valid (projective) examples : 1 + >>> input_file.close() + >>> remove(input_file.name) + + B. Check the ARC-EAGER training + + >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False) + >>> parser_eager = TransitionParser('arc-eager') + >>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file))) + Number of training examples : 1 + Number of valid (projective) examples : 1 + SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU + + >>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False) + Number of training examples : 1 + Number of valid (projective) examples : 1 + + >>> input_file.close() + >>> remove(input_file.name) + + ###################### Check The Parsing Function ######################## + + A. Check the ARC-STANDARD parser + + >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model') + >>> de = DependencyEvaluator(result, [gold_sent]) + >>> de.eval() >= (0, 0) + True + + B. Check the ARC-EAGER parser + >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model') + >>> de = DependencyEvaluator(result, [gold_sent]) + >>> de.eval() >= (0, 0) + True + + Remove test temporary files + >>> remove('temp.arceager.model') + >>> remove('temp.arcstd.model') + + Note that result is very poor because of only one training example. + """ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py new file mode 100644 index 0000000000000000000000000000000000000000..e6eecf8ed64d5db8b3c5202b235a1491ef11e57e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py @@ -0,0 +1,234 @@ +# Natural Language Toolkit: Parser Utility Functions +# +# Author: Ewan Klein +# Tom Aarsen <> +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + + +""" +Utility functions for parsers. +""" + +from nltk.data import load +from nltk.grammar import CFG, PCFG, FeatureGrammar +from nltk.parse.chart import Chart, ChartParser +from nltk.parse.featurechart import FeatureChart, FeatureChartParser +from nltk.parse.pchart import InsideChartParser + + +def load_parser( + grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args +): + """ + Load a grammar from a file, and build a parser based on that grammar. + The parser depends on the grammar format, and might also depend + on properties of the grammar itself. + + The following grammar formats are currently supported: + - ``'cfg'`` (CFGs: ``CFG``) + - ``'pcfg'`` (probabilistic CFGs: ``PCFG``) + - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``) + + :type grammar_url: str + :param grammar_url: A URL specifying where the grammar is located. + The default protocol is ``"nltk:"``, which searches for the file + in the the NLTK data package. + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + and higher numbers will produce more verbose tracing output. + :param parser: The class used for parsing; should be ``ChartParser`` + or a subclass. + If None, the class depends on the grammar format. + :param chart_class: The class used for storing the chart; + should be ``Chart`` or a subclass. + Only used for CFGs and feature CFGs. + If None, the chart class depends on the grammar format. + :type beam_size: int + :param beam_size: The maximum length for the parser's edge queue. + Only used for probabilistic CFGs. + :param load_args: Keyword parameters used when loading the grammar. + See ``data.load`` for more information. + """ + grammar = load(grammar_url, **load_args) + if not isinstance(grammar, CFG): + raise ValueError("The grammar must be a CFG, " "or a subclass thereof.") + if isinstance(grammar, PCFG): + if parser is None: + parser = InsideChartParser + return parser(grammar, trace=trace, beam_size=beam_size) + + elif isinstance(grammar, FeatureGrammar): + if parser is None: + parser = FeatureChartParser + if chart_class is None: + chart_class = FeatureChart + return parser(grammar, trace=trace, chart_class=chart_class) + + else: # Plain CFG. + if parser is None: + parser = ChartParser + if chart_class is None: + chart_class = Chart + return parser(grammar, trace=trace, chart_class=chart_class) + + +def taggedsent_to_conll(sentence): + """ + A module to convert a single POS tagged sentence into CONLL format. + + >>> from nltk import word_tokenize, pos_tag + >>> text = "This is a foobar sentence." + >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): # doctest: +NORMALIZE_WHITESPACE + ... print(line, end="") + 1 This _ DT DT _ 0 a _ _ + 2 is _ VBZ VBZ _ 0 a _ _ + 3 a _ DT DT _ 0 a _ _ + 4 foobar _ JJ JJ _ 0 a _ _ + 5 sentence _ NN NN _ 0 a _ _ + 6 . _ . . _ 0 a _ _ + + :param sentence: A single input sentence to parse + :type sentence: list(tuple(str, str)) + :rtype: iter(str) + :return: a generator yielding a single sentence in CONLL format. + """ + for (i, (word, tag)) in enumerate(sentence, start=1): + input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"] + input_str = "\t".join(input_str) + "\n" + yield input_str + + +def taggedsents_to_conll(sentences): + """ + A module to convert the a POS tagged document stream + (i.e. list of list of tuples, a list of sentences) and yield lines + in CONLL format. This module yields one line per word and two newlines + for end of sentence. + + >>> from nltk import word_tokenize, sent_tokenize, pos_tag + >>> text = "This is a foobar sentence. Is that right?" + >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)] + >>> for line in taggedsents_to_conll(sentences): # doctest: +NORMALIZE_WHITESPACE + ... if line: + ... print(line, end="") + 1 This _ DT DT _ 0 a _ _ + 2 is _ VBZ VBZ _ 0 a _ _ + 3 a _ DT DT _ 0 a _ _ + 4 foobar _ JJ JJ _ 0 a _ _ + 5 sentence _ NN NN _ 0 a _ _ + 6 . _ . . _ 0 a _ _ + + + 1 Is _ VBZ VBZ _ 0 a _ _ + 2 that _ IN IN _ 0 a _ _ + 3 right _ NN NN _ 0 a _ _ + 4 ? _ . . _ 0 a _ _ + + + + :param sentences: Input sentences to parse + :type sentence: list(list(tuple(str, str))) + :rtype: iter(str) + :return: a generator yielding sentences in CONLL format. + """ + for sentence in sentences: + yield from taggedsent_to_conll(sentence) + yield "\n\n" + + +###################################################################### +# { Test Suites +###################################################################### + + +class TestGrammar: + """ + Unit tests for CFG. + """ + + def __init__(self, grammar, suite, accept=None, reject=None): + self.test_grammar = grammar + + self.cp = load_parser(grammar, trace=0) + self.suite = suite + self._accept = accept + self._reject = reject + + def run(self, show_trees=False): + """ + Sentences in the test suite are divided into two classes: + + - grammatical (``accept``) and + - ungrammatical (``reject``). + + If a sentence should parse according to the grammar, the value of + ``trees`` will be a non-empty list. If a sentence should be rejected + according to the grammar, then the value of ``trees`` will be None. + """ + for test in self.suite: + print(test["doc"] + ":", end=" ") + for key in ["accept", "reject"]: + for sent in test[key]: + tokens = sent.split() + trees = list(self.cp.parse(tokens)) + if show_trees and trees: + print() + print(sent) + for tree in trees: + print(tree) + if key == "accept": + if trees == []: + raise ValueError("Sentence '%s' failed to parse'" % sent) + else: + accepted = True + else: + if trees: + raise ValueError("Sentence '%s' received a parse'" % sent) + else: + rejected = True + if accepted and rejected: + print("All tests passed!") + + +def extract_test_sentences(string, comment_chars="#%;", encoding=None): + """ + Parses a string with one test sentence per line. + Lines can optionally begin with: + + - a bool, saying if the sentence is grammatical or not, or + - an int, giving the number of parse trees is should have, + + The result information is followed by a colon, and then the sentence. + Empty lines and lines beginning with a comment char are ignored. + + :return: a list of tuple of sentences and expected results, + where a sentence is a list of str, + and a result is None, or bool, or int + + :param comment_chars: ``str`` of possible comment characters. + :param encoding: the encoding of the string, if it is binary + """ + if encoding is not None: + string = string.decode(encoding) + sentences = [] + for sentence in string.split("\n"): + if sentence == "" or sentence[0] in comment_chars: + continue + split_info = sentence.split(":", 1) + result = None + if len(split_info) == 2: + if split_info[0] in ["True", "true", "False", "false"]: + result = split_info[0] in ["True", "true"] + sentence = split_info[1] + else: + result = int(split_info[0]) + sentence = split_info[1] + tokens = sentence.split() + if tokens == []: + continue + sentences += [(tokens, result)] + return sentences diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py new file mode 100644 index 0000000000000000000000000000000000000000..87ba7331ec358153ad485ced66140e64597ccf96 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py @@ -0,0 +1,453 @@ +# Natural Language Toolkit: Viterbi Probabilistic Parser +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from functools import reduce + +from nltk.parse.api import ParserI +from nltk.tree import ProbabilisticTree, Tree + +##////////////////////////////////////////////////////// +## Viterbi PCFG Parser +##////////////////////////////////////////////////////// + + +class ViterbiParser(ParserI): + """ + A bottom-up ``PCFG`` parser that uses dynamic programming to find + the single most likely parse for a text. The ``ViterbiParser`` parser + parses texts by filling in a "most likely constituent table". + This table records the most probable tree representation for any + given span and node value. In particular, it has an entry for + every start index, end index, and node value, recording the most + likely subtree that spans from the start index to the end index, + and has the given node value. + + The ``ViterbiParser`` parser fills in this table incrementally. It starts + by filling in all entries for constituents that span one element + of text (i.e., entries where the end index is one greater than the + start index). After it has filled in all table entries for + constituents that span one element of text, it fills in the + entries for constitutants that span two elements of text. It + continues filling in the entries for constituents spanning larger + and larger portions of the text, until the entire table has been + filled. Finally, it returns the table entry for a constituent + spanning the entire text, whose node value is the grammar's start + symbol. + + In order to find the most likely constituent with a given span and + node value, the ``ViterbiParser`` parser considers all productions that + could produce that node value. For each production, it finds all + children that collectively cover the span and have the node values + specified by the production's right hand side. If the probability + of the tree formed by applying the production to the children is + greater than the probability of the current entry in the table, + then the table is updated with this new tree. + + A pseudo-code description of the algorithm used by + ``ViterbiParser`` is: + + | Create an empty most likely constituent table, *MLC*. + | For width in 1...len(text): + | For start in 1...len(text)-width: + | For prod in grammar.productions: + | For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC, + | where t[i].label()==prod.rhs[i], + | and the sequence covers [start:start+width]: + | old_p = MLC[start, start+width, prod.lhs] + | new_p = P(t[1])P(t[1])...P(t[n])P(prod) + | if new_p > old_p: + | new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n]) + | MLC[start, start+width, prod.lhs] = new_tree + | Return MLC[0, len(text), start_symbol] + + :type _grammar: PCFG + :ivar _grammar: The grammar used to parse sentences. + :type _trace: int + :ivar _trace: The level of tracing output that should be generated + when parsing a text. + """ + + def __init__(self, grammar, trace=0): + """ + Create a new ``ViterbiParser`` parser, that uses ``grammar`` to + parse texts. + + :type grammar: PCFG + :param grammar: The grammar used to parse texts. + :type trace: int + :param trace: The level of tracing that should be used when + parsing a text. ``0`` will generate no tracing output; + and higher numbers will produce more verbose tracing + output. + """ + self._grammar = grammar + self._trace = trace + + def grammar(self): + return self._grammar + + def trace(self, trace=2): + """ + Set the level of tracing output that should be generated when + parsing a text. + + :type trace: int + :param trace: The trace level. A trace level of ``0`` will + generate no tracing output; and higher trace levels will + produce more verbose tracing output. + :rtype: None + """ + self._trace = trace + + def parse(self, tokens): + # Inherit docs from ParserI + + tokens = list(tokens) + self._grammar.check_coverage(tokens) + + # The most likely constituent table. This table specifies the + # most likely constituent for a given span and type. + # Constituents can be either Trees or tokens. For Trees, + # the "type" is the Nonterminal for the tree's root node + # value. For Tokens, the "type" is the token's type. + # The table is stored as a dictionary, since it is sparse. + constituents = {} + + # Initialize the constituents dictionary with the words from + # the text. + if self._trace: + print("Inserting tokens into the most likely" + " constituents table...") + for index in range(len(tokens)): + token = tokens[index] + constituents[index, index + 1, token] = token + if self._trace > 1: + self._trace_lexical_insertion(token, index, len(tokens)) + + # Consider each span of length 1, 2, ..., n; and add any trees + # that might cover that span to the constituents dictionary. + for length in range(1, len(tokens) + 1): + if self._trace: + print( + "Finding the most likely constituents" + + " spanning %d text elements..." % length + ) + for start in range(len(tokens) - length + 1): + span = (start, start + length) + self._add_constituents_spanning(span, constituents, tokens) + + # Return the tree that spans the entire text & have the right cat + tree = constituents.get((0, len(tokens), self._grammar.start())) + if tree is not None: + yield tree + + def _add_constituents_spanning(self, span, constituents, tokens): + """ + Find any constituents that might cover ``span``, and add them + to the most likely constituents table. + + :rtype: None + :type span: tuple(int, int) + :param span: The section of the text for which we are + trying to find possible constituents. The span is + specified as a pair of integers, where the first integer + is the index of the first token that should be included in + the constituent; and the second integer is the index of + the first token that should not be included in the + constituent. I.e., the constituent should cover + ``text[span[0]:span[1]]``, where ``text`` is the text + that we are parsing. + + :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) + :param constituents: The most likely constituents table. This + table records the most probable tree representation for + any given span and node value. In particular, + ``constituents(s,e,nv)`` is the most likely + ``ProbabilisticTree`` that covers ``text[s:e]`` + and has a node value ``nv.symbol()``, where ``text`` + is the text that we are parsing. When + ``_add_constituents_spanning`` is called, ``constituents`` + should contain all possible constituents that are shorter + than ``span``. + + :type tokens: list of tokens + :param tokens: The text we are parsing. This is only used for + trace output. + """ + # Since some of the grammar productions may be unary, we need to + # repeatedly try all of the productions until none of them add any + # new constituents. + changed = True + while changed: + changed = False + + # Find all ways instantiations of the grammar productions that + # cover the span. + instantiations = self._find_instantiations(span, constituents) + + # For each production instantiation, add a new + # ProbabilisticTree whose probability is the product + # of the childrens' probabilities and the production's + # probability. + for (production, children) in instantiations: + subtrees = [c for c in children if isinstance(c, Tree)] + p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) + node = production.lhs().symbol() + tree = ProbabilisticTree(node, children, prob=p) + + # If it's new a constituent, then add it to the + # constituents dictionary. + c = constituents.get((span[0], span[1], production.lhs())) + if self._trace > 1: + if c is None or c != tree: + if c is None or c.prob() < tree.prob(): + print(" Insert:", end=" ") + else: + print(" Discard:", end=" ") + self._trace_production(production, p, span, len(tokens)) + if c is None or c.prob() < tree.prob(): + constituents[span[0], span[1], production.lhs()] = tree + changed = True + + def _find_instantiations(self, span, constituents): + """ + :return: a list of the production instantiations that cover a + given span of the text. A "production instantiation" is + a tuple containing a production and a list of children, + where the production's right hand side matches the list of + children; and the children cover ``span``. :rtype: list + of ``pair`` of ``Production``, (list of + (``ProbabilisticTree`` or token. + + :type span: tuple(int, int) + :param span: The section of the text for which we are + trying to find production instantiations. The span is + specified as a pair of integers, where the first integer + is the index of the first token that should be covered by + the production instantiation; and the second integer is + the index of the first token that should not be covered by + the production instantiation. + :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) + :param constituents: The most likely constituents table. This + table records the most probable tree representation for + any given span and node value. See the module + documentation for more information. + """ + rv = [] + for production in self._grammar.productions(): + childlists = self._match_rhs(production.rhs(), span, constituents) + + for childlist in childlists: + rv.append((production, childlist)) + return rv + + def _match_rhs(self, rhs, span, constituents): + """ + :return: a set of all the lists of children that cover ``span`` + and that match ``rhs``. + :rtype: list(list(ProbabilisticTree or token) + + :type rhs: list(Nonterminal or any) + :param rhs: The list specifying what kinds of children need to + cover ``span``. Each nonterminal in ``rhs`` specifies + that the corresponding child should be a tree whose node + value is that nonterminal's symbol. Each terminal in ``rhs`` + specifies that the corresponding child should be a token + whose type is that terminal. + :type span: tuple(int, int) + :param span: The section of the text for which we are + trying to find child lists. The span is specified as a + pair of integers, where the first integer is the index of + the first token that should be covered by the child list; + and the second integer is the index of the first token + that should not be covered by the child list. + :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) + :param constituents: The most likely constituents table. This + table records the most probable tree representation for + any given span and node value. See the module + documentation for more information. + """ + (start, end) = span + + # Base case + if start >= end and rhs == (): + return [[]] + if start >= end or rhs == (): + return [] + + # Find everything that matches the 1st symbol of the RHS + childlists = [] + for split in range(start, end + 1): + l = constituents.get((start, split, rhs[0])) + if l is not None: + rights = self._match_rhs(rhs[1:], (split, end), constituents) + childlists += [[l] + r for r in rights] + + return childlists + + def _trace_production(self, production, p, span, width): + """ + Print trace output indicating that a given production has been + applied at a given location. + + :param production: The production that has been applied + :type production: Production + :param p: The probability of the tree produced by the production. + :type p: float + :param span: The span of the production + :type span: tuple + :rtype: None + """ + + str = "|" + "." * span[0] + str += "=" * (span[1] - span[0]) + str += "." * (width - span[1]) + "| " + str += "%s" % production + if self._trace > 2: + str = f"{str:<40} {p:12.10f} " + + print(str) + + def _trace_lexical_insertion(self, token, index, width): + str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| " + str += f"{token}" + print(str) + + def __repr__(self): + return "" % self._grammar + + +##////////////////////////////////////////////////////// +## Test Code +##////////////////////////////////////////////////////// + + +def demo(): + """ + A demonstration of the probabilistic parsers. The user is + prompted to select which demo to run, and how many parses should + be found; and then each parser is run on the same demo, and a + summary of the results are displayed. + """ + import sys + import time + + from nltk import tokenize + from nltk.grammar import PCFG + from nltk.parse import ViterbiParser + + toy_pcfg1 = PCFG.fromstring( + """ + S -> NP VP [1.0] + NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] + Det -> 'the' [0.8] | 'my' [0.2] + N -> 'man' [0.5] | 'telescope' [0.5] + VP -> VP PP [0.1] | V NP [0.7] | V [0.2] + V -> 'ate' [0.35] | 'saw' [0.65] + PP -> P NP [1.0] + P -> 'with' [0.61] | 'under' [0.39] + """ + ) + + toy_pcfg2 = PCFG.fromstring( + """ + S -> NP VP [1.0] + VP -> V NP [.59] + VP -> V [.40] + VP -> VP PP [.01] + NP -> Det N [.41] + NP -> Name [.28] + NP -> NP PP [.31] + PP -> P NP [1.0] + V -> 'saw' [.21] + V -> 'ate' [.51] + V -> 'ran' [.28] + N -> 'boy' [.11] + N -> 'cookie' [.12] + N -> 'table' [.13] + N -> 'telescope' [.14] + N -> 'hill' [.5] + Name -> 'Jack' [.52] + Name -> 'Bob' [.48] + P -> 'with' [.61] + P -> 'under' [.39] + Det -> 'the' [.41] + Det -> 'a' [.31] + Det -> 'my' [.28] + """ + ) + + # Define two demos. Each demo has a sentence and a grammar. + demos = [ + ("I saw the man with my telescope", toy_pcfg1), + ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), + ] + + # Ask the user which demo they want to use. + print() + for i in range(len(demos)): + print(f"{i + 1:>3}: {demos[i][0]}") + print(" %r" % demos[i][1]) + print() + print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") + try: + snum = int(sys.stdin.readline().strip()) - 1 + sent, grammar = demos[snum] + except: + print("Bad sentence number") + return + + # Tokenize the sentence. + tokens = sent.split() + + parser = ViterbiParser(grammar) + all_parses = {} + + print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") + parser.trace(3) + t = time.time() + parses = parser.parse_all(tokens) + time = time.time() - t + average = ( + reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0 + ) + num_parses = len(parses) + for p in parses: + all_parses[p.freeze()] = 1 + + # Print some summary statistics + print() + print("Time (secs) # Parses Average P(parse)") + print("-----------------------------------------") + print("%11.4f%11d%19.14f" % (time, num_parses, average)) + parses = all_parses.keys() + if parses: + p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) + else: + p = 0 + print("------------------------------------------") + print("%11s%11d%19.14f" % ("n/a", len(parses), p)) + + # Ask the user if we should draw the parses. + print() + print("Draw parses (y/n)? ", end=" ") + if sys.stdin.readline().strip().lower().startswith("y"): + from nltk.draw.tree import draw_trees + + print(" please wait...") + draw_trees(*parses) + + # Ask the user if we should print the parses. + print() + print("Print parses (y/n)? ", end=" ") + if sys.stdin.readline().strip().lower().startswith("y"): + for parse in parses: + print(parse) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py new file mode 100644 index 0000000000000000000000000000000000000000..9399bc859e3578cd3eee3a0cb56f30fff6c3087b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py @@ -0,0 +1,1605 @@ +# Natural Language Toolkit: Interface to Boxer +# +# +# Author: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +""" +An interface to Boxer. + +This interface relies on the latest version of the development (subversion) version of +C&C and Boxer. + +Usage +===== + +Set the environment variable CANDC to the bin directory of your CandC installation. +The models directory should be in the CandC root directory. +For example:: + + /path/to/candc/ + bin/ + candc + boxer + models/ + boxer/ +""" + +import operator +import os +import re +import subprocess +import tempfile +from functools import reduce +from optparse import OptionParser + +from nltk.internals import find_binary +from nltk.sem.drt import ( + DRS, + DrtApplicationExpression, + DrtEqualityExpression, + DrtNegatedExpression, + DrtOrExpression, + DrtParser, + DrtProposition, + DrtTokens, + DrtVariableExpression, +) +from nltk.sem.logic import ( + ExpectedMoreTokensException, + LogicalExpressionException, + UnexpectedTokenException, + Variable, +) + + +class Boxer: + """ + This class is an interface to Johan Bos's program Boxer, a wide-coverage + semantic parser that produces Discourse Representation Structures (DRSs). + """ + + def __init__( + self, + boxer_drs_interpreter=None, + elimeq=False, + bin_dir=None, + verbose=False, + resolve=True, + ): + """ + :param boxer_drs_interpreter: A class that converts from the + ``AbstractBoxerDrs`` object hierarchy to a different object. The + default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK + DRT hierarchy. + :param elimeq: When set to true, Boxer removes all equalities from the + DRSs and discourse referents standing in the equality relation are + unified, but only if this can be done in a meaning-preserving manner. + :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction. + Resolution follows Van der Sandt's theory of binding and accommodation. + """ + if boxer_drs_interpreter is None: + boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter() + self._boxer_drs_interpreter = boxer_drs_interpreter + + self._resolve = resolve + self._elimeq = elimeq + + self.set_bin_dir(bin_dir, verbose) + + def set_bin_dir(self, bin_dir, verbose=False): + self._candc_bin = self._find_binary("candc", bin_dir, verbose) + self._candc_models_path = os.path.normpath( + os.path.join(self._candc_bin[:-5], "../models") + ) + self._boxer_bin = self._find_binary("boxer", bin_dir, verbose) + + def interpret(self, input, discourse_id=None, question=False, verbose=False): + """ + Use Boxer to give a first order representation. + + :param input: str Input sentence to parse + :param occur_index: bool Should predicates be occurrence indexed? + :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. + :return: ``drt.DrtExpression`` + """ + discourse_ids = [discourse_id] if discourse_id is not None else None + (d,) = self.interpret_multi_sents([[input]], discourse_ids, question, verbose) + if not d: + raise Exception(f'Unable to interpret: "{input}"') + return d + + def interpret_multi(self, input, discourse_id=None, question=False, verbose=False): + """ + Use Boxer to give a first order representation. + + :param input: list of str Input sentences to parse as a single discourse + :param occur_index: bool Should predicates be occurrence indexed? + :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. + :return: ``drt.DrtExpression`` + """ + discourse_ids = [discourse_id] if discourse_id is not None else None + (d,) = self.interpret_multi_sents([input], discourse_ids, question, verbose) + if not d: + raise Exception(f'Unable to interpret: "{input}"') + return d + + def interpret_sents( + self, inputs, discourse_ids=None, question=False, verbose=False + ): + """ + Use Boxer to give a first order representation. + + :param inputs: list of str Input sentences to parse as individual discourses + :param occur_index: bool Should predicates be occurrence indexed? + :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. + :return: list of ``drt.DrtExpression`` + """ + return self.interpret_multi_sents( + [[input] for input in inputs], discourse_ids, question, verbose + ) + + def interpret_multi_sents( + self, inputs, discourse_ids=None, question=False, verbose=False + ): + """ + Use Boxer to give a first order representation. + + :param inputs: list of list of str Input discourses to parse + :param occur_index: bool Should predicates be occurrence indexed? + :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. + :return: ``drt.DrtExpression`` + """ + if discourse_ids is not None: + assert len(inputs) == len(discourse_ids) + assert reduce(operator.and_, (id is not None for id in discourse_ids)) + use_disc_id = True + else: + discourse_ids = list(map(str, range(len(inputs)))) + use_disc_id = False + + candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose) + boxer_out = self._call_boxer(candc_out, verbose=verbose) + + # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out: + # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str) + + drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id) + return [drs_dict.get(id, None) for id in discourse_ids] + + def _call_candc(self, inputs, discourse_ids, question, verbose=False): + """ + Call the ``candc`` binary with the given input. + + :param inputs: list of list of str Input discourses to parse + :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. + :param filename: str A filename for the output file + :return: stdout + """ + args = [ + "--models", + os.path.join(self._candc_models_path, ["boxer", "questions"][question]), + "--candc-printer", + "boxer", + ] + return self._call( + "\n".join( + sum( + ([f"'{id}'"] + d for d, id in zip(inputs, discourse_ids)), + [], + ) + ), + self._candc_bin, + args, + verbose, + ) + + def _call_boxer(self, candc_out, verbose=False): + """ + Call the ``boxer`` binary with the given input. + + :param candc_out: str output from C&C parser + :return: stdout + """ + f = None + try: + fd, temp_filename = tempfile.mkstemp( + prefix="boxer-", suffix=".in", text=True + ) + f = os.fdopen(fd, "w") + f.write(candc_out.decode("utf-8")) + finally: + if f: + f.close() + + args = [ + "--box", + "false", + "--semantics", + "drs", + #'--flat', 'false', # removed from boxer + "--resolve", + ["false", "true"][self._resolve], + "--elimeq", + ["false", "true"][self._elimeq], + "--format", + "prolog", + "--instantiate", + "true", + "--input", + temp_filename, + ] + stdout = self._call(None, self._boxer_bin, args, verbose) + os.remove(temp_filename) + return stdout + + def _find_binary(self, name, bin_dir, verbose=False): + return find_binary( + name, + path_to_bin=bin_dir, + env_vars=["CANDC"], + url="http://svn.ask.it.usyd.edu.au/trac/candc/", + binary_names=[name, name + ".exe"], + verbose=verbose, + ) + + def _call(self, input_str, binary, args=[], verbose=False): + """ + Call the binary with the given input. + + :param input_str: A string whose contents are used as stdin. + :param binary: The location of the binary to call + :param args: A list of command-line arguments. + :return: stdout + """ + if verbose: + print("Calling:", binary) + print("Args:", args) + print("Input:", input_str) + print("Command:", binary + " " + " ".join(args)) + + # Call via a subprocess + if input_str is None: + cmd = [binary] + args + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + else: + cmd = 'echo "{}" | {} {}'.format(input_str, binary, " ".join(args)) + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True + ) + stdout, stderr = p.communicate() + + if verbose: + print("Return code:", p.returncode) + if stdout: + print("stdout:\n", stdout, "\n") + if stderr: + print("stderr:\n", stderr, "\n") + if p.returncode != 0: + raise Exception( + "ERROR CALLING: {} {}\nReturncode: {}\n{}".format( + binary, " ".join(args), p.returncode, stderr + ) + ) + + return stdout + + def _parse_to_drs_dict(self, boxer_out, use_disc_id): + lines = boxer_out.decode("utf-8").split("\n") + drs_dict = {} + i = 0 + while i < len(lines): + line = lines[i] + if line.startswith("id("): + comma_idx = line.index(",") + discourse_id = line[3:comma_idx] + if discourse_id[0] == "'" and discourse_id[-1] == "'": + discourse_id = discourse_id[1:-1] + drs_id = line[comma_idx + 1 : line.index(")")] + i += 1 + line = lines[i] + assert line.startswith(f"sem({drs_id},") + if line[-4:] == "').'": + line = line[:-4] + ")." + assert line.endswith(")."), f"can't parse line: {line}" + + search_start = len(f"sem({drs_id},[") + brace_count = 1 + drs_start = -1 + for j, c in enumerate(line[search_start:]): + if c == "[": + brace_count += 1 + if c == "]": + brace_count -= 1 + if brace_count == 0: + drs_start = search_start + j + 1 + if line[drs_start : drs_start + 3] == "','": + drs_start = drs_start + 3 + else: + drs_start = drs_start + 1 + break + assert drs_start > -1 + + drs_input = line[drs_start:-2].strip() + parsed = self._parse_drs(drs_input, discourse_id, use_disc_id) + drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed) + i += 1 + return drs_dict + + def _parse_drs(self, drs_string, discourse_id, use_disc_id): + return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string) + + +class BoxerOutputDrsParser(DrtParser): + def __init__(self, discourse_id=None): + """ + This class is used to parse the Prolog DRS output from Boxer into a + hierarchy of python objects. + """ + DrtParser.__init__(self) + self.discourse_id = discourse_id + self.sentence_id_offset = None + self.quote_chars = [("'", "'", "\\", False)] + + def parse(self, data, signature=None): + return DrtParser.parse(self, data, signature) + + def get_all_symbols(self): + return ["(", ")", ",", "[", "]", ":"] + + def handle(self, tok, context): + return self.handle_drs(tok) + + def attempt_adjuncts(self, expression, context): + return expression + + def parse_condition(self, indices): + """ + Parse a DRS condition + + :return: list of ``DrtExpression`` + """ + tok = self.token() + accum = self.handle_condition(tok, indices) + if accum is None: + raise UnexpectedTokenException(tok) + return accum + + def handle_drs(self, tok): + if tok == "drs": + return self.parse_drs() + elif tok in ["merge", "smerge"]: + return self._handle_binary_expression(self._make_merge_expression)(None, []) + elif tok in ["alfa"]: + return self._handle_alfa(self._make_merge_expression)(None, []) + + def handle_condition(self, tok, indices): + """ + Handle a DRS condition + + :param indices: list of int + :return: list of ``DrtExpression`` + """ + if tok == "not": + return [self._handle_not()] + + if tok == "or": + conds = [self._handle_binary_expression(self._make_or_expression)] + elif tok == "imp": + conds = [self._handle_binary_expression(self._make_imp_expression)] + elif tok == "eq": + conds = [self._handle_eq()] + elif tok == "prop": + conds = [self._handle_prop()] + + elif tok == "pred": + conds = [self._handle_pred()] + elif tok == "named": + conds = [self._handle_named()] + elif tok == "rel": + conds = [self._handle_rel()] + elif tok == "timex": + conds = self._handle_timex() + elif tok == "card": + conds = [self._handle_card()] + + elif tok == "whq": + conds = [self._handle_whq()] + elif tok == "duplex": + conds = [self._handle_duplex()] + + else: + conds = [] + + return sum( + ( + [cond(sent_index, word_indices) for cond in conds] + for sent_index, word_indices in self._sent_and_word_indices(indices) + ), + [], + ) + + def _handle_not(self): + self.assertToken(self.token(), "(") + drs = self.process_next_expression(None) + self.assertToken(self.token(), ")") + return BoxerNot(drs) + + def _handle_pred(self): + # pred(_G3943, dog, n, 0) + self.assertToken(self.token(), "(") + variable = self.parse_variable() + self.assertToken(self.token(), ",") + name = self.token() + self.assertToken(self.token(), ",") + pos = self.token() + self.assertToken(self.token(), ",") + sense = int(self.token()) + self.assertToken(self.token(), ")") + + def _handle_pred_f(sent_index, word_indices): + return BoxerPred( + self.discourse_id, sent_index, word_indices, variable, name, pos, sense + ) + + return _handle_pred_f + + def _handle_duplex(self): + # duplex(whq, drs(...), var, drs(...)) + self.assertToken(self.token(), "(") + # self.assertToken(self.token(), '[') + ans_types = [] + # while self.token(0) != ']': + # cat = self.token() + # self.assertToken(self.token(), ':') + # if cat == 'des': + # ans_types.append(self.token()) + # elif cat == 'num': + # ans_types.append('number') + # typ = self.token() + # if typ == 'cou': + # ans_types.append('count') + # else: + # ans_types.append(typ) + # else: + # ans_types.append(self.token()) + # self.token() #swallow the ']' + + self.assertToken(self.token(), "whq") + self.assertToken(self.token(), ",") + d1 = self.process_next_expression(None) + self.assertToken(self.token(), ",") + ref = self.parse_variable() + self.assertToken(self.token(), ",") + d2 = self.process_next_expression(None) + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: BoxerWhq( + self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2 + ) + + def _handle_named(self): + # named(x0, john, per, 0) + self.assertToken(self.token(), "(") + variable = self.parse_variable() + self.assertToken(self.token(), ",") + name = self.token() + self.assertToken(self.token(), ",") + type = self.token() + self.assertToken(self.token(), ",") + sense = self.token() # as per boxer rev 2554 + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: BoxerNamed( + self.discourse_id, sent_index, word_indices, variable, name, type, sense + ) + + def _handle_rel(self): + # rel(_G3993, _G3943, agent, 0) + self.assertToken(self.token(), "(") + var1 = self.parse_variable() + self.assertToken(self.token(), ",") + var2 = self.parse_variable() + self.assertToken(self.token(), ",") + rel = self.token() + self.assertToken(self.token(), ",") + sense = int(self.token()) + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: BoxerRel( + self.discourse_id, sent_index, word_indices, var1, var2, rel, sense + ) + + def _handle_timex(self): + # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX')) + self.assertToken(self.token(), "(") + arg = self.parse_variable() + self.assertToken(self.token(), ",") + new_conds = self._handle_time_expression(arg) + self.assertToken(self.token(), ")") + return new_conds + + def _handle_time_expression(self, arg): + # date([]: (+), []:'XXXX', [1004]:'04', []:'XX') + tok = self.token() + self.assertToken(self.token(), "(") + if tok == "date": + conds = self._handle_date(arg) + elif tok == "time": + conds = self._handle_time(arg) + else: + return None + self.assertToken(self.token(), ")") + return [ + lambda sent_index, word_indices: BoxerPred( + self.discourse_id, sent_index, word_indices, arg, tok, "n", 0 + ) + ] + [lambda sent_index, word_indices: cond for cond in conds] + + def _handle_date(self, arg): + # []: (+), []:'XXXX', [1004]:'04', []:'XX' + conds = [] + ((sent_index, word_indices),) = self._sent_and_word_indices( + self._parse_index_list() + ) + self.assertToken(self.token(), "(") + pol = self.token() + self.assertToken(self.token(), ")") + conds.append( + BoxerPred( + self.discourse_id, + sent_index, + word_indices, + arg, + f"date_pol_{pol}", + "a", + 0, + ) + ) + self.assertToken(self.token(), ",") + + ((sent_index, word_indices),) = self._sent_and_word_indices( + self._parse_index_list() + ) + year = self.token() + if year != "XXXX": + year = year.replace(":", "_") + conds.append( + BoxerPred( + self.discourse_id, + sent_index, + word_indices, + arg, + f"date_year_{year}", + "a", + 0, + ) + ) + self.assertToken(self.token(), ",") + + ((sent_index, word_indices),) = self._sent_and_word_indices( + self._parse_index_list() + ) + month = self.token() + if month != "XX": + conds.append( + BoxerPred( + self.discourse_id, + sent_index, + word_indices, + arg, + f"date_month_{month}", + "a", + 0, + ) + ) + self.assertToken(self.token(), ",") + + ((sent_index, word_indices),) = self._sent_and_word_indices( + self._parse_index_list() + ) + day = self.token() + if day != "XX": + conds.append( + BoxerPred( + self.discourse_id, + sent_index, + word_indices, + arg, + f"date_day_{day}", + "a", + 0, + ) + ) + + return conds + + def _handle_time(self, arg): + # time([1018]:'18', []:'XX', []:'XX') + conds = [] + self._parse_index_list() + hour = self.token() + if hour != "XX": + conds.append(self._make_atom("r_hour_2", arg, hour)) + self.assertToken(self.token(), ",") + + self._parse_index_list() + min = self.token() + if min != "XX": + conds.append(self._make_atom("r_min_2", arg, min)) + self.assertToken(self.token(), ",") + + self._parse_index_list() + sec = self.token() + if sec != "XX": + conds.append(self._make_atom("r_sec_2", arg, sec)) + + return conds + + def _handle_card(self): + # card(_G18535, 28, ge) + self.assertToken(self.token(), "(") + variable = self.parse_variable() + self.assertToken(self.token(), ",") + value = self.token() + self.assertToken(self.token(), ",") + type = self.token() + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: BoxerCard( + self.discourse_id, sent_index, word_indices, variable, value, type + ) + + def _handle_prop(self): + # prop(_G15949, drs(...)) + self.assertToken(self.token(), "(") + variable = self.parse_variable() + self.assertToken(self.token(), ",") + drs = self.process_next_expression(None) + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: BoxerProp( + self.discourse_id, sent_index, word_indices, variable, drs + ) + + def _parse_index_list(self): + # [1001,1002]: + indices = [] + self.assertToken(self.token(), "[") + while self.token(0) != "]": + indices.append(self.parse_index()) + if self.token(0) == ",": + self.token() # swallow ',' + self.token() # swallow ']' + self.assertToken(self.token(), ":") + return indices + + def parse_drs(self): + # drs([[1001]:_G3943], + # [[1002]:pred(_G3943, dog, n, 0)] + # ) + self.assertToken(self.token(), "(") + self.assertToken(self.token(), "[") + refs = set() + while self.token(0) != "]": + indices = self._parse_index_list() + refs.add(self.parse_variable()) + if self.token(0) == ",": + self.token() # swallow ',' + self.token() # swallow ']' + self.assertToken(self.token(), ",") + self.assertToken(self.token(), "[") + conds = [] + while self.token(0) != "]": + indices = self._parse_index_list() + conds.extend(self.parse_condition(indices)) + if self.token(0) == ",": + self.token() # swallow ',' + self.token() # swallow ']' + self.assertToken(self.token(), ")") + return BoxerDrs(list(refs), conds) + + def _handle_binary_expression(self, make_callback): + self.assertToken(self.token(), "(") + drs1 = self.process_next_expression(None) + self.assertToken(self.token(), ",") + drs2 = self.process_next_expression(None) + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: make_callback( + sent_index, word_indices, drs1, drs2 + ) + + def _handle_alfa(self, make_callback): + self.assertToken(self.token(), "(") + type = self.token() + self.assertToken(self.token(), ",") + drs1 = self.process_next_expression(None) + self.assertToken(self.token(), ",") + drs2 = self.process_next_expression(None) + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: make_callback( + sent_index, word_indices, drs1, drs2 + ) + + def _handle_eq(self): + self.assertToken(self.token(), "(") + var1 = self.parse_variable() + self.assertToken(self.token(), ",") + var2 = self.parse_variable() + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: BoxerEq( + self.discourse_id, sent_index, word_indices, var1, var2 + ) + + def _handle_whq(self): + self.assertToken(self.token(), "(") + self.assertToken(self.token(), "[") + ans_types = [] + while self.token(0) != "]": + cat = self.token() + self.assertToken(self.token(), ":") + if cat == "des": + ans_types.append(self.token()) + elif cat == "num": + ans_types.append("number") + typ = self.token() + if typ == "cou": + ans_types.append("count") + else: + ans_types.append(typ) + else: + ans_types.append(self.token()) + self.token() # swallow the ']' + + self.assertToken(self.token(), ",") + d1 = self.process_next_expression(None) + self.assertToken(self.token(), ",") + ref = self.parse_variable() + self.assertToken(self.token(), ",") + d2 = self.process_next_expression(None) + self.assertToken(self.token(), ")") + return lambda sent_index, word_indices: BoxerWhq( + self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2 + ) + + def _make_merge_expression(self, sent_index, word_indices, drs1, drs2): + return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds) + + def _make_or_expression(self, sent_index, word_indices, drs1, drs2): + return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2) + + def _make_imp_expression(self, sent_index, word_indices, drs1, drs2): + return BoxerDrs(drs1.refs, drs1.conds, drs2) + + def parse_variable(self): + var = self.token() + assert re.match(r"^[exps]\d+$", var), var + return var + + def parse_index(self): + return int(self.token()) + + def _sent_and_word_indices(self, indices): + """ + :return: list of (sent_index, word_indices) tuples + """ + sent_indices = {(i / 1000) - 1 for i in indices if i >= 0} + if sent_indices: + pairs = [] + for sent_index in sent_indices: + word_indices = [ + (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1 + ] + pairs.append((sent_index, word_indices)) + return pairs + else: + word_indices = [(i % 1000) - 1 for i in indices] + return [(None, word_indices)] + + +class BoxerDrsParser(DrtParser): + """ + Reparse the str form of subclasses of ``AbstractBoxerDrs`` + """ + + def __init__(self, discourse_id=None): + DrtParser.__init__(self) + self.discourse_id = discourse_id + + def get_all_symbols(self): + return [ + DrtTokens.OPEN, + DrtTokens.CLOSE, + DrtTokens.COMMA, + DrtTokens.OPEN_BRACKET, + DrtTokens.CLOSE_BRACKET, + ] + + def attempt_adjuncts(self, expression, context): + return expression + + def handle(self, tok, context): + try: + # if tok == 'drs': + # self.assertNextToken(DrtTokens.OPEN) + # label = int(self.token()) + # self.assertNextToken(DrtTokens.COMMA) + # refs = list(map(int, self.handle_refs())) + # self.assertNextToken(DrtTokens.COMMA) + # conds = self.handle_conds(None) + # self.assertNextToken(DrtTokens.CLOSE) + # return BoxerDrs(label, refs, conds) + if tok == "pred": + self.assertNextToken(DrtTokens.OPEN) + disc_id = ( + self.discourse_id if self.discourse_id is not None else self.token() + ) + self.assertNextToken(DrtTokens.COMMA) + sent_id = self.nullableIntToken() + self.assertNextToken(DrtTokens.COMMA) + word_ids = list(map(int, self.handle_refs())) + self.assertNextToken(DrtTokens.COMMA) + variable = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + name = self.token() + self.assertNextToken(DrtTokens.COMMA) + pos = self.token() + self.assertNextToken(DrtTokens.COMMA) + sense = int(self.token()) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense) + elif tok == "named": + self.assertNextToken(DrtTokens.OPEN) + disc_id = ( + self.discourse_id if self.discourse_id is not None else self.token() + ) + self.assertNextToken(DrtTokens.COMMA) + sent_id = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + word_ids = map(int, self.handle_refs()) + self.assertNextToken(DrtTokens.COMMA) + variable = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + name = self.token() + self.assertNextToken(DrtTokens.COMMA) + type = self.token() + self.assertNextToken(DrtTokens.COMMA) + sense = int(self.token()) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerNamed( + disc_id, sent_id, word_ids, variable, name, type, sense + ) + elif tok == "rel": + self.assertNextToken(DrtTokens.OPEN) + disc_id = ( + self.discourse_id if self.discourse_id is not None else self.token() + ) + self.assertNextToken(DrtTokens.COMMA) + sent_id = self.nullableIntToken() + self.assertNextToken(DrtTokens.COMMA) + word_ids = list(map(int, self.handle_refs())) + self.assertNextToken(DrtTokens.COMMA) + var1 = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + var2 = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + rel = self.token() + self.assertNextToken(DrtTokens.COMMA) + sense = int(self.token()) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense) + elif tok == "prop": + self.assertNextToken(DrtTokens.OPEN) + disc_id = ( + self.discourse_id if self.discourse_id is not None else self.token() + ) + self.assertNextToken(DrtTokens.COMMA) + sent_id = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + word_ids = list(map(int, self.handle_refs())) + self.assertNextToken(DrtTokens.COMMA) + variable = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + drs = self.process_next_expression(None) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerProp(disc_id, sent_id, word_ids, variable, drs) + elif tok == "not": + self.assertNextToken(DrtTokens.OPEN) + drs = self.process_next_expression(None) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerNot(drs) + elif tok == "imp": + self.assertNextToken(DrtTokens.OPEN) + drs1 = self.process_next_expression(None) + self.assertNextToken(DrtTokens.COMMA) + drs2 = self.process_next_expression(None) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerDrs(drs1.refs, drs1.conds, drs2) + elif tok == "or": + self.assertNextToken(DrtTokens.OPEN) + disc_id = ( + self.discourse_id if self.discourse_id is not None else self.token() + ) + self.assertNextToken(DrtTokens.COMMA) + sent_id = self.nullableIntToken() + self.assertNextToken(DrtTokens.COMMA) + word_ids = map(int, self.handle_refs()) + self.assertNextToken(DrtTokens.COMMA) + drs1 = self.process_next_expression(None) + self.assertNextToken(DrtTokens.COMMA) + drs2 = self.process_next_expression(None) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2) + elif tok == "eq": + self.assertNextToken(DrtTokens.OPEN) + disc_id = ( + self.discourse_id if self.discourse_id is not None else self.token() + ) + self.assertNextToken(DrtTokens.COMMA) + sent_id = self.nullableIntToken() + self.assertNextToken(DrtTokens.COMMA) + word_ids = list(map(int, self.handle_refs())) + self.assertNextToken(DrtTokens.COMMA) + var1 = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + var2 = int(self.token()) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerEq(disc_id, sent_id, word_ids, var1, var2) + elif tok == "card": + self.assertNextToken(DrtTokens.OPEN) + disc_id = ( + self.discourse_id if self.discourse_id is not None else self.token() + ) + self.assertNextToken(DrtTokens.COMMA) + sent_id = self.nullableIntToken() + self.assertNextToken(DrtTokens.COMMA) + word_ids = map(int, self.handle_refs()) + self.assertNextToken(DrtTokens.COMMA) + var = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + value = self.token() + self.assertNextToken(DrtTokens.COMMA) + type = self.token() + self.assertNextToken(DrtTokens.CLOSE) + return BoxerCard(disc_id, sent_id, word_ids, var, value, type) + elif tok == "whq": + self.assertNextToken(DrtTokens.OPEN) + disc_id = ( + self.discourse_id if self.discourse_id is not None else self.token() + ) + self.assertNextToken(DrtTokens.COMMA) + sent_id = self.nullableIntToken() + self.assertNextToken(DrtTokens.COMMA) + word_ids = list(map(int, self.handle_refs())) + self.assertNextToken(DrtTokens.COMMA) + ans_types = self.handle_refs() + self.assertNextToken(DrtTokens.COMMA) + drs1 = self.process_next_expression(None) + self.assertNextToken(DrtTokens.COMMA) + var = int(self.token()) + self.assertNextToken(DrtTokens.COMMA) + drs2 = self.process_next_expression(None) + self.assertNextToken(DrtTokens.CLOSE) + return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2) + except Exception as e: + raise LogicalExpressionException(self._currentIndex, str(e)) from e + assert False, repr(tok) + + def nullableIntToken(self): + t = self.token() + return int(t) if t != "None" else None + + def get_next_token_variable(self, description): + try: + return self.token() + except ExpectedMoreTokensException as e: + raise ExpectedMoreTokensException(e.index, "Variable expected.") from e + + +class AbstractBoxerDrs: + def variables(self): + """ + :return: (set, set, set) + """ + variables, events, propositions = self._variables() + return (variables - (events | propositions), events, propositions - events) + + def variable_types(self): + vartypes = {} + for t, vars in zip(("z", "e", "p"), self.variables()): + for v in vars: + vartypes[v] = t + return vartypes + + def _variables(self): + """ + :return: (set, set, set) + """ + return (set(), set(), set()) + + def atoms(self): + return set() + + def clean(self): + return self + + def _clean_name(self, name): + return name.replace("-", "_").replace("'", "_") + + def renumber_sentences(self, f): + return self + + def __hash__(self): + return hash(f"{self}") + + +class BoxerDrs(AbstractBoxerDrs): + def __init__(self, refs, conds, consequent=None): + AbstractBoxerDrs.__init__(self) + self.refs = refs + self.conds = conds + self.consequent = consequent + + def _variables(self): + variables = (set(), set(), set()) + for cond in self.conds: + for s, v in zip(variables, cond._variables()): + s.update(v) + if self.consequent is not None: + for s, v in zip(variables, self.consequent._variables()): + s.update(v) + return variables + + def atoms(self): + atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set()) + if self.consequent is not None: + atoms.update(self.consequent.atoms()) + return atoms + + def clean(self): + consequent = self.consequent.clean() if self.consequent else None + return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent) + + def renumber_sentences(self, f): + consequent = self.consequent.renumber_sentences(f) if self.consequent else None + return BoxerDrs( + self.refs, [c.renumber_sentences(f) for c in self.conds], consequent + ) + + def __repr__(self): + s = "drs([{}], [{}])".format( + ", ".join("%s" % r for r in self.refs), + ", ".join("%s" % c for c in self.conds), + ) + if self.consequent is not None: + s = f"imp({s}, {self.consequent})" + return s + + def __eq__(self, other): + return ( + self.__class__ == other.__class__ + and self.refs == other.refs + and len(self.conds) == len(other.conds) + and reduce( + operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds)) + ) + and self.consequent == other.consequent + ) + + def __ne__(self, other): + return not self == other + + __hash__ = AbstractBoxerDrs.__hash__ + + +class BoxerNot(AbstractBoxerDrs): + def __init__(self, drs): + AbstractBoxerDrs.__init__(self) + self.drs = drs + + def _variables(self): + return self.drs._variables() + + def atoms(self): + return self.drs.atoms() + + def clean(self): + return BoxerNot(self.drs.clean()) + + def renumber_sentences(self, f): + return BoxerNot(self.drs.renumber_sentences(f)) + + def __repr__(self): + return "not(%s)" % (self.drs) + + def __eq__(self, other): + return self.__class__ == other.__class__ and self.drs == other.drs + + def __ne__(self, other): + return not self == other + + __hash__ = AbstractBoxerDrs.__hash__ + + +class BoxerIndexed(AbstractBoxerDrs): + def __init__(self, discourse_id, sent_index, word_indices): + AbstractBoxerDrs.__init__(self) + self.discourse_id = discourse_id + self.sent_index = sent_index + self.word_indices = word_indices + + def atoms(self): + return {self} + + def __eq__(self, other): + return ( + self.__class__ == other.__class__ + and self.discourse_id == other.discourse_id + and self.sent_index == other.sent_index + and self.word_indices == other.word_indices + and reduce(operator.and_, (s == o for s, o in zip(self, other))) + ) + + def __ne__(self, other): + return not self == other + + __hash__ = AbstractBoxerDrs.__hash__ + + def __repr__(self): + s = "{}({}, {}, [{}]".format( + self._pred(), + self.discourse_id, + self.sent_index, + ", ".join("%s" % wi for wi in self.word_indices), + ) + for v in self: + s += ", %s" % v + return s + ")" + + +class BoxerPred(BoxerIndexed): + def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense): + BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) + self.var = var + self.name = name + self.pos = pos + self.sense = sense + + def _variables(self): + return ({self.var}, set(), set()) + + def change_var(self, var): + return BoxerPred( + self.discourse_id, + self.sent_index, + self.word_indices, + var, + self.name, + self.pos, + self.sense, + ) + + def clean(self): + return BoxerPred( + self.discourse_id, + self.sent_index, + self.word_indices, + self.var, + self._clean_name(self.name), + self.pos, + self.sense, + ) + + def renumber_sentences(self, f): + new_sent_index = f(self.sent_index) + return BoxerPred( + self.discourse_id, + new_sent_index, + self.word_indices, + self.var, + self.name, + self.pos, + self.sense, + ) + + def __iter__(self): + return iter((self.var, self.name, self.pos, self.sense)) + + def _pred(self): + return "pred" + + +class BoxerNamed(BoxerIndexed): + def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense): + BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) + self.var = var + self.name = name + self.type = type + self.sense = sense + + def _variables(self): + return ({self.var}, set(), set()) + + def change_var(self, var): + return BoxerNamed( + self.discourse_id, + self.sent_index, + self.word_indices, + var, + self.name, + self.type, + self.sense, + ) + + def clean(self): + return BoxerNamed( + self.discourse_id, + self.sent_index, + self.word_indices, + self.var, + self._clean_name(self.name), + self.type, + self.sense, + ) + + def renumber_sentences(self, f): + return BoxerNamed( + self.discourse_id, + f(self.sent_index), + self.word_indices, + self.var, + self.name, + self.type, + self.sense, + ) + + def __iter__(self): + return iter((self.var, self.name, self.type, self.sense)) + + def _pred(self): + return "named" + + +class BoxerRel(BoxerIndexed): + def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense): + BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) + self.var1 = var1 + self.var2 = var2 + self.rel = rel + self.sense = sense + + def _variables(self): + return ({self.var1, self.var2}, set(), set()) + + def clean(self): + return BoxerRel( + self.discourse_id, + self.sent_index, + self.word_indices, + self.var1, + self.var2, + self._clean_name(self.rel), + self.sense, + ) + + def renumber_sentences(self, f): + return BoxerRel( + self.discourse_id, + f(self.sent_index), + self.word_indices, + self.var1, + self.var2, + self.rel, + self.sense, + ) + + def __iter__(self): + return iter((self.var1, self.var2, self.rel, self.sense)) + + def _pred(self): + return "rel" + + +class BoxerProp(BoxerIndexed): + def __init__(self, discourse_id, sent_index, word_indices, var, drs): + BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) + self.var = var + self.drs = drs + + def _variables(self): + return tuple( + map(operator.or_, (set(), set(), {self.var}), self.drs._variables()) + ) + + def referenced_labels(self): + return {self.drs} + + def atoms(self): + return self.drs.atoms() + + def clean(self): + return BoxerProp( + self.discourse_id, + self.sent_index, + self.word_indices, + self.var, + self.drs.clean(), + ) + + def renumber_sentences(self, f): + return BoxerProp( + self.discourse_id, + f(self.sent_index), + self.word_indices, + self.var, + self.drs.renumber_sentences(f), + ) + + def __iter__(self): + return iter((self.var, self.drs)) + + def _pred(self): + return "prop" + + +class BoxerEq(BoxerIndexed): + def __init__(self, discourse_id, sent_index, word_indices, var1, var2): + BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) + self.var1 = var1 + self.var2 = var2 + + def _variables(self): + return ({self.var1, self.var2}, set(), set()) + + def atoms(self): + return set() + + def renumber_sentences(self, f): + return BoxerEq( + self.discourse_id, + f(self.sent_index), + self.word_indices, + self.var1, + self.var2, + ) + + def __iter__(self): + return iter((self.var1, self.var2)) + + def _pred(self): + return "eq" + + +class BoxerCard(BoxerIndexed): + def __init__(self, discourse_id, sent_index, word_indices, var, value, type): + BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) + self.var = var + self.value = value + self.type = type + + def _variables(self): + return ({self.var}, set(), set()) + + def renumber_sentences(self, f): + return BoxerCard( + self.discourse_id, + f(self.sent_index), + self.word_indices, + self.var, + self.value, + self.type, + ) + + def __iter__(self): + return iter((self.var, self.value, self.type)) + + def _pred(self): + return "card" + + +class BoxerOr(BoxerIndexed): + def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2): + BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) + self.drs1 = drs1 + self.drs2 = drs2 + + def _variables(self): + return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables())) + + def atoms(self): + return self.drs1.atoms() | self.drs2.atoms() + + def clean(self): + return BoxerOr( + self.discourse_id, + self.sent_index, + self.word_indices, + self.drs1.clean(), + self.drs2.clean(), + ) + + def renumber_sentences(self, f): + return BoxerOr( + self.discourse_id, + f(self.sent_index), + self.word_indices, + self.drs1, + self.drs2, + ) + + def __iter__(self): + return iter((self.drs1, self.drs2)) + + def _pred(self): + return "or" + + +class BoxerWhq(BoxerIndexed): + def __init__( + self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2 + ): + BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) + self.ans_types = ans_types + self.drs1 = drs1 + self.variable = variable + self.drs2 = drs2 + + def _variables(self): + return tuple( + map( + operator.or_, + ({self.variable}, set(), set()), + self.drs1._variables(), + self.drs2._variables(), + ) + ) + + def atoms(self): + return self.drs1.atoms() | self.drs2.atoms() + + def clean(self): + return BoxerWhq( + self.discourse_id, + self.sent_index, + self.word_indices, + self.ans_types, + self.drs1.clean(), + self.variable, + self.drs2.clean(), + ) + + def renumber_sentences(self, f): + return BoxerWhq( + self.discourse_id, + f(self.sent_index), + self.word_indices, + self.ans_types, + self.drs1, + self.variable, + self.drs2, + ) + + def __iter__(self): + return iter( + ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2) + ) + + def _pred(self): + return "whq" + + +class PassthroughBoxerDrsInterpreter: + def interpret(self, ex): + return ex + + +class NltkDrtBoxerDrsInterpreter: + def __init__(self, occur_index=False): + self._occur_index = occur_index + + def interpret(self, ex): + """ + :param ex: ``AbstractBoxerDrs`` + :return: ``DrtExpression`` + """ + if isinstance(ex, BoxerDrs): + drs = DRS( + [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds)) + ) + if ex.consequent is not None: + drs.consequent = self.interpret(ex.consequent) + return drs + elif isinstance(ex, BoxerNot): + return DrtNegatedExpression(self.interpret(ex.drs)) + elif isinstance(ex, BoxerPred): + pred = self._add_occur_indexing(f"{ex.pos}_{ex.name}", ex) + return self._make_atom(pred, ex.var) + elif isinstance(ex, BoxerNamed): + pred = self._add_occur_indexing(f"ne_{ex.type}_{ex.name}", ex) + return self._make_atom(pred, ex.var) + elif isinstance(ex, BoxerRel): + pred = self._add_occur_indexing("%s" % (ex.rel), ex) + return self._make_atom(pred, ex.var1, ex.var2) + elif isinstance(ex, BoxerProp): + return DrtProposition(Variable(ex.var), self.interpret(ex.drs)) + elif isinstance(ex, BoxerEq): + return DrtEqualityExpression( + DrtVariableExpression(Variable(ex.var1)), + DrtVariableExpression(Variable(ex.var2)), + ) + elif isinstance(ex, BoxerCard): + pred = self._add_occur_indexing(f"card_{ex.type}_{ex.value}", ex) + return self._make_atom(pred, ex.var) + elif isinstance(ex, BoxerOr): + return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2)) + elif isinstance(ex, BoxerWhq): + drs1 = self.interpret(ex.drs1) + drs2 = self.interpret(ex.drs2) + return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds) + assert False, f"{ex.__class__.__name__}: {ex}" + + def _make_atom(self, pred, *args): + accum = DrtVariableExpression(Variable(pred)) + for arg in args: + accum = DrtApplicationExpression( + accum, DrtVariableExpression(Variable(arg)) + ) + return accum + + def _add_occur_indexing(self, base, ex): + if self._occur_index and ex.sent_index is not None: + if ex.discourse_id: + base += "_%s" % ex.discourse_id + base += "_s%s" % ex.sent_index + base += "_w%s" % sorted(ex.word_indices)[0] + return base + + +class UnparseableInputException(Exception): + pass + + +if __name__ == "__main__": + opts = OptionParser("usage: %prog TEXT [options]") + opts.add_option( + "--verbose", + "-v", + help="display verbose logs", + action="store_true", + default=False, + dest="verbose", + ) + opts.add_option( + "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol" + ) + opts.add_option( + "--question", + "-q", + help="input is a question", + action="store_true", + default=False, + dest="question", + ) + opts.add_option( + "--occur", + "-o", + help="occurrence index", + action="store_true", + default=False, + dest="occur_index", + ) + (options, args) = opts.parse_args() + + if len(args) != 1: + opts.error("incorrect number of arguments") + + interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index) + drs = Boxer(interpreter).interpret_multi( + args[0].split(r"\n"), question=options.question, verbose=options.verbose + ) + if drs is None: + print(None) + else: + drs = drs.simplify().eliminate_equality() + if options.fol: + print(drs.fol().normalize()) + else: + drs.pretty_print() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..7d952cb8aa8b3ac8e66fd39c6a863756d3ec45c3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py @@ -0,0 +1,553 @@ +# Natural Language Toolkit: GUI Demo for Glue Semantics with Discourse +# Representation Theory (DRT) as meaning language +# +# Author: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +try: + from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk + from tkinter.font import Font + + from nltk.draw.util import CanvasFrame, ShowText + +except ImportError: + """Ignore ImportError because tkinter might not be available.""" + +from nltk.parse import MaltParser +from nltk.sem.drt import DrsDrawer, DrtVariableExpression +from nltk.sem.glue import DrtGlue +from nltk.sem.logic import Variable +from nltk.tag import RegexpTagger +from nltk.util import in_idle + + +class DrtGlueDemo: + def __init__(self, examples): + # Set up the main window. + self._top = Tk() + self._top.title("DRT Glue Demo") + + # Set up key bindings. + self._init_bindings() + + # Initialize the fonts.self._error = None + self._init_fonts(self._top) + + self._examples = examples + self._readingCache = [None for example in examples] + + # The user can hide the grammar. + self._show_grammar = IntVar(self._top) + self._show_grammar.set(1) + + # Set the data to None + self._curExample = -1 + self._readings = [] + self._drs = None + self._drsWidget = None + self._error = None + + self._init_glue() + + # Create the basic frames. + self._init_menubar(self._top) + self._init_buttons(self._top) + self._init_exampleListbox(self._top) + self._init_readingListbox(self._top) + self._init_canvas(self._top) + + # Resize callback + self._canvas.bind("", self._configure) + + ######################################### + ## Initialization Helpers + ######################################### + + def _init_glue(self): + tagger = RegexpTagger( + [ + ("^(David|Mary|John)$", "NNP"), + ( + "^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$", + "VB", + ), + ("^(go|order|vanish|find|approach)$", "VB"), + ("^(a)$", "ex_quant"), + ("^(every)$", "univ_quant"), + ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"), + ("^(big|gray|former)$", "JJ"), + ("^(him|himself)$", "PRP"), + ] + ) + + depparser = MaltParser(tagger=tagger) + self._glue = DrtGlue(depparser=depparser, remove_duplicates=False) + + def _init_fonts(self, root): + # See: + self._sysfont = Font(font=Button()["font"]) + root.option_add("*Font", self._sysfont) + + # TWhat's our font size (default=same as sysfont) + self._size = IntVar(root) + self._size.set(self._sysfont.cget("size")) + + self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) + self._font = Font(family="helvetica", size=self._size.get()) + if self._size.get() < 0: + big = self._size.get() - 2 + else: + big = self._size.get() + 2 + self._bigfont = Font(family="helvetica", weight="bold", size=big) + + def _init_exampleListbox(self, parent): + self._exampleFrame = listframe = Frame(parent) + self._exampleFrame.pack(fill="both", side="left", padx=2) + self._exampleList_label = Label( + self._exampleFrame, font=self._boldfont, text="Examples" + ) + self._exampleList_label.pack() + self._exampleList = Listbox( + self._exampleFrame, + selectmode="single", + relief="groove", + background="white", + foreground="#909090", + font=self._font, + selectforeground="#004040", + selectbackground="#c0f0c0", + ) + + self._exampleList.pack(side="right", fill="both", expand=1) + + for example in self._examples: + self._exampleList.insert("end", (" %s" % example)) + self._exampleList.config(height=min(len(self._examples), 25), width=40) + + # Add a scrollbar if there are more than 25 examples. + if len(self._examples) > 25: + listscroll = Scrollbar(self._exampleFrame, orient="vertical") + self._exampleList.config(yscrollcommand=listscroll.set) + listscroll.config(command=self._exampleList.yview) + listscroll.pack(side="left", fill="y") + + # If they select a example, apply it. + self._exampleList.bind("<>", self._exampleList_select) + + def _init_readingListbox(self, parent): + self._readingFrame = listframe = Frame(parent) + self._readingFrame.pack(fill="both", side="left", padx=2) + self._readingList_label = Label( + self._readingFrame, font=self._boldfont, text="Readings" + ) + self._readingList_label.pack() + self._readingList = Listbox( + self._readingFrame, + selectmode="single", + relief="groove", + background="white", + foreground="#909090", + font=self._font, + selectforeground="#004040", + selectbackground="#c0f0c0", + ) + + self._readingList.pack(side="right", fill="both", expand=1) + + # Add a scrollbar if there are more than 25 examples. + listscroll = Scrollbar(self._readingFrame, orient="vertical") + self._readingList.config(yscrollcommand=listscroll.set) + listscroll.config(command=self._readingList.yview) + listscroll.pack(side="right", fill="y") + + self._populate_readingListbox() + + def _populate_readingListbox(self): + # Populate the listbox with integers + self._readingList.delete(0, "end") + for i in range(len(self._readings)): + self._readingList.insert("end", (" %s" % (i + 1))) + self._readingList.config(height=min(len(self._readings), 25), width=5) + + # If they select a example, apply it. + self._readingList.bind("<>", self._readingList_select) + + def _init_bindings(self): + # Key bindings are a good thing. + self._top.bind("", self.destroy) + self._top.bind("", self.destroy) + self._top.bind("", self.destroy) + self._top.bind("n", self.next) + self._top.bind("", self.next) + self._top.bind("p", self.prev) + self._top.bind("", self.prev) + + def _init_buttons(self, parent): + # Set up the frames. + self._buttonframe = buttonframe = Frame(parent) + buttonframe.pack(fill="none", side="bottom", padx=3, pady=2) + Button( + buttonframe, + text="Prev", + background="#90c0d0", + foreground="black", + command=self.prev, + ).pack(side="left") + Button( + buttonframe, + text="Next", + background="#90c0d0", + foreground="black", + command=self.next, + ).pack(side="left") + + def _configure(self, event): + self._autostep = 0 + (x1, y1, x2, y2) = self._cframe.scrollregion() + y2 = event.height - 6 + self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2) + self._redraw() + + def _init_canvas(self, parent): + self._cframe = CanvasFrame( + parent, + background="white", + # width=525, height=250, + closeenough=10, + border=2, + relief="sunken", + ) + self._cframe.pack(expand=1, fill="both", side="top", pady=2) + canvas = self._canvas = self._cframe.canvas() + + # Initially, there's no tree or text + self._tree = None + self._textwidgets = [] + self._textline = None + + def _init_menubar(self, parent): + menubar = Menu(parent) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="q" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + actionmenu = Menu(menubar, tearoff=0) + actionmenu.add_command( + label="Next", underline=0, command=self.next, accelerator="n, Space" + ) + actionmenu.add_command( + label="Previous", underline=0, command=self.prev, accelerator="p, Backspace" + ) + menubar.add_cascade(label="Action", underline=0, menu=actionmenu) + + optionmenu = Menu(menubar, tearoff=0) + optionmenu.add_checkbutton( + label="Remove Duplicates", + underline=0, + variable=self._glue.remove_duplicates, + command=self._toggle_remove_duplicates, + accelerator="r", + ) + menubar.add_cascade(label="Options", underline=0, menu=optionmenu) + + viewmenu = Menu(menubar, tearoff=0) + viewmenu.add_radiobutton( + label="Tiny", + variable=self._size, + underline=0, + value=10, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Small", + variable=self._size, + underline=0, + value=12, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Medium", + variable=self._size, + underline=0, + value=14, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Large", + variable=self._size, + underline=0, + value=18, + command=self.resize, + ) + viewmenu.add_radiobutton( + label="Huge", + variable=self._size, + underline=0, + value=24, + command=self.resize, + ) + menubar.add_cascade(label="View", underline=0, menu=viewmenu) + + helpmenu = Menu(menubar, tearoff=0) + helpmenu.add_command(label="About", underline=0, command=self.about) + menubar.add_cascade(label="Help", underline=0, menu=helpmenu) + + parent.config(menu=menubar) + + ######################################### + ## Main draw procedure + ######################################### + + def _redraw(self): + canvas = self._canvas + + # Delete the old DRS, widgets, etc. + if self._drsWidget is not None: + self._drsWidget.clear() + + if self._drs: + self._drsWidget = DrsWidget(self._canvas, self._drs) + self._drsWidget.draw() + + if self._error: + self._drsWidget = DrsWidget(self._canvas, self._error) + self._drsWidget.draw() + + ######################################### + ## Button Callbacks + ######################################### + + def destroy(self, *e): + self._autostep = 0 + if self._top is None: + return + self._top.destroy() + self._top = None + + def prev(self, *e): + selection = self._readingList.curselection() + readingListSize = self._readingList.size() + + # there are readings + if readingListSize > 0: + # if one reading is currently selected + if len(selection) == 1: + index = int(selection[0]) + + # if it's on (or before) the first item + if index <= 0: + self._select_previous_example() + else: + self._readingList_store_selection(index - 1) + + else: + # select its first reading + self._readingList_store_selection(readingListSize - 1) + + else: + self._select_previous_example() + + def _select_previous_example(self): + # if the current example is not the first example + if self._curExample > 0: + self._exampleList_store_selection(self._curExample - 1) + else: + # go to the last example + self._exampleList_store_selection(len(self._examples) - 1) + + def next(self, *e): + selection = self._readingList.curselection() + readingListSize = self._readingList.size() + + # if there are readings + if readingListSize > 0: + # if one reading is currently selected + if len(selection) == 1: + index = int(selection[0]) + + # if it's on (or past) the last item + if index >= (readingListSize - 1): + self._select_next_example() + else: + self._readingList_store_selection(index + 1) + + else: + # select its first reading + self._readingList_store_selection(0) + + else: + self._select_next_example() + + def _select_next_example(self): + # if the current example is not the last example + if self._curExample < len(self._examples) - 1: + self._exampleList_store_selection(self._curExample + 1) + else: + # go to the first example + self._exampleList_store_selection(0) + + def about(self, *e): + ABOUT = ( + "NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n" + + "Written by Daniel H. Garrette" + ) + TITLE = "About: NLTK DRT Glue Demo" + try: + from tkinter.messagebox import Message + + Message(message=ABOUT, title=TITLE).show() + except: + ShowText(self._top, TITLE, ABOUT) + + def postscript(self, *e): + self._autostep = 0 + self._cframe.print_to_file() + + def mainloop(self, *args, **kwargs): + """ + Enter the Tkinter mainloop. This function must be called if + this demo is created from a non-interactive program (e.g. + from a secript); otherwise, the demo will close as soon as + the script completes. + """ + if in_idle(): + return + self._top.mainloop(*args, **kwargs) + + def resize(self, size=None): + if size is not None: + self._size.set(size) + size = self._size.get() + self._font.configure(size=-(abs(size))) + self._boldfont.configure(size=-(abs(size))) + self._sysfont.configure(size=-(abs(size))) + self._bigfont.configure(size=-(abs(size + 2))) + self._redraw() + + def _toggle_remove_duplicates(self): + self._glue.remove_duplicates = not self._glue.remove_duplicates + + self._exampleList.selection_clear(0, "end") + self._readings = [] + self._populate_readingListbox() + self._readingCache = [None for ex in self._examples] + self._curExample = -1 + self._error = None + + self._drs = None + self._redraw() + + def _exampleList_select(self, event): + selection = self._exampleList.curselection() + if len(selection) != 1: + return + self._exampleList_store_selection(int(selection[0])) + + def _exampleList_store_selection(self, index): + self._curExample = index + example = self._examples[index] + + self._exampleList.selection_clear(0, "end") + if example: + cache = self._readingCache[index] + if cache: + if isinstance(cache, list): + self._readings = cache + self._error = None + else: + self._readings = [] + self._error = cache + else: + try: + self._readings = self._glue.parse_to_meaning(example) + self._error = None + self._readingCache[index] = self._readings + except Exception as e: + self._readings = [] + self._error = DrtVariableExpression(Variable("Error: " + str(e))) + self._readingCache[index] = self._error + + # add a star to the end of the example + self._exampleList.delete(index) + self._exampleList.insert(index, (" %s *" % example)) + self._exampleList.config( + height=min(len(self._examples), 25), width=40 + ) + + self._populate_readingListbox() + + self._exampleList.selection_set(index) + + self._drs = None + self._redraw() + + def _readingList_select(self, event): + selection = self._readingList.curselection() + if len(selection) != 1: + return + self._readingList_store_selection(int(selection[0])) + + def _readingList_store_selection(self, index): + reading = self._readings[index] + + self._readingList.selection_clear(0, "end") + if reading: + self._readingList.selection_set(index) + + self._drs = reading.simplify().normalize().resolve_anaphora() + + self._redraw() + + +class DrsWidget: + def __init__(self, canvas, drs, **attribs): + self._drs = drs + self._canvas = canvas + canvas.font = Font( + font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font") + ) + canvas._BUFFER = 3 + self.bbox = (0, 0, 0, 0) + + def draw(self): + (right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw() + self.bbox = (0, 0, right + 1, bottom + 1) + + def clear(self): + self._canvas.create_rectangle(self.bbox, fill="white", width="0") + + +def demo(): + examples = [ + "John walks", + "David sees Mary", + "David eats a sandwich", + "every man chases a dog", + # 'every man believes a dog yawns', + # 'John gives David a sandwich', + "John chases himself", + # 'John persuades David to order a pizza', + # 'John tries to go', + # 'John tries to find a unicorn', + # 'John seems to vanish', + # 'a unicorn seems to approach', + # 'every big cat leaves', + # 'every gray cat leaves', + # 'every big gray cat leaves', + # 'a former senator leaves', + # 'John likes a cat', + # 'John likes every cat', + # 'he walks', + # 'John walks and he leaves' + ] + DrtGlueDemo(examples).mainloop() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py new file mode 100644 index 0000000000000000000000000000000000000000..760ade20c46c96131b0f75c0adada0b216958e44 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py @@ -0,0 +1,835 @@ +# Natural Language Toolkit: Glue Semantics +# +# Author: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +import os +from itertools import chain + +import nltk +from nltk.internals import Counter +from nltk.sem import drt, linearlogic +from nltk.sem.logic import ( + AbstractVariableExpression, + Expression, + LambdaExpression, + Variable, + VariableExpression, +) +from nltk.tag import BigramTagger, RegexpTagger, TrigramTagger, UnigramTagger + +SPEC_SEMTYPES = { + "a": "ex_quant", + "an": "ex_quant", + "every": "univ_quant", + "the": "def_art", + "no": "no_quant", + "default": "ex_quant", +} + +OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"] + + +class GlueFormula: + def __init__(self, meaning, glue, indices=None): + if not indices: + indices = set() + + if isinstance(meaning, str): + self.meaning = Expression.fromstring(meaning) + elif isinstance(meaning, Expression): + self.meaning = meaning + else: + raise RuntimeError( + "Meaning term neither string or expression: %s, %s" + % (meaning, meaning.__class__) + ) + + if isinstance(glue, str): + self.glue = linearlogic.LinearLogicParser().parse(glue) + elif isinstance(glue, linearlogic.Expression): + self.glue = glue + else: + raise RuntimeError( + "Glue term neither string or expression: %s, %s" + % (glue, glue.__class__) + ) + + self.indices = indices + + def applyto(self, arg): + """self = (\\x.(walk x), (subj -o f)) + arg = (john , subj) + returns ((walk john), f) + """ + if self.indices & arg.indices: # if the sets are NOT disjoint + raise linearlogic.LinearLogicApplicationException( + f"'{self}' applied to '{arg}'. Indices are not disjoint." + ) + else: # if the sets ARE disjoint + return_indices = self.indices | arg.indices + + try: + return_glue = linearlogic.ApplicationExpression( + self.glue, arg.glue, arg.indices + ) + except linearlogic.LinearLogicApplicationException as e: + raise linearlogic.LinearLogicApplicationException( + f"'{self.simplify()}' applied to '{arg.simplify()}'" + ) from e + + arg_meaning_abstracted = arg.meaning + if return_indices: + for dep in self.glue.simplify().antecedent.dependencies[ + ::-1 + ]: # if self.glue is (A -o B), dep is in A.dependencies + arg_meaning_abstracted = self.make_LambdaExpression( + Variable("v%s" % dep), arg_meaning_abstracted + ) + return_meaning = self.meaning.applyto(arg_meaning_abstracted) + + return self.__class__(return_meaning, return_glue, return_indices) + + def make_VariableExpression(self, name): + return VariableExpression(name) + + def make_LambdaExpression(self, variable, term): + return LambdaExpression(variable, term) + + def lambda_abstract(self, other): + assert isinstance(other, GlueFormula) + assert isinstance(other.meaning, AbstractVariableExpression) + return self.__class__( + self.make_LambdaExpression(other.meaning.variable, self.meaning), + linearlogic.ImpExpression(other.glue, self.glue), + ) + + def compile(self, counter=None): + """From Iddo Lev's PhD Dissertation p108-109""" + if not counter: + counter = Counter() + (compiled_glue, new_forms) = self.glue.simplify().compile_pos( + counter, self.__class__ + ) + return new_forms + [ + self.__class__(self.meaning, compiled_glue, {counter.get()}) + ] + + def simplify(self): + return self.__class__( + self.meaning.simplify(), self.glue.simplify(), self.indices + ) + + def __eq__(self, other): + return ( + self.__class__ == other.__class__ + and self.meaning == other.meaning + and self.glue == other.glue + ) + + def __ne__(self, other): + return not self == other + + # sorting for use in doctests which must be deterministic + def __lt__(self, other): + return str(self) < str(other) + + def __str__(self): + assert isinstance(self.indices, set) + accum = f"{self.meaning} : {self.glue}" + if self.indices: + accum += ( + " : {" + ", ".join(str(index) for index in sorted(self.indices)) + "}" + ) + return accum + + def __repr__(self): + return "%s" % self + + +class GlueDict(dict): + def __init__(self, filename, encoding=None): + self.filename = filename + self.file_encoding = encoding + self.read_file() + + def read_file(self, empty_first=True): + if empty_first: + self.clear() + + try: + contents = nltk.data.load( + self.filename, format="text", encoding=self.file_encoding + ) + # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load() + except LookupError as e: + try: + contents = nltk.data.load( + "file:" + self.filename, format="text", encoding=self.file_encoding + ) + except LookupError: + raise e + lines = contents.splitlines() + + for line in lines: # example: 'n : (\\x.( x), (v-or))' + # lambdacalc -^ linear logic -^ + line = line.strip() # remove trailing newline + if not len(line): + continue # skip empty lines + if line[0] == "#": + continue # skip commented out lines + + parts = line.split( + " : ", 2 + ) # ['verb', '(\\x.( x), ( subj -o f ))', '[subj]'] + + glue_formulas = [] + paren_count = 0 + tuple_start = 0 + tuple_comma = 0 + + relationships = None + + if len(parts) > 1: + for (i, c) in enumerate(parts[1]): + if c == "(": + if paren_count == 0: # if it's the first '(' of a tuple + tuple_start = i + 1 # then save the index + paren_count += 1 + elif c == ")": + paren_count -= 1 + if paren_count == 0: # if it's the last ')' of a tuple + meaning_term = parts[1][ + tuple_start:tuple_comma + ] # '\\x.( x)' + glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)' + glue_formulas.append( + [meaning_term, glue_term] + ) # add the GlueFormula to the list + elif c == ",": + if ( + paren_count == 1 + ): # if it's a comma separating the parts of the tuple + tuple_comma = i # then save the index + elif c == "#": # skip comments at the ends of lines + if ( + paren_count != 0 + ): # if the line hasn't parsed correctly so far + raise RuntimeError( + "Formula syntax is incorrect for entry " + line + ) + break # break to the next line + + if len(parts) > 2: # if there is a relationship entry at the end + rel_start = parts[2].index("[") + 1 + rel_end = parts[2].index("]") + if rel_start == rel_end: + relationships = frozenset() + else: + relationships = frozenset( + r.strip() for r in parts[2][rel_start:rel_end].split(",") + ) + + try: + start_inheritance = parts[0].index("(") + end_inheritance = parts[0].index(")") + sem = parts[0][:start_inheritance].strip() + supertype = parts[0][start_inheritance + 1 : end_inheritance] + except: + sem = parts[0].strip() + supertype = None + + if sem not in self: + self[sem] = {} + + if ( + relationships is None + ): # if not specified for a specific relationship set + # add all relationship entries for parents + if supertype: + for rels in self[supertype]: + if rels not in self[sem]: + self[sem][rels] = [] + glue = self[supertype][rels] + self[sem][rels].extend(glue) + self[sem][rels].extend( + glue_formulas + ) # add the glue formulas to every rel entry + else: + if None not in self[sem]: + self[sem][None] = [] + self[sem][None].extend( + glue_formulas + ) # add the glue formulas to every rel entry + else: + if relationships not in self[sem]: + self[sem][relationships] = [] + if supertype: + self[sem][relationships].extend(self[supertype][relationships]) + self[sem][relationships].extend( + glue_formulas + ) # add the glue entry to the dictionary + + def __str__(self): + accum = "" + for pos in self: + str_pos = "%s" % pos + for relset in self[pos]: + i = 1 + for gf in self[pos][relset]: + if i == 1: + accum += str_pos + ": " + else: + accum += " " * (len(str_pos) + 2) + accum += "%s" % gf + if relset and i == len(self[pos][relset]): + accum += " : %s" % relset + accum += "\n" + i += 1 + return accum + + def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False): + if node is None: + # TODO: should it be depgraph.root? Is this code tested? + top = depgraph.nodes[0] + depList = list(chain.from_iterable(top["deps"].values())) + root = depgraph.nodes[depList[0]] + + return self.to_glueformula_list(depgraph, root, Counter(), verbose) + + glueformulas = self.lookup(node, depgraph, counter) + for dep_idx in chain.from_iterable(node["deps"].values()): + dep = depgraph.nodes[dep_idx] + glueformulas.extend( + self.to_glueformula_list(depgraph, dep, counter, verbose) + ) + return glueformulas + + def lookup(self, node, depgraph, counter): + semtype_names = self.get_semtypes(node) + + semtype = None + for name in semtype_names: + if name in self: + semtype = self[name] + break + if semtype is None: + # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) + return [] + + self.add_missing_dependencies(node, depgraph) + + lookup = self._lookup_semtype_option(semtype, node, depgraph) + + if not len(lookup): + raise KeyError( + "There is no GlueDict entry for sem type of '%s' " + "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"]) + ) + + return self.get_glueformulas_from_semtype_entry( + lookup, node["word"], node, depgraph, counter + ) + + def add_missing_dependencies(self, node, depgraph): + rel = node["rel"].lower() + + if rel == "main": + headnode = depgraph.nodes[node["head"]] + subj = self.lookup_unique("subj", headnode, depgraph) + relation = subj["rel"] + node["deps"].setdefault(relation, []) + node["deps"][relation].append(subj["address"]) + # node['deps'].append(subj['address']) + + def _lookup_semtype_option(self, semtype, node, depgraph): + relationships = frozenset( + depgraph.nodes[dep]["rel"].lower() + for dep in chain.from_iterable(node["deps"].values()) + if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS + ) + + try: + lookup = semtype[relationships] + except KeyError: + # An exact match is not found, so find the best match where + # 'best' is defined as the glue entry whose relationship set has the + # most relations of any possible relationship set that is a subset + # of the actual depgraph + best_match = frozenset() + for relset_option in set(semtype) - {None}: + if ( + len(relset_option) > len(best_match) + and relset_option < relationships + ): + best_match = relset_option + if not best_match: + if None in semtype: + best_match = None + else: + return None + lookup = semtype[best_match] + + return lookup + + def get_semtypes(self, node): + """ + Based on the node, return a list of plausible semtypes in order of + plausibility. + """ + rel = node["rel"].lower() + word = node["word"].lower() + + if rel == "spec": + if word in SPEC_SEMTYPES: + return [SPEC_SEMTYPES[word]] + else: + return [SPEC_SEMTYPES["default"]] + elif rel in ["nmod", "vmod"]: + return [node["tag"], rel] + else: + return [node["tag"]] + + def get_glueformulas_from_semtype_entry( + self, lookup, word, node, depgraph, counter + ): + glueformulas = [] + + glueFormulaFactory = self.get_GlueFormula_factory() + for meaning, glue in lookup: + gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue) + if not len(glueformulas): + gf.word = word + else: + gf.word = f"{word}{len(glueformulas) + 1}" + + gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get()) + + glueformulas.append(gf) + return glueformulas + + def get_meaning_formula(self, generic, word): + """ + :param generic: A meaning formula string containing the + parameter "" + :param word: The actual word to be replace "" + """ + word = word.replace(".", "") + return generic.replace("", word) + + def initialize_labels(self, expr, node, depgraph, unique_index): + if isinstance(expr, linearlogic.AtomicExpression): + name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index) + if name[0].isupper(): + return linearlogic.VariableExpression(name) + else: + return linearlogic.ConstantExpression(name) + else: + return linearlogic.ImpExpression( + self.initialize_labels(expr.antecedent, node, depgraph, unique_index), + self.initialize_labels(expr.consequent, node, depgraph, unique_index), + ) + + def find_label_name(self, name, node, depgraph, unique_index): + try: + dot = name.index(".") + + before_dot = name[:dot] + after_dot = name[dot + 1 :] + if before_dot == "super": + return self.find_label_name( + after_dot, depgraph.nodes[node["head"]], depgraph, unique_index + ) + else: + return self.find_label_name( + after_dot, + self.lookup_unique(before_dot, node, depgraph), + depgraph, + unique_index, + ) + except ValueError: + lbl = self.get_label(node) + if name == "f": + return lbl + elif name == "v": + return "%sv" % lbl + elif name == "r": + return "%sr" % lbl + elif name == "super": + return self.get_label(depgraph.nodes[node["head"]]) + elif name == "var": + return f"{lbl.upper()}{unique_index}" + elif name == "a": + return self.get_label(self.lookup_unique("conja", node, depgraph)) + elif name == "b": + return self.get_label(self.lookup_unique("conjb", node, depgraph)) + else: + return self.get_label(self.lookup_unique(name, node, depgraph)) + + def get_label(self, node): + """ + Pick an alphabetic character as identifier for an entity in the model. + + :param value: where to index into the list of characters + :type value: int + """ + value = node["address"] + + letter = [ + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "a", + "b", + "c", + "d", + "e", + ][value - 1] + num = int(value) // 26 + if num > 0: + return letter + str(num) + else: + return letter + + def lookup_unique(self, rel, node, depgraph): + """ + Lookup 'key'. There should be exactly one item in the associated relation. + """ + deps = [ + depgraph.nodes[dep] + for dep in chain.from_iterable(node["deps"].values()) + if depgraph.nodes[dep]["rel"].lower() == rel.lower() + ] + + if len(deps) == 0: + raise KeyError( + "'{}' doesn't contain a feature '{}'".format(node["word"], rel) + ) + elif len(deps) > 1: + raise KeyError( + "'{}' should only have one feature '{}'".format(node["word"], rel) + ) + else: + return deps[0] + + def get_GlueFormula_factory(self): + return GlueFormula + + +class Glue: + def __init__( + self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False + ): + self.verbose = verbose + self.remove_duplicates = remove_duplicates + self.depparser = depparser + + from nltk import Prover9 + + self.prover = Prover9() + + if semtype_file: + self.semtype_file = semtype_file + else: + self.semtype_file = os.path.join( + "grammars", "sample_grammars", "glue.semtype" + ) + + def train_depparser(self, depgraphs=None): + if depgraphs: + self.depparser.train(depgraphs) + else: + self.depparser.train_from_file( + nltk.data.find( + os.path.join("grammars", "sample_grammars", "glue_train.conll") + ) + ) + + def parse_to_meaning(self, sentence): + readings = [] + for agenda in self.parse_to_compiled(sentence): + readings.extend(self.get_readings(agenda)) + return readings + + def get_readings(self, agenda): + readings = [] + agenda_length = len(agenda) + atomics = dict() + nonatomics = dict() + while agenda: # is not empty + cur = agenda.pop() + glue_simp = cur.glue.simplify() + if isinstance( + glue_simp, linearlogic.ImpExpression + ): # if cur.glue is non-atomic + for key in atomics: + try: + if isinstance(cur.glue, linearlogic.ApplicationExpression): + bindings = cur.glue.bindings + else: + bindings = linearlogic.BindingDict() + glue_simp.antecedent.unify(key, bindings) + for atomic in atomics[key]: + if not ( + cur.indices & atomic.indices + ): # if the sets of indices are disjoint + try: + agenda.append(cur.applyto(atomic)) + except linearlogic.LinearLogicApplicationException: + pass + except linearlogic.UnificationException: + pass + try: + nonatomics[glue_simp.antecedent].append(cur) + except KeyError: + nonatomics[glue_simp.antecedent] = [cur] + + else: # else cur.glue is atomic + for key in nonatomics: + for nonatomic in nonatomics[key]: + try: + if isinstance( + nonatomic.glue, linearlogic.ApplicationExpression + ): + bindings = nonatomic.glue.bindings + else: + bindings = linearlogic.BindingDict() + glue_simp.unify(key, bindings) + if not ( + cur.indices & nonatomic.indices + ): # if the sets of indices are disjoint + try: + agenda.append(nonatomic.applyto(cur)) + except linearlogic.LinearLogicApplicationException: + pass + except linearlogic.UnificationException: + pass + try: + atomics[glue_simp].append(cur) + except KeyError: + atomics[glue_simp] = [cur] + + for entry in atomics: + for gf in atomics[entry]: + if len(gf.indices) == agenda_length: + self._add_to_reading_list(gf, readings) + for entry in nonatomics: + for gf in nonatomics[entry]: + if len(gf.indices) == agenda_length: + self._add_to_reading_list(gf, readings) + return readings + + def _add_to_reading_list(self, glueformula, reading_list): + add_reading = True + if self.remove_duplicates: + for reading in reading_list: + try: + if reading.equiv(glueformula.meaning, self.prover): + add_reading = False + break + except Exception as e: + # if there is an exception, the syntax of the formula + # may not be understandable by the prover, so don't + # throw out the reading. + print("Error when checking logical equality of statements", e) + + if add_reading: + reading_list.append(glueformula.meaning) + + def parse_to_compiled(self, sentence): + gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)] + return [self.gfl_to_compiled(gfl) for gfl in gfls] + + def dep_parse(self, sentence): + """ + Return a dependency graph for the sentence. + + :param sentence: the sentence to be parsed + :type sentence: list(str) + :rtype: DependencyGraph + """ + + # Lazy-initialize the depparser + if self.depparser is None: + from nltk.parse import MaltParser + + self.depparser = MaltParser(tagger=self.get_pos_tagger()) + if not self.depparser._trained: + self.train_depparser() + return self.depparser.parse(sentence, verbose=self.verbose) + + def depgraph_to_glue(self, depgraph): + return self.get_glue_dict().to_glueformula_list(depgraph) + + def get_glue_dict(self): + return GlueDict(self.semtype_file) + + def gfl_to_compiled(self, gfl): + index_counter = Counter() + return_list = [] + for gf in gfl: + return_list.extend(gf.compile(index_counter)) + + if self.verbose: + print("Compiled Glue Premises:") + for cgf in return_list: + print(cgf) + + return return_list + + def get_pos_tagger(self): + from nltk.corpus import brown + + regexp_tagger = RegexpTagger( + [ + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers + (r"(The|the|A|a|An|an)$", "AT"), # articles + (r".*able$", "JJ"), # adjectives + (r".*ness$", "NN"), # nouns formed from adjectives + (r".*ly$", "RB"), # adverbs + (r".*s$", "NNS"), # plural nouns + (r".*ing$", "VBG"), # gerunds + (r".*ed$", "VBD"), # past tense verbs + (r".*", "NN"), # nouns (default) + ] + ) + brown_train = brown.tagged_sents(categories="news") + unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) + bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) + trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) + + # Override particular words + main_tagger = RegexpTagger( + [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")], + backoff=trigram_tagger, + ) + + return main_tagger + + +class DrtGlueFormula(GlueFormula): + def __init__(self, meaning, glue, indices=None): + if not indices: + indices = set() + + if isinstance(meaning, str): + self.meaning = drt.DrtExpression.fromstring(meaning) + elif isinstance(meaning, drt.DrtExpression): + self.meaning = meaning + else: + raise RuntimeError( + "Meaning term neither string or expression: %s, %s" + % (meaning, meaning.__class__) + ) + + if isinstance(glue, str): + self.glue = linearlogic.LinearLogicParser().parse(glue) + elif isinstance(glue, linearlogic.Expression): + self.glue = glue + else: + raise RuntimeError( + "Glue term neither string or expression: %s, %s" + % (glue, glue.__class__) + ) + + self.indices = indices + + def make_VariableExpression(self, name): + return drt.DrtVariableExpression(name) + + def make_LambdaExpression(self, variable, term): + return drt.DrtLambdaExpression(variable, term) + + +class DrtGlueDict(GlueDict): + def get_GlueFormula_factory(self): + return DrtGlueFormula + + +class DrtGlue(Glue): + def __init__( + self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False + ): + if not semtype_file: + semtype_file = os.path.join( + "grammars", "sample_grammars", "drt_glue.semtype" + ) + Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose) + + def get_glue_dict(self): + return DrtGlueDict(self.semtype_file) + + +def demo(show_example=-1): + from nltk.parse import MaltParser + + examples = [ + "David sees Mary", + "David eats a sandwich", + "every man chases a dog", + "every man believes a dog sleeps", + "John gives David a sandwich", + "John chases himself", + ] + # 'John persuades David to order a pizza', + # 'John tries to go', + # 'John tries to find a unicorn', + # 'John seems to vanish', + # 'a unicorn seems to approach', + # 'every big cat leaves', + # 'every gray cat leaves', + # 'every big gray cat leaves', + # 'a former senator leaves', + + print("============== DEMO ==============") + + tagger = RegexpTagger( + [ + ("^(David|Mary|John)$", "NNP"), + ( + "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$", + "VB", + ), + ("^(go|order|vanish|find|approach)$", "VB"), + ("^(a)$", "ex_quant"), + ("^(every)$", "univ_quant"), + ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"), + ("^(big|gray|former)$", "JJ"), + ("^(him|himself)$", "PRP"), + ] + ) + + depparser = MaltParser(tagger=tagger) + glue = Glue(depparser=depparser, verbose=False) + + for (i, sentence) in enumerate(examples): + if i == show_example or show_example == -1: + print(f"[[[Example {i}]]] {sentence}") + for reading in glue.parse_to_meaning(sentence.split()): + print(reading.simplify()) + print("") + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py new file mode 100644 index 0000000000000000000000000000000000000000..e703e693f61935a0470c581a5818e6d62ac19d3c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py @@ -0,0 +1,395 @@ +# Natural Language Toolkit: Logic +# +# Author: Peter Wang +# Updated by: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +""" +An implementation of the Hole Semantics model, following Blackburn and Bos, +Representation and Inference for Natural Language (CSLI, 2005). + +The semantic representations are built by the grammar hole.fcfg. +This module contains driver code to read in sentences and parse them +according to a hole semantics grammar. + +After parsing, the semantic representation is in the form of an underspecified +representation that is not easy to read. We use a "plugging" algorithm to +convert that representation into first-order logic formulas. +""" + +from functools import reduce + +from nltk.parse import load_parser +from nltk.sem.logic import ( + AllExpression, + AndExpression, + ApplicationExpression, + ExistsExpression, + IffExpression, + ImpExpression, + LambdaExpression, + NegatedExpression, + OrExpression, +) +from nltk.sem.skolemize import skolemize + +# Note that in this code there may be multiple types of trees being referred to: +# +# 1. parse trees +# 2. the underspecified representation +# 3. first-order logic formula trees +# 4. the search space when plugging (search tree) +# + + +class Constants: + ALL = "ALL" + EXISTS = "EXISTS" + NOT = "NOT" + AND = "AND" + OR = "OR" + IMP = "IMP" + IFF = "IFF" + PRED = "PRED" + LEQ = "LEQ" + HOLE = "HOLE" + LABEL = "LABEL" + + MAP = { + ALL: lambda v, e: AllExpression(v.variable, e), + EXISTS: lambda v, e: ExistsExpression(v.variable, e), + NOT: NegatedExpression, + AND: AndExpression, + OR: OrExpression, + IMP: ImpExpression, + IFF: IffExpression, + PRED: ApplicationExpression, + } + + +class HoleSemantics: + """ + This class holds the broken-down components of a hole semantics, i.e. it + extracts the holes, labels, logic formula fragments and constraints out of + a big conjunction of such as produced by the hole semantics grammar. It + then provides some operations on the semantics dealing with holes, labels + and finding legal ways to plug holes with labels. + """ + + def __init__(self, usr): + """ + Constructor. `usr' is a ``sem.Expression`` representing an + Underspecified Representation Structure (USR). A USR has the following + special predicates: + ALL(l,v,n), + EXISTS(l,v,n), + AND(l,n,n), + OR(l,n,n), + IMP(l,n,n), + IFF(l,n,n), + PRED(l,v,n,v[,v]*) where the brackets and star indicate zero or more repetitions, + LEQ(n,n), + HOLE(n), + LABEL(n) + where l is the label of the node described by the predicate, n is either + a label or a hole, and v is a variable. + """ + self.holes = set() + self.labels = set() + self.fragments = {} # mapping of label -> formula fragment + self.constraints = set() # set of Constraints + self._break_down(usr) + self.top_most_labels = self._find_top_most_labels() + self.top_hole = self._find_top_hole() + + def is_node(self, x): + """ + Return true if x is a node (label or hole) in this semantic + representation. + """ + return x in (self.labels | self.holes) + + def _break_down(self, usr): + """ + Extract holes, labels, formula fragments and constraints from the hole + semantics underspecified representation (USR). + """ + if isinstance(usr, AndExpression): + self._break_down(usr.first) + self._break_down(usr.second) + elif isinstance(usr, ApplicationExpression): + func, args = usr.uncurry() + if func.variable.name == Constants.LEQ: + self.constraints.add(Constraint(args[0], args[1])) + elif func.variable.name == Constants.HOLE: + self.holes.add(args[0]) + elif func.variable.name == Constants.LABEL: + self.labels.add(args[0]) + else: + label = args[0] + assert label not in self.fragments + self.fragments[label] = (func, args[1:]) + else: + raise ValueError(usr.label()) + + def _find_top_nodes(self, node_list): + top_nodes = node_list.copy() + for f in self.fragments.values(): + # the label is the first argument of the predicate + args = f[1] + for arg in args: + if arg in node_list: + top_nodes.discard(arg) + return top_nodes + + def _find_top_most_labels(self): + """ + Return the set of labels which are not referenced directly as part of + another formula fragment. These will be the top-most labels for the + subtree that they are part of. + """ + return self._find_top_nodes(self.labels) + + def _find_top_hole(self): + """ + Return the hole that will be the top of the formula tree. + """ + top_holes = self._find_top_nodes(self.holes) + assert len(top_holes) == 1 # it must be unique + return top_holes.pop() + + def pluggings(self): + """ + Calculate and return all the legal pluggings (mappings of labels to + holes) of this semantics given the constraints. + """ + record = [] + self._plug_nodes([(self.top_hole, [])], self.top_most_labels, {}, record) + return record + + def _plug_nodes(self, queue, potential_labels, plug_acc, record): + """ + Plug the nodes in `queue' with the labels in `potential_labels'. + + Each element of `queue' is a tuple of the node to plug and the list of + ancestor holes from the root of the graph to that node. + + `potential_labels' is a set of the labels which are still available for + plugging. + + `plug_acc' is the incomplete mapping of holes to labels made on the + current branch of the search tree so far. + + `record' is a list of all the complete pluggings that we have found in + total so far. It is the only parameter that is destructively updated. + """ + if queue != []: + (node, ancestors) = queue[0] + if node in self.holes: + # The node is a hole, try to plug it. + self._plug_hole( + node, ancestors, queue[1:], potential_labels, plug_acc, record + ) + else: + assert node in self.labels + # The node is a label. Replace it in the queue by the holes and + # labels in the formula fragment named by that label. + args = self.fragments[node][1] + head = [(a, ancestors) for a in args if self.is_node(a)] + self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record) + else: + raise Exception("queue empty") + + def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record): + """ + Try all possible ways of plugging a single hole. + See _plug_nodes for the meanings of the parameters. + """ + # Add the current hole we're trying to plug into the list of ancestors. + assert hole not in ancestors0 + ancestors = [hole] + ancestors0 + + # Try each potential label in this hole in turn. + for l in potential_labels0: + # Is the label valid in this hole? + if self._violates_constraints(l, ancestors): + continue + + plug_acc = plug_acc0.copy() + plug_acc[hole] = l + potential_labels = potential_labels0.copy() + potential_labels.remove(l) + + if len(potential_labels) == 0: + # No more potential labels. That must mean all the holes have + # been filled so we have found a legal plugging so remember it. + # + # Note that the queue might not be empty because there might + # be labels on there that point to formula fragments with + # no holes in them. _sanity_check_plugging will make sure + # all holes are filled. + self._sanity_check_plugging(plug_acc, self.top_hole, []) + record.append(plug_acc) + else: + # Recursively try to fill in the rest of the holes in the + # queue. The label we just plugged into the hole could have + # holes of its own so at the end of the queue. Putting it on + # the end of the queue gives us a breadth-first search, so that + # all the holes at level i of the formula tree are filled + # before filling level i+1. + # A depth-first search would work as well since the trees must + # be finite but the bookkeeping would be harder. + self._plug_nodes( + queue + [(l, ancestors)], potential_labels, plug_acc, record + ) + + def _violates_constraints(self, label, ancestors): + """ + Return True if the `label' cannot be placed underneath the holes given + by the set `ancestors' because it would violate the constraints imposed + on it. + """ + for c in self.constraints: + if c.lhs == label: + if c.rhs not in ancestors: + return True + return False + + def _sanity_check_plugging(self, plugging, node, ancestors): + """ + Make sure that a given plugging is legal. We recursively go through + each node and make sure that no constraints are violated. + We also check that all holes have been filled. + """ + if node in self.holes: + ancestors = [node] + ancestors + label = plugging[node] + else: + label = node + assert label in self.labels + for c in self.constraints: + if c.lhs == label: + assert c.rhs in ancestors + args = self.fragments[label][1] + for arg in args: + if self.is_node(arg): + self._sanity_check_plugging(plugging, arg, [label] + ancestors) + + def formula_tree(self, plugging): + """ + Return the first-order logic formula tree for this underspecified + representation using the plugging given. + """ + return self._formula_tree(plugging, self.top_hole) + + def _formula_tree(self, plugging, node): + if node in plugging: + return self._formula_tree(plugging, plugging[node]) + elif node in self.fragments: + pred, args = self.fragments[node] + children = [self._formula_tree(plugging, arg) for arg in args] + return reduce(Constants.MAP[pred.variable.name], children) + else: + return node + + +class Constraint: + """ + This class represents a constraint of the form (L =< N), + where L is a label and N is a node (a label or a hole). + """ + + def __init__(self, lhs, rhs): + self.lhs = lhs + self.rhs = rhs + + def __eq__(self, other): + if self.__class__ == other.__class__: + return self.lhs == other.lhs and self.rhs == other.rhs + else: + return False + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return hash(repr(self)) + + def __repr__(self): + return f"({self.lhs} < {self.rhs})" + + +def hole_readings(sentence, grammar_filename=None, verbose=False): + if not grammar_filename: + grammar_filename = "grammars/sample_grammars/hole.fcfg" + + if verbose: + print("Reading grammar file", grammar_filename) + + parser = load_parser(grammar_filename) + + # Parse the sentence. + tokens = sentence.split() + trees = list(parser.parse(tokens)) + if verbose: + print("Got %d different parses" % len(trees)) + + all_readings = [] + for tree in trees: + # Get the semantic feature from the top of the parse tree. + sem = tree.label()["SEM"].simplify() + + # Print the raw semantic representation. + if verbose: + print("Raw: ", sem) + + # Skolemize away all quantifiers. All variables become unique. + while isinstance(sem, LambdaExpression): + sem = sem.term + skolemized = skolemize(sem) + + if verbose: + print("Skolemized:", skolemized) + + # Break the hole semantics representation down into its components + # i.e. holes, labels, formula fragments and constraints. + hole_sem = HoleSemantics(skolemized) + + # Maybe show the details of the semantic representation. + if verbose: + print("Holes: ", hole_sem.holes) + print("Labels: ", hole_sem.labels) + print("Constraints: ", hole_sem.constraints) + print("Top hole: ", hole_sem.top_hole) + print("Top labels: ", hole_sem.top_most_labels) + print("Fragments:") + for l, f in hole_sem.fragments.items(): + print(f"\t{l}: {f}") + + # Find all the possible ways to plug the formulas together. + pluggings = hole_sem.pluggings() + + # Build FOL formula trees using the pluggings. + readings = list(map(hole_sem.formula_tree, pluggings)) + + # Print out the formulas in a textual format. + if verbose: + for i, r in enumerate(readings): + print() + print("%d. %s" % (i, r)) + print() + + all_readings.extend(readings) + + return all_readings + + +if __name__ == "__main__": + for r in hole_readings("a dog barks"): + print(r) + print() + for r in hole_readings("every girl chases a dog"): + print(r) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..474d71dc10cf299e78658d595a1603c28b7bc03d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py @@ -0,0 +1,34 @@ +# Natural Language Toolkit: Stemmers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +NLTK Stemmers + +Interfaces used to remove morphological affixes from words, leaving +only the word stem. Stemming algorithms aim to remove those affixes +required for eg. grammatical role, tense, derivational morphology +leaving only the stem of the word. This is a difficult problem due to +irregular words (eg. common verbs in English), complicated +morphological rules, and part-of-speech and sense ambiguities +(eg. ``ceil-`` is not the stem of ``ceiling``). + +StemmerI defines a standard interface for stemmers. +""" + +from nltk.stem.api import StemmerI +from nltk.stem.arlstem import ARLSTem +from nltk.stem.arlstem2 import ARLSTem2 +from nltk.stem.cistem import Cistem +from nltk.stem.isri import ISRIStemmer +from nltk.stem.lancaster import LancasterStemmer +from nltk.stem.porter import PorterStemmer +from nltk.stem.regexp import RegexpStemmer +from nltk.stem.rslp import RSLPStemmer +from nltk.stem.snowball import SnowballStemmer +from nltk.stem.wordnet import WordNetLemmatizer diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py new file mode 100644 index 0000000000000000000000000000000000000000..645debf5dfd5676bfcba33dc623e417e2ca5b54f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py @@ -0,0 +1,27 @@ +# Natural Language Toolkit: Stemmer Interface +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from abc import ABCMeta, abstractmethod + + +class StemmerI(metaclass=ABCMeta): + """ + A processing interface for removing morphological affixes from + words. This process is known as stemming. + + """ + + @abstractmethod + def stem(self, token): + """ + Strip affixes from the token and return the stem. + + :param token: The token that should be stemmed. + :type token: str + """ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py new file mode 100644 index 0000000000000000000000000000000000000000..ccaef92aacd554d7737a1085aa68221950406bcf --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py @@ -0,0 +1,343 @@ +# Natural Language Toolkit: Stemmers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Tomcavage +# URL: +# For license information, see LICENSE.TXT + +""" +A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm. +Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61. +""" +import re + +from nltk.stem.api import StemmerI + + +class LancasterStemmer(StemmerI): + """ + Lancaster Stemmer + + >>> from nltk.stem.lancaster import LancasterStemmer + >>> st = LancasterStemmer() + >>> st.stem('maximum') # Remove "-um" when word is intact + 'maxim' + >>> st.stem('presumably') # Don't remove "-um" when word is not intact + 'presum' + >>> st.stem('multiply') # No action taken if word ends with "-ply" + 'multiply' + >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules + 'provid' + >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters + 'ow' + >>> st.stem('ear') # ditto + 'ear' + >>> st.stem('saying') # Words starting with consonant must contain at least 3 + 'say' + >>> st.stem('crying') # letters and one of those letters must be a vowel + 'cry' + >>> st.stem('string') # ditto + 'string' + >>> st.stem('meant') # ditto + 'meant' + >>> st.stem('cement') # ditto + 'cem' + >>> st_pre = LancasterStemmer(strip_prefix_flag=True) + >>> st_pre.stem('kilometer') # Test Prefix + 'met' + >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t.")) + >>> st_custom.stem("ness") # Change s to t + 'nest' + """ + + # The rule list is static since it doesn't change between instances + default_rule_tuple = ( + "ai*2.", # -ia > - if intact + "a*1.", # -a > - if intact + "bb1.", # -bb > -b + "city3s.", # -ytic > -ys + "ci2>", # -ic > - + "cn1t>", # -nc > -nt + "dd1.", # -dd > -d + "dei3y>", # -ied > -y + "deec2ss.", # -ceed >", -cess + "dee1.", # -eed > -ee + "de2>", # -ed > - + "dooh4>", # -hood > - + "e1>", # -e > - + "feil1v.", # -lief > -liev + "fi2>", # -if > - + "gni3>", # -ing > - + "gai3y.", # -iag > -y + "ga2>", # -ag > - + "gg1.", # -gg > -g + "ht*2.", # -th > - if intact + "hsiug5ct.", # -guish > -ct + "hsi3>", # -ish > - + "i*1.", # -i > - if intact + "i1y>", # -i > -y + "ji1d.", # -ij > -id -- see nois4j> & vis3j> + "juf1s.", # -fuj > -fus + "ju1d.", # -uj > -ud + "jo1d.", # -oj > -od + "jeh1r.", # -hej > -her + "jrev1t.", # -verj > -vert + "jsim2t.", # -misj > -mit + "jn1d.", # -nj > -nd + "j1s.", # -j > -s + "lbaifi6.", # -ifiabl > - + "lbai4y.", # -iabl > -y + "lba3>", # -abl > - + "lbi3.", # -ibl > - + "lib2l>", # -bil > -bl + "lc1.", # -cl > c + "lufi4y.", # -iful > -y + "luf3>", # -ful > - + "lu2.", # -ul > - + "lai3>", # -ial > - + "lau3>", # -ual > - + "la2>", # -al > - + "ll1.", # -ll > -l + "mui3.", # -ium > - + "mu*2.", # -um > - if intact + "msi3>", # -ism > - + "mm1.", # -mm > -m + "nois4j>", # -sion > -j + "noix4ct.", # -xion > -ct + "noi3>", # -ion > - + "nai3>", # -ian > - + "na2>", # -an > - + "nee0.", # protect -een + "ne2>", # -en > - + "nn1.", # -nn > -n + "pihs4>", # -ship > - + "pp1.", # -pp > -p + "re2>", # -er > - + "rae0.", # protect -ear + "ra2.", # -ar > - + "ro2>", # -or > - + "ru2>", # -ur > - + "rr1.", # -rr > -r + "rt1>", # -tr > -t + "rei3y>", # -ier > -y + "sei3y>", # -ies > -y + "sis2.", # -sis > -s + "si2>", # -is > - + "ssen4>", # -ness > - + "ss0.", # protect -ss + "suo3>", # -ous > - + "su*2.", # -us > - if intact + "s*1>", # -s > - if intact + "s0.", # -s > -s + "tacilp4y.", # -plicat > -ply + "ta2>", # -at > - + "tnem4>", # -ment > - + "tne3>", # -ent > - + "tna3>", # -ant > - + "tpir2b.", # -ript > -rib + "tpro2b.", # -orpt > -orb + "tcud1.", # -duct > -duc + "tpmus2.", # -sumpt > -sum + "tpec2iv.", # -cept > -ceiv + "tulo2v.", # -olut > -olv + "tsis0.", # protect -sist + "tsi3>", # -ist > - + "tt1.", # -tt > -t + "uqi3.", # -iqu > - + "ugo1.", # -ogu > -og + "vis3j>", # -siv > -j + "vie0.", # protect -eiv + "vi2>", # -iv > - + "ylb1>", # -bly > -bl + "yli3y>", # -ily > -y + "ylp0.", # protect -ply + "yl2>", # -ly > - + "ygo1.", # -ogy > -og + "yhp1.", # -phy > -ph + "ymo1.", # -omy > -om + "ypo1.", # -opy > -op + "yti3>", # -ity > - + "yte3>", # -ety > - + "ytl2.", # -lty > -l + "yrtsi5.", # -istry > - + "yra3>", # -ary > - + "yro3>", # -ory > - + "yfi3.", # -ify > - + "ycn2t>", # -ncy > -nt + "yca3>", # -acy > - + "zi2>", # -iz > - + "zy1s.", # -yz > -ys + ) + + def __init__(self, rule_tuple=None, strip_prefix_flag=False): + """Create an instance of the Lancaster stemmer.""" + # Setup an empty rule dictionary - this will be filled in later + self.rule_dictionary = {} + # Check if a user wants to strip prefix + self._strip_prefix = strip_prefix_flag + # Check if a user wants to use his/her own rule tuples. + self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple + + def parseRules(self, rule_tuple=None): + """Validate the set of rules used in this stemmer. + + If this function is called as an individual method, without using stem + method, rule_tuple argument will be compiled into self.rule_dictionary. + If this function is called within stem, self._rule_tuple will be used. + + """ + # If there is no argument for the function, use class' own rule tuple. + rule_tuple = rule_tuple if rule_tuple else self._rule_tuple + valid_rule = re.compile(r"^[a-z]+\*?\d[a-z]*[>\.]?$") + # Empty any old rules from the rule set before adding new ones + self.rule_dictionary = {} + + for rule in rule_tuple: + if not valid_rule.match(rule): + raise ValueError(f"The rule {rule} is invalid") + first_letter = rule[0:1] + if first_letter in self.rule_dictionary: + self.rule_dictionary[first_letter].append(rule) + else: + self.rule_dictionary[first_letter] = [rule] + + def stem(self, word): + """Stem a word using the Lancaster stemmer.""" + # Lower-case the word, since all the rules are lower-cased + word = word.lower() + word = self.__stripPrefix(word) if self._strip_prefix else word + + # Save a copy of the original word + intact_word = word + + # If rule dictionary is empty, parse rule tuple. + if not self.rule_dictionary: + self.parseRules() + + return self.__doStemming(word, intact_word) + + def __doStemming(self, word, intact_word): + """Perform the actual word stemming""" + + valid_rule = re.compile(r"^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$") + + proceed = True + + while proceed: + + # Find the position of the last letter of the word to be stemmed + last_letter_position = self.__getLastLetter(word) + + # Only stem the word if it has a last letter and a rule matching that last letter + if ( + last_letter_position < 0 + or word[last_letter_position] not in self.rule_dictionary + ): + proceed = False + + else: + rule_was_applied = False + + # Go through each rule that matches the word's final letter + for rule in self.rule_dictionary[word[last_letter_position]]: + rule_match = valid_rule.match(rule) + if rule_match: + ( + ending_string, + intact_flag, + remove_total, + append_string, + cont_flag, + ) = rule_match.groups() + + # Convert the number of chars to remove when stemming + # from a string to an integer + remove_total = int(remove_total) + + # Proceed if word's ending matches rule's word ending + if word.endswith(ending_string[::-1]): + if intact_flag: + if word == intact_word and self.__isAcceptable( + word, remove_total + ): + word = self.__applyRule( + word, remove_total, append_string + ) + rule_was_applied = True + if cont_flag == ".": + proceed = False + break + elif self.__isAcceptable(word, remove_total): + word = self.__applyRule( + word, remove_total, append_string + ) + rule_was_applied = True + if cont_flag == ".": + proceed = False + break + # If no rules apply, the word doesn't need any more stemming + if rule_was_applied == False: + proceed = False + return word + + def __getLastLetter(self, word): + """Get the zero-based index of the last alphabetic character in this string""" + last_letter = -1 + for position in range(len(word)): + if word[position].isalpha(): + last_letter = position + else: + break + return last_letter + + def __isAcceptable(self, word, remove_total): + """Determine if the word is acceptable for stemming.""" + word_is_acceptable = False + # If the word starts with a vowel, it must be at least 2 + # characters long to be stemmed + if word[0] in "aeiouy": + if len(word) - remove_total >= 2: + word_is_acceptable = True + # If the word starts with a consonant, it must be at least 3 + # characters long (including one vowel) to be stemmed + elif len(word) - remove_total >= 3: + if word[1] in "aeiouy": + word_is_acceptable = True + elif word[2] in "aeiouy": + word_is_acceptable = True + return word_is_acceptable + + def __applyRule(self, word, remove_total, append_string): + """Apply the stemming rule to the word""" + # Remove letters from the end of the word + new_word_length = len(word) - remove_total + word = word[0:new_word_length] + + # And add new letters to the end of the truncated word + if append_string: + word += append_string + return word + + def __stripPrefix(self, word): + """Remove prefix from a word. + + This function originally taken from Whoosh. + + """ + for prefix in ( + "kilo", + "micro", + "milli", + "intra", + "ultra", + "mega", + "nano", + "pico", + "pseudo", + ): + if word.startswith(prefix): + return word[len(prefix) :] + return word + + def __repr__(self): + return "" diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py new file mode 100644 index 0000000000000000000000000000000000000000..294c78a18c4fb95c5adcf920346c9316c3d5168b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py @@ -0,0 +1,137 @@ +# Natural Language Toolkit: RSLP Stemmer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Tiago Tresoldi +# URL: +# For license information, see LICENSE.TXT + +# This code is based on the algorithm presented in the paper "A Stemming +# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and +# Christian Huyck, which unfortunately I had no access to. The code is a +# Python version, with some minor modifications of mine, to the description +# presented at https://www.webcitation.org/5NnvdIzOb and to the C source code +# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. +# Please note that this stemmer is intended for demonstration and educational +# purposes only. Feel free to write me for any comments, including the +# development of a different and/or better stemmer for Portuguese. I also +# suggest using NLTK's mailing list for Portuguese for any discussion. + +# Este código é baseado no algoritmo apresentado no artigo "A Stemming +# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e +# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O +# código é uma conversão para Python, com algumas pequenas modificações +# minhas, daquele apresentado em https://www.webcitation.org/5NnvdIzOb e do +# código para linguagem C disponível em +# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor, +# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente +# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer +# comentário, inclusive sobre o desenvolvimento de um stemmer diferente +# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão +# do NLTK para o português para qualquer debate. + +from nltk.data import load +from nltk.stem.api import StemmerI + + +class RSLPStemmer(StemmerI): + """ + A stemmer for Portuguese. + + >>> from nltk.stem import RSLPStemmer + >>> st = RSLPStemmer() + >>> # opening lines of Erico Verissimo's "Música ao Longe" + >>> text = ''' + ... Clarissa risca com giz no quadro-negro a paisagem que os alunos + ... devem copiar . Uma casinha de porta e janela , em cima duma + ... coxilha .''' + >>> for token in text.split(): # doctest: +NORMALIZE_WHITESPACE + ... print(st.stem(token)) + clariss risc com giz no quadro-negr a pais que os alun dev copi . + uma cas de port e janel , em cim dum coxilh . + """ + + def __init__(self): + self._model = [] + + self._model.append(self.read_rule("step0.pt")) + self._model.append(self.read_rule("step1.pt")) + self._model.append(self.read_rule("step2.pt")) + self._model.append(self.read_rule("step3.pt")) + self._model.append(self.read_rule("step4.pt")) + self._model.append(self.read_rule("step5.pt")) + self._model.append(self.read_rule("step6.pt")) + + def read_rule(self, filename): + rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8") + lines = rules.split("\n") + + lines = [line for line in lines if line != ""] # remove blank lines + lines = [line for line in lines if line[0] != "#"] # remove comments + + # NOTE: a simple but ugly hack to make this parser happy with double '\t's + lines = [line.replace("\t\t", "\t") for line in lines] + + # parse rules + rules = [] + for line in lines: + rule = [] + tokens = line.split("\t") + + # text to be searched for at the end of the string + rule.append(tokens[0][1:-1]) # remove quotes + + # minimum stem size to perform the replacement + rule.append(int(tokens[1])) + + # text to be replaced into + rule.append(tokens[2][1:-1]) # remove quotes + + # exceptions to this rule + rule.append([token[1:-1] for token in tokens[3].split(",")]) + + # append to the results + rules.append(rule) + + return rules + + def stem(self, word): + word = word.lower() + + # the word ends in 's'? apply rule for plural reduction + if word[-1] == "s": + word = self.apply_rule(word, 0) + + # the word ends in 'a'? apply rule for feminine reduction + if word[-1] == "a": + word = self.apply_rule(word, 1) + + # augmentative reduction + word = self.apply_rule(word, 3) + + # adverb reduction + word = self.apply_rule(word, 2) + + # noun reduction + prev_word = word + word = self.apply_rule(word, 4) + if word == prev_word: + # verb reduction + prev_word = word + word = self.apply_rule(word, 5) + if word == prev_word: + # vowel removal + word = self.apply_rule(word, 6) + + return word + + def apply_rule(self, word, rule_index): + rules = self._model[rule_index] + for rule in rules: + suffix_length = len(rule[0]) + if word[-suffix_length:] == rule[0]: # if suffix matches + if len(word) >= suffix_length + rule[1]: # if we have minimum size + if word not in rule[3]: # if not an exception + word = word[:-suffix_length] + rule[2] + break + + return word diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/snowball.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/snowball.py new file mode 100644 index 0000000000000000000000000000000000000000..4f5a35056c66efc878b2c4cdbbf98553391a3185 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/snowball.py @@ -0,0 +1,5946 @@ +# +# Natural Language Toolkit: Snowball Stemmer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Peter Michael Stahl +# Peter Ljunglof (revisions) +# Lakhdar Benzahia (co-writer) +# Assem Chelli (reviewer arabicstemmer) +# Abdelkrim Aries (reviewer arabicstemmer) +# Algorithms: Dr Martin Porter +# Assem Chelli arabic stemming algorithm +# Benzahia Lakhdar +# URL: +# For license information, see LICENSE.TXT + +""" +Snowball stemmers + +This module provides a port of the Snowball stemmers +developed by Martin Porter. + +There is also a demo function: `snowball.demo()`. + +""" + +import re + +from nltk.corpus import stopwords +from nltk.stem import porter +from nltk.stem.api import StemmerI +from nltk.stem.util import prefix_replace, suffix_replace + + +class SnowballStemmer(StemmerI): + + """ + Snowball Stemmer + + The following languages are supported: + Arabic, Danish, Dutch, English, Finnish, French, German, + Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, + Spanish and Swedish. + + The algorithm for English is documented here: + + Porter, M. \"An algorithm for suffix stripping.\" + Program 14.3 (1980): 130-137. + + The algorithms have been developed by Martin Porter. + These stemmers are called Snowball, because Porter created + a programming language with this name for creating + new stemming algorithms. There is more information available + at http://snowball.tartarus.org/ + + The stemmer is invoked as shown below: + + >>> from nltk.stem import SnowballStemmer # See which languages are supported + >>> print(" ".join(SnowballStemmer.languages)) # doctest: +NORMALIZE_WHITESPACE + arabic danish dutch english finnish french german hungarian + italian norwegian porter portuguese romanian russian + spanish swedish + >>> stemmer = SnowballStemmer("german") # Choose a language + >>> stemmer.stem("Autobahnen") # Stem a word + 'autobahn' + + Invoking the stemmers that way is useful if you do not know the + language to be stemmed at runtime. Alternatively, if you already know + the language, then you can invoke the language specific stemmer directly: + + >>> from nltk.stem.snowball import GermanStemmer + >>> stemmer = GermanStemmer() + >>> stemmer.stem("Autobahnen") + 'autobahn' + + :param language: The language whose subclass is instantiated. + :type language: str or unicode + :param ignore_stopwords: If set to True, stopwords are + not stemmed and returned unchanged. + Set to False by default. + :type ignore_stopwords: bool + :raise ValueError: If there is no stemmer for the specified + language, a ValueError is raised. + """ + + languages = ( + "arabic", + "danish", + "dutch", + "english", + "finnish", + "french", + "german", + "hungarian", + "italian", + "norwegian", + "porter", + "portuguese", + "romanian", + "russian", + "spanish", + "swedish", + ) + + def __init__(self, language, ignore_stopwords=False): + if language not in self.languages: + raise ValueError(f"The language '{language}' is not supported.") + stemmerclass = globals()[language.capitalize() + "Stemmer"] + self.stemmer = stemmerclass(ignore_stopwords) + self.stem = self.stemmer.stem + self.stopwords = self.stemmer.stopwords + + def stem(self, token): + return self.stemmer.stem(self, token) + + +class _LanguageSpecificStemmer(StemmerI): + + """ + This helper subclass offers the possibility + to invoke a specific stemmer directly. + This is useful if you already know the language to be stemmed at runtime. + + Create an instance of the Snowball stemmer. + + :param ignore_stopwords: If set to True, stopwords are + not stemmed and returned unchanged. + Set to False by default. + :type ignore_stopwords: bool + """ + + def __init__(self, ignore_stopwords=False): + # The language is the name of the class, minus the final "Stemmer". + language = type(self).__name__.lower() + if language.endswith("stemmer"): + language = language[:-7] + + self.stopwords = set() + if ignore_stopwords: + try: + for word in stopwords.words(language): + self.stopwords.add(word) + except OSError as e: + raise ValueError( + "{!r} has no list of stopwords. Please set" + " 'ignore_stopwords' to 'False'.".format(self) + ) from e + + def __repr__(self): + """ + Print out the string representation of the respective class. + + """ + return f"<{type(self).__name__}>" + + +class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer): + """ + A word stemmer based on the original Porter stemming algorithm. + + Porter, M. \"An algorithm for suffix stripping.\" + Program 14.3 (1980): 130-137. + + A few minor modifications have been made to Porter's basic + algorithm. See the source code of the module + nltk.stem.porter for more information. + + """ + + def __init__(self, ignore_stopwords=False): + _LanguageSpecificStemmer.__init__(self, ignore_stopwords) + porter.PorterStemmer.__init__(self) + + +class _ScandinavianStemmer(_LanguageSpecificStemmer): + + """ + This subclass encapsulates a method for defining the string region R1. + It is used by the Danish, Norwegian, and Swedish stemmer. + + """ + + def _r1_scandinavian(self, word, vowels): + """ + Return the region R1 that is used by the Scandinavian stemmers. + + R1 is the region after the first non-vowel following a vowel, + or is the null region at the end of the word if there is no + such non-vowel. But then R1 is adjusted so that the region + before it contains at least three letters. + + :param word: The word whose region R1 is determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the region R1. + :type vowels: unicode + :return: the region R1 for the respective word. + :rtype: unicode + :note: This helper method is invoked by the respective stem method of + the subclasses DanishStemmer, NorwegianStemmer, and + SwedishStemmer. It is not to be invoked directly! + + """ + r1 = "" + for i in range(1, len(word)): + if word[i] not in vowels and word[i - 1] in vowels: + if 3 > len(word[: i + 1]) > 0: + r1 = word[3:] + elif len(word[: i + 1]) >= 3: + r1 = word[i + 1 :] + else: + return word + break + + return r1 + + +class _StandardStemmer(_LanguageSpecificStemmer): + + """ + This subclass encapsulates two methods for defining the standard versions + of the string regions R1, R2, and RV. + + """ + + def _r1r2_standard(self, word, vowels): + """ + Return the standard interpretations of the string regions R1 and R2. + + R1 is the region after the first non-vowel following a vowel, + or is the null region at the end of the word if there is no + such non-vowel. + + R2 is the region after the first non-vowel following a vowel + in R1, or is the null region at the end of the word if there + is no such non-vowel. + + :param word: The word whose regions R1 and R2 are determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the regions R1 and R2. + :type vowels: unicode + :return: (r1,r2), the regions R1 and R2 for the respective word. + :rtype: tuple + :note: This helper method is invoked by the respective stem method of + the subclasses DutchStemmer, FinnishStemmer, + FrenchStemmer, GermanStemmer, ItalianStemmer, + PortugueseStemmer, RomanianStemmer, and SpanishStemmer. + It is not to be invoked directly! + :note: A detailed description of how to define R1 and R2 + can be found at http://snowball.tartarus.org/texts/r1r2.html + + """ + r1 = "" + r2 = "" + for i in range(1, len(word)): + if word[i] not in vowels and word[i - 1] in vowels: + r1 = word[i + 1 :] + break + + for i in range(1, len(r1)): + if r1[i] not in vowels and r1[i - 1] in vowels: + r2 = r1[i + 1 :] + break + + return (r1, r2) + + def _rv_standard(self, word, vowels): + """ + Return the standard interpretation of the string region RV. + + If the second letter is a consonant, RV is the region after the + next following vowel. If the first two letters are vowels, RV is + the region after the next following consonant. Otherwise, RV is + the region after the third letter. + + :param word: The word whose region RV is determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the region RV. + :type vowels: unicode + :return: the region RV for the respective word. + :rtype: unicode + :note: This helper method is invoked by the respective stem method of + the subclasses ItalianStemmer, PortugueseStemmer, + RomanianStemmer, and SpanishStemmer. It is not to be + invoked directly! + + """ + rv = "" + if len(word) >= 2: + if word[1] not in vowels: + for i in range(2, len(word)): + if word[i] in vowels: + rv = word[i + 1 :] + break + + elif word[0] in vowels and word[1] in vowels: + for i in range(2, len(word)): + if word[i] not in vowels: + rv = word[i + 1 :] + break + else: + rv = word[3:] + + return rv + + +class ArabicStemmer(_StandardStemmer): + """ + https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm) + The Snowball Arabic light Stemmer + Algorithm: + + - Assem Chelli + - Abdelkrim Aries + - Lakhdar Benzahia + + NLTK Version Author: + + - Lakhdar Benzahia + """ + + # Normalize_pre stes + __vocalization = re.compile( + r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]" + ) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ + + __kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda + + __arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟ + + # Normalize_post + __last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ + + # normalize other hamza's + __initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ + + __waw_hamza = re.compile(r"[\u0624]") # ؤ + + __yeh_hamza = re.compile(r"[\u0626]") # ئ + + __alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ + + # Checks + __checks1 = ( + "\u0643\u0627\u0644", + "\u0628\u0627\u0644", # بال، كال + "\u0627\u0644", + "\u0644\u0644", # لل، ال + ) + + __checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات + + # Suffixes + __suffix_noun_step1a = ( + "\u064a", + "\u0643", + "\u0647", # ي، ك، ه + "\u0646\u0627", + "\u0643\u0645", + "\u0647\u0627", + "\u0647\u0646", + "\u0647\u0645", # نا، كم، ها، هن، هم + "\u0643\u0645\u0627", + "\u0647\u0645\u0627", # كما، هما + ) + + __suffix_noun_step1b = "\u0646" # ن + + __suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و + + __suffix_noun_step2b = "\u0627\u062a" # ات + + __suffix_noun_step2c1 = "\u062a" # ت + + __suffix_noun_step2c2 = "\u0629" # ة + + __suffix_noun_step3 = "\u064a" # ي + + __suffix_verb_step1 = ( + "\u0647", + "\u0643", # ه، ك + "\u0646\u064a", + "\u0646\u0627", + "\u0647\u0627", + "\u0647\u0645", # ني، نا، ها، هم + "\u0647\u0646", + "\u0643\u0645", + "\u0643\u0646", # هن، كم، كن + "\u0647\u0645\u0627", + "\u0643\u0645\u0627", + "\u0643\u0645\u0648", # هما، كما، كمو + ) + + __suffix_verb_step2a = ( + "\u062a", + "\u0627", + "\u0646", + "\u064a", # ت، ا، ن، ي + "\u0646\u0627", + "\u062a\u0627", + "\u062a\u0646", # نا، تا، تن Past + "\u0627\u0646", + "\u0648\u0646", + "\u064a\u0646", # ان، هن، ين Present + "\u062a\u0645\u0627", # تما + ) + + __suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم + + __suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو + + __suffix_all_alef_maqsura = "\u0649" # ى + + # Prefixes + __prefix_step1 = ( + "\u0623", # أ + "\u0623\u0623", + "\u0623\u0622", + "\u0623\u0624", + "\u0623\u0627", + "\u0623\u0625", # أأ، أآ، أؤ، أا، أإ + ) + + __prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال + + __prefix_step2b = ("\u0641", "\u0648") # ف، و + + __prefix_step3a_noun = ( + "\u0627\u0644", + "\u0644\u0644", # لل، ال + "\u0643\u0627\u0644", + "\u0628\u0627\u0644", # بال، كال + ) + + __prefix_step3b_noun = ( + "\u0628", + "\u0643", + "\u0644", # ب، ك، ل + "\u0628\u0628", + "\u0643\u0643", # بب، كك + ) + + __prefix_step3_verb = ( + "\u0633\u064a", + "\u0633\u062a", + "\u0633\u0646", + "\u0633\u0623", + ) # سي، ست، سن، سأ + + __prefix_step4_verb = ( + "\u064a\u0633\u062a", + "\u0646\u0633\u062a", + "\u062a\u0633\u062a", + ) # يست، نست، تست + + # Suffixes added due to Conjugation Verbs + __conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك + + __conjugation_suffix_verb_2 = ( + "\u0646\u064a", + "\u0646\u0627", + "\u0647\u0627", # ني، نا، ها + "\u0647\u0645", + "\u0647\u0646", + "\u0643\u0645", # هم، هن، كم + "\u0643\u0646", # كن + ) + __conjugation_suffix_verb_3 = ( + "\u0647\u0645\u0627", + "\u0643\u0645\u0627", + "\u0643\u0645\u0648", + ) # هما، كما، كمو + + __conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي + + __conjugation_suffix_verb_past = ( + "\u0646\u0627", + "\u062a\u0627", + "\u062a\u0646", + ) # نا، تا، تن + + __conjugation_suffix_verb_present = ( + "\u0627\u0646", + "\u0648\u0646", + "\u064a\u0646", + ) # ان، ون، ين + + # Suffixes added due to derivation Names + __conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه + + __conjugation_suffix_noun_2 = ( + "\u0646\u0627", + "\u0643\u0645", # نا، كم + "\u0647\u0627", + "\u0647\u0646", + "\u0647\u0645", # ها، هن، هم + ) + + __conjugation_suffix_noun_3 = ( + "\u0643\u0645\u0627", + "\u0647\u0645\u0627", + ) # كما، هما + + # Prefixes added due to derivation Names + __prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا + + __articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال + + __articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل + + # Prepositions letters + __prepositions1 = ("\u0643", "\u0644") # ك، ل + __prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك + + is_verb = True + is_noun = True + is_defined = False + + suffixes_verb_step1_success = False + suffix_verb_step2a_success = False + suffix_verb_step2b_success = False + suffix_noun_step2c2_success = False + suffix_noun_step1a_success = False + suffix_noun_step2a_success = False + suffix_noun_step2b_success = False + suffixe_noun_step1b_success = False + prefix_step2a_success = False + prefix_step3a_noun_success = False + prefix_step3b_noun_success = False + + def __normalize_pre(self, token): + """ + :param token: string + :return: normalized token type string + """ + # strip diacritics + token = self.__vocalization.sub("", token) + # strip kasheeda + token = self.__kasheeda.sub("", token) + # strip punctuation marks + token = self.__arabic_punctuation_marks.sub("", token) + return token + + def __normalize_post(self, token): + # normalize last hamza + for hamza in self.__last_hamzat: + if token.endswith(hamza): + token = suffix_replace(token, hamza, "\u0621") + break + # normalize other hamzat + token = self.__initial_hamzat.sub("\u0627", token) + token = self.__waw_hamza.sub("\u0648", token) + token = self.__yeh_hamza.sub("\u064a", token) + token = self.__alefat.sub("\u0627", token) + return token + + def __checks_1(self, token): + for prefix in self.__checks1: + if token.startswith(prefix): + if prefix in self.__articles_3len and len(token) > 4: + self.is_noun = True + self.is_verb = False + self.is_defined = True + break + + if prefix in self.__articles_2len and len(token) > 3: + self.is_noun = True + self.is_verb = False + self.is_defined = True + break + + def __checks_2(self, token): + for suffix in self.__checks2: + if token.endswith(suffix): + if suffix == "\u0629" and len(token) > 2: + self.is_noun = True + self.is_verb = False + break + + if suffix == "\u0627\u062a" and len(token) > 3: + self.is_noun = True + self.is_verb = False + break + + def __Suffix_Verb_Step1(self, token): + for suffix in self.__suffix_verb_step1: + if token.endswith(suffix): + if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4: + token = token[:-1] + self.suffixes_verb_step1_success = True + break + + if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5: + token = token[:-2] + self.suffixes_verb_step1_success = True + break + + if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6: + token = token[:-3] + self.suffixes_verb_step1_success = True + break + return token + + def __Suffix_Verb_Step2a(self, token): + for suffix in self.__suffix_verb_step2a: + if token.endswith(suffix) and len(token) > 3: + if suffix == "\u062a" and len(token) >= 4: + token = token[:-1] + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4: + token = token[:-1] + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5: + token = token[:-2] # past + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_present and len(token) > 5: + token = token[:-2] # present + self.suffix_verb_step2a_success = True + break + + if suffix == "\u062a\u0645\u0627" and len(token) >= 6: + token = token[:-3] + self.suffix_verb_step2a_success = True + break + return token + + def __Suffix_Verb_Step2c(self, token): + for suffix in self.__suffix_verb_step2c: + if token.endswith(suffix): + if suffix == "\u062a\u0645\u0648" and len(token) >= 6: + token = token[:-3] + break + + if suffix == "\u0648" and len(token) >= 4: + token = token[:-1] + break + return token + + def __Suffix_Verb_Step2b(self, token): + for suffix in self.__suffix_verb_step2b: + if token.endswith(suffix) and len(token) >= 5: + token = token[:-2] + self.suffix_verb_step2b_success = True + break + return token + + def __Suffix_Noun_Step2c2(self, token): + for suffix in self.__suffix_noun_step2c2: + if token.endswith(suffix) and len(token) >= 3: + token = token[:-1] + self.suffix_noun_step2c2_success = True + break + return token + + def __Suffix_Noun_Step1a(self, token): + for suffix in self.__suffix_noun_step1a: + if token.endswith(suffix): + if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4: + token = token[:-1] + self.suffix_noun_step1a_success = True + break + + if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5: + token = token[:-2] + self.suffix_noun_step1a_success = True + break + + if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6: + token = token[:-3] + self.suffix_noun_step1a_success = True + break + return token + + def __Suffix_Noun_Step2a(self, token): + for suffix in self.__suffix_noun_step2a: + if token.endswith(suffix) and len(token) > 4: + token = token[:-1] + self.suffix_noun_step2a_success = True + break + return token + + def __Suffix_Noun_Step2b(self, token): + for suffix in self.__suffix_noun_step2b: + if token.endswith(suffix) and len(token) >= 5: + token = token[:-2] + self.suffix_noun_step2b_success = True + break + return token + + def __Suffix_Noun_Step2c1(self, token): + for suffix in self.__suffix_noun_step2c1: + if token.endswith(suffix) and len(token) >= 4: + token = token[:-1] + break + return token + + def __Suffix_Noun_Step1b(self, token): + for suffix in self.__suffix_noun_step1b: + if token.endswith(suffix) and len(token) > 5: + token = token[:-1] + self.suffixe_noun_step1b_success = True + break + return token + + def __Suffix_Noun_Step3(self, token): + for suffix in self.__suffix_noun_step3: + if token.endswith(suffix) and len(token) >= 3: + token = token[:-1] # ya' nisbiya + break + return token + + def __Suffix_All_alef_maqsura(self, token): + for suffix in self.__suffix_all_alef_maqsura: + if token.endswith(suffix): + token = suffix_replace(token, suffix, "\u064a") + return token + + def __Prefix_Step1(self, token): + for prefix in self.__prefix_step1: + if token.startswith(prefix) and len(token) > 3: + if prefix == "\u0623\u0623": + token = prefix_replace(token, prefix, "\u0623") + break + + elif prefix == "\u0623\u0622": + token = prefix_replace(token, prefix, "\u0622") + break + + elif prefix == "\u0623\u0624": + token = prefix_replace(token, prefix, "\u0624") + break + + elif prefix == "\u0623\u0627": + token = prefix_replace(token, prefix, "\u0627") + break + + elif prefix == "\u0623\u0625": + token = prefix_replace(token, prefix, "\u0625") + break + return token + + def __Prefix_Step2a(self, token): + for prefix in self.__prefix_step2a: + if token.startswith(prefix) and len(token) > 5: + token = token[len(prefix) :] + self.prefix_step2a_success = True + break + return token + + def __Prefix_Step2b(self, token): + for prefix in self.__prefix_step2b: + if token.startswith(prefix) and len(token) > 3: + if token[:2] not in self.__prefixes1: + token = token[len(prefix) :] + break + return token + + def __Prefix_Step3a_Noun(self, token): + for prefix in self.__prefix_step3a_noun: + if token.startswith(prefix): + if prefix in self.__articles_2len and len(token) > 4: + token = token[len(prefix) :] + self.prefix_step3a_noun_success = True + break + if prefix in self.__articles_3len and len(token) > 5: + token = token[len(prefix) :] + break + return token + + def __Prefix_Step3b_Noun(self, token): + for prefix in self.__prefix_step3b_noun: + if token.startswith(prefix): + if len(token) > 3: + if prefix == "\u0628": + token = token[len(prefix) :] + self.prefix_step3b_noun_success = True + break + + if prefix in self.__prepositions2: + token = prefix_replace(token, prefix, prefix[1]) + self.prefix_step3b_noun_success = True + break + + if prefix in self.__prepositions1 and len(token) > 4: + token = token[len(prefix) :] # BUG: cause confusion + self.prefix_step3b_noun_success = True + break + return token + + def __Prefix_Step3_Verb(self, token): + for prefix in self.__prefix_step3_verb: + if token.startswith(prefix) and len(token) > 4: + token = prefix_replace(token, prefix, prefix[1]) + break + return token + + def __Prefix_Step4_Verb(self, token): + for prefix in self.__prefix_step4_verb: + if token.startswith(prefix) and len(token) > 4: + token = prefix_replace(token, prefix, "\u0627\u0633\u062a") + self.is_verb = True + self.is_noun = False + break + return token + + def stem(self, word): + """ + Stem an Arabic word and return the stemmed form. + + :param word: string + :return: string + """ + # set initial values + self.is_verb = True + self.is_noun = True + self.is_defined = False + + self.suffix_verb_step2a_success = False + self.suffix_verb_step2b_success = False + self.suffix_noun_step2c2_success = False + self.suffix_noun_step1a_success = False + self.suffix_noun_step2a_success = False + self.suffix_noun_step2b_success = False + self.suffixe_noun_step1b_success = False + self.prefix_step2a_success = False + self.prefix_step3a_noun_success = False + self.prefix_step3b_noun_success = False + + modified_word = word + # guess type and properties + # checks1 + self.__checks_1(modified_word) + # checks2 + self.__checks_2(modified_word) + # Pre_Normalization + modified_word = self.__normalize_pre(modified_word) + # Avoid stopwords + if modified_word in self.stopwords or len(modified_word) <= 2: + return modified_word + # Start stemming + if self.is_verb: + modified_word = self.__Suffix_Verb_Step1(modified_word) + if self.suffixes_verb_step1_success: + modified_word = self.__Suffix_Verb_Step2a(modified_word) + if not self.suffix_verb_step2a_success: + modified_word = self.__Suffix_Verb_Step2c(modified_word) + # or next TODO: How to deal with or next instruction + else: + modified_word = self.__Suffix_Verb_Step2b(modified_word) + if not self.suffix_verb_step2b_success: + modified_word = self.__Suffix_Verb_Step2a(modified_word) + if self.is_noun: + modified_word = self.__Suffix_Noun_Step2c2(modified_word) + if not self.suffix_noun_step2c2_success: + if not self.is_defined: + modified_word = self.__Suffix_Noun_Step1a(modified_word) + # if self.suffix_noun_step1a_success: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + if not self.suffix_noun_step2a_success: + modified_word = self.__Suffix_Noun_Step2b(modified_word) + if ( + not self.suffix_noun_step2b_success + and not self.suffix_noun_step2a_success + ): + modified_word = self.__Suffix_Noun_Step2c1(modified_word) + # or next ? todo : how to deal with or next + else: + modified_word = self.__Suffix_Noun_Step1b(modified_word) + if self.suffixe_noun_step1b_success: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + if not self.suffix_noun_step2a_success: + modified_word = self.__Suffix_Noun_Step2b(modified_word) + if ( + not self.suffix_noun_step2b_success + and not self.suffix_noun_step2a_success + ): + modified_word = self.__Suffix_Noun_Step2c1(modified_word) + else: + if not self.is_defined: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + modified_word = self.__Suffix_Noun_Step2b(modified_word) + modified_word = self.__Suffix_Noun_Step3(modified_word) + if not self.is_noun and self.is_verb: + modified_word = self.__Suffix_All_alef_maqsura(modified_word) + + # prefixes + modified_word = self.__Prefix_Step1(modified_word) + modified_word = self.__Prefix_Step2a(modified_word) + if not self.prefix_step2a_success: + modified_word = self.__Prefix_Step2b(modified_word) + modified_word = self.__Prefix_Step3a_Noun(modified_word) + if not self.prefix_step3a_noun_success and self.is_noun: + modified_word = self.__Prefix_Step3b_Noun(modified_word) + else: + if not self.prefix_step3b_noun_success and self.is_verb: + modified_word = self.__Prefix_Step3_Verb(modified_word) + modified_word = self.__Prefix_Step4_Verb(modified_word) + + # post normalization stemming + modified_word = self.__normalize_post(modified_word) + stemmed_word = modified_word + return stemmed_word + + +class DanishStemmer(_ScandinavianStemmer): + + """ + The Danish Snowball stemmer. + + :cvar __vowels: The Danish vowels. + :type __vowels: unicode + :cvar __consonants: The Danish consonants. + :type __consonants: unicode + :cvar __double_consonants: The Danish double consonants. + :type __double_consonants: tuple + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Danish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/danish/stemmer.html + + """ + + # The language's vowels and other important characters are defined. + __vowels = "aeiouy\xE6\xE5\xF8" + __consonants = "bcdfghjklmnpqrstvwxz" + __double_consonants = ( + "bb", + "cc", + "dd", + "ff", + "gg", + "hh", + "jj", + "kk", + "ll", + "mm", + "nn", + "pp", + "qq", + "rr", + "ss", + "tt", + "vv", + "ww", + "xx", + "zz", + ) + __s_ending = "abcdfghjklmnoprtvyz\xE5" + + # The different suffixes, divided into the algorithm's steps + # and organized by length, are listed in tuples. + __step1_suffixes = ( + "erendes", + "erende", + "hedens", + "ethed", + "erede", + "heden", + "heder", + "endes", + "ernes", + "erens", + "erets", + "ered", + "ende", + "erne", + "eren", + "erer", + "heds", + "enes", + "eres", + "eret", + "hed", + "ene", + "ere", + "ens", + "ers", + "ets", + "en", + "er", + "es", + "et", + "e", + "s", + ) + __step2_suffixes = ("gd", "dt", "gt", "kt") + __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig") + + def stem(self, word): + """ + Stem a Danish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + # Every word is put into lower case for normalization. + word = word.lower() + + if word in self.stopwords: + return word + + # After this, the required regions are generated + # by the respective helper method. + r1 = self._r1_scandinavian(word, self.__vowels) + + # Then the actual stemming process starts. + # Every new step is explicitly indicated + # according to the descriptions on the Snowball website. + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + if r1.endswith("igst"): + word = word[:-2] + r1 = r1[:-2] + + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix == "l\xF8st": + word = word[:-1] + r1 = r1[:-1] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + + if r1.endswith(self.__step2_suffixes): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 4: Undouble + for double_cons in self.__double_consonants: + if word.endswith(double_cons) and len(word) > 3: + word = word[:-1] + break + + return word + + +class DutchStemmer(_StandardStemmer): + + """ + The Dutch Snowball stemmer. + + :cvar __vowels: The Dutch vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm. + :type __step3b_suffixes: tuple + :note: A detailed description of the Dutch + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/dutch/stemmer.html + + """ + + __vowels = "aeiouy\xE8" + __step1_suffixes = ("heden", "ene", "en", "se", "s") + __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig") + + def stem(self, word): + """ + Stem a Dutch word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + step2_success = False + + # Vowel accents are removed. + word = ( + word.replace("\xE4", "a") + .replace("\xE1", "a") + .replace("\xEB", "e") + .replace("\xE9", "e") + .replace("\xED", "i") + .replace("\xEF", "i") + .replace("\xF6", "o") + .replace("\xF3", "o") + .replace("\xFC", "u") + .replace("\xFA", "u") + ) + + # An initial 'y', a 'y' after a vowel, + # and an 'i' between self.__vowels is put into upper case. + # As from now these are treated as consonants. + if word.startswith("y"): + word = "".join(("Y", word[1:])) + + for i in range(1, len(word)): + if word[i - 1] in self.__vowels and word[i] == "y": + word = "".join((word[:i], "Y", word[i + 1 :])) + + for i in range(1, len(word) - 1): + if ( + word[i - 1] in self.__vowels + and word[i] == "i" + and word[i + 1] in self.__vowels + ): + word = "".join((word[:i], "I", word[i + 1 :])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # R1 is adjusted so that the region before it + # contains at least 3 letters. + for i in range(1, len(word)): + if word[i] not in self.__vowels and word[i - 1] in self.__vowels: + if 3 > len(word[: i + 1]) > 0: + r1 = word[3:] + elif len(word[: i + 1]) == 0: + return word + break + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "heden": + word = suffix_replace(word, suffix, "heid") + r1 = suffix_replace(r1, suffix, "heid") + if r2.endswith("heden"): + r2 = suffix_replace(r2, suffix, "heid") + + elif ( + suffix in ("ene", "en") + and not word.endswith("heden") + and word[-len(suffix) - 1] not in self.__vowels + and word[-len(suffix) - 3 : -len(suffix)] != "gem" + ): + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif ( + suffix in ("se", "s") + and word[-len(suffix) - 1] not in self.__vowels + and word[-len(suffix) - 1] != "j" + ): + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + break + + # STEP 2 + if r1.endswith("e") and word[-2] not in self.__vowels: + step2_success = True + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + # STEP 3a + if r2.endswith("heid") and word[-5] != "c": + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + + if ( + r1.endswith("en") + and word[-3] not in self.__vowels + and word[-5:-2] != "gem" + ): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + # STEP 3b: Derivational suffixes + for suffix in self.__step3b_suffixes: + if r2.endswith(suffix): + if suffix in ("end", "ing"): + word = word[:-3] + r2 = r2[:-3] + + if r2.endswith("ig") and word[-3] != "e": + word = word[:-2] + else: + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + + elif suffix == "ig" and word[-3] != "e": + word = word[:-2] + + elif suffix == "lijk": + word = word[:-4] + r1 = r1[:-4] + + if r1.endswith("e") and word[-2] not in self.__vowels: + word = word[:-1] + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + + elif suffix == "baar": + word = word[:-4] + + elif suffix == "bar" and step2_success: + word = word[:-3] + break + + # STEP 4: Undouble vowel + if len(word) >= 4: + if word[-1] not in self.__vowels and word[-1] != "I": + if word[-3:-1] in ("aa", "ee", "oo", "uu"): + if word[-4] not in self.__vowels: + word = "".join((word[:-3], word[-3], word[-1])) + + # All occurrences of 'I' and 'Y' are put back into lower case. + word = word.replace("I", "i").replace("Y", "y") + + return word + + +class EnglishStemmer(_StandardStemmer): + + """ + The English Snowball stemmer. + + :cvar __vowels: The English vowels. + :type __vowels: unicode + :cvar __double_consonants: The English double consonants. + :type __double_consonants: tuple + :cvar __li_ending: Letters that may directly appear before a word final 'li'. + :type __li_ending: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm. + :type __step1a_suffixes: tuple + :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm. + :type __step1b_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. + :type __step5_suffixes: tuple + :cvar __special_words: A dictionary containing words + which have to be stemmed specially. + :type __special_words: dict + :note: A detailed description of the English + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/english/stemmer.html + """ + + __vowels = "aeiouy" + __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") + __li_ending = "cdeghkmnrt" + __step0_suffixes = ("'s'", "'s", "'") + __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s") + __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed") + __step2_suffixes = ( + "ization", + "ational", + "fulness", + "ousness", + "iveness", + "tional", + "biliti", + "lessli", + "entli", + "ation", + "alism", + "aliti", + "ousli", + "iviti", + "fulli", + "enci", + "anci", + "abli", + "izer", + "ator", + "alli", + "bli", + "ogi", + "li", + ) + __step3_suffixes = ( + "ational", + "tional", + "alize", + "icate", + "iciti", + "ative", + "ical", + "ness", + "ful", + ) + __step4_suffixes = ( + "ement", + "ance", + "ence", + "able", + "ible", + "ment", + "ant", + "ent", + "ism", + "ate", + "iti", + "ous", + "ive", + "ize", + "ion", + "al", + "er", + "ic", + ) + __step5_suffixes = ("e", "l") + __special_words = { + "skis": "ski", + "skies": "sky", + "dying": "die", + "lying": "lie", + "tying": "tie", + "idly": "idl", + "gently": "gentl", + "ugly": "ugli", + "early": "earli", + "only": "onli", + "singly": "singl", + "sky": "sky", + "news": "news", + "howe": "howe", + "atlas": "atlas", + "cosmos": "cosmos", + "bias": "bias", + "andes": "andes", + "inning": "inning", + "innings": "inning", + "outing": "outing", + "outings": "outing", + "canning": "canning", + "cannings": "canning", + "herring": "herring", + "herrings": "herring", + "earring": "earring", + "earrings": "earring", + "proceed": "proceed", + "proceeds": "proceed", + "proceeded": "proceed", + "proceeding": "proceed", + "exceed": "exceed", + "exceeds": "exceed", + "exceeded": "exceed", + "exceeding": "exceed", + "succeed": "succeed", + "succeeds": "succeed", + "succeeded": "succeed", + "succeeding": "succeed", + } + + def stem(self, word): + + """ + Stem an English word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords or len(word) <= 2: + return word + + elif word in self.__special_words: + return self.__special_words[word] + + # Map the different apostrophe characters to a single consistent one + word = ( + word.replace("\u2019", "\x27") + .replace("\u2018", "\x27") + .replace("\u201B", "\x27") + ) + + if word.startswith("\x27"): + word = word[1:] + + if word.startswith("y"): + word = "".join(("Y", word[1:])) + + for i in range(1, len(word)): + if word[i - 1] in self.__vowels and word[i] == "y": + word = "".join((word[:i], "Y", word[i + 1 :])) + + step1a_vowel_found = False + step1b_vowel_found = False + + r1 = "" + r2 = "" + + if word.startswith(("gener", "commun", "arsen")): + if word.startswith(("gener", "arsen")): + r1 = word[5:] + else: + r1 = word[6:] + + for i in range(1, len(r1)): + if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels: + r2 = r1[i + 1 :] + break + else: + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # STEP 0 + for suffix in self.__step0_suffixes: + if word.endswith(suffix): + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + break + + # STEP 1a + for suffix in self.__step1a_suffixes: + if word.endswith(suffix): + + if suffix == "sses": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("ied", "ies"): + if len(word[: -len(suffix)]) > 1: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif suffix == "s": + for letter in word[:-2]: + if letter in self.__vowels: + step1a_vowel_found = True + break + + if step1a_vowel_found: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + break + + # STEP 1b + for suffix in self.__step1b_suffixes: + if word.endswith(suffix): + if suffix in ("eed", "eedly"): + + if r1.endswith(suffix): + word = suffix_replace(word, suffix, "ee") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "ee") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "ee") + else: + r2 = "" + else: + for letter in word[: -len(suffix)]: + if letter in self.__vowels: + step1b_vowel_found = True + break + + if step1b_vowel_found: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + + if word.endswith(("at", "bl", "iz")): + word = "".join((word, "e")) + r1 = "".join((r1, "e")) + + if len(word) > 5 or len(r1) >= 3: + r2 = "".join((r2, "e")) + + elif word.endswith(self.__double_consonants): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif ( + r1 == "" + and len(word) >= 3 + and word[-1] not in self.__vowels + and word[-1] not in "wxY" + and word[-2] in self.__vowels + and word[-3] not in self.__vowels + ) or ( + r1 == "" + and len(word) == 2 + and word[0] in self.__vowels + and word[1] not in self.__vowels + ): + + word = "".join((word, "e")) + + if len(r1) > 0: + r1 = "".join((r1, "e")) + + if len(r2) > 0: + r2 = "".join((r2, "e")) + break + + # STEP 1c + if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels: + word = "".join((word[:-1], "i")) + if len(r1) >= 1: + r1 = "".join((r1[:-1], "i")) + else: + r1 = "" + + if len(r2) >= 1: + r2 = "".join((r2[:-1], "i")) + else: + r2 = "" + + # STEP 2 + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == "tional": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("enci", "anci", "abli"): + word = "".join((word[:-1], "e")) + + if len(r1) >= 1: + r1 = "".join((r1[:-1], "e")) + else: + r1 = "" + + if len(r2) >= 1: + r2 = "".join((r2[:-1], "e")) + else: + r2 = "" + + elif suffix == "entli": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("izer", "ization"): + word = suffix_replace(word, suffix, "ize") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "ize") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "ize") + else: + r2 = "" + + elif suffix in ("ational", "ation", "ator"): + word = suffix_replace(word, suffix, "ate") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "ate") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "ate") + else: + r2 = "e" + + elif suffix in ("alism", "aliti", "alli"): + word = suffix_replace(word, suffix, "al") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "al") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "al") + else: + r2 = "" + + elif suffix == "fulness": + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + + elif suffix in ("ousli", "ousness"): + word = suffix_replace(word, suffix, "ous") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "ous") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "ous") + else: + r2 = "" + + elif suffix in ("iveness", "iviti"): + word = suffix_replace(word, suffix, "ive") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "ive") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "ive") + else: + r2 = "e" + + elif suffix in ("biliti", "bli"): + word = suffix_replace(word, suffix, "ble") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "ble") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "ble") + else: + r2 = "" + + elif suffix == "ogi" and word[-4] == "l": + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif suffix in ("fulli", "lessli"): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "li" and word[-3] in self.__li_ending: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == "tional": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "ational": + word = suffix_replace(word, suffix, "ate") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "ate") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "ate") + else: + r2 = "" + + elif suffix == "alize": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + + elif suffix in ("icate", "iciti", "ical"): + word = suffix_replace(word, suffix, "ic") + + if len(r1) >= len(suffix): + r1 = suffix_replace(r1, suffix, "ic") + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = suffix_replace(r2, suffix, "ic") + else: + r2 = "" + + elif suffix in ("ful", "ness"): + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + + elif suffix == "ative" and r2.endswith(suffix): + word = word[:-5] + r1 = r1[:-5] + r2 = r2[:-5] + break + + # STEP 4 + for suffix in self.__step4_suffixes: + if word.endswith(suffix): + if r2.endswith(suffix): + if suffix == "ion": + if word[-4] in "st": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + break + + # STEP 5 + if r2.endswith("l") and word[-2] == "l": + word = word[:-1] + elif r2.endswith("e"): + word = word[:-1] + elif r1.endswith("e"): + if len(word) >= 4 and ( + word[-2] in self.__vowels + or word[-2] in "wxY" + or word[-3] not in self.__vowels + or word[-4] in self.__vowels + ): + word = word[:-1] + + word = word.replace("Y", "y") + + return word + + +class FinnishStemmer(_StandardStemmer): + + """ + The Finnish Snowball stemmer. + + :cvar __vowels: The Finnish vowels. + :type __vowels: unicode + :cvar __restricted_vowels: A subset of the Finnish vowels. + :type __restricted_vowels: unicode + :cvar __long_vowels: The Finnish vowels in their long forms. + :type __long_vowels: tuple + :cvar __consonants: The Finnish consonants. + :type __consonants: unicode + :cvar __double_consonants: The Finnish double consonants. + :type __double_consonants: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the Finnish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/finnish/stemmer.html + """ + + __vowels = "aeiouy\xE4\xF6" + __restricted_vowels = "aeiou\xE4\xF6" + __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6") + __consonants = "bcdfghjklmnpqrstvwxz" + __double_consonants = ( + "bb", + "cc", + "dd", + "ff", + "gg", + "hh", + "jj", + "kk", + "ll", + "mm", + "nn", + "pp", + "qq", + "rr", + "ss", + "tt", + "vv", + "ww", + "xx", + "zz", + ) + __step1_suffixes = ( + "kaan", + "k\xE4\xE4n", + "sti", + "kin", + "han", + "h\xE4n", + "ko", + "k\xF6", + "pa", + "p\xE4", + ) + __step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en") + __step3_suffixes = ( + "siin", + "tten", + "seen", + "han", + "hen", + "hin", + "hon", + "h\xE4n", + "h\xF6n", + "den", + "tta", + "tt\xE4", + "ssa", + "ss\xE4", + "sta", + "st\xE4", + "lla", + "ll\xE4", + "lta", + "lt\xE4", + "lle", + "ksi", + "ine", + "ta", + "t\xE4", + "na", + "n\xE4", + "a", + "\xE4", + "n", + ) + __step4_suffixes = ( + "impi", + "impa", + "imp\xE4", + "immi", + "imma", + "imm\xE4", + "mpi", + "mpa", + "mp\xE4", + "mmi", + "mma", + "mm\xE4", + "eja", + "ej\xE4", + ) + + def stem(self, word): + """ + Stem a Finnish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + step3_success = False + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # STEP 1: Particles etc. + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "sti": + if suffix in r2: + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6": + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + break + + # STEP 2: Possessives + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + if suffix == "si": + if word[-3] != "k": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "ni": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + if word.endswith("kse"): + word = suffix_replace(word, "kse", "ksi") + + if r1.endswith("kse"): + r1 = suffix_replace(r1, "kse", "ksi") + + if r2.endswith("kse"): + r2 = suffix_replace(r2, "kse", "ksi") + + elif suffix == "an": + if word[-4:-2] in ("ta", "na") or word[-5:-2] in ( + "ssa", + "sta", + "lla", + "lta", + ): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "\xE4n": + if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in ( + "ss\xE4", + "st\xE4", + "ll\xE4", + "lt\xE4", + ): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "en": + if word[-5:-2] in ("lle", "ine"): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + break + + # STEP 3: Cases + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"): + if ( + (suffix == "han" and word[-4] == "a") + or (suffix == "hen" and word[-4] == "e") + or (suffix == "hin" and word[-4] == "i") + or (suffix == "hon" and word[-4] == "o") + or (suffix == "h\xE4n" and word[-4] == "\xE4") + or (suffix == "h\xF6n" and word[-4] == "\xF6") + ): + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + step3_success = True + + elif suffix in ("siin", "den", "tten"): + if ( + word[-len(suffix) - 1] == "i" + and word[-len(suffix) - 2] in self.__restricted_vowels + ): + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + step3_success = True + else: + continue + + elif suffix == "seen": + if word[-6:-4] in self.__long_vowels: + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + step3_success = True + else: + continue + + elif suffix in ("a", "\xE4"): + if word[-2] in self.__vowels and word[-3] in self.__consonants: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + step3_success = True + + elif suffix in ("tta", "tt\xE4"): + if word[-4] == "e": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + step3_success = True + + elif suffix == "n": + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + step3_success = True + + if word[-2:] == "ie" or word[-2:] in self.__long_vowels: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + step3_success = True + break + + # STEP 4: Other endings + for suffix in self.__step4_suffixes: + if r2.endswith(suffix): + if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"): + if word[-5:-3] != "po": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + break + + # STEP 5: Plurals + if step3_success and len(r1) >= 1 and r1[-1] in "ij": + word = word[:-1] + r1 = r1[:-1] + + elif ( + not step3_success + and len(r1) >= 2 + and r1[-1] == "t" + and r1[-2] in self.__vowels + ): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + if r2.endswith("imma"): + word = word[:-4] + r1 = r1[:-4] + elif r2.endswith("mma") and r2[-5:-3] != "po": + word = word[:-3] + r1 = r1[:-3] + + # STEP 6: Tidying up + if r1[-2:] in self.__long_vowels: + word = word[:-1] + r1 = r1[:-1] + + if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei": + word = word[:-1] + r1 = r1[:-1] + + if r1.endswith(("oj", "uj")): + word = word[:-1] + r1 = r1[:-1] + + if r1.endswith("jo"): + word = word[:-1] + r1 = r1[:-1] + + # If the word ends with a double consonant + # followed by zero or more vowels, the last consonant is removed. + for i in range(1, len(word)): + if word[-i] in self.__vowels: + continue + else: + if i == 1: + if word[-i - 1 :] in self.__double_consonants: + word = word[:-1] + else: + if word[-i - 1 : -i + 1] in self.__double_consonants: + word = "".join((word[:-i], word[-i + 1 :])) + break + + return word + + +class FrenchStemmer(_StandardStemmer): + + """ + The French Snowball stemmer. + + :cvar __vowels: The French vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. + :type __step2a_suffixes: tuple + :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. + :type __step2b_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the French + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/french/stemmer.html + """ + + __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9" + __step1_suffixes = ( + "issements", + "issement", + "atrices", + "atrice", + "ateurs", + "ations", + "logies", + "usions", + "utions", + "ements", + "amment", + "emment", + "ances", + "iqUes", + "ismes", + "ables", + "istes", + "ateur", + "ation", + "logie", + "usion", + "ution", + "ences", + "ement", + "euses", + "ments", + "ance", + "iqUe", + "isme", + "able", + "iste", + "ence", + "it\xE9s", + "ives", + "eaux", + "euse", + "ment", + "eux", + "it\xE9", + "ive", + "ifs", + "aux", + "if", + ) + __step2a_suffixes = ( + "issaIent", + "issantes", + "iraIent", + "issante", + "issants", + "issions", + "irions", + "issais", + "issait", + "issant", + "issent", + "issiez", + "issons", + "irais", + "irait", + "irent", + "iriez", + "irons", + "iront", + "isses", + "issez", + "\xEEmes", + "\xEEtes", + "irai", + "iras", + "irez", + "isse", + "ies", + "ira", + "\xEEt", + "ie", + "ir", + "is", + "it", + "i", + ) + __step2b_suffixes = ( + "eraIent", + "assions", + "erions", + "assent", + "assiez", + "\xE8rent", + "erais", + "erait", + "eriez", + "erons", + "eront", + "aIent", + "antes", + "asses", + "ions", + "erai", + "eras", + "erez", + "\xE2mes", + "\xE2tes", + "ante", + "ants", + "asse", + "\xE9es", + "era", + "iez", + "ais", + "ait", + "ant", + "\xE9e", + "\xE9s", + "er", + "ez", + "\xE2t", + "ai", + "as", + "\xE9", + "a", + ) + __step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB") + + def stem(self, word): + """ + Stem a French word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + step1_success = False + rv_ending_found = False + step2a_success = False + step2b_success = False + + # Every occurrence of 'u' after 'q' is put into upper case. + for i in range(1, len(word)): + if word[i - 1] == "q" and word[i] == "u": + word = "".join((word[:i], "U", word[i + 1 :])) + + # Every occurrence of 'u' and 'i' + # between vowels is put into upper case. + # Every occurrence of 'y' preceded or + # followed by a vowel is also put into upper case. + for i in range(1, len(word) - 1): + if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i + 1 :])) + + elif word[i] == "i": + word = "".join((word[:i], "I", word[i + 1 :])) + + if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels: + if word[i] == "y": + word = "".join((word[:i], "Y", word[i + 1 :])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self.__rv_french(word, self.__vowels) + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "eaux": + word = word[:-1] + step1_success = True + + elif suffix in ("euse", "euses"): + if suffix in r2: + word = word[: -len(suffix)] + step1_success = True + + elif suffix in r1: + word = suffix_replace(word, suffix, "eux") + step1_success = True + + elif suffix in ("ement", "ements") and suffix in rv: + word = word[: -len(suffix)] + step1_success = True + + if word[-2:] == "iv" and "iv" in r2: + word = word[:-2] + + if word[-2:] == "at" and "at" in r2: + word = word[:-2] + + elif word[-3:] == "eus": + if "eus" in r2: + word = word[:-3] + elif "eus" in r1: + word = "".join((word[:-1], "x")) + + elif word[-3:] in ("abl", "iqU"): + if "abl" in r2 or "iqU" in r2: + word = word[:-3] + + elif word[-3:] in ("i\xE8r", "I\xE8r"): + if "i\xE8r" in rv or "I\xE8r" in rv: + word = "".join((word[:-3], "i")) + + elif suffix == "amment" and suffix in rv: + word = suffix_replace(word, "amment", "ant") + rv = suffix_replace(rv, "amment", "ant") + rv_ending_found = True + + elif suffix == "emment" and suffix in rv: + word = suffix_replace(word, "emment", "ent") + rv_ending_found = True + + elif ( + suffix in ("ment", "ments") + and suffix in rv + and not rv.startswith(suffix) + and rv[rv.rindex(suffix) - 1] in self.__vowels + ): + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + rv_ending_found = True + + elif suffix == "aux" and suffix in r1: + word = "".join((word[:-2], "l")) + step1_success = True + + elif ( + suffix in ("issement", "issements") + and suffix in r1 + and word[-len(suffix) - 1] not in self.__vowels + ): + word = word[: -len(suffix)] + step1_success = True + + elif ( + suffix + in ( + "ance", + "iqUe", + "isme", + "able", + "iste", + "eux", + "ances", + "iqUes", + "ismes", + "ables", + "istes", + ) + and suffix in r2 + ): + word = word[: -len(suffix)] + step1_success = True + + elif ( + suffix + in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations") + and suffix in r2 + ): + word = word[: -len(suffix)] + step1_success = True + + if word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + + elif suffix in ("logie", "logies") and suffix in r2: + word = suffix_replace(word, suffix, "log") + step1_success = True + + elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2: + word = suffix_replace(word, suffix, "u") + step1_success = True + + elif suffix in ("ence", "ences") and suffix in r2: + word = suffix_replace(word, suffix, "ent") + step1_success = True + + elif suffix in ("it\xE9", "it\xE9s") and suffix in r2: + word = word[: -len(suffix)] + step1_success = True + + if word[-4:] == "abil": + if "abil" in r2: + word = word[:-4] + else: + word = "".join((word[:-2], "l")) + + elif word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + + elif word[-2:] == "iv": + if "iv" in r2: + word = word[:-2] + + elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2: + word = word[: -len(suffix)] + step1_success = True + + if word[-2:] == "at" and "at" in r2: + word = word[:-2] + + if word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + break + + # STEP 2a: Verb suffixes beginning 'i' + if not step1_success or rv_ending_found: + for suffix in self.__step2a_suffixes: + if word.endswith(suffix): + if ( + suffix in rv + and len(rv) > len(suffix) + and rv[rv.rindex(suffix) - 1] not in self.__vowels + ): + word = word[: -len(suffix)] + step2a_success = True + break + + # STEP 2b: Other verb suffixes + if not step2a_success: + for suffix in self.__step2b_suffixes: + if rv.endswith(suffix): + if suffix == "ions" and "ions" in r2: + word = word[:-4] + step2b_success = True + + elif suffix in ( + "eraIent", + "erions", + "\xE8rent", + "erais", + "erait", + "eriez", + "erons", + "eront", + "erai", + "eras", + "erez", + "\xE9es", + "era", + "iez", + "\xE9e", + "\xE9s", + "er", + "ez", + "\xE9", + ): + word = word[: -len(suffix)] + step2b_success = True + + elif suffix in ( + "assions", + "assent", + "assiez", + "aIent", + "antes", + "asses", + "\xE2mes", + "\xE2tes", + "ante", + "ants", + "asse", + "ais", + "ait", + "ant", + "\xE2t", + "ai", + "as", + "a", + ): + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + step2b_success = True + if rv.endswith("e"): + word = word[:-1] + break + + # STEP 3 + if step1_success or step2a_success or step2b_success: + if word[-1] == "Y": + word = "".join((word[:-1], "i")) + elif word[-1] == "\xE7": + word = "".join((word[:-1], "c")) + + # STEP 4: Residual suffixes + else: + if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s": + word = word[:-1] + + for suffix in self.__step4_suffixes: + if word.endswith(suffix): + if suffix in rv: + if suffix == "ion" and suffix in r2 and rv[-4] in "st": + word = word[:-3] + + elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"): + word = suffix_replace(word, suffix, "i") + + elif suffix == "e": + word = word[:-1] + + elif suffix == "\xEB" and word[-3:-1] == "gu": + word = word[:-1] + break + + # STEP 5: Undouble + if word.endswith(("enn", "onn", "ett", "ell", "eill")): + word = word[:-1] + + # STEP 6: Un-accent + for i in range(1, len(word)): + if word[-i] not in self.__vowels: + i += 1 + else: + if i != 1 and word[-i] in ("\xE9", "\xE8"): + word = "".join((word[:-i], "e", word[-i + 1 :])) + break + + word = word.replace("I", "i").replace("U", "u").replace("Y", "y") + + return word + + def __rv_french(self, word, vowels): + """ + Return the region RV that is used by the French stemmer. + + If the word begins with two vowels, RV is the region after + the third letter. Otherwise, it is the region after the first + vowel not at the beginning of the word, or the end of the word + if these positions cannot be found. (Exceptionally, u'par', + u'col' or u'tap' at the beginning of a word is also taken to + define RV as the region to their right.) + + :param word: The French word whose region RV is determined. + :type word: str or unicode + :param vowels: The French vowels that are used to determine + the region RV. + :type vowels: unicode + :return: the region RV for the respective French word. + :rtype: unicode + :note: This helper method is invoked by the stem method of + the subclass FrenchStemmer. It is not to be invoked directly! + + """ + rv = "" + if len(word) >= 2: + if word.startswith(("par", "col", "tap")) or ( + word[0] in vowels and word[1] in vowels + ): + rv = word[3:] + else: + for i in range(1, len(word)): + if word[i] in vowels: + rv = word[i + 1 :] + break + + return rv + + +class GermanStemmer(_StandardStemmer): + + """ + The German Snowball stemmer. + + :cvar __vowels: The German vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __st_ending: Letter that may directly appear before a word final 'st'. + :type __st_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the German + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/german/stemmer.html + + """ + + __vowels = "aeiouy\xE4\xF6\xFC" + __s_ending = "bdfghklmnrt" + __st_ending = "bdfghklmnt" + + __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") + __step2_suffixes = ("est", "en", "er", "st") + __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik") + + def stem(self, word): + """ + Stem a German word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + word = word.replace("\xDF", "ss") + + # Every occurrence of 'u' and 'y' + # between vowels is put into upper case. + for i in range(1, len(word) - 1): + if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i + 1 :])) + + elif word[i] == "y": + word = "".join((word[:i], "Y", word[i + 1 :])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # R1 is adjusted so that the region before it + # contains at least 3 letters. + for i in range(1, len(word)): + if word[i] not in self.__vowels and word[i - 1] in self.__vowels: + if 3 > len(word[: i + 1]) > 0: + r1 = word[3:] + elif len(word[: i + 1]) == 0: + return word + break + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if ( + suffix in ("en", "es", "e") + and word[-len(suffix) - 4 : -len(suffix)] == "niss" + ): + word = word[: -len(suffix) - 1] + r1 = r1[: -len(suffix) - 1] + r2 = r2[: -len(suffix) - 1] + + elif suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + if suffix == "st": + if word[-3] in self.__st_ending and len(word[:-3]) >= 3: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + break + + # STEP 3: Derivational suffixes + for suffix in self.__step3_suffixes: + if r2.endswith(suffix): + if suffix in ("end", "ung"): + if ( + "ig" in r2[-len(suffix) - 2 : -len(suffix)] + and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2] + ): + word = word[: -len(suffix) - 2] + else: + word = word[: -len(suffix)] + + elif ( + suffix in ("ig", "ik", "isch") + and "e" not in r2[-len(suffix) - 1 : -len(suffix)] + ): + word = word[: -len(suffix)] + + elif suffix in ("lich", "heit"): + if ( + "er" in r1[-len(suffix) - 2 : -len(suffix)] + or "en" in r1[-len(suffix) - 2 : -len(suffix)] + ): + word = word[: -len(suffix) - 2] + else: + word = word[: -len(suffix)] + + elif suffix == "keit": + if "lich" in r2[-len(suffix) - 4 : -len(suffix)]: + word = word[: -len(suffix) - 4] + + elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]: + word = word[: -len(suffix) - 2] + else: + word = word[: -len(suffix)] + break + + # Umlaut accents are removed and + # 'u' and 'y' are put back into lower case. + word = ( + word.replace("\xE4", "a") + .replace("\xF6", "o") + .replace("\xFC", "u") + .replace("U", "u") + .replace("Y", "y") + ) + + return word + + +class HungarianStemmer(_LanguageSpecificStemmer): + + """ + The Hungarian Snowball stemmer. + + :cvar __vowels: The Hungarian vowels. + :type __vowels: unicode + :cvar __digraphs: The Hungarian digraphs. + :type __digraphs: tuple + :cvar __double_consonants: The Hungarian double consonants. + :type __double_consonants: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. + :type __step5_suffixes: tuple + :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. + :type __step6_suffixes: tuple + :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. + :type __step7_suffixes: tuple + :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. + :type __step8_suffixes: tuple + :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. + :type __step9_suffixes: tuple + :note: A detailed description of the Hungarian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/hungarian/stemmer.html + + """ + + __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB" + __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") + __double_consonants = ( + "bb", + "cc", + "ccs", + "dd", + "ff", + "gg", + "ggy", + "jj", + "kk", + "ll", + "lly", + "mm", + "nn", + "nny", + "pp", + "rr", + "ss", + "ssz", + "tt", + "tty", + "vv", + "zz", + "zzs", + ) + + __step1_suffixes = ("al", "el") + __step2_suffixes = ( + "k\xE9ppen", + "onk\xE9nt", + "enk\xE9nt", + "ank\xE9nt", + "k\xE9pp", + "k\xE9nt", + "ban", + "ben", + "nak", + "nek", + "val", + "vel", + "t\xF3l", + "t\xF5l", + "r\xF3l", + "r\xF5l", + "b\xF3l", + "b\xF5l", + "hoz", + "hez", + "h\xF6z", + "n\xE1l", + "n\xE9l", + "\xE9rt", + "kor", + "ba", + "be", + "ra", + "re", + "ig", + "at", + "et", + "ot", + "\xF6t", + "ul", + "\xFCl", + "v\xE1", + "v\xE9", + "en", + "on", + "an", + "\xF6n", + "n", + "t", + ) + __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n") + __step4_suffixes = ( + "astul", + "est\xFCl", + "\xE1stul", + "\xE9st\xFCl", + "stul", + "st\xFCl", + ) + __step5_suffixes = ("\xE1", "\xE9") + __step6_suffixes = ( + "ok\xE9", + "\xF6k\xE9", + "ak\xE9", + "ek\xE9", + "\xE1k\xE9", + "\xE1\xE9i", + "\xE9k\xE9", + "\xE9\xE9i", + "k\xE9", + "\xE9i", + "\xE9\xE9", + "\xE9", + ) + __step7_suffixes = ( + "\xE1juk", + "\xE9j\xFCk", + "\xFCnk", + "unk", + "juk", + "j\xFCk", + "\xE1nk", + "\xE9nk", + "nk", + "uk", + "\xFCk", + "em", + "om", + "am", + "od", + "ed", + "ad", + "\xF6d", + "ja", + "je", + "\xE1m", + "\xE1d", + "\xE9m", + "\xE9d", + "m", + "d", + "a", + "e", + "o", + "\xE1", + "\xE9", + ) + __step8_suffixes = ( + "jaitok", + "jeitek", + "jaink", + "jeink", + "aitok", + "eitek", + "\xE1itok", + "\xE9itek", + "jaim", + "jeim", + "jaid", + "jeid", + "eink", + "aink", + "itek", + "jeik", + "jaik", + "\xE1ink", + "\xE9ink", + "aim", + "eim", + "aid", + "eid", + "jai", + "jei", + "ink", + "aik", + "eik", + "\xE1im", + "\xE1id", + "\xE1ik", + "\xE9im", + "\xE9id", + "\xE9ik", + "im", + "id", + "ai", + "ei", + "ik", + "\xE1i", + "\xE9i", + "i", + ) + __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k") + + def stem(self, word): + """ + Stem an Hungarian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) + + # STEP 1: Remove instrumental case + if r1.endswith(self.__step1_suffixes): + for double_cons in self.__double_consonants: + if word[-2 - len(double_cons) : -2] == double_cons: + word = "".join((word[:-4], word[-3])) + + if r1[-2 - len(double_cons) : -2] == double_cons: + r1 = "".join((r1[:-4], r1[-3])) + break + + # STEP 2: Remove frequent cases + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + + if r1.endswith("\xE1"): + word = "".join((word[:-1], "a")) + r1 = suffix_replace(r1, "\xE1", "a") + + elif r1.endswith("\xE9"): + word = "".join((word[:-1], "e")) + r1 = suffix_replace(r1, "\xE9", "e") + break + + # STEP 3: Remove special cases + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix == "\xE9n": + word = suffix_replace(word, suffix, "e") + r1 = suffix_replace(r1, suffix, "e") + else: + word = suffix_replace(word, suffix, "a") + r1 = suffix_replace(r1, suffix, "a") + break + + # STEP 4: Remove other cases + for suffix in self.__step4_suffixes: + if r1.endswith(suffix): + if suffix == "\xE1stul": + word = suffix_replace(word, suffix, "a") + r1 = suffix_replace(r1, suffix, "a") + + elif suffix == "\xE9st\xFCl": + word = suffix_replace(word, suffix, "e") + r1 = suffix_replace(r1, suffix, "e") + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + break + + # STEP 5: Remove factive case + for suffix in self.__step5_suffixes: + if r1.endswith(suffix): + for double_cons in self.__double_consonants: + if word[-1 - len(double_cons) : -1] == double_cons: + word = "".join((word[:-3], word[-2])) + + if r1[-1 - len(double_cons) : -1] == double_cons: + r1 = "".join((r1[:-3], r1[-2])) + break + + # STEP 6: Remove owned + for suffix in self.__step6_suffixes: + if r1.endswith(suffix): + if suffix in ("\xE1k\xE9", "\xE1\xE9i"): + word = suffix_replace(word, suffix, "a") + r1 = suffix_replace(r1, suffix, "a") + + elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"): + word = suffix_replace(word, suffix, "e") + r1 = suffix_replace(r1, suffix, "e") + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + break + + # STEP 7: Remove singular owner suffixes + for suffix in self.__step7_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"): + word = suffix_replace(word, suffix, "a") + r1 = suffix_replace(r1, suffix, "a") + + elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"): + word = suffix_replace(word, suffix, "e") + r1 = suffix_replace(r1, suffix, "e") + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + break + + # STEP 8: Remove plural owner suffixes + for suffix in self.__step8_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix in ( + "\xE1im", + "\xE1id", + "\xE1i", + "\xE1ink", + "\xE1itok", + "\xE1ik", + ): + word = suffix_replace(word, suffix, "a") + r1 = suffix_replace(r1, suffix, "a") + + elif suffix in ( + "\xE9im", + "\xE9id", + "\xE9i", + "\xE9ink", + "\xE9itek", + "\xE9ik", + ): + word = suffix_replace(word, suffix, "e") + r1 = suffix_replace(r1, suffix, "e") + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + break + + # STEP 9: Remove plural suffixes + for suffix in self.__step9_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == "\xE1k": + word = suffix_replace(word, suffix, "a") + elif suffix == "\xE9k": + word = suffix_replace(word, suffix, "e") + else: + word = word[: -len(suffix)] + break + + return word + + def __r1_hungarian(self, word, vowels, digraphs): + """ + Return the region R1 that is used by the Hungarian stemmer. + + If the word begins with a vowel, R1 is defined as the region + after the first consonant or digraph (= two letters stand for + one phoneme) in the word. If the word begins with a consonant, + it is defined as the region after the first vowel in the word. + If the word does not contain both a vowel and consonant, R1 + is the null region at the end of the word. + + :param word: The Hungarian word whose region R1 is determined. + :type word: str or unicode + :param vowels: The Hungarian vowels that are used to determine + the region R1. + :type vowels: unicode + :param digraphs: The digraphs that are used to determine the + region R1. + :type digraphs: tuple + :return: the region R1 for the respective word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + HungarianStemmer. It is not to be invoked directly! + + """ + r1 = "" + if word[0] in vowels: + for digraph in digraphs: + if digraph in word[1:]: + r1 = word[word.index(digraph[-1]) + 1 :] + return r1 + + for i in range(1, len(word)): + if word[i] not in vowels: + r1 = word[i + 1 :] + break + else: + for i in range(1, len(word)): + if word[i] in vowels: + r1 = word[i + 1 :] + break + + return r1 + + +class ItalianStemmer(_StandardStemmer): + + """ + The Italian Snowball stemmer. + + :cvar __vowels: The Italian vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :note: A detailed description of the Italian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/italian/stemmer.html + + """ + + __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9" + __step0_suffixes = ( + "gliela", + "gliele", + "glieli", + "glielo", + "gliene", + "sene", + "mela", + "mele", + "meli", + "melo", + "mene", + "tela", + "tele", + "teli", + "telo", + "tene", + "cela", + "cele", + "celi", + "celo", + "cene", + "vela", + "vele", + "veli", + "velo", + "vene", + "gli", + "ci", + "la", + "le", + "li", + "lo", + "mi", + "ne", + "si", + "ti", + "vi", + ) + __step1_suffixes = ( + "atrice", + "atrici", + "azione", + "azioni", + "uzione", + "uzioni", + "usione", + "usioni", + "amento", + "amenti", + "imento", + "imenti", + "amente", + "abile", + "abili", + "ibile", + "ibili", + "mente", + "atore", + "atori", + "logia", + "logie", + "anza", + "anze", + "iche", + "ichi", + "ismo", + "ismi", + "ista", + "iste", + "isti", + "ist\xE0", + "ist\xE8", + "ist\xEC", + "ante", + "anti", + "enza", + "enze", + "ico", + "ici", + "ica", + "ice", + "oso", + "osi", + "osa", + "ose", + "it\xE0", + "ivo", + "ivi", + "iva", + "ive", + ) + __step2_suffixes = ( + "erebbero", + "irebbero", + "assero", + "assimo", + "eranno", + "erebbe", + "eremmo", + "ereste", + "eresti", + "essero", + "iranno", + "irebbe", + "iremmo", + "ireste", + "iresti", + "iscano", + "iscono", + "issero", + "arono", + "avamo", + "avano", + "avate", + "eremo", + "erete", + "erono", + "evamo", + "evano", + "evate", + "iremo", + "irete", + "irono", + "ivamo", + "ivano", + "ivate", + "ammo", + "ando", + "asse", + "assi", + "emmo", + "enda", + "ende", + "endi", + "endo", + "erai", + "erei", + "Yamo", + "iamo", + "immo", + "irai", + "irei", + "isca", + "isce", + "isci", + "isco", + "ano", + "are", + "ata", + "ate", + "ati", + "ato", + "ava", + "avi", + "avo", + "er\xE0", + "ere", + "er\xF2", + "ete", + "eva", + "evi", + "evo", + "ir\xE0", + "ire", + "ir\xF2", + "ita", + "ite", + "iti", + "ito", + "iva", + "ivi", + "ivo", + "ono", + "uta", + "ute", + "uti", + "uto", + "ar", + "ir", + ) + + def stem(self, word): + """ + Stem an Italian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + step1_success = False + + # All acute accents are replaced by grave accents. + word = ( + word.replace("\xE1", "\xE0") + .replace("\xE9", "\xE8") + .replace("\xED", "\xEC") + .replace("\xF3", "\xF2") + .replace("\xFA", "\xF9") + ) + + # Every occurrence of 'u' after 'q' + # is put into upper case. + for i in range(1, len(word)): + if word[i - 1] == "q" and word[i] == "u": + word = "".join((word[:i], "U", word[i + 1 :])) + + # Every occurrence of 'u' and 'i' + # between vowels is put into upper case. + for i in range(1, len(word) - 1): + if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i + 1 :])) + + elif word[i] == "i": + word = "".join((word[:i], "I", word[i + 1 :])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Attached pronoun + for suffix in self.__step0_suffixes: + if rv.endswith(suffix): + if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"): + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + + elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"): + word = suffix_replace(word, suffix, "e") + r1 = suffix_replace(r1, suffix, "e") + r2 = suffix_replace(r2, suffix, "e") + rv = suffix_replace(rv, suffix, "e") + break + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith( + suffix + ): + step1_success = True + word = word[:-6] + rv = rv[:-6] + + elif r2.endswith(suffix): + step1_success = True + if suffix in ("azione", "azioni", "atore", "atori"): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + + elif suffix in ("logia", "logie"): + word = word[:-2] + rv = word[:-2] + + elif suffix in ("uzione", "uzioni", "usione", "usioni"): + word = word[:-5] + rv = rv[:-5] + + elif suffix in ("enza", "enze"): + word = suffix_replace(word, suffix, "te") + rv = suffix_replace(rv, suffix, "te") + + elif suffix == "it\xE0": + word = word[:-3] + r2 = r2[:-3] + rv = rv[:-3] + + if r2.endswith(("ic", "iv")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("ivo", "ivi", "iva", "ive"): + word = word[:-3] + r2 = r2[:-3] + rv = rv[:-3] + + if r2.endswith("at"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + # STEP 2: Verb suffixes + if not step1_success: + for suffix in self.__step2_suffixes: + if rv.endswith(suffix): + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + # STEP 3a + if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")): + word = word[:-1] + rv = rv[:-1] + + if rv.endswith("i"): + word = word[:-1] + rv = rv[:-1] + + # STEP 3b + if rv.endswith(("ch", "gh")): + word = word[:-1] + + word = word.replace("I", "i").replace("U", "u") + + return word + + +class NorwegianStemmer(_ScandinavianStemmer): + + """ + The Norwegian Snowball stemmer. + + :cvar __vowels: The Norwegian vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Norwegian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/norwegian/stemmer.html + + """ + + __vowels = "aeiouy\xE6\xE5\xF8" + __s_ending = "bcdfghjlmnoprtvyz" + __step1_suffixes = ( + "hetenes", + "hetene", + "hetens", + "heter", + "heten", + "endes", + "ande", + "ende", + "edes", + "enes", + "erte", + "ede", + "ane", + "ene", + "ens", + "ers", + "ets", + "het", + "ast", + "ert", + "en", + "ar", + "er", + "as", + "es", + "et", + "a", + "e", + "s", + ) + + __step2_suffixes = ("dt", "vt") + + __step3_suffixes = ( + "hetslov", + "eleg", + "elig", + "elov", + "slov", + "leg", + "eig", + "lig", + "els", + "lov", + "ig", + ) + + def stem(self, word): + """ + Stem a Norwegian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + r1 = self._r1_scandinavian(word, self.__vowels) + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix in ("erte", "ert"): + word = suffix_replace(word, suffix, "er") + r1 = suffix_replace(r1, suffix, "er") + + elif suffix == "s": + if word[-2] in self.__s_ending or ( + word[-2] == "k" and word[-3] not in self.__vowels + ): + word = word[:-1] + r1 = r1[:-1] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + word = word[: -len(suffix)] + break + + return word + + +class PortugueseStemmer(_StandardStemmer): + + """ + The Portuguese Snowball stemmer. + + :cvar __vowels: The Portuguese vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the Portuguese + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/portuguese/stemmer.html + + """ + + __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4" + __step1_suffixes = ( + "amentos", + "imentos", + "uço~es", + "amento", + "imento", + "adoras", + "adores", + "a\xE7o~es", + "logias", + "\xEAncias", + "amente", + "idades", + "an\xE7as", + "ismos", + "istas", + "adora", + "a\xE7a~o", + "antes", + "\xE2ncia", + "logia", + "uça~o", + "\xEAncia", + "mente", + "idade", + "an\xE7a", + "ezas", + "icos", + "icas", + "ismo", + "\xE1vel", + "\xEDvel", + "ista", + "osos", + "osas", + "ador", + "ante", + "ivas", + "ivos", + "iras", + "eza", + "ico", + "ica", + "oso", + "osa", + "iva", + "ivo", + "ira", + ) + __step2_suffixes = ( + "ar\xEDamos", + "er\xEDamos", + "ir\xEDamos", + "\xE1ssemos", + "\xEAssemos", + "\xEDssemos", + "ar\xEDeis", + "er\xEDeis", + "ir\xEDeis", + "\xE1sseis", + "\xE9sseis", + "\xEDsseis", + "\xE1ramos", + "\xE9ramos", + "\xEDramos", + "\xE1vamos", + "aremos", + "eremos", + "iremos", + "ariam", + "eriam", + "iriam", + "assem", + "essem", + "issem", + "ara~o", + "era~o", + "ira~o", + "arias", + "erias", + "irias", + "ardes", + "erdes", + "irdes", + "asses", + "esses", + "isses", + "astes", + "estes", + "istes", + "\xE1reis", + "areis", + "\xE9reis", + "ereis", + "\xEDreis", + "ireis", + "\xE1veis", + "\xEDamos", + "armos", + "ermos", + "irmos", + "aria", + "eria", + "iria", + "asse", + "esse", + "isse", + "aste", + "este", + "iste", + "arei", + "erei", + "irei", + "aram", + "eram", + "iram", + "avam", + "arem", + "erem", + "irem", + "ando", + "endo", + "indo", + "adas", + "idas", + "ar\xE1s", + "aras", + "er\xE1s", + "eras", + "ir\xE1s", + "avas", + "ares", + "eres", + "ires", + "\xEDeis", + "ados", + "idos", + "\xE1mos", + "amos", + "emos", + "imos", + "iras", + "ada", + "ida", + "ar\xE1", + "ara", + "er\xE1", + "era", + "ir\xE1", + "ava", + "iam", + "ado", + "ido", + "ias", + "ais", + "eis", + "ira", + "ia", + "ei", + "am", + "em", + "ar", + "er", + "ir", + "as", + "es", + "is", + "eu", + "iu", + "ou", + ) + __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3") + + def stem(self, word): + """ + Stem a Portuguese word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + step1_success = False + step2_success = False + + word = ( + word.replace("\xE3", "a~") + .replace("\xF5", "o~") + .replace("q\xFC", "qu") + .replace("g\xFC", "gu") + ) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic", "ad")): + word = word[:-2] + rv = rv[:-2] + + elif ( + suffix in ("ira", "iras") + and rv.endswith(suffix) + and word[-len(suffix) - 1 : -len(suffix)] == "e" + ): + step1_success = True + + word = suffix_replace(word, suffix, "ir") + rv = suffix_replace(rv, suffix, "ir") + + elif r2.endswith(suffix): + step1_success = True + + if suffix in ("logia", "logias"): + word = suffix_replace(word, suffix, "log") + rv = suffix_replace(rv, suffix, "log") + + elif suffix in ("uça~o", "uço~es"): + word = suffix_replace(word, suffix, "u") + rv = suffix_replace(rv, suffix, "u") + + elif suffix in ("\xEAncia", "\xEAncias"): + word = suffix_replace(word, suffix, "ente") + rv = suffix_replace(rv, suffix, "ente") + + elif suffix == "mente": + word = word[:-5] + r2 = r2[:-5] + rv = rv[:-5] + + if r2.endswith(("ante", "avel", "ivel")): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("idade", "idades"): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + + if r2.endswith(("ic", "iv")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("iva", "ivo", "ivas", "ivos"): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + # STEP 2: Verb suffixes + if not step1_success: + for suffix in self.__step2_suffixes: + if rv.endswith(suffix): + step2_success = True + + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + # STEP 3 + if step1_success or step2_success: + if rv.endswith("i") and word[-2] == "c": + word = word[:-1] + rv = rv[:-1] + + ### STEP 4: Residual suffix + if not step1_success and not step2_success: + for suffix in self.__step4_suffixes: + if rv.endswith(suffix): + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + # STEP 5 + if rv.endswith(("e", "\xE9", "\xEA")): + word = word[:-1] + rv = rv[:-1] + + if (word.endswith("gu") and rv.endswith("u")) or ( + word.endswith("ci") and rv.endswith("i") + ): + word = word[:-1] + + elif word.endswith("\xE7"): + word = suffix_replace(word, "\xE7", "c") + + word = word.replace("a~", "\xE3").replace("o~", "\xF5") + + return word + + +class RomanianStemmer(_StandardStemmer): + + """ + The Romanian Snowball stemmer. + + :cvar __vowels: The Romanian vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Romanian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/romanian/stemmer.html + + """ + + __vowels = "aeiou\u0103\xE2\xEE" + __step0_suffixes = ( + "iilor", + "ului", + "elor", + "iile", + "ilor", + "atei", + "a\u0163ie", + "a\u0163ia", + "aua", + "ele", + "iua", + "iei", + "ile", + "ul", + "ea", + "ii", + ) + __step1_suffixes = ( + "abilitate", + "abilitati", + "abilit\u0103\u0163i", + "ibilitate", + "abilit\u0103i", + "ivitate", + "ivitati", + "ivit\u0103\u0163i", + "icitate", + "icitati", + "icit\u0103\u0163i", + "icatori", + "ivit\u0103i", + "icit\u0103i", + "icator", + "a\u0163iune", + "atoare", + "\u0103toare", + "i\u0163iune", + "itoare", + "iciva", + "icive", + "icivi", + "iciv\u0103", + "icala", + "icale", + "icali", + "ical\u0103", + "ativa", + "ative", + "ativi", + "ativ\u0103", + "atori", + "\u0103tori", + "itiva", + "itive", + "itivi", + "itiv\u0103", + "itori", + "iciv", + "ical", + "ativ", + "ator", + "\u0103tor", + "itiv", + "itor", + ) + __step2_suffixes = ( + "abila", + "abile", + "abili", + "abil\u0103", + "ibila", + "ibile", + "ibili", + "ibil\u0103", + "atori", + "itate", + "itati", + "it\u0103\u0163i", + "abil", + "ibil", + "oasa", + "oas\u0103", + "oase", + "anta", + "ante", + "anti", + "ant\u0103", + "ator", + "it\u0103i", + "iune", + "iuni", + "isme", + "ista", + "iste", + "isti", + "ist\u0103", + "i\u015Fti", + "ata", + "at\u0103", + "ati", + "ate", + "uta", + "ut\u0103", + "uti", + "ute", + "ita", + "it\u0103", + "iti", + "ite", + "ica", + "ice", + "ici", + "ic\u0103", + "osi", + "o\u015Fi", + "ant", + "iva", + "ive", + "ivi", + "iv\u0103", + "ism", + "ist", + "at", + "ut", + "it", + "ic", + "os", + "iv", + ) + __step3_suffixes = ( + "seser\u0103\u0163i", + "aser\u0103\u0163i", + "iser\u0103\u0163i", + "\xE2ser\u0103\u0163i", + "user\u0103\u0163i", + "seser\u0103m", + "aser\u0103m", + "iser\u0103m", + "\xE2ser\u0103m", + "user\u0103m", + "ser\u0103\u0163i", + "sese\u015Fi", + "seser\u0103", + "easc\u0103", + "ar\u0103\u0163i", + "ur\u0103\u0163i", + "ir\u0103\u0163i", + "\xE2r\u0103\u0163i", + "ase\u015Fi", + "aser\u0103", + "ise\u015Fi", + "iser\u0103", + "\xe2se\u015Fi", + "\xE2ser\u0103", + "use\u015Fi", + "user\u0103", + "ser\u0103m", + "sesem", + "indu", + "\xE2ndu", + "eaz\u0103", + "e\u015Fti", + "e\u015Fte", + "\u0103\u015Fti", + "\u0103\u015Fte", + "ea\u0163i", + "ia\u0163i", + "ar\u0103m", + "ur\u0103m", + "ir\u0103m", + "\xE2r\u0103m", + "asem", + "isem", + "\xE2sem", + "usem", + "se\u015Fi", + "ser\u0103", + "sese", + "are", + "ere", + "ire", + "\xE2re", + "ind", + "\xE2nd", + "eze", + "ezi", + "esc", + "\u0103sc", + "eam", + "eai", + "eau", + "iam", + "iai", + "iau", + "a\u015Fi", + "ar\u0103", + "u\u015Fi", + "ur\u0103", + "i\u015Fi", + "ir\u0103", + "\xE2\u015Fi", + "\xe2r\u0103", + "ase", + "ise", + "\xE2se", + "use", + "a\u0163i", + "e\u0163i", + "i\u0163i", + "\xe2\u0163i", + "sei", + "ez", + "am", + "ai", + "au", + "ea", + "ia", + "ui", + "\xE2i", + "\u0103m", + "em", + "im", + "\xE2m", + "se", + ) + + def stem(self, word): + """ + Stem a Romanian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + step1_success = False + step2_success = False + + for i in range(1, len(word) - 1): + if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i + 1 :])) + + elif word[i] == "i": + word = "".join((word[:i], "I", word[i + 1 :])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Removal of plurals and other simplifications + for suffix in self.__step0_suffixes: + if word.endswith(suffix): + if suffix in r1: + if suffix in ("ul", "ului"): + word = word[: -len(suffix)] + + if suffix in rv: + rv = rv[: -len(suffix)] + else: + rv = "" + + elif ( + suffix == "aua" + or suffix == "atei" + or (suffix == "ile" and word[-5:-3] != "ab") + ): + word = word[:-2] + + elif suffix in ("ea", "ele", "elor"): + word = suffix_replace(word, suffix, "e") + + if suffix in rv: + rv = suffix_replace(rv, suffix, "e") + else: + rv = "" + + elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"): + word = suffix_replace(word, suffix, "i") + + if suffix in rv: + rv = suffix_replace(rv, suffix, "i") + else: + rv = "" + + elif suffix in ("a\u0163ie", "a\u0163ia"): + word = word[:-1] + break + + # STEP 1: Reduction of combining suffixes + while True: + + replacement_done = False + + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix in r1: + step1_success = True + replacement_done = True + + if suffix in ( + "abilitate", + "abilitati", + "abilit\u0103i", + "abilit\u0103\u0163i", + ): + word = suffix_replace(word, suffix, "abil") + + elif suffix == "ibilitate": + word = word[:-5] + + elif suffix in ( + "ivitate", + "ivitati", + "ivit\u0103i", + "ivit\u0103\u0163i", + ): + word = suffix_replace(word, suffix, "iv") + + elif suffix in ( + "icitate", + "icitati", + "icit\u0103i", + "icit\u0103\u0163i", + "icator", + "icatori", + "iciv", + "iciva", + "icive", + "icivi", + "iciv\u0103", + "ical", + "icala", + "icale", + "icali", + "ical\u0103", + ): + word = suffix_replace(word, suffix, "ic") + + elif suffix in ( + "ativ", + "ativa", + "ative", + "ativi", + "ativ\u0103", + "a\u0163iune", + "atoare", + "ator", + "atori", + "\u0103toare", + "\u0103tor", + "\u0103tori", + ): + word = suffix_replace(word, suffix, "at") + + if suffix in r2: + r2 = suffix_replace(r2, suffix, "at") + + elif suffix in ( + "itiv", + "itiva", + "itive", + "itivi", + "itiv\u0103", + "i\u0163iune", + "itoare", + "itor", + "itori", + ): + word = suffix_replace(word, suffix, "it") + + if suffix in r2: + r2 = suffix_replace(r2, suffix, "it") + else: + step1_success = False + break + + if not replacement_done: + break + + # STEP 2: Removal of standard suffixes + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if suffix in r2: + step2_success = True + + if suffix in ("iune", "iuni"): + if word[-5] == "\u0163": + word = "".join((word[:-5], "t")) + + elif suffix in ( + "ism", + "isme", + "ist", + "ista", + "iste", + "isti", + "ist\u0103", + "i\u015Fti", + ): + word = suffix_replace(word, suffix, "ist") + + else: + word = word[: -len(suffix)] + break + + # STEP 3: Removal of verb suffixes + if not step1_success and not step2_success: + for suffix in self.__step3_suffixes: + if word.endswith(suffix): + if suffix in rv: + if suffix in ( + "seser\u0103\u0163i", + "seser\u0103m", + "ser\u0103\u0163i", + "sese\u015Fi", + "seser\u0103", + "ser\u0103m", + "sesem", + "se\u015Fi", + "ser\u0103", + "sese", + "a\u0163i", + "e\u0163i", + "i\u0163i", + "\xE2\u0163i", + "sei", + "\u0103m", + "em", + "im", + "\xE2m", + "se", + ): + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + else: + if ( + not rv.startswith(suffix) + and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE" + ): + word = word[: -len(suffix)] + break + + # STEP 4: Removal of final vowel + for suffix in ("ie", "a", "e", "i", "\u0103"): + if word.endswith(suffix): + if suffix in rv: + word = word[: -len(suffix)] + break + + word = word.replace("I", "i").replace("U", "u") + + return word + + +class RussianStemmer(_LanguageSpecificStemmer): + + """ + The Russian Snowball stemmer. + + :cvar __perfective_gerund_suffixes: Suffixes to be deleted. + :type __perfective_gerund_suffixes: tuple + :cvar __adjectival_suffixes: Suffixes to be deleted. + :type __adjectival_suffixes: tuple + :cvar __reflexive_suffixes: Suffixes to be deleted. + :type __reflexive_suffixes: tuple + :cvar __verb_suffixes: Suffixes to be deleted. + :type __verb_suffixes: tuple + :cvar __noun_suffixes: Suffixes to be deleted. + :type __noun_suffixes: tuple + :cvar __superlative_suffixes: Suffixes to be deleted. + :type __superlative_suffixes: tuple + :cvar __derivational_suffixes: Suffixes to be deleted. + :type __derivational_suffixes: tuple + :note: A detailed description of the Russian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/russian/stemmer.html + + """ + + __perfective_gerund_suffixes = ( + "ivshis'", + "yvshis'", + "vshis'", + "ivshi", + "yvshi", + "vshi", + "iv", + "yv", + "v", + ) + __adjectival_suffixes = ( + "ui^ushchi^ui^u", + "ui^ushchi^ai^a", + "ui^ushchimi", + "ui^ushchymi", + "ui^ushchego", + "ui^ushchogo", + "ui^ushchemu", + "ui^ushchomu", + "ui^ushchikh", + "ui^ushchykh", + "ui^ushchui^u", + "ui^ushchaia", + "ui^ushchoi^u", + "ui^ushchei^u", + "i^ushchi^ui^u", + "i^ushchi^ai^a", + "ui^ushchee", + "ui^ushchie", + "ui^ushchye", + "ui^ushchoe", + "ui^ushchei`", + "ui^ushchii`", + "ui^ushchyi`", + "ui^ushchoi`", + "ui^ushchem", + "ui^ushchim", + "ui^ushchym", + "ui^ushchom", + "i^ushchimi", + "i^ushchymi", + "i^ushchego", + "i^ushchogo", + "i^ushchemu", + "i^ushchomu", + "i^ushchikh", + "i^ushchykh", + "i^ushchui^u", + "i^ushchai^a", + "i^ushchoi^u", + "i^ushchei^u", + "i^ushchee", + "i^ushchie", + "i^ushchye", + "i^ushchoe", + "i^ushchei`", + "i^ushchii`", + "i^ushchyi`", + "i^ushchoi`", + "i^ushchem", + "i^ushchim", + "i^ushchym", + "i^ushchom", + "shchi^ui^u", + "shchi^ai^a", + "ivshi^ui^u", + "ivshi^ai^a", + "yvshi^ui^u", + "yvshi^ai^a", + "shchimi", + "shchymi", + "shchego", + "shchogo", + "shchemu", + "shchomu", + "shchikh", + "shchykh", + "shchui^u", + "shchai^a", + "shchoi^u", + "shchei^u", + "ivshimi", + "ivshymi", + "ivshego", + "ivshogo", + "ivshemu", + "ivshomu", + "ivshikh", + "ivshykh", + "ivshui^u", + "ivshai^a", + "ivshoi^u", + "ivshei^u", + "yvshimi", + "yvshymi", + "yvshego", + "yvshogo", + "yvshemu", + "yvshomu", + "yvshikh", + "yvshykh", + "yvshui^u", + "yvshai^a", + "yvshoi^u", + "yvshei^u", + "vshi^ui^u", + "vshi^ai^a", + "shchee", + "shchie", + "shchye", + "shchoe", + "shchei`", + "shchii`", + "shchyi`", + "shchoi`", + "shchem", + "shchim", + "shchym", + "shchom", + "ivshee", + "ivshie", + "ivshye", + "ivshoe", + "ivshei`", + "ivshii`", + "ivshyi`", + "ivshoi`", + "ivshem", + "ivshim", + "ivshym", + "ivshom", + "yvshee", + "yvshie", + "yvshye", + "yvshoe", + "yvshei`", + "yvshii`", + "yvshyi`", + "yvshoi`", + "yvshem", + "yvshim", + "yvshym", + "yvshom", + "vshimi", + "vshymi", + "vshego", + "vshogo", + "vshemu", + "vshomu", + "vshikh", + "vshykh", + "vshui^u", + "vshai^a", + "vshoi^u", + "vshei^u", + "emi^ui^u", + "emi^ai^a", + "nni^ui^u", + "nni^ai^a", + "vshee", + "vshie", + "vshye", + "vshoe", + "vshei`", + "vshii`", + "vshyi`", + "vshoi`", + "vshem", + "vshim", + "vshym", + "vshom", + "emimi", + "emymi", + "emego", + "emogo", + "ememu", + "emomu", + "emikh", + "emykh", + "emui^u", + "emai^a", + "emoi^u", + "emei^u", + "nnimi", + "nnymi", + "nnego", + "nnogo", + "nnemu", + "nnomu", + "nnikh", + "nnykh", + "nnui^u", + "nnai^a", + "nnoi^u", + "nnei^u", + "emee", + "emie", + "emye", + "emoe", + "emei`", + "emii`", + "emyi`", + "emoi`", + "emem", + "emim", + "emym", + "emom", + "nnee", + "nnie", + "nnye", + "nnoe", + "nnei`", + "nnii`", + "nnyi`", + "nnoi`", + "nnem", + "nnim", + "nnym", + "nnom", + "i^ui^u", + "i^ai^a", + "imi", + "ymi", + "ego", + "ogo", + "emu", + "omu", + "ikh", + "ykh", + "ui^u", + "ai^a", + "oi^u", + "ei^u", + "ee", + "ie", + "ye", + "oe", + "ei`", + "ii`", + "yi`", + "oi`", + "em", + "im", + "ym", + "om", + ) + __reflexive_suffixes = ("si^a", "s'") + __verb_suffixes = ( + "esh'", + "ei`te", + "ui`te", + "ui^ut", + "ish'", + "ete", + "i`te", + "i^ut", + "nno", + "ila", + "yla", + "ena", + "ite", + "ili", + "yli", + "ilo", + "ylo", + "eno", + "i^at", + "uet", + "eny", + "it'", + "yt'", + "ui^u", + "la", + "na", + "li", + "em", + "lo", + "no", + "et", + "ny", + "t'", + "ei`", + "ui`", + "il", + "yl", + "im", + "ym", + "en", + "it", + "yt", + "i^u", + "i`", + "l", + "n", + ) + __noun_suffixes = ( + "ii^ami", + "ii^akh", + "i^ami", + "ii^am", + "i^akh", + "ami", + "iei`", + "i^am", + "iem", + "akh", + "ii^u", + "'i^u", + "ii^a", + "'i^a", + "ev", + "ov", + "ie", + "'e", + "ei", + "ii", + "ei`", + "oi`", + "ii`", + "em", + "am", + "om", + "i^u", + "i^a", + "a", + "e", + "i", + "i`", + "o", + "u", + "y", + "'", + ) + __superlative_suffixes = ("ei`she", "ei`sh") + __derivational_suffixes = ("ost'", "ost") + + def stem(self, word): + """ + Stem a Russian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + if word in self.stopwords: + return word + + chr_exceeded = False + for i in range(len(word)): + if ord(word[i]) > 255: + chr_exceeded = True + break + + if not chr_exceeded: + return word + + word = self.__cyrillic_to_roman(word) + + step1_success = False + adjectival_removed = False + verb_removed = False + undouble_success = False + superlative_removed = False + + rv, r2 = self.__regions_russian(word) + + # Step 1 + for suffix in self.__perfective_gerund_suffixes: + if rv.endswith(suffix): + if suffix in ("v", "vshi", "vshis'"): + if ( + rv[-len(suffix) - 3 : -len(suffix)] == "i^a" + or rv[-len(suffix) - 1 : -len(suffix)] == "a" + ): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + step1_success = True + break + else: + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + step1_success = True + break + + if not step1_success: + for suffix in self.__reflexive_suffixes: + if rv.endswith(suffix): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + for suffix in self.__adjectival_suffixes: + if rv.endswith(suffix): + if suffix in ( + "i^ushchi^ui^u", + "i^ushchi^ai^a", + "i^ushchui^u", + "i^ushchai^a", + "i^ushchoi^u", + "i^ushchei^u", + "i^ushchimi", + "i^ushchymi", + "i^ushchego", + "i^ushchogo", + "i^ushchemu", + "i^ushchomu", + "i^ushchikh", + "i^ushchykh", + "shchi^ui^u", + "shchi^ai^a", + "i^ushchee", + "i^ushchie", + "i^ushchye", + "i^ushchoe", + "i^ushchei`", + "i^ushchii`", + "i^ushchyi`", + "i^ushchoi`", + "i^ushchem", + "i^ushchim", + "i^ushchym", + "i^ushchom", + "vshi^ui^u", + "vshi^ai^a", + "shchui^u", + "shchai^a", + "shchoi^u", + "shchei^u", + "emi^ui^u", + "emi^ai^a", + "nni^ui^u", + "nni^ai^a", + "shchimi", + "shchymi", + "shchego", + "shchogo", + "shchemu", + "shchomu", + "shchikh", + "shchykh", + "vshui^u", + "vshai^a", + "vshoi^u", + "vshei^u", + "shchee", + "shchie", + "shchye", + "shchoe", + "shchei`", + "shchii`", + "shchyi`", + "shchoi`", + "shchem", + "shchim", + "shchym", + "shchom", + "vshimi", + "vshymi", + "vshego", + "vshogo", + "vshemu", + "vshomu", + "vshikh", + "vshykh", + "emui^u", + "emai^a", + "emoi^u", + "emei^u", + "nnui^u", + "nnai^a", + "nnoi^u", + "nnei^u", + "vshee", + "vshie", + "vshye", + "vshoe", + "vshei`", + "vshii`", + "vshyi`", + "vshoi`", + "vshem", + "vshim", + "vshym", + "vshom", + "emimi", + "emymi", + "emego", + "emogo", + "ememu", + "emomu", + "emikh", + "emykh", + "nnimi", + "nnymi", + "nnego", + "nnogo", + "nnemu", + "nnomu", + "nnikh", + "nnykh", + "emee", + "emie", + "emye", + "emoe", + "emei`", + "emii`", + "emyi`", + "emoi`", + "emem", + "emim", + "emym", + "emom", + "nnee", + "nnie", + "nnye", + "nnoe", + "nnei`", + "nnii`", + "nnyi`", + "nnoi`", + "nnem", + "nnim", + "nnym", + "nnom", + ): + if ( + rv[-len(suffix) - 3 : -len(suffix)] == "i^a" + or rv[-len(suffix) - 1 : -len(suffix)] == "a" + ): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + adjectival_removed = True + break + else: + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + adjectival_removed = True + break + + if not adjectival_removed: + for suffix in self.__verb_suffixes: + if rv.endswith(suffix): + if suffix in ( + "la", + "na", + "ete", + "i`te", + "li", + "i`", + "l", + "em", + "n", + "lo", + "no", + "et", + "i^ut", + "ny", + "t'", + "esh'", + "nno", + ): + if ( + rv[-len(suffix) - 3 : -len(suffix)] == "i^a" + or rv[-len(suffix) - 1 : -len(suffix)] == "a" + ): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + verb_removed = True + break + else: + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + verb_removed = True + break + + if not adjectival_removed and not verb_removed: + for suffix in self.__noun_suffixes: + if rv.endswith(suffix): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + # Step 2 + if rv.endswith("i"): + word = word[:-1] + r2 = r2[:-1] + + # Step 3 + for suffix in self.__derivational_suffixes: + if r2.endswith(suffix): + word = word[: -len(suffix)] + break + + # Step 4 + if word.endswith("nn"): + word = word[:-1] + undouble_success = True + + if not undouble_success: + for suffix in self.__superlative_suffixes: + if word.endswith(suffix): + word = word[: -len(suffix)] + superlative_removed = True + break + if word.endswith("nn"): + word = word[:-1] + + if not undouble_success and not superlative_removed: + if word.endswith("'"): + word = word[:-1] + + word = self.__roman_to_cyrillic(word) + + return word + + def __regions_russian(self, word): + """ + Return the regions RV and R2 which are used by the Russian stemmer. + + In any word, RV is the region after the first vowel, + or the end of the word if it contains no vowel. + + R2 is the region after the first non-vowel following + a vowel in R1, or the end of the word if there is no such non-vowel. + + R1 is the region after the first non-vowel following a vowel, + or the end of the word if there is no such non-vowel. + + :param word: The Russian word whose regions RV and R2 are determined. + :type word: str or unicode + :return: the regions RV and R2 for the respective Russian word. + :rtype: tuple + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + r1 = "" + r2 = "" + rv = "" + + vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") + word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E") + + for i in range(1, len(word)): + if word[i] not in vowels and word[i - 1] in vowels: + r1 = word[i + 1 :] + break + + for i in range(1, len(r1)): + if r1[i] not in vowels and r1[i - 1] in vowels: + r2 = r1[i + 1 :] + break + + for i in range(len(word)): + if word[i] in vowels: + rv = word[i + 1 :] + break + + r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`") + rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`") + + return (rv, r2) + + def __cyrillic_to_roman(self, word): + """ + Transliterate a Russian word into the Roman alphabet. + + A Russian word whose letters consist of the Cyrillic + alphabet are transliterated into the Roman alphabet + in order to ease the forthcoming stemming process. + + :param word: The word that is transliterated. + :type word: unicode + :return: the transliterated word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + word = ( + word.replace("\u0410", "a") + .replace("\u0430", "a") + .replace("\u0411", "b") + .replace("\u0431", "b") + .replace("\u0412", "v") + .replace("\u0432", "v") + .replace("\u0413", "g") + .replace("\u0433", "g") + .replace("\u0414", "d") + .replace("\u0434", "d") + .replace("\u0415", "e") + .replace("\u0435", "e") + .replace("\u0401", "e") + .replace("\u0451", "e") + .replace("\u0416", "zh") + .replace("\u0436", "zh") + .replace("\u0417", "z") + .replace("\u0437", "z") + .replace("\u0418", "i") + .replace("\u0438", "i") + .replace("\u0419", "i`") + .replace("\u0439", "i`") + .replace("\u041A", "k") + .replace("\u043A", "k") + .replace("\u041B", "l") + .replace("\u043B", "l") + .replace("\u041C", "m") + .replace("\u043C", "m") + .replace("\u041D", "n") + .replace("\u043D", "n") + .replace("\u041E", "o") + .replace("\u043E", "o") + .replace("\u041F", "p") + .replace("\u043F", "p") + .replace("\u0420", "r") + .replace("\u0440", "r") + .replace("\u0421", "s") + .replace("\u0441", "s") + .replace("\u0422", "t") + .replace("\u0442", "t") + .replace("\u0423", "u") + .replace("\u0443", "u") + .replace("\u0424", "f") + .replace("\u0444", "f") + .replace("\u0425", "kh") + .replace("\u0445", "kh") + .replace("\u0426", "t^s") + .replace("\u0446", "t^s") + .replace("\u0427", "ch") + .replace("\u0447", "ch") + .replace("\u0428", "sh") + .replace("\u0448", "sh") + .replace("\u0429", "shch") + .replace("\u0449", "shch") + .replace("\u042A", "''") + .replace("\u044A", "''") + .replace("\u042B", "y") + .replace("\u044B", "y") + .replace("\u042C", "'") + .replace("\u044C", "'") + .replace("\u042D", "e`") + .replace("\u044D", "e`") + .replace("\u042E", "i^u") + .replace("\u044E", "i^u") + .replace("\u042F", "i^a") + .replace("\u044F", "i^a") + ) + + return word + + def __roman_to_cyrillic(self, word): + """ + Transliterate a Russian word back into the Cyrillic alphabet. + + A Russian word formerly transliterated into the Roman alphabet + in order to ease the stemming process, is transliterated back + into the Cyrillic alphabet, its original form. + + :param word: The word that is transliterated. + :type word: str or unicode + :return: word, the transliterated word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + word = ( + word.replace("i^u", "\u044E") + .replace("i^a", "\u044F") + .replace("shch", "\u0449") + .replace("kh", "\u0445") + .replace("t^s", "\u0446") + .replace("ch", "\u0447") + .replace("e`", "\u044D") + .replace("i`", "\u0439") + .replace("sh", "\u0448") + .replace("k", "\u043A") + .replace("e", "\u0435") + .replace("zh", "\u0436") + .replace("a", "\u0430") + .replace("b", "\u0431") + .replace("v", "\u0432") + .replace("g", "\u0433") + .replace("d", "\u0434") + .replace("e", "\u0435") + .replace("z", "\u0437") + .replace("i", "\u0438") + .replace("l", "\u043B") + .replace("m", "\u043C") + .replace("n", "\u043D") + .replace("o", "\u043E") + .replace("p", "\u043F") + .replace("r", "\u0440") + .replace("s", "\u0441") + .replace("t", "\u0442") + .replace("u", "\u0443") + .replace("f", "\u0444") + .replace("''", "\u044A") + .replace("y", "\u044B") + .replace("'", "\u044C") + ) + + return word + + +class SpanishStemmer(_StandardStemmer): + + """ + The Spanish Snowball stemmer. + + :cvar __vowels: The Spanish vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. + :type __step2a_suffixes: tuple + :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. + :type __step2b_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Spanish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/spanish/stemmer.html + + """ + + __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC" + __step0_suffixes = ( + "selas", + "selos", + "sela", + "selo", + "las", + "les", + "los", + "nos", + "me", + "se", + "la", + "le", + "lo", + ) + __step1_suffixes = ( + "amientos", + "imientos", + "amiento", + "imiento", + "acion", + "aciones", + "uciones", + "adoras", + "adores", + "ancias", + "log\xEDas", + "encias", + "amente", + "idades", + "anzas", + "ismos", + "ables", + "ibles", + "istas", + "adora", + "aci\xF3n", + "antes", + "ancia", + "log\xEDa", + "uci\xf3n", + "encia", + "mente", + "anza", + "icos", + "icas", + "ismo", + "able", + "ible", + "ista", + "osos", + "osas", + "ador", + "ante", + "idad", + "ivas", + "ivos", + "ico", + "ica", + "oso", + "osa", + "iva", + "ivo", + ) + __step2a_suffixes = ( + "yeron", + "yendo", + "yamos", + "yais", + "yan", + "yen", + "yas", + "yes", + "ya", + "ye", + "yo", + "y\xF3", + ) + __step2b_suffixes = ( + "ar\xEDamos", + "er\xEDamos", + "ir\xEDamos", + "i\xE9ramos", + "i\xE9semos", + "ar\xEDais", + "aremos", + "er\xEDais", + "eremos", + "ir\xEDais", + "iremos", + "ierais", + "ieseis", + "asteis", + "isteis", + "\xE1bamos", + "\xE1ramos", + "\xE1semos", + "ar\xEDan", + "ar\xEDas", + "ar\xE9is", + "er\xEDan", + "er\xEDas", + "er\xE9is", + "ir\xEDan", + "ir\xEDas", + "ir\xE9is", + "ieran", + "iesen", + "ieron", + "iendo", + "ieras", + "ieses", + "abais", + "arais", + "aseis", + "\xE9amos", + "ar\xE1n", + "ar\xE1s", + "ar\xEDa", + "er\xE1n", + "er\xE1s", + "er\xEDa", + "ir\xE1n", + "ir\xE1s", + "ir\xEDa", + "iera", + "iese", + "aste", + "iste", + "aban", + "aran", + "asen", + "aron", + "ando", + "abas", + "adas", + "idas", + "aras", + "ases", + "\xEDais", + "ados", + "idos", + "amos", + "imos", + "emos", + "ar\xE1", + "ar\xE9", + "er\xE1", + "er\xE9", + "ir\xE1", + "ir\xE9", + "aba", + "ada", + "ida", + "ara", + "ase", + "\xEDan", + "ado", + "ido", + "\xEDas", + "\xE1is", + "\xE9is", + "\xEDa", + "ad", + "ed", + "id", + "an", + "i\xF3", + "ar", + "er", + "ir", + "as", + "\xEDs", + "en", + "es", + ) + __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3") + + def stem(self, word): + """ + Stem a Spanish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + step1_success = False + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Attached pronoun + for suffix in self.__step0_suffixes: + if not (word.endswith(suffix) and rv.endswith(suffix)): + continue + + if ( + rv[: -len(suffix)].endswith( + ( + "ando", + "\xE1ndo", + "ar", + "\xE1r", + "er", + "\xE9r", + "iendo", + "i\xE9ndo", + "ir", + "\xEDr", + ) + ) + ) or ( + rv[: -len(suffix)].endswith("yendo") + and word[: -len(suffix)].endswith("uyendo") + ): + + word = self.__replace_accented(word[: -len(suffix)]) + r1 = self.__replace_accented(r1[: -len(suffix)]) + r2 = self.__replace_accented(r2[: -len(suffix)]) + rv = self.__replace_accented(rv[: -len(suffix)]) + break + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if not word.endswith(suffix): + continue + + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic", "ad")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(suffix): + step1_success = True + if suffix in ( + "adora", + "ador", + "aci\xF3n", + "adoras", + "adores", + "acion", + "aciones", + "ante", + "antes", + "ancia", + "ancias", + ): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + + elif suffix in ("log\xEDa", "log\xEDas"): + word = suffix_replace(word, suffix, "log") + rv = suffix_replace(rv, suffix, "log") + + elif suffix in ("uci\xF3n", "uciones"): + word = suffix_replace(word, suffix, "u") + rv = suffix_replace(rv, suffix, "u") + + elif suffix in ("encia", "encias"): + word = suffix_replace(word, suffix, "ente") + rv = suffix_replace(rv, suffix, "ente") + + elif suffix == "mente": + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + + if r2.endswith(("ante", "able", "ible")): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("idad", "idades"): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + + for pre_suff in ("abil", "ic", "iv"): + if r2.endswith(pre_suff): + word = word[: -len(pre_suff)] + rv = rv[: -len(pre_suff)] + + elif suffix in ("ivo", "iva", "ivos", "ivas"): + word = word[: -len(suffix)] + r2 = r2[: -len(suffix)] + rv = rv[: -len(suffix)] + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + # STEP 2a: Verb suffixes beginning 'y' + if not step1_success: + for suffix in self.__step2a_suffixes: + if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u": + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + break + + # STEP 2b: Other verb suffixes + for suffix in self.__step2b_suffixes: + if rv.endswith(suffix): + word = word[: -len(suffix)] + rv = rv[: -len(suffix)] + if suffix in ("en", "es", "\xE9is", "emos"): + if word.endswith("gu"): + word = word[:-1] + + if rv.endswith("gu"): + rv = rv[:-1] + break + + # STEP 3: Residual suffix + for suffix in self.__step3_suffixes: + if rv.endswith(suffix): + word = word[: -len(suffix)] + if suffix in ("e", "\xE9"): + rv = rv[: -len(suffix)] + + if word[-2:] == "gu" and rv.endswith("u"): + word = word[:-1] + break + + word = self.__replace_accented(word) + + return word + + def __replace_accented(self, word): + """ + Replaces all accented letters on a word with their non-accented + counterparts. + + :param word: A spanish word, with or without accents + :type word: str or unicode + :return: a word with the accented letters (á, é, í, ó, ú) replaced with + their non-accented counterparts (a, e, i, o, u) + :rtype: str or unicode + """ + return ( + word.replace("\xE1", "a") + .replace("\xE9", "e") + .replace("\xED", "i") + .replace("\xF3", "o") + .replace("\xFA", "u") + ) + + +class SwedishStemmer(_ScandinavianStemmer): + + """ + The Swedish Snowball stemmer. + + :cvar __vowels: The Swedish vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Swedish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/swedish/stemmer.html + + """ + + __vowels = "aeiouy\xE4\xE5\xF6" + __s_ending = "bcdfghjklmnoprtvy" + __step1_suffixes = ( + "heterna", + "hetens", + "heter", + "heten", + "anden", + "arnas", + "ernas", + "ornas", + "andes", + "andet", + "arens", + "arna", + "erna", + "orna", + "ande", + "arne", + "aste", + "aren", + "ades", + "erns", + "ade", + "are", + "ern", + "ens", + "het", + "ast", + "ad", + "en", + "ar", + "er", + "or", + "as", + "es", + "at", + "a", + "e", + "s", + ) + __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") + __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig") + + def stem(self, word): + """ + Stem a Swedish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.stopwords: + return word + + r1 = self._r1_scandinavian(word, self.__vowels) + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + else: + word = word[: -len(suffix)] + r1 = r1[: -len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix in ("els", "lig", "ig"): + word = word[: -len(suffix)] + elif suffix in ("fullt", "l\xF6st"): + word = word[:-1] + break + + return word + + +def demo(): + """ + This function provides a demonstration of the Snowball stemmers. + + After invoking this function and specifying a language, + it stems an excerpt of the Universal Declaration of Human Rights + (which is a part of the NLTK corpus collection) and then prints + out the original and the stemmed text. + + """ + + from nltk.corpus import udhr + + udhr_corpus = { + "arabic": "Arabic_Alarabia-Arabic", + "danish": "Danish_Dansk-Latin1", + "dutch": "Dutch_Nederlands-Latin1", + "english": "English-Latin1", + "finnish": "Finnish_Suomi-Latin1", + "french": "French_Francais-Latin1", + "german": "German_Deutsch-Latin1", + "hungarian": "Hungarian_Magyar-UTF8", + "italian": "Italian_Italiano-Latin1", + "norwegian": "Norwegian-Latin1", + "porter": "English-Latin1", + "portuguese": "Portuguese_Portugues-Latin1", + "romanian": "Romanian_Romana-Latin2", + "russian": "Russian-UTF8", + "spanish": "Spanish-Latin1", + "swedish": "Swedish_Svenska-Latin1", + } + + print("\n") + print("******************************") + print("Demo for the Snowball stemmers") + print("******************************") + + while True: + + language = input( + "Please enter the name of the language " + + "to be demonstrated\n" + + "/".join(SnowballStemmer.languages) + + "\n" + + "(enter 'exit' in order to leave): " + ) + + if language == "exit": + break + + if language not in SnowballStemmer.languages: + print( + "\nOops, there is no stemmer for this language. " + + "Please try again.\n" + ) + continue + + stemmer = SnowballStemmer(language) + excerpt = udhr.words(udhr_corpus[language])[:300] + + stemmed = " ".join(stemmer.stem(word) for word in excerpt) + stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip() + excerpt = " ".join(excerpt) + excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip() + + print("\n") + print("-" * 70) + print("ORIGINAL".center(70)) + print(excerpt) + print("\n\n") + print("STEMMED RESULTS".center(70)) + print(stemmed) + print("-" * 70) + print("\n") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/stem/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/util.py new file mode 100644 index 0000000000000000000000000000000000000000..e2a600fdb134503a9abad45e2870d0eee946faa5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/stem/util.py @@ -0,0 +1,25 @@ +# Natural Language Toolkit: Stemmer Utilities +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Helder +# URL: +# For license information, see LICENSE.TXT + + +def suffix_replace(original, old, new): + """ + Replaces the old suffix of the original string by a new suffix + """ + return original[: -len(old)] + new + + +def prefix_replace(original, old, new): + """ + Replaces the old prefix of the original string by a new suffix + + :param original: string + :param old: string + :param new: string + :return: string + """ + return new + original[len(old) :] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/api.py new file mode 100644 index 0000000000000000000000000000000000000000..ffd556bbd4e22624963d311a594dd9535768b4f7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/api.py @@ -0,0 +1,296 @@ +# Natural Language Toolkit: Tagger Interface +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# Tom Aarsen <> +# URL: +# For license information, see LICENSE.TXT + +""" +Interface for tagging each token in a sentence with supplementary +information, such as its part of speech. +""" +from abc import ABCMeta, abstractmethod +from functools import lru_cache +from itertools import chain +from typing import Dict + +from nltk.internals import deprecated, overridden +from nltk.metrics import ConfusionMatrix, accuracy +from nltk.tag.util import untag + + +class TaggerI(metaclass=ABCMeta): + """ + A processing interface for assigning a tag to each token in a list. + Tags are case sensitive strings that identify some property of each + token, such as its part of speech or its sense. + + Some taggers require specific types for their tokens. This is + generally indicated by the use of a sub-interface to ``TaggerI``. + For example, featureset taggers, which are subclassed from + ``FeaturesetTagger``, require that each token be a ``featureset``. + + Subclasses must define: + - either ``tag()`` or ``tag_sents()`` (or both) + """ + + @abstractmethod + def tag(self, tokens): + """ + Determine the most appropriate tag sequence for the given + token sequence, and return a corresponding list of tagged + tokens. A tagged token is encoded as a tuple ``(token, tag)``. + + :rtype: list(tuple(str, str)) + """ + if overridden(self.tag_sents): + return self.tag_sents([tokens])[0] + + def tag_sents(self, sentences): + """ + Apply ``self.tag()`` to each element of *sentences*. I.e.:: + + return [self.tag(sent) for sent in sentences] + """ + return [self.tag(sent) for sent in sentences] + + @deprecated("Use accuracy(gold) instead.") + def evaluate(self, gold): + return self.accuracy(gold) + + def accuracy(self, gold): + """ + Score the accuracy of the tagger against the gold standard. + Strip the tags from the gold standard text, retag it using + the tagger, then compute the accuracy score. + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :rtype: float + """ + + tagged_sents = self.tag_sents(untag(sent) for sent in gold) + gold_tokens = list(chain.from_iterable(gold)) + test_tokens = list(chain.from_iterable(tagged_sents)) + return accuracy(gold_tokens, test_tokens) + + @lru_cache(maxsize=1) + def _confusion_cached(self, gold): + """ + Inner function used after ``gold`` is converted to a + ``tuple(tuple(tuple(str, str)))``. That way, we can use caching on + creating a ConfusionMatrix. + + :param gold: The list of tagged sentences to run the tagger with, + also used as the reference values in the generated confusion matrix. + :type gold: tuple(tuple(tuple(str, str))) + :rtype: ConfusionMatrix + """ + + tagged_sents = self.tag_sents(untag(sent) for sent in gold) + gold_tokens = [token for _word, token in chain.from_iterable(gold)] + test_tokens = [token for _word, token in chain.from_iterable(tagged_sents)] + return ConfusionMatrix(gold_tokens, test_tokens) + + def confusion(self, gold): + """ + Return a ConfusionMatrix with the tags from ``gold`` as the reference + values, with the predictions from ``tag_sents`` as the predicted values. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[:10] + >>> print(tagger.confusion(gold_data)) + | - | + | N | + | O P | + | N J J N N P P R R V V V V V W | + | ' E C C D E I J J J M N N N O R P R B R T V B B B B B D ` | + | ' , - . C D T X N J R S D N P S S P $ B R P O B D G N P Z T ` | + -------+----------------------------------------------------------------------------------------------+ + '' | <1> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | .<15> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . <.> . . 2 . . . 2 . . . 5 1 . . . . 2 . . . . . . . . . . . | + . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . .<20> . . . . . . . . . . . . . . . . . . . . . . . . | + EX | . . . . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . .<22> . . . . . . . . . . 3 . . . . . . . . . . . | + JJ | . . . . . . . . .<16> . . . . 1 . . . . 1 . . . . . . . . . . . | + JJR | . . . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . .<28> 1 1 . . . . . . . . . . . . . . . | + NNP | . . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . .<19> . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . <2> . . . . . . . . . . . . | + RB | . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RBR | . . . . . . . . . . 1 . . . . . . . . . <1> . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . <5> . . . . . . . . | + VB | . . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . . . | + VBD | . . . . . . . . . . . . . 1 . . . . . . . . . . <6> . . . . . . | + VBG | . . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . . . | + VBN | . . . . . . . . . . . . . . . . . . . . . . . . 1 . <4> . . . . | + VBP | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . . . . <7> . . | + WDT | . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . <.> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <1>| + -------+----------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + + :param gold: The list of tagged sentences to run the tagger with, + also used as the reference values in the generated confusion matrix. + :type gold: list(list(tuple(str, str))) + :rtype: ConfusionMatrix + """ + + return self._confusion_cached(tuple(tuple(sent) for sent in gold)) + + def recall(self, gold) -> Dict[str, float]: + """ + Compute the recall for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to recall. The recall is defined as: + + - *r* = true positive / (true positive + false positive) + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :return: A mapping from tags to recall + :rtype: Dict[str, float] + """ + + cm = self.confusion(gold) + return {tag: cm.recall(tag) for tag in cm._values} + + def precision(self, gold): + """ + Compute the precision for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to precision. The precision is defined as: + + - *p* = true positive / (true positive + false negative) + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :return: A mapping from tags to precision + :rtype: Dict[str, float] + """ + + cm = self.confusion(gold) + return {tag: cm.precision(tag) for tag in cm._values} + + def f_measure(self, gold, alpha=0.5): + """ + Compute the f-measure for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to f-measure. The f-measure is the harmonic mean + of the ``precision`` and ``recall``, weighted by ``alpha``. + In particular, given the precision *p* and recall *r* defined by: + + - *p* = true positive / (true positive + false negative) + - *r* = true positive / (true positive + false positive) + + The f-measure is: + + - *1/(alpha/p + (1-alpha)/r)* + + With ``alpha = 0.5``, this reduces to: + + - *2pr / (p + r)* + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :param alpha: Ratio of the cost of false negative compared to false + positives. Defaults to 0.5, where the costs are equal. + :type alpha: float + :return: A mapping from tags to precision + :rtype: Dict[str, float] + """ + cm = self.confusion(gold) + return {tag: cm.f_measure(tag, alpha) for tag in cm._values} + + def evaluate_per_tag(self, gold, alpha=0.5, truncate=None, sort_by_count=False): + """Tabulate the **recall**, **precision** and **f-measure** + for each tag from ``gold`` or from running ``tag`` on the tokenized + sentences from ``gold``. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[:10] + >>> print(tagger.evaluate_per_tag(gold_data)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + '' | 1.0000 | 1.0000 | 1.0000 + , | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + . | 1.0000 | 1.0000 | 1.0000 + CC | 1.0000 | 1.0000 | 1.0000 + CD | 0.7143 | 1.0000 | 0.8333 + DT | 1.0000 | 1.0000 | 1.0000 + EX | 1.0000 | 1.0000 | 1.0000 + IN | 0.9167 | 0.8800 | 0.8980 + JJ | 0.8889 | 0.8889 | 0.8889 + JJR | 0.0000 | 0.0000 | 0.0000 + JJS | 1.0000 | 1.0000 | 1.0000 + MD | 1.0000 | 1.0000 | 1.0000 + NN | 0.8000 | 0.9333 | 0.8615 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 0.9500 | 1.0000 | 0.9744 + POS | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 1.0000 | 1.0000 + PRP$ | 1.0000 | 1.0000 | 1.0000 + RB | 0.4000 | 1.0000 | 0.5714 + RBR | 1.0000 | 0.5000 | 0.6667 + RP | 1.0000 | 1.0000 | 1.0000 + TO | 1.0000 | 1.0000 | 1.0000 + VB | 1.0000 | 1.0000 | 1.0000 + VBD | 0.8571 | 0.8571 | 0.8571 + VBG | 1.0000 | 0.8000 | 0.8889 + VBN | 1.0000 | 0.8000 | 0.8889 + VBP | 1.0000 | 1.0000 | 1.0000 + VBZ | 1.0000 | 1.0000 | 1.0000 + WDT | 0.0000 | 0.0000 | 0.0000 + `` | 1.0000 | 1.0000 | 1.0000 + + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :param alpha: Ratio of the cost of false negative compared to false + positives, as used in the f-measure computation. Defaults to 0.5, + where the costs are equal. + :type alpha: float + :param truncate: If specified, then only show the specified + number of values. Any sorting (e.g., sort_by_count) + will be performed before truncation. Defaults to None + :type truncate: int, optional + :param sort_by_count: Whether to sort the outputs on number of + occurrences of that tag in the ``gold`` data, defaults to False + :type sort_by_count: bool, optional + :return: A tabulated recall, precision and f-measure string + :rtype: str + """ + cm = self.confusion(gold) + return cm.evaluate(alpha=alpha, truncate=truncate, sort_by_count=sort_by_count) + + def _check_params(self, train, model): + if (train and model) or (not train and not model): + raise ValueError("Must specify either training data or trained model.") + + +class FeaturesetTaggerI(TaggerI): + """ + A tagger that requires tokens to be ``featuresets``. A featureset + is a dictionary that maps from feature names to feature + values. See ``nltk.classify`` for more information about features + and featuresets. + """ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/brill.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/brill.py new file mode 100644 index 0000000000000000000000000000000000000000..e440c8ab049cd2e7b64ee5b492b63198843554ba --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/brill.py @@ -0,0 +1,449 @@ +# Natural Language Toolkit: Transformation-based learning +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Marcus Uneson +# based on previous (nltk2) version by +# Christopher Maloof, Edward Loper, Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from collections import Counter, defaultdict + +from nltk import jsontags +from nltk.tag import TaggerI +from nltk.tbl import Feature, Template + +###################################################################### +# Brill Templates +###################################################################### + + +@jsontags.register_tag +class Word(Feature): + """ + Feature which examines the text (word) of nearby tokens. + """ + + json_tag = "nltk.tag.brill.Word" + + @staticmethod + def extract_property(tokens, index): + """@return: The given token's text.""" + return tokens[index][0] + + +@jsontags.register_tag +class Pos(Feature): + """ + Feature which examines the tags of nearby tokens. + """ + + json_tag = "nltk.tag.brill.Pos" + + @staticmethod + def extract_property(tokens, index): + """@return: The given token's tag.""" + return tokens[index][1] + + +def nltkdemo18(): + """ + Return 18 templates, from the original nltk demo, in multi-feature syntax + """ + return [ + Template(Pos([-1])), + Template(Pos([1])), + Template(Pos([-2])), + Template(Pos([2])), + Template(Pos([-2, -1])), + Template(Pos([1, 2])), + Template(Pos([-3, -2, -1])), + Template(Pos([1, 2, 3])), + Template(Pos([-1]), Pos([1])), + Template(Word([-1])), + Template(Word([1])), + Template(Word([-2])), + Template(Word([2])), + Template(Word([-2, -1])), + Template(Word([1, 2])), + Template(Word([-3, -2, -1])), + Template(Word([1, 2, 3])), + Template(Word([-1]), Word([1])), + ] + + +def nltkdemo18plus(): + """ + Return 18 templates, from the original nltk demo, and additionally a few + multi-feature ones (the motivation is easy comparison with nltkdemo18) + """ + return nltkdemo18() + [ + Template(Word([-1]), Pos([1])), + Template(Pos([-1]), Word([1])), + Template(Word([-1]), Word([0]), Pos([1])), + Template(Pos([-1]), Word([0]), Word([1])), + Template(Pos([-1]), Word([0]), Pos([1])), + ] + + +def fntbl37(): + """ + Return 37 templates taken from the postagging task of the + fntbl distribution https://www.cs.jhu.edu/~rflorian/fntbl/ + (37 is after excluding a handful which do not condition on Pos[0]; + fntbl can do that but the current nltk implementation cannot.) + """ + return [ + Template(Word([0]), Word([1]), Word([2])), + Template(Word([-1]), Word([0]), Word([1])), + Template(Word([0]), Word([-1])), + Template(Word([0]), Word([1])), + Template(Word([0]), Word([2])), + Template(Word([0]), Word([-2])), + Template(Word([1, 2])), + Template(Word([-2, -1])), + Template(Word([1, 2, 3])), + Template(Word([-3, -2, -1])), + Template(Word([0]), Pos([2])), + Template(Word([0]), Pos([-2])), + Template(Word([0]), Pos([1])), + Template(Word([0]), Pos([-1])), + Template(Word([0])), + Template(Word([-2])), + Template(Word([2])), + Template(Word([1])), + Template(Word([-1])), + Template(Pos([-1]), Pos([1])), + Template(Pos([1]), Pos([2])), + Template(Pos([-1]), Pos([-2])), + Template(Pos([1])), + Template(Pos([-1])), + Template(Pos([-2])), + Template(Pos([2])), + Template(Pos([1, 2, 3])), + Template(Pos([1, 2])), + Template(Pos([-3, -2, -1])), + Template(Pos([-2, -1])), + Template(Pos([1]), Word([0]), Word([1])), + Template(Pos([1]), Word([0]), Word([-1])), + Template(Pos([-1]), Word([-1]), Word([0])), + Template(Pos([-1]), Word([0]), Word([1])), + Template(Pos([-2]), Pos([-1])), + Template(Pos([1]), Pos([2])), + Template(Pos([1]), Pos([2]), Word([1])), + ] + + +def brill24(): + """ + Return 24 templates of the seminal TBL paper, Brill (1995) + """ + return [ + Template(Pos([-1])), + Template(Pos([1])), + Template(Pos([-2])), + Template(Pos([2])), + Template(Pos([-2, -1])), + Template(Pos([1, 2])), + Template(Pos([-3, -2, -1])), + Template(Pos([1, 2, 3])), + Template(Pos([-1]), Pos([1])), + Template(Pos([-2]), Pos([-1])), + Template(Pos([1]), Pos([2])), + Template(Word([-1])), + Template(Word([1])), + Template(Word([-2])), + Template(Word([2])), + Template(Word([-2, -1])), + Template(Word([1, 2])), + Template(Word([-1, 0])), + Template(Word([0, 1])), + Template(Word([0])), + Template(Word([-1]), Pos([-1])), + Template(Word([1]), Pos([1])), + Template(Word([0]), Word([-1]), Pos([-1])), + Template(Word([0]), Word([1]), Pos([1])), + ] + + +def describe_template_sets(): + """ + Print the available template sets in this demo, with a short description" + """ + import inspect + import sys + + # a bit of magic to get all functions in this module + templatesets = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for (name, obj) in templatesets: + if name == "describe_template_sets": + continue + print(name, obj.__doc__, "\n") + + +###################################################################### +# The Brill Tagger +###################################################################### + + +@jsontags.register_tag +class BrillTagger(TaggerI): + """ + Brill's transformational rule-based tagger. Brill taggers use an + initial tagger (such as ``tag.DefaultTagger``) to assign an initial + tag sequence to a text; and then apply an ordered list of + transformational rules to correct the tags of individual tokens. + These transformation rules are specified by the ``TagRule`` + interface. + + Brill taggers can be created directly, from an initial tagger and + a list of transformational rules; but more often, Brill taggers + are created by learning rules from a training corpus, using one + of the TaggerTrainers available. + """ + + json_tag = "nltk.tag.BrillTagger" + + def __init__(self, initial_tagger, rules, training_stats=None): + """ + :param initial_tagger: The initial tagger + :type initial_tagger: TaggerI + + :param rules: An ordered list of transformation rules that + should be used to correct the initial tagging. + :type rules: list(TagRule) + + :param training_stats: A dictionary of statistics collected + during training, for possible later use + :type training_stats: dict + + """ + self._initial_tagger = initial_tagger + self._rules = tuple(rules) + self._training_stats = training_stats + + def encode_json_obj(self): + return self._initial_tagger, self._rules, self._training_stats + + @classmethod + def decode_json_obj(cls, obj): + _initial_tagger, _rules, _training_stats = obj + return cls(_initial_tagger, _rules, _training_stats) + + def rules(self): + """ + Return the ordered list of transformation rules that this tagger has learnt + + :return: the ordered list of transformation rules that correct the initial tagging + :rtype: list of Rules + """ + return self._rules + + def train_stats(self, statistic=None): + """ + Return a named statistic collected during training, or a dictionary of all + available statistics if no name given + + :param statistic: name of statistic + :type statistic: str + :return: some statistic collected during training of this tagger + :rtype: any (but usually a number) + """ + if statistic is None: + return self._training_stats + else: + return self._training_stats.get(statistic) + + def tag(self, tokens): + # Inherit documentation from TaggerI + + # Run the initial tagger. + tagged_tokens = self._initial_tagger.tag(tokens) + + # Create a dictionary that maps each tag to a list of the + # indices of tokens that have that tag. + tag_to_positions = defaultdict(set) + for i, (token, tag) in enumerate(tagged_tokens): + tag_to_positions[tag].add(i) + + # Apply each rule, in order. Only try to apply rules at + # positions that have the desired original tag. + for rule in self._rules: + # Find the positions where it might apply + positions = tag_to_positions.get(rule.original_tag, []) + # Apply the rule at those positions. + changed = rule.apply(tagged_tokens, positions) + # Update tag_to_positions with the positions of tags that + # were modified. + for i in changed: + tag_to_positions[rule.original_tag].remove(i) + tag_to_positions[rule.replacement_tag].add(i) + + return tagged_tokens + + def print_template_statistics(self, test_stats=None, printunused=True): + """ + Print a list of all templates, ranked according to efficiency. + + If test_stats is available, the templates are ranked according to their + relative contribution (summed for all rules created from a given template, + weighted by score) to the performance on the test set. If no test_stats, then + statistics collected during training are used instead. There is also + an unweighted measure (just counting the rules). This is less informative, + though, as many low-score rules will appear towards end of training. + + :param test_stats: dictionary of statistics collected during testing + :type test_stats: dict of str -> any (but usually numbers) + :param printunused: if True, print a list of all unused templates + :type printunused: bool + :return: None + :rtype: None + """ + tids = [r.templateid for r in self._rules] + train_stats = self.train_stats() + + trainscores = train_stats["rulescores"] + assert len(trainscores) == len( + tids + ), "corrupt statistics: " "{} train scores for {} rules".format( + trainscores, tids + ) + template_counts = Counter(tids) + weighted_traincounts = Counter() + for (tid, score) in zip(tids, trainscores): + weighted_traincounts[tid] += score + tottrainscores = sum(trainscores) + + # det_tplsort() is for deterministic sorting; + # the otherwise convenient Counter.most_common() unfortunately + # does not break ties deterministically + # between python versions and will break cross-version tests + def det_tplsort(tpl_value): + return (tpl_value[1], repr(tpl_value[0])) + + def print_train_stats(): + print( + "TEMPLATE STATISTICS (TRAIN) {} templates, {} rules)".format( + len(template_counts), len(tids) + ) + ) + print( + "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " + "final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats) + ) + head = "#ID | Score (train) | #Rules | Template" + print(head, "\n", "-" * len(head), sep="") + train_tplscores = sorted( + weighted_traincounts.items(), key=det_tplsort, reverse=True + ) + for (tid, trainscore) in train_tplscores: + s = "{} | {:5d} {:5.3f} |{:4d} {:.3f} | {}".format( + tid, + trainscore, + trainscore / tottrainscores, + template_counts[tid], + template_counts[tid] / len(tids), + Template.ALLTEMPLATES[int(tid)], + ) + print(s) + + def print_testtrain_stats(): + testscores = test_stats["rulescores"] + print( + "TEMPLATE STATISTICS (TEST AND TRAIN) ({} templates, {} rules)".format( + len(template_counts), len(tids) + ) + ) + print( + "TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " + "final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats) + ) + print( + "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " + "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats) + ) + weighted_testcounts = Counter() + for (tid, score) in zip(tids, testscores): + weighted_testcounts[tid] += score + tottestscores = sum(testscores) + head = "#ID | Score (test) | Score (train) | #Rules | Template" + print(head, "\n", "-" * len(head), sep="") + test_tplscores = sorted( + weighted_testcounts.items(), key=det_tplsort, reverse=True + ) + for (tid, testscore) in test_tplscores: + s = "{:s} |{:5d} {:6.3f} | {:4d} {:.3f} |{:4d} {:.3f} | {:s}".format( + tid, + testscore, + testscore / tottestscores, + weighted_traincounts[tid], + weighted_traincounts[tid] / tottrainscores, + template_counts[tid], + template_counts[tid] / len(tids), + Template.ALLTEMPLATES[int(tid)], + ) + print(s) + + def print_unused_templates(): + usedtpls = {int(tid) for tid in tids} + unused = [ + (tid, tpl) + for (tid, tpl) in enumerate(Template.ALLTEMPLATES) + if tid not in usedtpls + ] + print(f"UNUSED TEMPLATES ({len(unused)})") + + for (tid, tpl) in unused: + print(f"{tid:03d} {str(tpl):s}") + + if test_stats is None: + print_train_stats() + else: + print_testtrain_stats() + print() + if printunused: + print_unused_templates() + print() + + def batch_tag_incremental(self, sequences, gold): + """ + Tags by applying each rule to the entire corpus (rather than all rules to a + single sequence). The point is to collect statistics on the test set for + individual rules. + + NOTE: This is inefficient (does not build any index, so will traverse the entire + corpus N times for N rules) -- usually you would not care about statistics for + individual rules and thus use batch_tag() instead + + :param sequences: lists of token sequences (sentences, in some applications) to be tagged + :type sequences: list of list of strings + :param gold: the gold standard + :type gold: list of list of strings + :returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule)) + """ + + def counterrors(xs): + return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair)) + + testing_stats = {} + testing_stats["tokencount"] = sum(len(t) for t in sequences) + testing_stats["sequencecount"] = len(sequences) + tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences] + testing_stats["initialerrors"] = counterrors(tagged_tokenses) + testing_stats["initialacc"] = ( + 1 - testing_stats["initialerrors"] / testing_stats["tokencount"] + ) + # Apply each rule to the entire corpus, in order + errors = [testing_stats["initialerrors"]] + for rule in self._rules: + for tagged_tokens in tagged_tokenses: + rule.apply(tagged_tokens) + errors.append(counterrors(tagged_tokenses)) + testing_stats["rulescores"] = [ + err0 - err1 for (err0, err1) in zip(errors, errors[1:]) + ] + testing_stats["finalerrors"] = errors[-1] + testing_stats["finalacc"] = ( + 1 - testing_stats["finalerrors"] / testing_stats["tokencount"] + ) + return (tagged_tokenses, testing_stats) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/hmm.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/hmm.py new file mode 100644 index 0000000000000000000000000000000000000000..f4db56705ec6e34c98b847e338fbc64ac88deb63 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/hmm.py @@ -0,0 +1,1329 @@ +# Natural Language Toolkit: Hidden Markov Model +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# Philip Blunsom +# Tiago Tresoldi (fixes) +# Steven Bird (fixes) +# Joseph Frazee (fixes) +# Steven Xu (fixes) +# URL: +# For license information, see LICENSE.TXT + +""" +Hidden Markov Models (HMMs) largely used to assign the correct label sequence +to sequential data or assess the probability of a given label and data +sequence. These models are finite state machines characterised by a number of +states, transitions between these states, and output symbols emitted while in +each state. The HMM is an extension to the Markov chain, where each state +corresponds deterministically to a given event. In the HMM the observation is +a probabilistic function of the state. HMMs share the Markov chain's +assumption, being that the probability of transition from one state to another +only depends on the current state - i.e. the series of states that led to the +current state are not used. They are also time invariant. + +The HMM is a directed graph, with probability weighted edges (representing the +probability of a transition between the source and sink states) where each +vertex emits an output symbol when entered. The symbol (or observation) is +non-deterministically generated. For this reason, knowing that a sequence of +output observations was generated by a given HMM does not mean that the +corresponding sequence of states (and what the current state is) is known. +This is the 'hidden' in the hidden markov model. + +Formally, a HMM can be characterised by: + +- the output observation alphabet. This is the set of symbols which may be + observed as output of the system. +- the set of states. +- the transition probabilities *a_{ij} = P(s_t = j | s_{t-1} = i)*. These + represent the probability of transition to each state from a given state. +- the output probability matrix *b_i(k) = P(X_t = o_k | s_t = i)*. These + represent the probability of observing each symbol in a given state. +- the initial state distribution. This gives the probability of starting + in each state. + +To ground this discussion, take a common NLP application, part-of-speech (POS) +tagging. An HMM is desirable for this task as the highest probability tag +sequence can be calculated for a given sequence of word forms. This differs +from other tagging techniques which often tag each word individually, seeking +to optimise each individual tagging greedily without regard to the optimal +combination of tags for a larger unit, such as a sentence. The HMM does this +with the Viterbi algorithm, which efficiently computes the optimal path +through the graph given the sequence of words forms. + +In POS tagging the states usually have a 1:1 correspondence with the tag +alphabet - i.e. each state represents a single tag. The output observation +alphabet is the set of word forms (the lexicon), and the remaining three +parameters are derived by a training regime. With this information the +probability of a given sentence can be easily derived, by simply summing the +probability of each distinct path through the model. Similarly, the highest +probability tagging sequence can be derived with the Viterbi algorithm, +yielding a state sequence which can be mapped into a tag sequence. + +This discussion assumes that the HMM has been trained. This is probably the +most difficult task with the model, and requires either MLE estimates of the +parameters or unsupervised learning using the Baum-Welch algorithm, a variant +of EM. + +For more information, please consult the source code for this module, +which includes extensive demonstration code. +""" + +import itertools +import re + +try: + import numpy as np +except ImportError: + pass + +from nltk.metrics import accuracy +from nltk.probability import ( + ConditionalFreqDist, + ConditionalProbDist, + DictionaryConditionalProbDist, + DictionaryProbDist, + FreqDist, + LidstoneProbDist, + MLEProbDist, + MutableProbDist, + RandomProbDist, +) +from nltk.tag.api import TaggerI +from nltk.util import LazyMap, unique_list + +_TEXT = 0 # index of text in a tuple +_TAG = 1 # index of tag in a tuple + + +def _identity(labeled_symbols): + return labeled_symbols + + +class HiddenMarkovModelTagger(TaggerI): + """ + Hidden Markov model class, a generative model for labelling sequence data. + These models define the joint probability of a sequence of symbols and + their labels (state transitions) as the product of the starting state + probability, the probability of each state transition, and the probability + of each observation being generated from each state. This is described in + more detail in the module documentation. + + This implementation is based on the HMM description in Chapter 8, Huang, + Acero and Hon, Spoken Language Processing and includes an extension for + training shallow HMM parsers or specialized HMMs as in Molina et. + al, 2002. A specialized HMM modifies training data by applying a + specialization function to create a new training set that is more + appropriate for sequential tagging with an HMM. A typical use case is + chunking. + + :param symbols: the set of output symbols (alphabet) + :type symbols: seq of any + :param states: a set of states representing state space + :type states: seq of any + :param transitions: transition probabilities; Pr(s_i | s_j) is the + probability of transition from state i given the model is in + state_j + :type transitions: ConditionalProbDistI + :param outputs: output probabilities; Pr(o_k | s_i) is the probability + of emitting symbol k when entering state i + :type outputs: ConditionalProbDistI + :param priors: initial state distribution; Pr(s_i) is the probability + of starting in state i + :type priors: ProbDistI + :param transform: an optional function for transforming training + instances, defaults to the identity function. + :type transform: callable + """ + + def __init__( + self, symbols, states, transitions, outputs, priors, transform=_identity + ): + self._symbols = unique_list(symbols) + self._states = unique_list(states) + self._transitions = transitions + self._outputs = outputs + self._priors = priors + self._cache = None + self._transform = transform + + @classmethod + def _train( + cls, + labeled_sequence, + test_sequence=None, + unlabeled_sequence=None, + transform=_identity, + estimator=None, + **kwargs, + ): + + if estimator is None: + + def estimator(fd, bins): + return LidstoneProbDist(fd, 0.1, bins) + + labeled_sequence = LazyMap(transform, labeled_sequence) + symbols = unique_list(word for sent in labeled_sequence for word, tag in sent) + tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent) + + trainer = HiddenMarkovModelTrainer(tag_set, symbols) + hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) + hmm = cls( + hmm._symbols, + hmm._states, + hmm._transitions, + hmm._outputs, + hmm._priors, + transform=transform, + ) + + if test_sequence: + hmm.test(test_sequence, verbose=kwargs.get("verbose", False)) + + if unlabeled_sequence: + max_iterations = kwargs.get("max_iterations", 5) + hmm = trainer.train_unsupervised( + unlabeled_sequence, model=hmm, max_iterations=max_iterations + ) + if test_sequence: + hmm.test(test_sequence, verbose=kwargs.get("verbose", False)) + + return hmm + + @classmethod + def train( + cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, **kwargs + ): + """ + Train a new HiddenMarkovModelTagger using the given labeled and + unlabeled training instances. Testing will be performed if test + instances are provided. + + :return: a hidden markov model tagger + :rtype: HiddenMarkovModelTagger + :param labeled_sequence: a sequence of labeled training instances, + i.e. a list of sentences represented as tuples + :type labeled_sequence: list(list) + :param test_sequence: a sequence of labeled test instances + :type test_sequence: list(list) + :param unlabeled_sequence: a sequence of unlabeled training instances, + i.e. a list of sentences represented as words + :type unlabeled_sequence: list(list) + :param transform: an optional function for transforming training + instances, defaults to the identity function, see ``transform()`` + :type transform: function + :param estimator: an optional function or class that maps a + condition's frequency distribution to its probability + distribution, defaults to a Lidstone distribution with gamma = 0.1 + :type estimator: class or function + :param verbose: boolean flag indicating whether training should be + verbose or include printed output + :type verbose: bool + :param max_iterations: number of Baum-Welch iterations to perform + :type max_iterations: int + """ + return cls._train(labeled_sequence, test_sequence, unlabeled_sequence, **kwargs) + + def probability(self, sequence): + """ + Returns the probability of the given symbol sequence. If the sequence + is labelled, then returns the joint probability of the symbol, state + sequence. Otherwise, uses the forward algorithm to find the + probability over all label sequences. + + :return: the probability of the sequence + :rtype: float + :param sequence: the sequence of symbols which must contain the TEXT + property, and optionally the TAG property + :type sequence: Token + """ + return 2 ** (self.log_probability(self._transform(sequence))) + + def log_probability(self, sequence): + """ + Returns the log-probability of the given symbol sequence. If the + sequence is labelled, then returns the joint log-probability of the + symbol, state sequence. Otherwise, uses the forward algorithm to find + the log-probability over all label sequences. + + :return: the log-probability of the sequence + :rtype: float + :param sequence: the sequence of symbols which must contain the TEXT + property, and optionally the TAG property + :type sequence: Token + """ + sequence = self._transform(sequence) + + T = len(sequence) + + if T > 0 and sequence[0][_TAG]: + last_state = sequence[0][_TAG] + p = self._priors.logprob(last_state) + self._output_logprob( + last_state, sequence[0][_TEXT] + ) + for t in range(1, T): + state = sequence[t][_TAG] + p += self._transitions[last_state].logprob( + state + ) + self._output_logprob(state, sequence[t][_TEXT]) + last_state = state + return p + else: + alpha = self._forward_probability(sequence) + p = logsumexp2(alpha[T - 1]) + return p + + def tag(self, unlabeled_sequence): + """ + Tags the sequence with the highest probability state sequence. This + uses the best_path method to find the Viterbi path. + + :return: a labelled sequence of symbols + :rtype: list + :param unlabeled_sequence: the sequence of unlabeled symbols + :type unlabeled_sequence: list + """ + unlabeled_sequence = self._transform(unlabeled_sequence) + return self._tag(unlabeled_sequence) + + def _tag(self, unlabeled_sequence): + path = self._best_path(unlabeled_sequence) + return list(zip(unlabeled_sequence, path)) + + def _output_logprob(self, state, symbol): + """ + :return: the log probability of the symbol being observed in the given + state + :rtype: float + """ + return self._outputs[state].logprob(symbol) + + def _create_cache(self): + """ + The cache is a tuple (P, O, X, S) where: + + - S maps symbols to integers. I.e., it is the inverse + mapping from self._symbols; for each symbol s in + self._symbols, the following is true:: + + self._symbols[S[s]] == s + + - O is the log output probabilities:: + + O[i,k] = log( P(token[t]=sym[k]|tag[t]=state[i]) ) + + - X is the log transition probabilities:: + + X[i,j] = log( P(tag[t]=state[j]|tag[t-1]=state[i]) ) + + - P is the log prior probabilities:: + + P[i] = log( P(tag[0]=state[i]) ) + """ + if not self._cache: + N = len(self._states) + M = len(self._symbols) + P = np.zeros(N, np.float32) + X = np.zeros((N, N), np.float32) + O = np.zeros((N, M), np.float32) + for i in range(N): + si = self._states[i] + P[i] = self._priors.logprob(si) + for j in range(N): + X[i, j] = self._transitions[si].logprob(self._states[j]) + for k in range(M): + O[i, k] = self._output_logprob(si, self._symbols[k]) + S = {} + for k in range(M): + S[self._symbols[k]] = k + self._cache = (P, O, X, S) + + def _update_cache(self, symbols): + # add new symbols to the symbol table and repopulate the output + # probabilities and symbol table mapping + if symbols: + self._create_cache() + P, O, X, S = self._cache + for symbol in symbols: + if symbol not in self._symbols: + self._cache = None + self._symbols.append(symbol) + # don't bother with the work if there aren't any new symbols + if not self._cache: + N = len(self._states) + M = len(self._symbols) + Q = O.shape[1] + # add new columns to the output probability table without + # destroying the old probabilities + O = np.hstack([O, np.zeros((N, M - Q), np.float32)]) + for i in range(N): + si = self._states[i] + # only calculate probabilities for new symbols + for k in range(Q, M): + O[i, k] = self._output_logprob(si, self._symbols[k]) + # only create symbol mappings for new symbols + for k in range(Q, M): + S[self._symbols[k]] = k + self._cache = (P, O, X, S) + + def reset_cache(self): + self._cache = None + + def best_path(self, unlabeled_sequence): + """ + Returns the state sequence of the optimal (most probable) path through + the HMM. Uses the Viterbi algorithm to calculate this part by dynamic + programming. + + :return: the state sequence + :rtype: sequence of any + :param unlabeled_sequence: the sequence of unlabeled symbols + :type unlabeled_sequence: list + """ + unlabeled_sequence = self._transform(unlabeled_sequence) + return self._best_path(unlabeled_sequence) + + def _best_path(self, unlabeled_sequence): + T = len(unlabeled_sequence) + N = len(self._states) + self._create_cache() + self._update_cache(unlabeled_sequence) + P, O, X, S = self._cache + + V = np.zeros((T, N), np.float32) + B = -np.ones((T, N), int) + + V[0] = P + O[:, S[unlabeled_sequence[0]]] + for t in range(1, T): + for j in range(N): + vs = V[t - 1, :] + X[:, j] + best = np.argmax(vs) + V[t, j] = vs[best] + O[j, S[unlabeled_sequence[t]]] + B[t, j] = best + + current = np.argmax(V[T - 1, :]) + sequence = [current] + for t in range(T - 1, 0, -1): + last = B[t, current] + sequence.append(last) + current = last + + sequence.reverse() + return list(map(self._states.__getitem__, sequence)) + + def best_path_simple(self, unlabeled_sequence): + """ + Returns the state sequence of the optimal (most probable) path through + the HMM. Uses the Viterbi algorithm to calculate this part by dynamic + programming. This uses a simple, direct method, and is included for + teaching purposes. + + :return: the state sequence + :rtype: sequence of any + :param unlabeled_sequence: the sequence of unlabeled symbols + :type unlabeled_sequence: list + """ + unlabeled_sequence = self._transform(unlabeled_sequence) + return self._best_path_simple(unlabeled_sequence) + + def _best_path_simple(self, unlabeled_sequence): + T = len(unlabeled_sequence) + N = len(self._states) + V = np.zeros((T, N), np.float64) + B = {} + + # find the starting log probabilities for each state + symbol = unlabeled_sequence[0] + for i, state in enumerate(self._states): + V[0, i] = self._priors.logprob(state) + self._output_logprob(state, symbol) + B[0, state] = None + + # find the maximum log probabilities for reaching each state at time t + for t in range(1, T): + symbol = unlabeled_sequence[t] + for j in range(N): + sj = self._states[j] + best = None + for i in range(N): + si = self._states[i] + va = V[t - 1, i] + self._transitions[si].logprob(sj) + if not best or va > best[0]: + best = (va, si) + V[t, j] = best[0] + self._output_logprob(sj, symbol) + B[t, sj] = best[1] + + # find the highest probability final state + best = None + for i in range(N): + val = V[T - 1, i] + if not best or val > best[0]: + best = (val, self._states[i]) + + # traverse the back-pointers B to find the state sequence + current = best[1] + sequence = [current] + for t in range(T - 1, 0, -1): + last = B[t, current] + sequence.append(last) + current = last + + sequence.reverse() + return sequence + + def random_sample(self, rng, length): + """ + Randomly sample the HMM to generate a sentence of a given length. This + samples the prior distribution then the observation distribution and + transition distribution for each subsequent observation and state. + This will mostly generate unintelligible garbage, but can provide some + amusement. + + :return: the randomly created state/observation sequence, + generated according to the HMM's probability + distributions. The SUBTOKENS have TEXT and TAG + properties containing the observation and state + respectively. + :rtype: list + :param rng: random number generator + :type rng: Random (or any object with a random() method) + :param length: desired output length + :type length: int + """ + + # sample the starting state and symbol prob dists + tokens = [] + state = self._sample_probdist(self._priors, rng.random(), self._states) + symbol = self._sample_probdist( + self._outputs[state], rng.random(), self._symbols + ) + tokens.append((symbol, state)) + + for i in range(1, length): + # sample the state transition and symbol prob dists + state = self._sample_probdist( + self._transitions[state], rng.random(), self._states + ) + symbol = self._sample_probdist( + self._outputs[state], rng.random(), self._symbols + ) + tokens.append((symbol, state)) + + return tokens + + def _sample_probdist(self, probdist, p, samples): + cum_p = 0 + for sample in samples: + add_p = probdist.prob(sample) + if cum_p <= p <= cum_p + add_p: + return sample + cum_p += add_p + raise Exception("Invalid probability distribution - " "does not sum to one") + + def entropy(self, unlabeled_sequence): + """ + Returns the entropy over labellings of the given sequence. This is + given by:: + + H(O) = - sum_S Pr(S | O) log Pr(S | O) + + where the summation ranges over all state sequences, S. Let + *Z = Pr(O) = sum_S Pr(S, O)}* where the summation ranges over all state + sequences and O is the observation sequence. As such the entropy can + be re-expressed as:: + + H = - sum_S Pr(S | O) log [ Pr(S, O) / Z ] + = log Z - sum_S Pr(S | O) log Pr(S, 0) + = log Z - sum_S Pr(S | O) [ log Pr(S_0) + sum_t Pr(S_t | S_{t-1}) + sum_t Pr(O_t | S_t) ] + + The order of summation for the log terms can be flipped, allowing + dynamic programming to be used to calculate the entropy. Specifically, + we use the forward and backward probabilities (alpha, beta) giving:: + + H = log Z - sum_s0 alpha_0(s0) beta_0(s0) / Z * log Pr(s0) + + sum_t,si,sj alpha_t(si) Pr(sj | si) Pr(O_t+1 | sj) beta_t(sj) / Z * log Pr(sj | si) + + sum_t,st alpha_t(st) beta_t(st) / Z * log Pr(O_t | st) + + This simply uses alpha and beta to find the probabilities of partial + sequences, constrained to include the given state(s) at some point in + time. + """ + unlabeled_sequence = self._transform(unlabeled_sequence) + + T = len(unlabeled_sequence) + N = len(self._states) + + alpha = self._forward_probability(unlabeled_sequence) + beta = self._backward_probability(unlabeled_sequence) + normalisation = logsumexp2(alpha[T - 1]) + + entropy = normalisation + + # starting state, t = 0 + for i, state in enumerate(self._states): + p = 2 ** (alpha[0, i] + beta[0, i] - normalisation) + entropy -= p * self._priors.logprob(state) + # print('p(s_0 = %s) =' % state, p) + + # state transitions + for t0 in range(T - 1): + t1 = t0 + 1 + for i0, s0 in enumerate(self._states): + for i1, s1 in enumerate(self._states): + p = 2 ** ( + alpha[t0, i0] + + self._transitions[s0].logprob(s1) + + self._outputs[s1].logprob(unlabeled_sequence[t1][_TEXT]) + + beta[t1, i1] + - normalisation + ) + entropy -= p * self._transitions[s0].logprob(s1) + # print('p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p) + + # symbol emissions + for t in range(T): + for i, state in enumerate(self._states): + p = 2 ** (alpha[t, i] + beta[t, i] - normalisation) + entropy -= p * self._outputs[state].logprob( + unlabeled_sequence[t][_TEXT] + ) + # print('p(s_%d = %s) =' % (t, state), p) + + return entropy + + def point_entropy(self, unlabeled_sequence): + """ + Returns the pointwise entropy over the possible states at each + position in the chain, given the observation sequence. + """ + unlabeled_sequence = self._transform(unlabeled_sequence) + + T = len(unlabeled_sequence) + N = len(self._states) + + alpha = self._forward_probability(unlabeled_sequence) + beta = self._backward_probability(unlabeled_sequence) + normalisation = logsumexp2(alpha[T - 1]) + + entropies = np.zeros(T, np.float64) + probs = np.zeros(N, np.float64) + for t in range(T): + for s in range(N): + probs[s] = alpha[t, s] + beta[t, s] - normalisation + + for s in range(N): + entropies[t] -= 2 ** (probs[s]) * probs[s] + + return entropies + + def _exhaustive_entropy(self, unlabeled_sequence): + unlabeled_sequence = self._transform(unlabeled_sequence) + + T = len(unlabeled_sequence) + N = len(self._states) + + labellings = [[state] for state in self._states] + for t in range(T - 1): + current = labellings + labellings = [] + for labelling in current: + for state in self._states: + labellings.append(labelling + [state]) + + log_probs = [] + for labelling in labellings: + labeled_sequence = unlabeled_sequence[:] + for t, label in enumerate(labelling): + labeled_sequence[t] = (labeled_sequence[t][_TEXT], label) + lp = self.log_probability(labeled_sequence) + log_probs.append(lp) + normalisation = _log_add(*log_probs) + + entropy = 0 + for lp in log_probs: + lp -= normalisation + entropy -= 2 ** (lp) * lp + + return entropy + + def _exhaustive_point_entropy(self, unlabeled_sequence): + unlabeled_sequence = self._transform(unlabeled_sequence) + + T = len(unlabeled_sequence) + N = len(self._states) + + labellings = [[state] for state in self._states] + for t in range(T - 1): + current = labellings + labellings = [] + for labelling in current: + for state in self._states: + labellings.append(labelling + [state]) + + log_probs = [] + for labelling in labellings: + labelled_sequence = unlabeled_sequence[:] + for t, label in enumerate(labelling): + labelled_sequence[t] = (labelled_sequence[t][_TEXT], label) + lp = self.log_probability(labelled_sequence) + log_probs.append(lp) + + normalisation = _log_add(*log_probs) + + probabilities = _ninf_array((T, N)) + + for labelling, lp in zip(labellings, log_probs): + lp -= normalisation + for t, label in enumerate(labelling): + index = self._states.index(label) + probabilities[t, index] = _log_add(probabilities[t, index], lp) + + entropies = np.zeros(T, np.float64) + for t in range(T): + for s in range(N): + entropies[t] -= 2 ** (probabilities[t, s]) * probabilities[t, s] + + return entropies + + def _transitions_matrix(self): + """Return a matrix of transition log probabilities.""" + trans_iter = ( + self._transitions[sj].logprob(si) + for sj in self._states + for si in self._states + ) + + transitions_logprob = np.fromiter(trans_iter, dtype=np.float64) + N = len(self._states) + return transitions_logprob.reshape((N, N)).T + + def _outputs_vector(self, symbol): + """ + Return a vector with log probabilities of emitting a symbol + when entering states. + """ + out_iter = (self._output_logprob(sj, symbol) for sj in self._states) + return np.fromiter(out_iter, dtype=np.float64) + + def _forward_probability(self, unlabeled_sequence): + """ + Return the forward probability matrix, a T by N array of + log-probabilities, where T is the length of the sequence and N is the + number of states. Each entry (t, s) gives the probability of being in + state s at time t after observing the partial symbol sequence up to + and including t. + + :param unlabeled_sequence: the sequence of unlabeled symbols + :type unlabeled_sequence: list + :return: the forward log probability matrix + :rtype: array + """ + T = len(unlabeled_sequence) + N = len(self._states) + alpha = _ninf_array((T, N)) + + transitions_logprob = self._transitions_matrix() + + # Initialization + symbol = unlabeled_sequence[0][_TEXT] + for i, state in enumerate(self._states): + alpha[0, i] = self._priors.logprob(state) + self._output_logprob( + state, symbol + ) + + # Induction + for t in range(1, T): + symbol = unlabeled_sequence[t][_TEXT] + output_logprob = self._outputs_vector(symbol) + + for i in range(N): + summand = alpha[t - 1] + transitions_logprob[i] + alpha[t, i] = logsumexp2(summand) + output_logprob[i] + + return alpha + + def _backward_probability(self, unlabeled_sequence): + """ + Return the backward probability matrix, a T by N array of + log-probabilities, where T is the length of the sequence and N is the + number of states. Each entry (t, s) gives the probability of being in + state s at time t after observing the partial symbol sequence from t + .. T. + + :return: the backward log probability matrix + :rtype: array + :param unlabeled_sequence: the sequence of unlabeled symbols + :type unlabeled_sequence: list + """ + T = len(unlabeled_sequence) + N = len(self._states) + beta = _ninf_array((T, N)) + + transitions_logprob = self._transitions_matrix().T + + # initialise the backward values; + # "1" is an arbitrarily chosen value from Rabiner tutorial + beta[T - 1, :] = np.log2(1) + + # inductively calculate remaining backward values + for t in range(T - 2, -1, -1): + symbol = unlabeled_sequence[t + 1][_TEXT] + outputs = self._outputs_vector(symbol) + + for i in range(N): + summand = transitions_logprob[i] + beta[t + 1] + outputs + beta[t, i] = logsumexp2(summand) + + return beta + + def test(self, test_sequence, verbose=False, **kwargs): + """ + Tests the HiddenMarkovModelTagger instance. + + :param test_sequence: a sequence of labeled test instances + :type test_sequence: list(list) + :param verbose: boolean flag indicating whether training should be + verbose or include printed output + :type verbose: bool + """ + + def words(sent): + return [word for (word, tag) in sent] + + def tags(sent): + return [tag for (word, tag) in sent] + + def flatten(seq): + return list(itertools.chain(*seq)) + + test_sequence = self._transform(test_sequence) + predicted_sequence = list(map(self._tag, map(words, test_sequence))) + + if verbose: + for test_sent, predicted_sent in zip(test_sequence, predicted_sequence): + print( + "Test:", + " ".join(f"{token}/{tag}" for (token, tag) in test_sent), + ) + print() + print("Untagged:", " ".join("%s" % token for (token, tag) in test_sent)) + print() + print( + "HMM-tagged:", + " ".join(f"{token}/{tag}" for (token, tag) in predicted_sent), + ) + print() + print( + "Entropy:", + self.entropy([(token, None) for (token, tag) in predicted_sent]), + ) + print() + print("-" * 60) + + test_tags = flatten(map(tags, test_sequence)) + predicted_tags = flatten(map(tags, predicted_sequence)) + + acc = accuracy(test_tags, predicted_tags) + count = sum(len(sent) for sent in test_sequence) + print("accuracy over %d tokens: %.2f" % (count, acc * 100)) + + def __repr__(self): + return "" % ( + len(self._states), + len(self._symbols), + ) + + +class HiddenMarkovModelTrainer: + """ + Algorithms for learning HMM parameters from training data. These include + both supervised learning (MLE) and unsupervised learning (Baum-Welch). + + Creates an HMM trainer to induce an HMM with the given states and + output symbol alphabet. A supervised and unsupervised training + method may be used. If either of the states or symbols are not given, + these may be derived from supervised training. + + :param states: the set of state labels + :type states: sequence of any + :param symbols: the set of observation symbols + :type symbols: sequence of any + """ + + def __init__(self, states=None, symbols=None): + self._states = states if states else [] + self._symbols = symbols if symbols else [] + + def train(self, labeled_sequences=None, unlabeled_sequences=None, **kwargs): + """ + Trains the HMM using both (or either of) supervised and unsupervised + techniques. + + :return: the trained model + :rtype: HiddenMarkovModelTagger + :param labelled_sequences: the supervised training data, a set of + labelled sequences of observations + ex: [ (word_1, tag_1),...,(word_n,tag_n) ] + :type labelled_sequences: list + :param unlabeled_sequences: the unsupervised training data, a set of + sequences of observations + ex: [ word_1, ..., word_n ] + :type unlabeled_sequences: list + :param kwargs: additional arguments to pass to the training methods + """ + assert labeled_sequences or unlabeled_sequences + model = None + if labeled_sequences: + model = self.train_supervised(labeled_sequences, **kwargs) + if unlabeled_sequences: + if model: + kwargs["model"] = model + model = self.train_unsupervised(unlabeled_sequences, **kwargs) + return model + + def _baum_welch_step(self, sequence, model, symbol_to_number): + + N = len(model._states) + M = len(model._symbols) + T = len(sequence) + + # compute forward and backward probabilities + alpha = model._forward_probability(sequence) + beta = model._backward_probability(sequence) + + # find the log probability of the sequence + lpk = logsumexp2(alpha[T - 1]) + + A_numer = _ninf_array((N, N)) + B_numer = _ninf_array((N, M)) + A_denom = _ninf_array(N) + B_denom = _ninf_array(N) + + transitions_logprob = model._transitions_matrix().T + + for t in range(T): + symbol = sequence[t][_TEXT] # not found? FIXME + next_symbol = None + if t < T - 1: + next_symbol = sequence[t + 1][_TEXT] # not found? FIXME + xi = symbol_to_number[symbol] + + next_outputs_logprob = model._outputs_vector(next_symbol) + alpha_plus_beta = alpha[t] + beta[t] + + if t < T - 1: + numer_add = ( + transitions_logprob + + next_outputs_logprob + + beta[t + 1] + + alpha[t].reshape(N, 1) + ) + A_numer = np.logaddexp2(A_numer, numer_add) + A_denom = np.logaddexp2(A_denom, alpha_plus_beta) + else: + B_denom = np.logaddexp2(A_denom, alpha_plus_beta) + + B_numer[:, xi] = np.logaddexp2(B_numer[:, xi], alpha_plus_beta) + + return lpk, A_numer, A_denom, B_numer, B_denom + + def train_unsupervised(self, unlabeled_sequences, update_outputs=True, **kwargs): + """ + Trains the HMM using the Baum-Welch algorithm to maximise the + probability of the data sequence. This is a variant of the EM + algorithm, and is unsupervised in that it doesn't need the state + sequences for the symbols. The code is based on 'A Tutorial on Hidden + Markov Models and Selected Applications in Speech Recognition', + Lawrence Rabiner, IEEE, 1989. + + :return: the trained model + :rtype: HiddenMarkovModelTagger + :param unlabeled_sequences: the training data, a set of + sequences of observations + :type unlabeled_sequences: list + + kwargs may include following parameters: + + :param model: a HiddenMarkovModelTagger instance used to begin + the Baum-Welch algorithm + :param max_iterations: the maximum number of EM iterations + :param convergence_logprob: the maximum change in log probability to + allow convergence + """ + + # create a uniform HMM, which will be iteratively refined, unless + # given an existing model + model = kwargs.get("model") + if not model: + priors = RandomProbDist(self._states) + transitions = DictionaryConditionalProbDist( + {state: RandomProbDist(self._states) for state in self._states} + ) + outputs = DictionaryConditionalProbDist( + {state: RandomProbDist(self._symbols) for state in self._states} + ) + model = HiddenMarkovModelTagger( + self._symbols, self._states, transitions, outputs, priors + ) + + self._states = model._states + self._symbols = model._symbols + + N = len(self._states) + M = len(self._symbols) + symbol_numbers = {sym: i for i, sym in enumerate(self._symbols)} + + # update model prob dists so that they can be modified + # model._priors = MutableProbDist(model._priors, self._states) + + model._transitions = DictionaryConditionalProbDist( + { + s: MutableProbDist(model._transitions[s], self._states) + for s in self._states + } + ) + + if update_outputs: + model._outputs = DictionaryConditionalProbDist( + { + s: MutableProbDist(model._outputs[s], self._symbols) + for s in self._states + } + ) + + model.reset_cache() + + # iterate until convergence + converged = False + last_logprob = None + iteration = 0 + max_iterations = kwargs.get("max_iterations", 1000) + epsilon = kwargs.get("convergence_logprob", 1e-6) + + while not converged and iteration < max_iterations: + A_numer = _ninf_array((N, N)) + B_numer = _ninf_array((N, M)) + A_denom = _ninf_array(N) + B_denom = _ninf_array(N) + + logprob = 0 + for sequence in unlabeled_sequences: + sequence = list(sequence) + if not sequence: + continue + + ( + lpk, + seq_A_numer, + seq_A_denom, + seq_B_numer, + seq_B_denom, + ) = self._baum_welch_step(sequence, model, symbol_numbers) + + # add these sums to the global A and B values + for i in range(N): + A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i] - lpk) + B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i] - lpk) + + A_denom = np.logaddexp2(A_denom, seq_A_denom - lpk) + B_denom = np.logaddexp2(B_denom, seq_B_denom - lpk) + + logprob += lpk + + # use the calculated values to update the transition and output + # probability values + for i in range(N): + logprob_Ai = A_numer[i] - A_denom[i] + logprob_Bi = B_numer[i] - B_denom[i] + + # We should normalize all probabilities (see p.391 Huang et al) + # Let sum(P) be K. + # We can divide each Pi by K to make sum(P) == 1. + # Pi' = Pi/K + # log2(Pi') = log2(Pi) - log2(K) + logprob_Ai -= logsumexp2(logprob_Ai) + logprob_Bi -= logsumexp2(logprob_Bi) + + # update output and transition probabilities + si = self._states[i] + + for j in range(N): + sj = self._states[j] + model._transitions[si].update(sj, logprob_Ai[j]) + + if update_outputs: + for k in range(M): + ok = self._symbols[k] + model._outputs[si].update(ok, logprob_Bi[k]) + + # Rabiner says the priors don't need to be updated. I don't + # believe him. FIXME + + # test for convergence + if iteration > 0 and abs(logprob - last_logprob) < epsilon: + converged = True + + print("iteration", iteration, "logprob", logprob) + iteration += 1 + last_logprob = logprob + + return model + + def train_supervised(self, labelled_sequences, estimator=None): + """ + Supervised training maximising the joint probability of the symbol and + state sequences. This is done via collecting frequencies of + transitions between states, symbol observations while within each + state and which states start a sentence. These frequency distributions + are then normalised into probability estimates, which can be + smoothed if desired. + + :return: the trained model + :rtype: HiddenMarkovModelTagger + :param labelled_sequences: the training data, a set of + labelled sequences of observations + :type labelled_sequences: list + :param estimator: a function taking + a FreqDist and a number of bins and returning a CProbDistI; + otherwise a MLE estimate is used + """ + + # default to the MLE estimate + if estimator is None: + estimator = lambda fdist, bins: MLEProbDist(fdist) + + # count occurrences of starting states, transitions out of each state + # and output symbols observed in each state + known_symbols = set(self._symbols) + known_states = set(self._states) + + starting = FreqDist() + transitions = ConditionalFreqDist() + outputs = ConditionalFreqDist() + for sequence in labelled_sequences: + lasts = None + for token in sequence: + state = token[_TAG] + symbol = token[_TEXT] + if lasts is None: + starting[state] += 1 + else: + transitions[lasts][state] += 1 + outputs[state][symbol] += 1 + lasts = state + + # update the state and symbol lists + if state not in known_states: + self._states.append(state) + known_states.add(state) + + if symbol not in known_symbols: + self._symbols.append(symbol) + known_symbols.add(symbol) + + # create probability distributions (with smoothing) + N = len(self._states) + pi = estimator(starting, N) + A = ConditionalProbDist(transitions, estimator, N) + B = ConditionalProbDist(outputs, estimator, len(self._symbols)) + + return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi) + + +def _ninf_array(shape): + res = np.empty(shape, np.float64) + res.fill(-np.inf) + return res + + +def logsumexp2(arr): + max_ = arr.max() + return np.log2(np.sum(2 ** (arr - max_))) + max_ + + +def _log_add(*values): + """ + Adds the logged values, returning the logarithm of the addition. + """ + x = max(values) + if x > -np.inf: + sum_diffs = 0 + for value in values: + sum_diffs += 2 ** (value - x) + return x + np.log2(sum_diffs) + else: + return x + + +def _create_hmm_tagger(states, symbols, A, B, pi): + def pd(values, samples): + d = dict(zip(samples, values)) + return DictionaryProbDist(d) + + def cpd(array, conditions, samples): + d = {} + for values, condition in zip(array, conditions): + d[condition] = pd(values, samples) + return DictionaryConditionalProbDist(d) + + A = cpd(A, states, states) + B = cpd(B, states, symbols) + pi = pd(pi, states) + return HiddenMarkovModelTagger( + symbols=symbols, states=states, transitions=A, outputs=B, priors=pi + ) + + +def _market_hmm_example(): + """ + Return an example HMM (described at page 381, Huang et al) + """ + states = ["bull", "bear", "static"] + symbols = ["up", "down", "unchanged"] + A = np.array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], np.float64) + B = np.array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], np.float64) + pi = np.array([0.5, 0.2, 0.3], np.float64) + + model = _create_hmm_tagger(states, symbols, A, B, pi) + return model, states, symbols + + +def demo(): + # demonstrates HMM probability calculation + + print() + print("HMM probability calculation demo") + print() + + model, states, symbols = _market_hmm_example() + + print("Testing", model) + + for test in [ + ["up", "up"], + ["up", "down", "up"], + ["down"] * 5, + ["unchanged"] * 5 + ["up"], + ]: + + sequence = [(t, None) for t in test] + + print("Testing with state sequence", test) + print("probability =", model.probability(sequence)) + print("tagging = ", model.tag([word for (word, tag) in sequence])) + print("p(tagged) = ", model.probability(sequence)) + print("H = ", model.entropy(sequence)) + print("H_exh = ", model._exhaustive_entropy(sequence)) + print("H(point) = ", model.point_entropy(sequence)) + print("H_exh(point)=", model._exhaustive_point_entropy(sequence)) + print() + + +def load_pos(num_sents): + from nltk.corpus import brown + + sentences = brown.tagged_sents(categories="news")[:num_sents] + + tag_re = re.compile(r"[*]|--|[^+*-]+") + tag_set = set() + symbols = set() + + cleaned_sentences = [] + for sentence in sentences: + for i in range(len(sentence)): + word, tag = sentence[i] + word = word.lower() # normalize + symbols.add(word) # log this word + # Clean up the tag. + tag = tag_re.match(tag).group() + tag_set.add(tag) + sentence[i] = (word, tag) # store cleaned-up tagged token + cleaned_sentences += [sentence] + + return cleaned_sentences, list(tag_set), list(symbols) + + +def demo_pos(): + # demonstrates POS tagging using supervised training + + print() + print("HMM POS tagging demo") + print() + + print("Training HMM...") + labelled_sequences, tag_set, symbols = load_pos(20000) + trainer = HiddenMarkovModelTrainer(tag_set, symbols) + hmm = trainer.train_supervised( + labelled_sequences[10:], + estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins), + ) + + print("Testing...") + hmm.test(labelled_sequences[:10], verbose=True) + + +def _untag(sentences): + unlabeled = [] + for sentence in sentences: + unlabeled.append([(token[_TEXT], None) for token in sentence]) + return unlabeled + + +def demo_pos_bw( + test=10, supervised=20, unsupervised=10, verbose=True, max_iterations=5 +): + # demonstrates the Baum-Welch algorithm in POS tagging + + print() + print("Baum-Welch demo for POS tagging") + print() + + print("Training HMM (supervised, %d sentences)..." % supervised) + + sentences, tag_set, symbols = load_pos(test + supervised + unsupervised) + + symbols = set() + for sentence in sentences: + for token in sentence: + symbols.add(token[_TEXT]) + + trainer = HiddenMarkovModelTrainer(tag_set, list(symbols)) + hmm = trainer.train_supervised( + sentences[test : test + supervised], + estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins), + ) + + hmm.test(sentences[:test], verbose=verbose) + + print("Training (unsupervised, %d sentences)..." % unsupervised) + # it's rather slow - so only use 10 samples by default + unlabeled = _untag(sentences[test + supervised :]) + hmm = trainer.train_unsupervised( + unlabeled, model=hmm, max_iterations=max_iterations + ) + hmm.test(sentences[:test], verbose=verbose) + + +def demo_bw(): + # demo Baum Welch by generating some sequences and then performing + # unsupervised training on them + + print() + print("Baum-Welch demo for market example") + print() + + model, states, symbols = _market_hmm_example() + + # generate some random sequences + training = [] + import random + + rng = random.Random() + rng.seed(0) + for i in range(10): + item = model.random_sample(rng, 5) + training.append([(i[0], None) for i in item]) + + # train on those examples, starting with the model that generated them + trainer = HiddenMarkovModelTrainer(states, symbols) + hmm = trainer.train_unsupervised(training, model=model, max_iterations=1000) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/perceptron.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/perceptron.py new file mode 100644 index 0000000000000000000000000000000000000000..9afe08f0c8d6a9d5852a225e6c9569a291fb1e3d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/perceptron.py @@ -0,0 +1,371 @@ +# This module is a port of the Textblob Averaged Perceptron Tagger +# Author: Matthew Honnibal , +# Long Duong (NLTK port) +# URL: +# +# Copyright 2013 Matthew Honnibal +# NLTK modifications Copyright 2015 The NLTK Project +# +# This module is provided under the terms of the MIT License. + +import logging +import pickle +import random +from collections import defaultdict + +from nltk import jsontags +from nltk.data import find, load +from nltk.tag.api import TaggerI + +try: + import numpy as np +except ImportError: + pass + +PICKLE = "averaged_perceptron_tagger.pickle" + + +@jsontags.register_tag +class AveragedPerceptron: + + """An averaged perceptron, as implemented by Matthew Honnibal. + + See more implementation details here: + https://explosion.ai/blog/part-of-speech-pos-tagger-in-python + """ + + json_tag = "nltk.tag.perceptron.AveragedPerceptron" + + def __init__(self, weights=None): + # Each feature gets its own weight vector, so weights is a dict-of-dicts + self.weights = weights if weights else {} + self.classes = set() + # The accumulated values, for the averaging. These will be keyed by + # feature/clas tuples + self._totals = defaultdict(int) + # The last time the feature was changed, for the averaging. Also + # keyed by feature/clas tuples + # (tstamps is short for timestamps) + self._tstamps = defaultdict(int) + # Number of instances seen + self.i = 0 + + def _softmax(self, scores): + s = np.fromiter(scores.values(), dtype=float) + exps = np.exp(s) + return exps / np.sum(exps) + + def predict(self, features, return_conf=False): + """Dot-product the features and current weights and return the best label.""" + scores = defaultdict(float) + for feat, value in features.items(): + if feat not in self.weights or value == 0: + continue + weights = self.weights[feat] + for label, weight in weights.items(): + scores[label] += value * weight + + # Do a secondary alphabetic sort, for stability + best_label = max(self.classes, key=lambda label: (scores[label], label)) + # compute the confidence + conf = max(self._softmax(scores)) if return_conf == True else None + + return best_label, conf + + def update(self, truth, guess, features): + """Update the feature weights.""" + + def upd_feat(c, f, w, v): + param = (f, c) + self._totals[param] += (self.i - self._tstamps[param]) * w + self._tstamps[param] = self.i + self.weights[f][c] = w + v + + self.i += 1 + if truth == guess: + return None + for f in features: + weights = self.weights.setdefault(f, {}) + upd_feat(truth, f, weights.get(truth, 0.0), 1.0) + upd_feat(guess, f, weights.get(guess, 0.0), -1.0) + + def average_weights(self): + """Average weights from all iterations.""" + for feat, weights in self.weights.items(): + new_feat_weights = {} + for clas, weight in weights.items(): + param = (feat, clas) + total = self._totals[param] + total += (self.i - self._tstamps[param]) * weight + averaged = round(total / self.i, 3) + if averaged: + new_feat_weights[clas] = averaged + self.weights[feat] = new_feat_weights + + def save(self, path): + """Save the pickled model weights.""" + with open(path, "wb") as fout: + return pickle.dump(dict(self.weights), fout) + + def load(self, path): + """Load the pickled model weights.""" + self.weights = load(path) + + def encode_json_obj(self): + return self.weights + + @classmethod + def decode_json_obj(cls, obj): + return cls(obj) + + +@jsontags.register_tag +class PerceptronTagger(TaggerI): + + """ + Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. + See more implementation details here: + https://explosion.ai/blog/part-of-speech-pos-tagger-in-python + + >>> from nltk.tag.perceptron import PerceptronTagger + + Train the model + + >>> tagger = PerceptronTagger(load=False) + + >>> tagger.train([[('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')], + ... [('yes','NNS'),('it','PRP'),('beautiful','JJ')]]) + + >>> tagger.tag(['today','is','a','beautiful','day']) + [('today', 'NN'), ('is', 'PRP'), ('a', 'PRP'), ('beautiful', 'JJ'), ('day', 'NN')] + + Use the pretrain model (the default constructor) + + >>> pretrain = PerceptronTagger() + + >>> pretrain.tag('The quick brown fox jumps over the lazy dog'.split()) + [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')] + + >>> pretrain.tag("The red cat".split()) + [('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')] + """ + + json_tag = "nltk.tag.sequential.PerceptronTagger" + + START = ["-START-", "-START2-"] + END = ["-END-", "-END2-"] + + def __init__(self, load=True): + """ + :param load: Load the pickled model upon instantiation. + """ + self.model = AveragedPerceptron() + self.tagdict = {} + self.classes = set() + if load: + AP_MODEL_LOC = "file:" + str( + find("taggers/averaged_perceptron_tagger/" + PICKLE) + ) + self.load(AP_MODEL_LOC) + + def tag(self, tokens, return_conf=False, use_tagdict=True): + """ + Tag tokenized sentences. + :params tokens: list of word + :type tokens: list(str) + """ + prev, prev2 = self.START + output = [] + + context = self.START + [self.normalize(w) for w in tokens] + self.END + for i, word in enumerate(tokens): + tag, conf = ( + (self.tagdict.get(word), 1.0) if use_tagdict == True else (None, None) + ) + if not tag: + features = self._get_features(i, word, context, prev, prev2) + tag, conf = self.model.predict(features, return_conf) + output.append((word, tag, conf) if return_conf == True else (word, tag)) + + prev2 = prev + prev = tag + + return output + + def train(self, sentences, save_loc=None, nr_iter=5): + """Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` + controls the number of Perceptron training iterations. + + :param sentences: A list or iterator of sentences, where each sentence + is a list of (words, tags) tuples. + :param save_loc: If not ``None``, saves a pickled model in this location. + :param nr_iter: Number of training iterations. + """ + # We'd like to allow ``sentences`` to be either a list or an iterator, + # the latter being especially important for a large training dataset. + # Because ``self._make_tagdict(sentences)`` runs regardless, we make + # it populate ``self._sentences`` (a list) with all the sentences. + # This saves the overheard of just iterating through ``sentences`` to + # get the list by ``sentences = list(sentences)``. + + self._sentences = list() # to be populated by self._make_tagdict... + self._make_tagdict(sentences) + self.model.classes = self.classes + for iter_ in range(nr_iter): + c = 0 + n = 0 + for sentence in self._sentences: + words, tags = zip(*sentence) + + prev, prev2 = self.START + context = self.START + [self.normalize(w) for w in words] + self.END + for i, word in enumerate(words): + guess = self.tagdict.get(word) + if not guess: + feats = self._get_features(i, word, context, prev, prev2) + guess, _ = self.model.predict(feats) + self.model.update(tags[i], guess, feats) + prev2 = prev + prev = guess + c += guess == tags[i] + n += 1 + random.shuffle(self._sentences) + logging.info(f"Iter {iter_}: {c}/{n}={_pc(c, n)}") + + # We don't need the training sentences anymore, and we don't want to + # waste space on them when we pickle the trained tagger. + self._sentences = None + + self.model.average_weights() + # Pickle as a binary file + if save_loc is not None: + with open(save_loc, "wb") as fout: + # changed protocol from -1 to 2 to make pickling Python 2 compatible + pickle.dump((self.model.weights, self.tagdict, self.classes), fout, 2) + + def load(self, loc): + """ + :param loc: Load a pickled model at location. + :type loc: str + """ + + self.model.weights, self.tagdict, self.classes = load(loc) + self.model.classes = self.classes + + def encode_json_obj(self): + return self.model.weights, self.tagdict, list(self.classes) + + @classmethod + def decode_json_obj(cls, obj): + tagger = cls(load=False) + tagger.model.weights, tagger.tagdict, tagger.classes = obj + tagger.classes = set(tagger.classes) + tagger.model.classes = tagger.classes + return tagger + + def normalize(self, word): + """ + Normalization used in pre-processing. + - All words are lower cased + - Groups of digits of length 4 are represented as !YEAR; + - Other digits are represented as !DIGITS + + :rtype: str + """ + if "-" in word and word[0] != "-": + return "!HYPHEN" + if word.isdigit() and len(word) == 4: + return "!YEAR" + if word and word[0].isdigit(): + return "!DIGITS" + return word.lower() + + def _get_features(self, i, word, context, prev, prev2): + """Map tokens into a feature representation, implemented as a + {hashable: int} dict. If the features change, a new model must be + trained. + """ + + def add(name, *args): + features[" ".join((name,) + tuple(args))] += 1 + + i += len(self.START) + features = defaultdict(int) + # It's useful to have a constant feature, which acts sort of like a prior + add("bias") + add("i suffix", word[-3:]) + add("i pref1", word[0] if word else "") + add("i-1 tag", prev) + add("i-2 tag", prev2) + add("i tag+i-2 tag", prev, prev2) + add("i word", context[i]) + add("i-1 tag+i word", prev, context[i]) + add("i-1 word", context[i - 1]) + add("i-1 suffix", context[i - 1][-3:]) + add("i-2 word", context[i - 2]) + add("i+1 word", context[i + 1]) + add("i+1 suffix", context[i + 1][-3:]) + add("i+2 word", context[i + 2]) + return features + + def _make_tagdict(self, sentences): + """ + Make a tag dictionary for single-tag words. + :param sentences: A list of list of (word, tag) tuples. + """ + counts = defaultdict(lambda: defaultdict(int)) + for sentence in sentences: + self._sentences.append(sentence) + for word, tag in sentence: + counts[word][tag] += 1 + self.classes.add(tag) + freq_thresh = 20 + ambiguity_thresh = 0.97 + for word, tag_freqs in counts.items(): + tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) + n = sum(tag_freqs.values()) + # Don't add rare words to the tag dictionary + # Only add quite unambiguous words + if n >= freq_thresh and (mode / n) >= ambiguity_thresh: + self.tagdict[word] = tag + + +def _pc(n, d): + return (n / d) * 100 + + +def _load_data_conll_format(filename): + print("Read from file: ", filename) + with open(filename, "rb") as fin: + sentences = [] + sentence = [] + for line in fin.readlines(): + line = line.strip() + # print line + if len(line) == 0: + sentences.append(sentence) + sentence = [] + continue + tokens = line.split("\t") + word = tokens[1] + tag = tokens[4] + sentence.append((word, tag)) + return sentences + + +def _get_pretrain_model(): + # Train and test on English part of ConLL data (WSJ part of Penn Treebank) + # Train: section 2-11 + # Test : section 23 + tagger = PerceptronTagger() + training = _load_data_conll_format("english_ptb_train.conll") + testing = _load_data_conll_format("english_ptb_test.conll") + print("Size of training and testing (sentence)", len(training), len(testing)) + # Train and save the model + tagger.train(training, PICKLE) + print("Accuracy : ", tagger.accuracy(testing)) + + +if __name__ == "__main__": + # _get_pretrain_model() + pass diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/sequential.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/sequential.py new file mode 100644 index 0000000000000000000000000000000000000000..5432fdcf8a923a22ef52ba131a1fbcc8d48aaca7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/sequential.py @@ -0,0 +1,755 @@ +# Natural Language Toolkit: Sequential Backoff Taggers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# Tiago Tresoldi (original affix tagger) +# URL: +# For license information, see LICENSE.TXT + +""" +Classes for tagging sentences sequentially, left to right. The +abstract base class SequentialBackoffTagger serves as the base +class for all the taggers in this module. Tagging of individual words +is performed by the method ``choose_tag()``, which is defined by +subclasses of SequentialBackoffTagger. If a tagger is unable to +determine a tag for the specified token, then its backoff tagger is +consulted instead. Any SequentialBackoffTagger may serve as a +backoff tagger for any other SequentialBackoffTagger. +""" +import ast +import re +from abc import abstractmethod +from typing import List, Optional, Tuple + +from nltk import jsontags +from nltk.classify import NaiveBayesClassifier +from nltk.probability import ConditionalFreqDist +from nltk.tag.api import FeaturesetTaggerI, TaggerI + + +###################################################################### +# Abstract Base Classes +###################################################################### +class SequentialBackoffTagger(TaggerI): + """ + An abstract base class for taggers that tags words sequentially, + left to right. Tagging of individual words is performed by the + ``choose_tag()`` method, which should be defined by subclasses. If + a tagger is unable to determine a tag for the specified token, + then its backoff tagger is consulted. + + :ivar _taggers: A list of all the taggers that should be tried to + tag a token (i.e., self and its backoff taggers). + """ + + def __init__(self, backoff=None): + if backoff is None: + self._taggers = [self] + else: + self._taggers = [self] + backoff._taggers + + @property + def backoff(self): + """The backoff tagger for this tagger.""" + return self._taggers[1] if len(self._taggers) > 1 else None + + def tag(self, tokens): + # docs inherited from TaggerI + tags = [] + for i in range(len(tokens)): + tags.append(self.tag_one(tokens, i, tags)) + return list(zip(tokens, tags)) + + def tag_one(self, tokens, index, history): + """ + Determine an appropriate tag for the specified token, and + return that tag. If this tagger is unable to determine a tag + for the specified token, then its backoff tagger is consulted. + + :rtype: str + :type tokens: list + :param tokens: The list of words that are being tagged. + :type index: int + :param index: The index of the word whose tag should be + returned. + :type history: list(str) + :param history: A list of the tags for all words before *index*. + """ + tag = None + for tagger in self._taggers: + tag = tagger.choose_tag(tokens, index, history) + if tag is not None: + break + return tag + + @abstractmethod + def choose_tag(self, tokens, index, history): + """ + Decide which tag should be used for the specified token, and + return that tag. If this tagger is unable to determine a tag + for the specified token, return None -- do not consult + the backoff tagger. This method should be overridden by + subclasses of SequentialBackoffTagger. + + :rtype: str + :type tokens: list + :param tokens: The list of words that are being tagged. + :type index: int + :param index: The index of the word whose tag should be + returned. + :type history: list(str) + :param history: A list of the tags for all words before *index*. + """ + + +class ContextTagger(SequentialBackoffTagger): + """ + An abstract base class for sequential backoff taggers that choose + a tag for a token based on the value of its "context". Different + subclasses are used to define different contexts. + + A ContextTagger chooses the tag for a token by calculating the + token's context, and looking up the corresponding tag in a table. + This table can be constructed manually; or it can be automatically + constructed based on a training corpus, using the ``_train()`` + factory method. + + :ivar _context_to_tag: Dictionary mapping contexts to tags. + """ + + def __init__(self, context_to_tag, backoff=None): + """ + :param context_to_tag: A dictionary mapping contexts to tags. + :param backoff: The backoff tagger that should be used for this tagger. + """ + super().__init__(backoff) + self._context_to_tag = context_to_tag if context_to_tag else {} + + @abstractmethod + def context(self, tokens, index, history): + """ + :return: the context that should be used to look up the tag + for the specified token; or None if the specified token + should not be handled by this tagger. + :rtype: (hashable) + """ + + def choose_tag(self, tokens, index, history): + context = self.context(tokens, index, history) + return self._context_to_tag.get(context) + + def size(self): + """ + :return: The number of entries in the table used by this + tagger to map from contexts to tags. + """ + return len(self._context_to_tag) + + def __repr__(self): + return f"<{self.__class__.__name__}: size={self.size()}>" + + def _train(self, tagged_corpus, cutoff=0, verbose=False): + """ + Initialize this ContextTagger's ``_context_to_tag`` table + based on the given training data. In particular, for each + context ``c`` in the training data, set + ``_context_to_tag[c]`` to the most frequent tag for that + context. However, exclude any contexts that are already + tagged perfectly by the backoff tagger(s). + + The old value of ``self._context_to_tag`` (if any) is discarded. + + :param tagged_corpus: A tagged corpus. Each item should be + a list of (word, tag tuples. + :param cutoff: If the most likely tag for a context occurs + fewer than cutoff times, then exclude it from the + context-to-tag table for the new tagger. + """ + + token_count = hit_count = 0 + + # A context is considered 'useful' if it's not already tagged + # perfectly by the backoff tagger. + useful_contexts = set() + + # Count how many times each tag occurs in each context. + fd = ConditionalFreqDist() + for sentence in tagged_corpus: + tokens, tags = zip(*sentence) + for index, (token, tag) in enumerate(sentence): + # Record the event. + token_count += 1 + context = self.context(tokens, index, tags[:index]) + if context is None: + continue + fd[context][tag] += 1 + # If the backoff got it wrong, this context is useful: + if self.backoff is None or tag != self.backoff.tag_one( + tokens, index, tags[:index] + ): + useful_contexts.add(context) + + # Build the context_to_tag table -- for each context, figure + # out what the most likely tag is. Only include contexts that + # we've seen at least `cutoff` times. + for context in useful_contexts: + best_tag = fd[context].max() + hits = fd[context][best_tag] + if hits > cutoff: + self._context_to_tag[context] = best_tag + hit_count += hits + + # Display some stats, if requested. + if verbose: + size = len(self._context_to_tag) + backoff = 100 - (hit_count * 100.0) / token_count + pruning = 100 - (size * 100.0) / len(fd.conditions()) + print("[Trained Unigram tagger:", end=" ") + print( + "size={}, backoff={:.2f}%, pruning={:.2f}%]".format( + size, backoff, pruning + ) + ) + + +###################################################################### +# Tagger Classes +###################################################################### + + +@jsontags.register_tag +class DefaultTagger(SequentialBackoffTagger): + """ + A tagger that assigns the same tag to every token. + + >>> from nltk.tag import DefaultTagger + >>> default_tagger = DefaultTagger('NN') + >>> list(default_tagger.tag('This is a test'.split())) + [('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')] + + This tagger is recommended as a backoff tagger, in cases where + a more powerful tagger is unable to assign a tag to the word + (e.g. because the word was not seen during training). + + :param tag: The tag to assign to each token + :type tag: str + """ + + json_tag = "nltk.tag.sequential.DefaultTagger" + + def __init__(self, tag): + self._tag = tag + super().__init__(None) + + def encode_json_obj(self): + return self._tag + + @classmethod + def decode_json_obj(cls, obj): + tag = obj + return cls(tag) + + def choose_tag(self, tokens, index, history): + return self._tag # ignore token and history + + def __repr__(self): + return f"" + + +@jsontags.register_tag +class NgramTagger(ContextTagger): + """ + A tagger that chooses a token's tag based on its word string and + on the preceding n word's tags. In particular, a tuple + (tags[i-n:i-1], words[i]) is looked up in a table, and the + corresponding tag is returned. N-gram taggers are typically + trained on a tagged corpus. + + Train a new NgramTagger using the given training data or + the supplied model. In particular, construct a new tagger + whose table maps from each context (tag[i-n:i-1], word[i]) + to the most frequent tag for that context. But exclude any + contexts that are already tagged perfectly by the backoff + tagger. + + :param train: A tagged corpus consisting of a list of tagged + sentences, where each sentence is a list of (word, tag) tuples. + :param backoff: A backoff tagger, to be used by the new + tagger if it encounters an unknown context. + :param cutoff: If the most likely tag for a context occurs + fewer than *cutoff* times, then exclude it from the + context-to-tag table for the new tagger. + """ + + json_tag = "nltk.tag.sequential.NgramTagger" + + def __init__( + self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False + ): + self._n = n + self._check_params(train, model) + + super().__init__(model, backoff) + + if train: + self._train(train, cutoff, verbose) + + def encode_json_obj(self): + _context_to_tag = {repr(k): v for k, v in self._context_to_tag.items()} + if "NgramTagger" in self.__class__.__name__: + return self._n, _context_to_tag, self.backoff + else: + return _context_to_tag, self.backoff + + @classmethod + def decode_json_obj(cls, obj): + try: + _n, _context_to_tag, backoff = obj + except ValueError: + _context_to_tag, backoff = obj + + if not _context_to_tag: + return backoff + + _context_to_tag = {ast.literal_eval(k): v for k, v in _context_to_tag.items()} + + if "NgramTagger" in cls.__name__: + return cls(_n, model=_context_to_tag, backoff=backoff) + else: + return cls(model=_context_to_tag, backoff=backoff) + + def context(self, tokens, index, history): + tag_context = tuple(history[max(0, index - self._n + 1) : index]) + return tag_context, tokens[index] + + +@jsontags.register_tag +class UnigramTagger(NgramTagger): + """ + Unigram Tagger + + The UnigramTagger finds the most likely tag for each word in a training + corpus, and then uses that information to assign tags to new tokens. + + >>> from nltk.corpus import brown + >>> from nltk.tag import UnigramTagger + >>> test_sent = brown.sents(categories='news')[0] + >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) + >>> for tok, tag in unigram_tagger.tag(test_sent): + ... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE + (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL), + (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT), + (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ), + (primary, NN), (election, NN), (produced, VBD), (``, ``), + (no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI), + (irregularities, NNS), (took, VBD), (place, NN), (., .), + + :param train: The corpus of training data, a list of tagged sentences + :type train: list(list(tuple(str, str))) + :param model: The tagger model + :type model: dict + :param backoff: Another tagger which this tagger will consult when it is + unable to tag a word + :type backoff: TaggerI + :param cutoff: The number of instances of training data the tagger must see + in order not to use the backoff tagger + :type cutoff: int + """ + + json_tag = "nltk.tag.sequential.UnigramTagger" + + def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): + super().__init__(1, train, model, backoff, cutoff, verbose) + + def context(self, tokens, index, history): + return tokens[index] + + +@jsontags.register_tag +class BigramTagger(NgramTagger): + """ + A tagger that chooses a token's tag based its word string and on + the preceding words' tag. In particular, a tuple consisting + of the previous tag and the word is looked up in a table, and + the corresponding tag is returned. + + :param train: The corpus of training data, a list of tagged sentences + :type train: list(list(tuple(str, str))) + :param model: The tagger model + :type model: dict + :param backoff: Another tagger which this tagger will consult when it is + unable to tag a word + :type backoff: TaggerI + :param cutoff: The number of instances of training data the tagger must see + in order not to use the backoff tagger + :type cutoff: int + """ + + json_tag = "nltk.tag.sequential.BigramTagger" + + def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): + super().__init__(2, train, model, backoff, cutoff, verbose) + + +@jsontags.register_tag +class TrigramTagger(NgramTagger): + """ + A tagger that chooses a token's tag based its word string and on + the preceding two words' tags. In particular, a tuple consisting + of the previous two tags and the word is looked up in a table, and + the corresponding tag is returned. + + :param train: The corpus of training data, a list of tagged sentences + :type train: list(list(tuple(str, str))) + :param model: The tagger model + :type model: dict + :param backoff: Another tagger which this tagger will consult when it is + unable to tag a word + :type backoff: TaggerI + :param cutoff: The number of instances of training data the tagger must see + in order not to use the backoff tagger + :type cutoff: int + """ + + json_tag = "nltk.tag.sequential.TrigramTagger" + + def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): + super().__init__(3, train, model, backoff, cutoff, verbose) + + +@jsontags.register_tag +class AffixTagger(ContextTagger): + """ + A tagger that chooses a token's tag based on a leading or trailing + substring of its word string. (It is important to note that these + substrings are not necessarily "true" morphological affixes). In + particular, a fixed-length substring of the word is looked up in a + table, and the corresponding tag is returned. Affix taggers are + typically constructed by training them on a tagged corpus. + + Construct a new affix tagger. + + :param affix_length: The length of the affixes that should be + considered during training and tagging. Use negative + numbers for suffixes. + :param min_stem_length: Any words whose length is less than + min_stem_length+abs(affix_length) will be assigned a + tag of None by this tagger. + """ + + json_tag = "nltk.tag.sequential.AffixTagger" + + def __init__( + self, + train=None, + model=None, + affix_length=-3, + min_stem_length=2, + backoff=None, + cutoff=0, + verbose=False, + ): + + self._check_params(train, model) + + super().__init__(model, backoff) + + self._affix_length = affix_length + self._min_word_length = min_stem_length + abs(affix_length) + + if train: + self._train(train, cutoff, verbose) + + def encode_json_obj(self): + return ( + self._affix_length, + self._min_word_length, + self._context_to_tag, + self.backoff, + ) + + @classmethod + def decode_json_obj(cls, obj): + _affix_length, _min_word_length, _context_to_tag, backoff = obj + return cls( + affix_length=_affix_length, + min_stem_length=_min_word_length - abs(_affix_length), + model=_context_to_tag, + backoff=backoff, + ) + + def context(self, tokens, index, history): + token = tokens[index] + if len(token) < self._min_word_length: + return None + elif self._affix_length > 0: + return token[: self._affix_length] + else: + return token[self._affix_length :] + + +@jsontags.register_tag +class RegexpTagger(SequentialBackoffTagger): + r""" + Regular Expression Tagger + + The RegexpTagger assigns tags to tokens by comparing their + word strings to a series of regular expressions. The following tagger + uses word suffixes to make guesses about the correct Brown Corpus part + of speech tag: + + >>> from nltk.corpus import brown + >>> from nltk.tag import RegexpTagger + >>> test_sent = brown.sents(categories='news')[0] + >>> regexp_tagger = RegexpTagger( + ... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers + ... (r'(The|the|A|a|An|an)$', 'AT'), # articles + ... (r'.*able$', 'JJ'), # adjectives + ... (r'.*ness$', 'NN'), # nouns formed from adjectives + ... (r'.*ly$', 'RB'), # adverbs + ... (r'.*s$', 'NNS'), # plural nouns + ... (r'.*ing$', 'VBG'), # gerunds + ... (r'.*ed$', 'VBD'), # past tense verbs + ... (r'.*', 'NN') # nouns (default) + ... ]) + >>> regexp_tagger + + >>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE + [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), + ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'), + ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), + ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'), + ('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'), + ('place', 'NN'), ('.', 'NN')] + + :type regexps: list(tuple(str, str)) + :param regexps: A list of ``(regexp, tag)`` pairs, each of + which indicates that a word matching ``regexp`` should + be tagged with ``tag``. The pairs will be evaluated in + order. If none of the regexps match a word, then the + optional backoff tagger is invoked, else it is + assigned the tag None. + """ + + json_tag = "nltk.tag.sequential.RegexpTagger" + + def __init__( + self, regexps: List[Tuple[str, str]], backoff: Optional[TaggerI] = None + ): + super().__init__(backoff) + self._regexps = [] + for regexp, tag in regexps: + try: + self._regexps.append((re.compile(regexp), tag)) + except Exception as e: + raise Exception( + f"Invalid RegexpTagger regexp: {e}\n- regexp: {regexp!r}\n- tag: {tag!r}" + ) from e + + def encode_json_obj(self): + return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff + + @classmethod + def decode_json_obj(cls, obj): + regexps, backoff = obj + return cls(regexps, backoff) + + def choose_tag(self, tokens, index, history): + for regexp, tag in self._regexps: + if re.match(regexp, tokens[index]): + return tag + return None + + def __repr__(self): + return f"" + + +class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI): + """ + A sequential tagger that uses a classifier to choose the tag for + each token in a sentence. The featureset input for the classifier + is generated by a feature detector function:: + + feature_detector(tokens, index, history) -> featureset + + Where tokens is the list of unlabeled tokens in the sentence; + index is the index of the token for which feature detection + should be performed; and history is list of the tags for all + tokens before index. + + Construct a new classifier-based sequential tagger. + + :param feature_detector: A function used to generate the + featureset input for the classifier:: + feature_detector(tokens, index, history) -> featureset + + :param train: A tagged corpus consisting of a list of tagged + sentences, where each sentence is a list of (word, tag) tuples. + + :param backoff: A backoff tagger, to be used by the new tagger + if it encounters an unknown context. + + :param classifier_builder: A function used to train a new + classifier based on the data in *train*. It should take + one argument, a list of labeled featuresets (i.e., + (featureset, label) tuples). + + :param classifier: The classifier that should be used by the + tagger. This is only useful if you want to manually + construct the classifier; normally, you would use *train* + instead. + + :param backoff: A backoff tagger, used if this tagger is + unable to determine a tag for a given token. + + :param cutoff_prob: If specified, then this tagger will fall + back on its backoff tagger if the probability of the most + likely tag is less than *cutoff_prob*. + """ + + def __init__( + self, + feature_detector=None, + train=None, + classifier_builder=NaiveBayesClassifier.train, + classifier=None, + backoff=None, + cutoff_prob=None, + verbose=False, + ): + self._check_params(train, classifier) + + super().__init__(backoff) + + if (train and classifier) or (not train and not classifier): + raise ValueError( + "Must specify either training data or " "trained classifier." + ) + + if feature_detector is not None: + self._feature_detector = feature_detector + # The feature detector function, used to generate a featureset + # or each token: feature_detector(tokens, index, history) -> featureset + + self._cutoff_prob = cutoff_prob + """Cutoff probability for tagging -- if the probability of the + most likely tag is less than this, then use backoff.""" + + self._classifier = classifier + """The classifier used to choose a tag for each token.""" + + if train: + self._train(train, classifier_builder, verbose) + + def choose_tag(self, tokens, index, history): + # Use our feature detector to get the featureset. + featureset = self.feature_detector(tokens, index, history) + + # Use the classifier to pick a tag. If a cutoff probability + # was specified, then check that the tag's probability is + # higher than that cutoff first; otherwise, return None. + if self._cutoff_prob is None: + return self._classifier.classify(featureset) + + pdist = self._classifier.prob_classify(featureset) + tag = pdist.max() + return tag if pdist.prob(tag) >= self._cutoff_prob else None + + def _train(self, tagged_corpus, classifier_builder, verbose): + """ + Build a new classifier, based on the given training data + *tagged_corpus*. + """ + + classifier_corpus = [] + if verbose: + print("Constructing training corpus for classifier.") + + for sentence in tagged_corpus: + history = [] + untagged_sentence, tags = zip(*sentence) + for index in range(len(sentence)): + featureset = self.feature_detector(untagged_sentence, index, history) + classifier_corpus.append((featureset, tags[index])) + history.append(tags[index]) + + if verbose: + print(f"Training classifier ({len(classifier_corpus)} instances)") + self._classifier = classifier_builder(classifier_corpus) + + def __repr__(self): + return f"" + + def feature_detector(self, tokens, index, history): + """ + Return the feature detector that this tagger uses to generate + featuresets for its classifier. The feature detector is a + function with the signature:: + + feature_detector(tokens, index, history) -> featureset + + See ``classifier()`` + """ + return self._feature_detector(tokens, index, history) + + def classifier(self): + """ + Return the classifier that this tagger uses to choose a tag + for each word in a sentence. The input for this classifier is + generated using this tagger's feature detector. + See ``feature_detector()`` + """ + return self._classifier + + +class ClassifierBasedPOSTagger(ClassifierBasedTagger): + """ + A classifier based part of speech tagger. + """ + + def feature_detector(self, tokens, index, history): + word = tokens[index] + if index == 0: + prevword = prevprevword = None + prevtag = prevprevtag = None + elif index == 1: + prevword = tokens[index - 1].lower() + prevprevword = None + prevtag = history[index - 1] + prevprevtag = None + else: + prevword = tokens[index - 1].lower() + prevprevword = tokens[index - 2].lower() + prevtag = history[index - 1] + prevprevtag = history[index - 2] + + if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word): + shape = "number" + elif re.match(r"\W+$", word): + shape = "punct" + elif re.match("[A-Z][a-z]+$", word): + shape = "upcase" + elif re.match("[a-z]+$", word): + shape = "downcase" + elif re.match(r"\w+$", word): + shape = "mixedcase" + else: + shape = "other" + + features = { + "prevtag": prevtag, + "prevprevtag": prevprevtag, + "word": word, + "word.lower": word.lower(), + "suffix3": word.lower()[-3:], + "suffix2": word.lower()[-2:], + "suffix1": word.lower()[-1:], + "prevprevword": prevprevword, + "prevword": prevword, + "prevtag+word": f"{prevtag}+{word.lower()}", + "prevprevtag+word": f"{prevprevtag}+{word.lower()}", + "prevword+word": f"{prevword}+{word.lower()}", + "shape": shape, + } + return features diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/stanford.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/stanford.py new file mode 100644 index 0000000000000000000000000000000000000000..0b41821bf6cb3aff3664eb1f0b12a6f292cfb755 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/stanford.py @@ -0,0 +1,236 @@ +# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Nitin Madnani +# Rami Al-Rfou' +# URL: +# For license information, see LICENSE.TXT + +""" +A module for interfacing with the Stanford taggers. + +Tagger models need to be downloaded from https://nlp.stanford.edu/software +and the STANFORD_MODELS environment variable set (a colon-separated +list of paths). + +For more details see the documentation for StanfordPOSTagger and StanfordNERTagger. +""" + +import os +import tempfile +import warnings +from abc import abstractmethod +from subprocess import PIPE + +from nltk.internals import _java_options, config_java, find_file, find_jar, java +from nltk.tag.api import TaggerI + +_stanford_url = "https://nlp.stanford.edu/software" + + +class StanfordTagger(TaggerI): + """ + An interface to Stanford taggers. Subclasses must define: + + - ``_cmd`` property: A property that returns the command that will be + executed. + - ``_SEPARATOR``: Class constant that represents that character that + is used to separate the tokens from their tags. + - ``_JAR`` file: Class constant that represents the jar file name. + """ + + _SEPARATOR = "" + _JAR = "" + + def __init__( + self, + model_filename, + path_to_jar=None, + encoding="utf8", + verbose=False, + java_options="-mx1000m", + ): + # Raise deprecation warning. + warnings.warn( + str( + "\nThe StanfordTokenizer will " + "be deprecated in version 3.2.6.\n" + "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead." + ), + DeprecationWarning, + stacklevel=2, + ) + + if not self._JAR: + warnings.warn( + "The StanfordTagger class is not meant to be " + "instantiated directly. Did you mean " + "StanfordPOSTagger or StanfordNERTagger?" + ) + self._stanford_jar = find_jar( + self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose + ) + + self._stanford_model = find_file( + model_filename, env_vars=("STANFORD_MODELS",), verbose=verbose + ) + + self._encoding = encoding + self.java_options = java_options + + @property + @abstractmethod + def _cmd(self): + """ + A property that returns the command that will be executed. + """ + + def tag(self, tokens): + # This function should return list of tuple rather than list of list + return sum(self.tag_sents([tokens]), []) + + def tag_sents(self, sentences): + encoding = self._encoding + default_options = " ".join(_java_options) + config_java(options=self.java_options, verbose=False) + + # Create a temporary input file + _input_fh, self._input_file_path = tempfile.mkstemp(text=True) + + cmd = list(self._cmd) + cmd.extend(["-encoding", encoding]) + + # Write the actual sentences to the temporary input file + _input_fh = os.fdopen(_input_fh, "wb") + _input = "\n".join(" ".join(x) for x in sentences) + if isinstance(_input, str) and encoding: + _input = _input.encode(encoding) + _input_fh.write(_input) + _input_fh.close() + + # Run the tagger and get the output + stanpos_output, _stderr = java( + cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE + ) + stanpos_output = stanpos_output.decode(encoding) + + # Delete the temporary file + os.unlink(self._input_file_path) + + # Return java configurations to their default values + config_java(options=default_options, verbose=False) + + return self.parse_output(stanpos_output, sentences) + + def parse_output(self, text, sentences=None): + # Output the tagged sentences + tagged_sentences = [] + for tagged_sentence in text.strip().split("\n"): + sentence = [] + for tagged_word in tagged_sentence.strip().split(): + word_tags = tagged_word.strip().split(self._SEPARATOR) + sentence.append( + ("".join(word_tags[:-1]), word_tags[-1].replace("0", "").upper()) + ) + tagged_sentences.append(sentence) + return tagged_sentences + + +class StanfordPOSTagger(StanfordTagger): + """ + A class for pos tagging with Stanford Tagger. The input is the paths to: + - a model trained on training data + - (optionally) the path to the stanford tagger jar file. If not specified here, + then this jar file must be specified in the CLASSPATH environment variable. + - (optionally) the encoding of the training data (default: UTF-8) + + Example: + + >>> from nltk.tag import StanfordPOSTagger + >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP + >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP + [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] + """ + + _SEPARATOR = "_" + _JAR = "stanford-postagger.jar" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @property + def _cmd(self): + return [ + "edu.stanford.nlp.tagger.maxent.MaxentTagger", + "-model", + self._stanford_model, + "-textFile", + self._input_file_path, + "-tokenize", + "false", + "-outputFormatOptions", + "keepEmptySentences", + ] + + +class StanfordNERTagger(StanfordTagger): + """ + A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to: + + - a model trained on training data + - (optionally) the path to the stanford tagger jar file. If not specified here, + then this jar file must be specified in the CLASSPATH environment variable. + - (optionally) the encoding of the training data (default: UTF-8) + + Example: + + >>> from nltk.tag import StanfordNERTagger + >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP + >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP + [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), + ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), + ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')] + """ + + _SEPARATOR = "/" + _JAR = "stanford-ner.jar" + _FORMAT = "slashTags" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @property + def _cmd(self): + # Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer + return [ + "edu.stanford.nlp.ie.crf.CRFClassifier", + "-loadClassifier", + self._stanford_model, + "-textFile", + self._input_file_path, + "-outputFormat", + self._FORMAT, + "-tokenizerFactory", + "edu.stanford.nlp.process.WhitespaceTokenizer", + "-tokenizerOptions", + '"tokenizeNLs=false"', + ] + + def parse_output(self, text, sentences): + if self._FORMAT == "slashTags": + # Joint together to a big list + tagged_sentences = [] + for tagged_sentence in text.strip().split("\n"): + for tagged_word in tagged_sentence.strip().split(): + word_tags = tagged_word.strip().split(self._SEPARATOR) + tagged_sentences.append(("".join(word_tags[:-1]), word_tags[-1])) + + # Separate it according to the input + result = [] + start = 0 + for sent in sentences: + result.append(tagged_sentences[start : start + len(sent)]) + start += len(sent) + return result + + raise NotImplementedError diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/util.py new file mode 100644 index 0000000000000000000000000000000000000000..00fe164f38b08fd0893386dd7bd0196c2f6a471f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/util.py @@ -0,0 +1,72 @@ +# Natural Language Toolkit: Tagger Utilities +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + + +def str2tuple(s, sep="/"): + """ + Given the string representation of a tagged token, return the + corresponding tuple representation. The rightmost occurrence of + *sep* in *s* will be used to divide *s* into a word string and + a tag string. If *sep* does not occur in *s*, return (s, None). + + >>> from nltk.tag.util import str2tuple + >>> str2tuple('fly/NN') + ('fly', 'NN') + + :type s: str + :param s: The string representation of a tagged token. + :type sep: str + :param sep: The separator string used to separate word strings + from tags. + """ + loc = s.rfind(sep) + if loc >= 0: + return (s[:loc], s[loc + len(sep) :].upper()) + else: + return (s, None) + + +def tuple2str(tagged_token, sep="/"): + """ + Given the tuple representation of a tagged token, return the + corresponding string representation. This representation is + formed by concatenating the token's word string, followed by the + separator, followed by the token's tag. (If the tag is None, + then just return the bare word string.) + + >>> from nltk.tag.util import tuple2str + >>> tagged_token = ('fly', 'NN') + >>> tuple2str(tagged_token) + 'fly/NN' + + :type tagged_token: tuple(str, str) + :param tagged_token: The tuple representation of a tagged token. + :type sep: str + :param sep: The separator string used to separate word strings + from tags. + """ + word, tag = tagged_token + if tag is None: + return word + else: + assert sep not in tag, "tag may not contain sep!" + return f"{word}{sep}{tag}" + + +def untag(tagged_sentence): + """ + Given a tagged sentence, return an untagged version of that + sentence. I.e., return a list containing the first element + of each tuple in *tagged_sentence*. + + >>> from nltk.tag.util import untag + >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')]) + ['John', 'saw', 'Mary'] + + """ + return [w for (w, t) in tagged_sentence] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d58869716298eba412964b4fbf1175ecbf244990 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/__init__.py @@ -0,0 +1,31 @@ +# Natural Language Toolkit: Transformation-based learning +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Marcus Uneson +# based on previous (nltk2) version by +# Christopher Maloof, Edward Loper, Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Transformation Based Learning + +A general purpose package for Transformation Based Learning, +currently used by nltk.tag.BrillTagger. + +isort:skip_file +""" + +from nltk.tbl.template import Template + +# API: Template(...), Template.expand(...) + +from nltk.tbl.feature import Feature + +# API: Feature(...), Feature.expand(...) + +from nltk.tbl.rule import Rule + +# API: Rule.format(...), Rule.templatetid + +from nltk.tbl.erroranalysis import error_list diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/erroranalysis.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/erroranalysis.py new file mode 100644 index 0000000000000000000000000000000000000000..12b6e70b7934b484cf23132667da149859418615 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/erroranalysis.py @@ -0,0 +1,38 @@ +# Natural Language Toolkit: Transformation-based learning +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Marcus Uneson +# based on previous (nltk2) version by +# Christopher Maloof, Edward Loper, Steven Bird +# URL: +# For license information, see LICENSE.TXT + +# returns a list of errors in string format + + +def error_list(train_sents, test_sents): + """ + Returns a list of human-readable strings indicating the errors in the + given tagging of the corpus. + + :param train_sents: The correct tagging of the corpus + :type train_sents: list(tuple) + :param test_sents: The tagged corpus + :type test_sents: list(tuple) + """ + hdr = ("%25s | %s | %s\n" + "-" * 26 + "+" + "-" * 24 + "+" + "-" * 26) % ( + "left context", + "word/test->gold".center(22), + "right context", + ) + errors = [hdr] + for (train_sent, test_sent) in zip(train_sents, test_sents): + for wordnum, (word, train_pos) in enumerate(train_sent): + test_pos = test_sent[wordnum][1] + if train_pos != test_pos: + left = " ".join("%s/%s" % w for w in train_sent[:wordnum]) + right = " ".join("%s/%s" % w for w in train_sent[wordnum + 1 :]) + mid = f"{word}/{test_pos}->{train_pos}" + errors.append(f"{left[-25:]:>25} | {mid.center(22)} | {right[:25]}") + + return errors diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/feature.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/feature.py new file mode 100644 index 0000000000000000000000000000000000000000..70ed085d6327a0fb49cf766efd3fdaad11e3e172 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/feature.py @@ -0,0 +1,267 @@ +# Natural Language Toolkit: Transformation-based learning +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Marcus Uneson +# based on previous (nltk2) version by +# Christopher Maloof, Edward Loper, Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from abc import ABCMeta, abstractmethod + + +class Feature(metaclass=ABCMeta): + """ + An abstract base class for Features. A Feature is a combination of + a specific property-computing method and a list of relative positions + to apply that method to. + + The property-computing method, M{extract_property(tokens, index)}, + must be implemented by every subclass. It extracts or computes a specific + property for the token at the current index. Typical extract_property() + methods return features such as the token text or tag; but more involved + methods may consider the entire sequence M{tokens} and + for instance compute the length of the sentence the token belongs to. + + In addition, the subclass may have a PROPERTY_NAME, which is how + it will be printed (in Rules and Templates, etc). If not given, defaults + to the classname. + + """ + + json_tag = "nltk.tbl.Feature" + PROPERTY_NAME = None + + def __init__(self, positions, end=None): + """ + Construct a Feature which may apply at C{positions}. + + >>> # For instance, importing some concrete subclasses (Feature is abstract) + >>> from nltk.tag.brill import Word, Pos + + >>> # Feature Word, applying at one of [-2, -1] + >>> Word([-2,-1]) + Word([-2, -1]) + + >>> # Positions need not be contiguous + >>> Word([-2,-1, 1]) + Word([-2, -1, 1]) + + >>> # Contiguous ranges can alternatively be specified giving the + >>> # two endpoints (inclusive) + >>> Pos(-3, -1) + Pos([-3, -2, -1]) + + >>> # In two-arg form, start <= end is enforced + >>> Pos(2, 1) + Traceback (most recent call last): + File "", line 1, in + File "nltk/tbl/template.py", line 306, in __init__ + raise TypeError + ValueError: illegal interval specification: (start=2, end=1) + + :type positions: list of int + :param positions: the positions at which this features should apply + :raises ValueError: illegal position specifications + + An alternative calling convention, for contiguous positions only, + is Feature(start, end): + + :type start: int + :param start: start of range where this feature should apply + :type end: int + :param end: end of range (NOTE: inclusive!) where this feature should apply + """ + self.positions = None # to avoid warnings + if end is None: + self.positions = tuple(sorted({int(i) for i in positions})) + else: # positions was actually not a list, but only the start index + try: + if positions > end: + raise TypeError + self.positions = tuple(range(positions, end + 1)) + except TypeError as e: + # let any kind of erroneous spec raise ValueError + raise ValueError( + "illegal interval specification: (start={}, end={})".format( + positions, end + ) + ) from e + + # set property name given in subclass, or otherwise name of subclass + self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__ + + def encode_json_obj(self): + return self.positions + + @classmethod + def decode_json_obj(cls, obj): + positions = obj + return cls(positions) + + def __repr__(self): + return f"{self.__class__.__name__}({list(self.positions)!r})" + + @classmethod + def expand(cls, starts, winlens, excludezero=False): + """ + Return a list of features, one for each start point in starts + and for each window length in winlen. If excludezero is True, + no Features containing 0 in its positions will be generated + (many tbl trainers have a special representation for the + target feature at [0]) + + For instance, importing a concrete subclass (Feature is abstract) + + >>> from nltk.tag.brill import Word + + First argument gives the possible start positions, second the + possible window lengths + + >>> Word.expand([-3,-2,-1], [1]) + [Word([-3]), Word([-2]), Word([-1])] + + >>> Word.expand([-2,-1], [1]) + [Word([-2]), Word([-1])] + + >>> Word.expand([-3,-2,-1], [1,2]) + [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])] + + >>> Word.expand([-2,-1], [1]) + [Word([-2]), Word([-1])] + + A third optional argument excludes all Features whose positions contain zero + + >>> Word.expand([-2,-1,0], [1,2], excludezero=False) + [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])] + + >>> Word.expand([-2,-1,0], [1,2], excludezero=True) + [Word([-2]), Word([-1]), Word([-2, -1])] + + All window lengths must be positive + + >>> Word.expand([-2,-1], [0]) + Traceback (most recent call last): + File "", line 1, in + File "nltk/tag/tbl/template.py", line 371, in expand + :param starts: where to start looking for Feature + ValueError: non-positive window length in [0] + + :param starts: where to start looking for Feature + :type starts: list of ints + :param winlens: window lengths where to look for Feature + :type starts: list of ints + :param excludezero: do not output any Feature with 0 in any of its positions. + :type excludezero: bool + :returns: list of Features + :raises ValueError: for non-positive window lengths + """ + if not all(x > 0 for x in winlens): + raise ValueError(f"non-positive window length in {winlens}") + xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1)) + return [cls(x) for x in xs if not (excludezero and 0 in x)] + + def issuperset(self, other): + """ + Return True if this Feature always returns True when other does + + More precisely, return True if this feature refers to the same property as other; + and this Feature looks at all positions that other does (and possibly + other positions in addition). + + #For instance, importing a concrete subclass (Feature is abstract) + >>> from nltk.tag.brill import Word, Pos + + >>> Word([-3,-2,-1]).issuperset(Word([-3,-2])) + True + + >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0])) + False + + #Feature subclasses must agree + >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2])) + False + + :param other: feature with which to compare + :type other: (subclass of) Feature + :return: True if this feature is superset, otherwise False + :rtype: bool + + + """ + return self.__class__ is other.__class__ and set(self.positions) >= set( + other.positions + ) + + def intersects(self, other): + """ + Return True if the positions of this Feature intersects with those of other + + More precisely, return True if this feature refers to the same property as other; + and there is some overlap in the positions they look at. + + #For instance, importing a concrete subclass (Feature is abstract) + >>> from nltk.tag.brill import Word, Pos + + >>> Word([-3,-2,-1]).intersects(Word([-3,-2])) + True + + >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0])) + True + + >>> Word([-3,-2,-1]).intersects(Word([0])) + False + + #Feature subclasses must agree + >>> Word([-3,-2,-1]).intersects(Pos([-3,-2])) + False + + :param other: feature with which to compare + :type other: (subclass of) Feature + :return: True if feature classes agree and there is some overlap in the positions they look at + :rtype: bool + """ + + return bool( + self.__class__ is other.__class__ + and set(self.positions) & set(other.positions) + ) + + # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+), + # it will be enough to define __lt__ and __eq__ + def __eq__(self, other): + return self.__class__ is other.__class__ and self.positions == other.positions + + def __lt__(self, other): + return ( + self.__class__.__name__ < other.__class__.__name__ + or + # self.positions is a sorted tuple of ints + self.positions < other.positions + ) + + def __ne__(self, other): + return not (self == other) + + def __gt__(self, other): + return other < self + + def __ge__(self, other): + return not self < other + + def __le__(self, other): + return self < other or self == other + + @staticmethod + @abstractmethod + def extract_property(tokens, index): + """ + Any subclass of Feature must define static method extract_property(tokens, index) + + :param tokens: the sequence of tokens + :type tokens: list of tokens + :param index: the current index + :type index: int + :return: feature value + :rtype: any (but usually scalar) + """ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/template.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/template.py new file mode 100644 index 0000000000000000000000000000000000000000..41367093c02f596bc0349a87c228b3e161ae1cb7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tbl/template.py @@ -0,0 +1,325 @@ +# Natural Language Toolkit: Transformation-based learning +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Marcus Uneson +# based on previous (nltk2) version by +# Christopher Maloof, Edward Loper, Steven Bird +# URL: +# For license information, see LICENSE.TXT + +import itertools as it +from abc import ABCMeta, abstractmethod + +from nltk.tbl.feature import Feature +from nltk.tbl.rule import Rule + + +class BrillTemplateI(metaclass=ABCMeta): + """ + An interface for generating lists of transformational rules that + apply at given sentence positions. ``BrillTemplateI`` is used by + ``Brill`` training algorithms to generate candidate rules. + """ + + @abstractmethod + def applicable_rules(self, tokens, i, correctTag): + """ + Return a list of the transformational rules that would correct + the ``i``-th subtoken's tag in the given token. In particular, + return a list of zero or more rules that would change + ``tokens[i][1]`` to ``correctTag``, if applied to ``token[i]``. + + If the ``i``-th token already has the correct tag (i.e., if + ``tagged_tokens[i][1] == correctTag``), then + ``applicable_rules()`` should return the empty list. + + :param tokens: The tagged tokens being tagged. + :type tokens: list(tuple) + :param i: The index of the token whose tag should be corrected. + :type i: int + :param correctTag: The correct tag for the ``i``-th token. + :type correctTag: any + :rtype: list(BrillRule) + """ + + @abstractmethod + def get_neighborhood(self, token, index): + """ + Returns the set of indices *i* such that + ``applicable_rules(token, i, ...)`` depends on the value of + the *index*th token of *token*. + + This method is used by the "fast" Brill tagger trainer. + + :param token: The tokens being tagged. + :type token: list(tuple) + :param index: The index whose neighborhood should be returned. + :type index: int + :rtype: set + """ + + +class Template(BrillTemplateI): + """ + A tbl Template that generates a list of L{Rule}s that apply at a given sentence + position. In particular, each C{Template} is parameterized by a list of + independent features (a combination of a specific + property to extract and a list C{L} of relative positions at which to extract + it) and generates all Rules that: + + - use the given features, each at its own independent position; and + - are applicable to the given token. + """ + + ALLTEMPLATES = [] + # record a unique id of form "001", for each template created + # _ids = it.count(0) + + def __init__(self, *features): + + """ + Construct a Template for generating Rules. + + Takes a list of Features. A C{Feature} is a combination + of a specific property and its relative positions and should be + a subclass of L{nltk.tbl.feature.Feature}. + + An alternative calling convention (kept for backwards compatibility, + but less expressive as it only permits one feature type) is + Template(Feature, (start1, end1), (start2, end2), ...) + In new code, that would be better written + Template(Feature(start1, end1), Feature(start2, end2), ...) + + For instance, importing some features + + >>> from nltk.tbl.template import Template + >>> from nltk.tag.brill import Word, Pos + + Create some features + + >>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1])) + + Create a single-feature template + + >>> Template(wfeat1) + Template(Word([-1])) + + Or a two-feature one + + >>> Template(wfeat1, wfeat2) + Template(Word([-1]),Word([1, 2])) + + Or a three-feature one with two different feature types + + >>> Template(wfeat1, wfeat2, pfeat) + Template(Word([-1]),Word([1, 2]),Pos([-2, -1])) + + deprecated api: Feature subclass, followed by list of (start,end) pairs + (permits only a single Feature) + + >>> Template(Word, (-2,-1), (0,0)) + Template(Word([-2, -1]),Word([0])) + + Incorrect specification raises TypeError + + >>> Template(Word, (-2,-1), Pos, (0,0)) + Traceback (most recent call last): + File "", line 1, in + File "nltk/tag/tbl/template.py", line 143, in __init__ + raise TypeError( + TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ... + + :type features: list of Features + :param features: the features to build this Template on + """ + # determine the calling form: either + # Template(Feature, args1, [args2, ...)] + # Template(Feature1(args), Feature2(args), ...) + if all(isinstance(f, Feature) for f in features): + self._features = features + elif issubclass(features[0], Feature) and all( + isinstance(a, tuple) for a in features[1:] + ): + self._features = [features[0](*tp) for tp in features[1:]] + else: + raise TypeError( + "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..." + ) + self.id = f"{len(self.ALLTEMPLATES):03d}" + self.ALLTEMPLATES.append(self) + + def __repr__(self): + return "{}({})".format( + self.__class__.__name__, + ",".join([str(f) for f in self._features]), + ) + + def applicable_rules(self, tokens, index, correct_tag): + if tokens[index][1] == correct_tag: + return [] + + # For each of this Template's features, find the conditions + # that are applicable for the given token. + # Then, generate one Rule for each combination of features + # (the crossproduct of the conditions). + + applicable_conditions = self._applicable_conditions(tokens, index) + xs = list(it.product(*applicable_conditions)) + return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs] + + def _applicable_conditions(self, tokens, index): + """ + :returns: A set of all conditions for rules + that are applicable to C{tokens[index]}. + """ + conditions = [] + + for feature in self._features: + conditions.append([]) + for pos in feature.positions: + if not (0 <= index + pos < len(tokens)): + continue + value = feature.extract_property(tokens, index + pos) + conditions[-1].append((feature, value)) + return conditions + + def get_neighborhood(self, tokens, index): + # inherit docs from BrillTemplateI + + # applicable_rules(tokens, index, ...) depends on index. + neighborhood = {index} # set literal for python 2.7+ + + # applicable_rules(tokens, i, ...) depends on index if + # i+start < index <= i+end. + + allpositions = [0] + [p for feat in self._features for p in feat.positions] + start, end = min(allpositions), max(allpositions) + s = max(0, index + (-end)) + e = min(index + (-start) + 1, len(tokens)) + for i in range(s, e): + neighborhood.add(i) + return neighborhood + + @classmethod + def expand(cls, featurelists, combinations=None, skipintersecting=True): + + """ + Factory method to mass generate Templates from a list L of lists of Features. + + #With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2 + #of the sublists in L; it will output all Templates formed by the Cartesian product + #of this selection, with duplicates and other semantically equivalent + #forms removed. Default for combinations is (1, len(L)). + + The feature lists may have been specified + manually, or generated from Feature.expand(). For instance, + + >>> from nltk.tbl.template import Template + >>> from nltk.tag.brill import Word, Pos + + #creating some features + >>> (wd_0, wd_01) = (Word([0]), Word([0,1])) + + >>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3])) + + >>> list(Template.expand([[wd_0], [pos_m2]])) + [Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))] + + >>> list(Template.expand([[wd_0, wd_01], [pos_m2]])) + [Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))] + + #note: with Feature.expand(), it is very easy to generate more templates + #than your system can handle -- for instance, + >>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False) + >>> len(wordtpls) + 7 + + >>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True) + >>> len(postpls) + 9 + + #and now the Cartesian product of all non-empty combinations of two wordtpls and + #two postpls, with semantic equivalents removed + >>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls])) + >>> len(templates) + 713 + + + will return a list of eight templates + Template(Word([0])), + Template(Word([0, 1])), + Template(Pos([-2])), + Template(Pos([-1])), + Template(Pos([-2]),Word([0])), + Template(Pos([-1]),Word([0])), + Template(Pos([-2]),Word([0, 1])), + Template(Pos([-1]),Word([0, 1]))] + + + #Templates where one feature is a subset of another, such as + #Template(Word([0,1]), Word([1]), will not appear in the output. + #By default, this non-subset constraint is tightened to disjointness: + #Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out. + #With skipintersecting=False, then such Templates are allowed + + WARNING: this method makes it very easy to fill all your memory when training + generated templates on any real-world corpus + + :param featurelists: lists of Features, whose Cartesian product will return a set of Templates + :type featurelists: list of (list of Features) + :param combinations: given n featurelists: if combinations=k, all generated Templates will have + k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n + :type combinations: None, int, or (int, int) + :param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature) + :type skipintersecting: bool + :returns: generator of Templates + + """ + + def nonempty_powerset(xs): # xs is a list + # itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3) + + # find the correct tuple given combinations, one of {None, k, (k1,k2)} + k = combinations # for brevity + combrange = ( + (1, len(xs) + 1) + if k is None + else (k, k + 1) # n over 1 .. n over n (all non-empty combinations) + if isinstance(k, int) + else (k[0], k[1] + 1) # n over k (only + ) # n over k1, n over k1+1... n over k2 + return it.chain.from_iterable( + it.combinations(xs, r) for r in range(*combrange) + ) + + seentemplates = set() + for picks in nonempty_powerset(featurelists): + for pick in it.product(*picks): + if any( + i != j and x.issuperset(y) + for (i, x) in enumerate(pick) + for (j, y) in enumerate(pick) + ): + continue + if skipintersecting and any( + i != j and x.intersects(y) + for (i, x) in enumerate(pick) + for (j, y) in enumerate(pick) + ): + continue + thistemplate = cls(*sorted(pick)) + strpick = str(thistemplate) + #!!FIXME --this is hackish + if strpick in seentemplates: # already added + cls._poptemplate() + continue + seentemplates.add(strpick) + yield thistemplate + + @classmethod + def _cleartemplates(cls): + cls.ALLTEMPLATES = [] + + @classmethod + def _poptemplate(cls): + return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/bleu.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/bleu.doctest new file mode 100644 index 0000000000000000000000000000000000000000..d7e6e41f5a17e6f048a7264c72f657615e8567cc --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/bleu.doctest @@ -0,0 +1,29 @@ +========== +BLEU tests +========== + +>>> from nltk.translate import bleu + +If the candidate has no alignment to any of the references, the BLEU score is 0. + +>>> bleu( +... ['The candidate has no alignment to any of the references'.split()], +... 'John loves Mary'.split(), +... (1,), +... ) +0 + +This is an implementation of the smoothing techniques +for segment-level BLEU scores that was presented in +Boxing Chen and Collin Cherry (2014) A Systematic Comparison of +Smoothing Techniques for Sentence-Level BLEU. In WMT14. +http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf +>>> from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction + + +>>> sentence_bleu( +... ['It is a place of quiet contemplation .'.split()], +... 'It is .'.split(), +... smoothing_function=SmoothingFunction().method4, +... )*100 +4.4267... diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/bnc.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/bnc.doctest new file mode 100644 index 0000000000000000000000000000000000000000..e02358eb78705f3458e97b2a7541dd9890ffc7f7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/bnc.doctest @@ -0,0 +1,60 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + + >>> import os.path + + >>> from nltk.corpus.reader import BNCCorpusReader + >>> import nltk.test + + >>> root = os.path.dirname(nltk.test.__file__) + >>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml') + +Checking the word access. +------------------------- + + >>> len(bnc.words()) + 151 + + >>> bnc.words()[:6] + ['Ah', 'there', 'we', 'are', ',', '.'] + >>> bnc.words(stem=True)[:6] + ['ah', 'there', 'we', 'be', ',', '.'] + + >>> bnc.tagged_words()[:6] + [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')] + + >>> bnc.tagged_words(c5=True)[:6] + [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')] + +Testing access to the sentences. +-------------------------------- + + >>> len(bnc.sents()) + 15 + + >>> bnc.sents()[0] + ['Ah', 'there', 'we', 'are', ',', '.'] + >>> bnc.sents(stem=True)[0] + ['ah', 'there', 'we', 'be', ',', '.'] + + >>> bnc.tagged_sents()[0] + [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')] + >>> bnc.tagged_sents(c5=True)[0] + [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')] + +A not lazy loader. +------------------ + + >>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False) + + >>> len(eager.words()) + 151 + >>> eager.words(stem=True)[6:17] + ['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.'] + + >>> eager.tagged_words()[6:11] + [('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')] + >>> eager.tagged_words(c5=True)[6:17] + [('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')] + >>> len(eager.sents()) + 15 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/chat80.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/chat80.doctest new file mode 100644 index 0000000000000000000000000000000000000000..e1a2b9a3d101b36841c8c9359f7ba78633bf2a84 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/chat80.doctest @@ -0,0 +1,232 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +======= +Chat-80 +======= + +Chat-80 was a natural language system which allowed the user to +interrogate a Prolog knowledge base in the domain of world +geography. It was developed in the early '80s by Warren and Pereira; see +``_ for a description and +``_ for the source +files. + +The ``chat80`` module contains functions to extract data from the Chat-80 +relation files ('the world database'), and convert then into a format +that can be incorporated in the FOL models of +``nltk.sem.evaluate``. The code assumes that the Prolog +input files are available in the NLTK corpora directory. + +The Chat-80 World Database consists of the following files:: + + world0.pl + rivers.pl + cities.pl + countries.pl + contain.pl + borders.pl + +This module uses a slightly modified version of ``world0.pl``, in which +a set of Prolog rules have been omitted. The modified file is named +``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since +it uses a list rather than a string in the second field. + +Reading Chat-80 Files +===================== + +Chat-80 relations are like tables in a relational database. The +relation acts as the name of the table; the first argument acts as the +'primary key'; and subsequent arguments are further fields in the +table. In general, the name of the table provides a label for a unary +predicate whose extension is all the primary keys. For example, +relations in ``cities.pl`` are of the following form:: + + 'city(athens,greece,1368).' + +Here, ``'athens'`` is the key, and will be mapped to a member of the +unary predicate *city*. + +By analogy with NLTK corpora, ``chat80`` defines a number of 'items' +which correspond to the relations. + + >>> from nltk.sem import chat80 + >>> print(chat80.items) + ('borders', 'circle_of_lat', 'circle_of_long', 'city', ...) + +The fields in the table are mapped to binary predicates. The first +argument of the predicate is the primary key, while the second +argument is the data in the relevant field. Thus, in the above +example, the third field is mapped to the binary predicate +*population_of*, whose extension is a set of pairs such as +``'(athens, 1368)'``. + +An exception to this general framework is required by the relations in +the files ``borders.pl`` and ``contains.pl``. These contain facts of the +following form:: + + 'borders(albania,greece).' + + 'contains0(africa,central_africa).' + +We do not want to form a unary concept out the element in +the first field of these records, and we want the label of the binary +relation just to be ``'border'``/``'contain'`` respectively. + +In order to drive the extraction process, we use 'relation metadata bundles' +which are Python dictionaries such as the following:: + + city = {'label': 'city', + 'closures': [], + 'schema': ['city', 'country', 'population'], + 'filename': 'cities.pl'} + +According to this, the file ``city['filename']`` contains a list of +relational tuples (or more accurately, the corresponding strings in +Prolog form) whose predicate symbol is ``city['label']`` and whose +relational schema is ``city['schema']``. The notion of a ``closure`` is +discussed in the next section. + +Concepts +======== +In order to encapsulate the results of the extraction, a class of +``Concept``\ s is introduced. A ``Concept`` object has a number of +attributes, in particular a ``prefLabel``, an arity and ``extension``. + + >>> c1 = chat80.Concept('dog', arity=1, extension=set(['d1', 'd2'])) + >>> print(c1) + Label = 'dog' + Arity = 1 + Extension = ['d1', 'd2'] + + + +The ``extension`` attribute makes it easier to inspect the output of +the extraction. + + >>> schema = ['city', 'country', 'population'] + >>> concepts = chat80.clause2concepts('cities.pl', 'city', schema) + >>> concepts + [Concept('city'), Concept('country_of'), Concept('population_of')] + >>> for c in concepts: + ... print("%s:\n\t%s" % (c.prefLabel, c.extension[:4])) + city: + ['athens', 'bangkok', 'barcelona', 'berlin'] + country_of: + [('athens', 'greece'), ('bangkok', 'thailand'), ('barcelona', 'spain'), ('berlin', 'east_germany')] + population_of: + [('athens', '1368'), ('bangkok', '1178'), ('barcelona', '1280'), ('berlin', '3481')] + +In addition, the ``extension`` can be further +processed: in the case of the ``'border'`` relation, we check that the +relation is **symmetric**, and in the case of the ``'contain'`` +relation, we carry out the **transitive closure**. The closure +properties associated with a concept is indicated in the relation +metadata, as indicated earlier. + + >>> borders = set([('a1', 'a2'), ('a2', 'a3')]) + >>> c2 = chat80.Concept('borders', arity=2, extension=borders) + >>> print(c2) + Label = 'borders' + Arity = 2 + Extension = [('a1', 'a2'), ('a2', 'a3')] + >>> c3 = chat80.Concept('borders', arity=2, closures=['symmetric'], extension=borders) + >>> c3.close() + >>> print(c3) + Label = 'borders' + Arity = 2 + Extension = [('a1', 'a2'), ('a2', 'a1'), ('a2', 'a3'), ('a3', 'a2')] + +The ``extension`` of a ``Concept`` object is then incorporated into a +``Valuation`` object. + +Persistence +=========== +The functions ``val_dump`` and ``val_load`` are provided to allow a +valuation to be stored in a persistent database and re-loaded, rather +than having to be re-computed each time. + +Individuals and Lexical Items +============================= +As well as deriving relations from the Chat-80 data, we also create a +set of individual constants, one for each entity in the domain. The +individual constants are string-identical to the entities. For +example, given a data item such as ``'zloty'``, we add to the valuation +a pair ``('zloty', 'zloty')``. In order to parse English sentences that +refer to these entities, we also create a lexical item such as the +following for each individual constant:: + + PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty' + +The set of rules is written to the file ``chat_pnames.fcfg`` in the +current directory. + +SQL Query +========= + +The ``city`` relation is also available in RDB form and can be queried +using SQL statements. + + >>> import nltk + >>> q = "SELECT City, Population FROM city_table WHERE Country = 'china' and Population > 1000" + >>> for answer in chat80.sql_query('corpora/city_database/city.db', q): + ... print("%-10s %4s" % answer) + canton 1496 + chungking 1100 + mukden 1551 + peking 2031 + shanghai 5407 + tientsin 1795 + +The (deliberately naive) grammar ``sql.fcfg`` translates from English +to SQL: + + >>> nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg') + % start S + S[SEM=(?np + WHERE + ?vp)] -> NP[SEM=?np] VP[SEM=?vp] + VP[SEM=(?v + ?pp)] -> IV[SEM=?v] PP[SEM=?pp] + VP[SEM=(?v + ?ap)] -> IV[SEM=?v] AP[SEM=?ap] + NP[SEM=(?det + ?n)] -> Det[SEM=?det] N[SEM=?n] + PP[SEM=(?p + ?np)] -> P[SEM=?p] NP[SEM=?np] + AP[SEM=?pp] -> A[SEM=?a] PP[SEM=?pp] + NP[SEM='Country="greece"'] -> 'Greece' + NP[SEM='Country="china"'] -> 'China' + Det[SEM='SELECT'] -> 'Which' | 'What' + N[SEM='City FROM city_table'] -> 'cities' + IV[SEM=''] -> 'are' + A[SEM=''] -> 'located' + P[SEM=''] -> 'in' + +Given this grammar, we can express, and then execute, queries in English. + + >>> cp = nltk.parse.load_parser('grammars/book_grammars/sql0.fcfg') + >>> query = 'What cities are in China' + >>> for tree in cp.parse(query.split()): + ... answer = tree.label()['SEM'] + ... q = " ".join(answer) + ... print(q) + ... + SELECT City FROM city_table WHERE Country="china" + + >>> rows = chat80.sql_query('corpora/city_database/city.db', q) + >>> for r in rows: print("%s" % r, end=' ') + canton chungking dairen harbin kowloon mukden peking shanghai sian tientsin + + +Using Valuations +----------------- + +In order to convert such an extension into a valuation, we use the +``make_valuation()`` method; setting ``read=True`` creates and returns +a new ``Valuation`` object which contains the results. + + >>> val = chat80.make_valuation(concepts, read=True) + >>> 'calcutta' in val['city'] + True + >>> [town for (town, country) in val['country_of'] if country == 'india'] + ['bombay', 'calcutta', 'delhi', 'hyderabad', 'madras'] + >>> dom = val.domain + >>> g = nltk.sem.Assignment(dom) + >>> m = nltk.sem.Model(dom, val) + >>> m.evaluate(r'population_of(jakarta, 533)', g) + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/childes.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/childes.doctest new file mode 100644 index 0000000000000000000000000000000000000000..d2e1195b1128f97df83e3c7e5ba386583d15242f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/childes.doctest @@ -0,0 +1,190 @@ +======================= + CHILDES Corpus Readers +======================= + +Read the XML version of the CHILDES corpus. + +Setup +===== + + >>> from nltk.test.childes_fixt import setup_module + >>> setup_module() + +How to use CHILDESCorpusReader +============================== + +Read the CHILDESCorpusReader class and read the CHILDES corpus saved in +the nltk_data directory. + + >>> import nltk + >>> from nltk.corpus.reader import CHILDESCorpusReader + >>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/') + +Reading files in the Valian corpus (Valian, 1991). + + >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml') + >>> valian.fileids() + ['Valian/01a.xml', 'Valian/01b.xml', 'Valian/02a.xml', 'Valian/02b.xml',... + +Count the number of files + + >>> len(valian.fileids()) + 43 + +Printing properties of the corpus files. + + >>> corpus_data = valian.corpus(valian.fileids()) + >>> print(corpus_data[0]['Lang']) + eng + >>> for key in sorted(corpus_data[0].keys()): + ... print(key, ": ", corpus_data[0][key]) + Corpus : valian + Date : 1986-03-04 + Id : 01a + Lang : eng + Version : 2.0.1 + {http://www.w3.org/2001/XMLSchema-instance}schemaLocation : http://www.talkbank.org/ns/talkbank http://talkbank.org/software/talkbank.xsd + +Printing information of participants of the corpus. The most common codes for +the participants are 'CHI' (target child), 'MOT' (mother), and 'INV' (investigator). + + >>> corpus_participants = valian.participants(valian.fileids()) + >>> for this_corpus_participants in corpus_participants[:2]: + ... for key in sorted(this_corpus_participants.keys()): + ... dct = this_corpus_participants[key] + ... print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())]) + CHI : [('age', 'P2Y1M3D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')] + INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')] + MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')] + CHI : [('age', 'P2Y1M12D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')] + INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')] + MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')] + +printing words. + + >>> valian.words('Valian/01a.xml') + ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ... + +printing sentences. + + >>> valian.sents('Valian/01a.xml') + [['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', + 'and', 'it', 'is', 'March', 'fourth', 'I', 'believe', 'and', 'when', + 'was', "Parent's", 'birthday'], ["Child's"], ['oh', "I'm", 'sorry'], + ["that's", 'okay'], ... + +You can specify the participants with the argument *speaker*. + + >>> valian.words('Valian/01a.xml',speaker=['INV']) + ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ... + >>> valian.words('Valian/01a.xml',speaker=['MOT']) + ["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ... + >>> valian.words('Valian/01a.xml',speaker=['CHI']) + ['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',... + + +tagged_words() and tagged_sents() return the usual (word,pos) tuple lists. +POS tags in the CHILDES are automatically assigned by MOR and POST programs +(MacWhinney, 2000). + + >>> valian.tagged_words('Valian/01a.xml')[:30] + [('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), + ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'), + ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'), + ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'), + ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n'), ("Child's", 'n:prop'), + ('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj'), ("that's", 'pro:dem'), + ('okay', 'adj'), ('February', 'n:prop'), ('first', 'adj'), + ('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')] + + >>> valian.tagged_sents('Valian/01a.xml')[:10] + [[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), + ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'), + ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'), + ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'), + ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n')], + [("Child's", 'n:prop')], [('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj')], + [("that's", 'pro:dem'), ('okay', 'adj')], + [('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'), + ('eighty', 'det:num'), ('four', 'det:num')], + [('great', 'adj')], + [('and', 'coord'), ("she's", 'pro:sub'), ('two', 'det:num'), ('years', 'n'), ('old', 'adj')], + [('correct', 'adj')], + [('okay', 'co')], [('she', 'pro:sub'), ('just', 'adv:int'), ('turned', 'part'), ('two', 'det:num'), + ('a', 'det'), ('month', 'n'), ('ago', 'adv')]] + +When the argument *stem* is true, the word stems (e.g., 'is' -> 'be-3PS') are +used instead of the original words. + + >>> valian.words('Valian/01a.xml')[:30] + ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', ... + >>> valian.words('Valian/01a.xml',stem=True)[:30] + ['at', 'Parent', 'Lastname', 's', 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'be-3S', ... + +When the argument *replace* is true, the replaced words are used instead of +the original words. + + >>> valian.words('Valian/01a.xml',speaker='CHI')[247] + 'tikteat' + >>> valian.words('Valian/01a.xml',speaker='CHI',replace=True)[247] + 'trick' + +When the argument *relation* is true, the relational relationships in the +sentence are returned. See Sagae et al. (2010) for details of the relational +structure adopted in the CHILDES. + + >>> valian.words('Valian/01a.xml',relation=True)[:10] + [[('at', 'prep', '1|0|ROOT'), ('Parent', 'n', '2|5|VOC'), ('Lastname', 'n', '3|5|MOD'), ('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|1|JCT'), ('Child', 'n', '7|8|NAME'), ('Lastname', 'n', '8|6|POBJ'), ('and', 'coord', '9|8|COORD'), ('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v', '11|9|COMP'), ('March', 'n', '12|11|PRED'), ('fourth', 'adj', '13|12|MOD'), ('I', 'pro', '15|16|SUBJ'), ('believe', 'v', '16|14|ROOT'), ('and', 'coord', '18|17|ROOT'), ('when', 'adv', '19|20|PRED'), ('be-PAST', 'v', '20|18|COMP'), ('Parent', 'n', '21|23|MOD'), ('s', 'poss', '22|23|MOD'), ('birth', 'n', '23|20|SUBJ')], [('Child', 'n', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|4|COM'), ('I', 'pro', '3|4|SUBJ'), ('be', 'v', '4|0|ROOT'), ('sorry', 'adj', '5|4|PRED')], [('that', 'pro', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')], [('February', 'n', '1|6|VOC'), ('first', 'adj', '2|6|ENUM'), ('nineteen', 'det', '4|6|ENUM'), ('eighty', 'det', '5|6|ENUM'), ('four', 'det', '6|0|ROOT')], [('great', 'adj', '1|0|ROOT')], [('and', 'coord', '1|0|ROOT'), ('she', 'pro', '2|1|ROOT'), ('be', 'aux', '3|5|AUX'), ('two', 'det', '4|5|QUANT'), ('year-PL', 'n', '5|2|ROOT'), ('old', 'adj', '6|5|MOD')], [('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|0|ROOT'), ('just', 'adv', '2|3|JCT'), ('turn-PERF', 'part', '3|1|XCOMP'), ('two', 'det', '4|6|QUANT'), ('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]] + +Printing age. When the argument *month* is true, the age information in +the CHILDES format is converted into the number of months. + + >>> valian.age() + ['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ... + >>> valian.age('Valian/01a.xml') + ['P2Y1M3D'] + >>> valian.age('Valian/01a.xml',month=True) + [25] + +Printing MLU. The criteria for the MLU computation is broadly based on +Brown (1973). + + >>> valian.MLU() + [2.3574660633484..., 2.292682926829..., 3.492857142857..., 2.961783439490..., + 2.0842696629213..., 3.169811320754..., 3.137404580152..., 3.0578034682080..., + 4.090163934426..., 3.488372093023..., 2.8773584905660..., 3.4792899408284..., + 4.0111940298507..., 3.456790123456..., 4.487603305785..., 4.007936507936..., + 5.25, 5.154696132596..., ...] + + >>> valian.MLU('Valian/01a.xml') + [2.35746606334...] + + +Basic stuff +============================== + +Count the number of words and sentences of each file. + + >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml') + >>> for this_file in valian.fileids()[:6]: + ... print(valian.corpus(this_file)[0]['Corpus'], valian.corpus(this_file)[0]['Id']) + ... print("num of words: %i" % len(valian.words(this_file))) + ... print("num of sents: %i" % len(valian.sents(this_file))) + valian 01a + num of words: 3606 + num of sents: 1027 + valian 01b + num of words: 4376 + num of sents: 1274 + valian 02a + num of words: 2673 + num of sents: 801 + valian 02b + num of words: 5020 + num of sents: 1583 + valian 03a + num of words: 2743 + num of sents: 988 + valian 03b + num of words: 4409 + num of sents: 1397 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/childes_fixt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/childes_fixt.py new file mode 100644 index 0000000000000000000000000000000000000000..0c3b84fd5f089d55e30f10f8233ec7ce2cb5f1b7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/childes_fixt.py @@ -0,0 +1,13 @@ +def setup_module(): + import pytest + + import nltk.data + + try: + nltk.data.find("corpora/childes/data-xml/Eng-USA-MOR/") + except LookupError as e: + pytest.skip( + "The CHILDES corpus is not found. " + "It should be manually downloaded and saved/unpacked " + "to [NLTK_Data_Dir]/corpora/childes/" + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/chunk.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/chunk.doctest new file mode 100644 index 0000000000000000000000000000000000000000..68c235fd63d121cb4ea490b34170cafeabbef8fa --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/chunk.doctest @@ -0,0 +1,372 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========== + Chunking +========== + + >>> from nltk.chunk import * + >>> from nltk.chunk.util import * + >>> from nltk.chunk.regexp import * + >>> from nltk import Tree + + >>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./." + >>> gold_chunked_text = tagstr2tree(tagged_text) + >>> unchunked_text = gold_chunked_text.flatten() + +Chunking uses a special regexp syntax for rules that delimit the chunks. These +rules must be converted to 'regular' regular expressions before a sentence can +be chunked. + + >>> tag_pattern = "
?*" + >>> regexp_pattern = tag_pattern2re_pattern(tag_pattern) + >>> regexp_pattern + '(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)' + +Construct some new chunking rules. + + >>> chunk_rule = ChunkRule(r"<.*>+", "Chunk everything") + >>> strip_rule = StripRule(r"", "Strip on verbs/prepositions") + >>> split_rule = SplitRule("
", "
", + ... "Split successive determiner/noun pairs") + + +Create and score a series of chunk parsers, successively more complex. + + >>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP') + >>> chunked_text = chunk_parser.parse(unchunked_text) + >>> print(chunked_text) + (S + (NP + The/DT + cat/NN + sat/VBD + on/IN + the/DT + mat/NN + the/DT + dog/NN + chewed/VBD + ./.)) + + >>> chunkscore = ChunkScore() + >>> chunkscore.score(gold_chunked_text, chunked_text) + >>> print(chunkscore.precision()) + 0.0 + + >>> print(chunkscore.recall()) + 0.0 + + >>> print(chunkscore.f_measure()) + 0 + + >>> for chunk in sorted(chunkscore.missed()): print(chunk) + (NP The/DT cat/NN) + (NP the/DT dog/NN) + (NP the/DT mat/NN) + + >>> for chunk in chunkscore.incorrect(): print(chunk) + (NP + The/DT + cat/NN + sat/VBD + on/IN + the/DT + mat/NN + the/DT + dog/NN + chewed/VBD + ./.) + + >>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule], + ... chunk_label='NP') + >>> chunked_text = chunk_parser.parse(unchunked_text) + >>> print(chunked_text) + (S + (NP The/DT cat/NN) + sat/VBD + on/IN + (NP the/DT mat/NN the/DT dog/NN) + chewed/VBD + ./.) + >>> assert chunked_text == chunk_parser.parse(list(unchunked_text)) + + >>> chunkscore = ChunkScore() + >>> chunkscore.score(gold_chunked_text, chunked_text) + >>> chunkscore.precision() + 0.5 + + >>> print(chunkscore.recall()) + 0.33333333... + + >>> print(chunkscore.f_measure()) + 0.4 + + >>> for chunk in sorted(chunkscore.missed()): print(chunk) + (NP the/DT dog/NN) + (NP the/DT mat/NN) + + >>> for chunk in chunkscore.incorrect(): print(chunk) + (NP the/DT mat/NN the/DT dog/NN) + + >>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule, split_rule], + ... chunk_label='NP') + >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True) + # Input: +
<.> + # Chunk everything: + {
<.>} + # Strip on verbs/prepositions: + {
} {
} <.> + # Split successive determiner/noun pairs: + {
} {
}{
} <.> + >>> print(chunked_text) + (S + (NP The/DT cat/NN) + sat/VBD + on/IN + (NP the/DT mat/NN) + (NP the/DT dog/NN) + chewed/VBD + ./.) + + >>> chunkscore = ChunkScore() + >>> chunkscore.score(gold_chunked_text, chunked_text) + >>> chunkscore.precision() + 1.0 + + >>> chunkscore.recall() + 1.0 + + >>> chunkscore.f_measure() + 1.0 + + >>> chunkscore.missed() + [] + + >>> chunkscore.incorrect() + [] + + >>> chunk_parser.rules() + [+'>, '>, + ', '
'>] + +Printing parsers: + + >>> print(repr(chunk_parser)) + + >>> print(chunk_parser) + RegexpChunkParser with 3 rules: + Chunk everything + +'> + Strip on verbs/prepositions + '> + Split successive determiner/noun pairs + ', '
'> + +Regression Tests +~~~~~~~~~~~~~~~~ +ChunkParserI +------------ +`ChunkParserI` is an abstract interface -- it is not meant to be +instantiated directly. + + >>> ChunkParserI().parse([]) + Traceback (most recent call last): + . . . + NotImplementedError + + +ChunkString +----------- +ChunkString can be built from a tree of tagged tuples, a tree of +trees, or a mixed list of both: + + >>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)]) + >>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])]) + >>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])]) + >>> ChunkString(t1) + '> + >>> ChunkString(t2) + '> + >>> ChunkString(t3) + '> + +Other values generate an error: + + >>> ChunkString(Tree('S', ['x'])) + Traceback (most recent call last): + . . . + ValueError: chunk structures must contain tagged tokens or trees + +The `str()` for a chunk string adds spaces to it, which makes it line +up with `str()` output for other chunk strings over the same +underlying input. + + >>> cs = ChunkString(t1) + >>> print(cs) + + >>> cs.xform('', '{}') + >>> print(cs) + {} + +The `_verify()` method makes sure that our transforms don't corrupt +the chunk string. By setting debug_level=2, `_verify()` will be +called at the end of every call to `xform`. + + >>> cs = ChunkString(t1, debug_level=3) + + >>> # tag not marked with <...>: + >>> cs.xform('', 't3') + Traceback (most recent call last): + . . . + ValueError: Transformation generated invalid chunkstring: + t3 + + >>> # brackets not balanced: + >>> cs.xform('', '{') + Traceback (most recent call last): + . . . + ValueError: Transformation generated invalid chunkstring: + { + + >>> # nested brackets: + >>> cs.xform('', '{{}}') + Traceback (most recent call last): + . . . + ValueError: Transformation generated invalid chunkstring: + {{}} + + >>> # modified tags: + >>> cs.xform('', '') + Traceback (most recent call last): + . . . + ValueError: Transformation generated invalid chunkstring: tag changed + + >>> # added tags: + >>> cs.xform('', '') + Traceback (most recent call last): + . . . + ValueError: Transformation generated invalid chunkstring: tag changed + +Chunking Rules +-------------- + +Test the different rule constructors & __repr__ methods: + + >>> r1 = RegexpChunkRule(''+ChunkString.IN_STRIP_PATTERN, + ... '{}', 'chunk and ') + >>> r2 = RegexpChunkRule(re.compile(''+ChunkString.IN_STRIP_PATTERN), + ... '{}', 'chunk and ') + >>> r3 = ChunkRule('', 'chunk and ') + >>> r4 = StripRule('', 'strip and ') + >>> r5 = UnChunkRule('', 'unchunk and ') + >>> r6 = MergeRule('', '', 'merge w/ ') + >>> r7 = SplitRule('', '', 'split from ') + >>> r8 = ExpandLeftRule('', '', 'expand left ') + >>> r9 = ExpandRightRule('', '', 'expand right ') + >>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9: + ... print(rule) + (?=[^\\}]*(\\{|$))'->'{}'> + (?=[^\\}]*(\\{|$))'->'{}'> + '> + '> + '> + ', ''> + ', ''> + ', ''> + ', ''> + +`tag_pattern2re_pattern()` complains if the tag pattern looks problematic: + + >>> tag_pattern2re_pattern('{}') + Traceback (most recent call last): + . . . + ValueError: Bad tag pattern: '{}' + +RegexpChunkParser +----------------- + +A warning is printed when parsing an empty sentence: + + >>> parser = RegexpChunkParser([ChunkRule('', '')]) + >>> parser.parse(Tree('S', [])) + Warning: parsing empty text + Tree('S', []) + +RegexpParser +------------ + + >>> parser = RegexpParser(''' + ... NP: {
? * *} # NP + ... P: {} # Preposition + ... V: {} # Verb + ... PP: {

} # PP -> P NP + ... VP: { *} # VP -> V (NP|PP)* + ... ''') + >>> print(repr(parser)) + + >>> print(parser) + chunk.RegexpParser with 5 stages: + RegexpChunkParser with 1 rules: + NP ? * *'> + RegexpChunkParser with 1 rules: + Preposition '> + RegexpChunkParser with 1 rules: + Verb '> + RegexpChunkParser with 1 rules: + PP -> P NP '> + RegexpChunkParser with 1 rules: + VP -> V (NP|PP)* *'> + >>> print(parser.parse(unchunked_text, trace=True)) + # Input: +

<.> + # NP: + {
} {
}{
} <.> + # Input: + <.> + # Preposition: + {} <.> + # Input: +

<.> + # Verb: + {}

{} <.> + # Input: +

<.> + # PP -> P NP: + {

} <.> + # Input: + <.> + # VP -> V (NP|PP)*: + { }{} <.> + (S + (NP The/DT cat/NN) + (VP + (V sat/VBD) + (PP (P on/IN) (NP the/DT mat/NN)) + (NP the/DT dog/NN)) + (VP (V chewed/VBD)) + ./.) + +Test parsing of other rule types: + + >>> print(RegexpParser(''' + ... X: + ... }{ # strip rule + ... }{ # split rule + ... {} # merge rule + ... {} # chunk rule w/ context + ... ''')) + chunk.RegexpParser with 1 stages: + RegexpChunkParser with 4 rules: + strip rule '> + split rule ', ''> + merge rule ', ''> + chunk rule w/ context ', '', ''> + +Illegal patterns give an error message: + + >>> print(RegexpParser('X: {} {}')) + Traceback (most recent call last): + . . . + ValueError: Illegal chunk pattern: {} {} diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/classify.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/classify.doctest new file mode 100644 index 0000000000000000000000000000000000000000..4c47c90eeeb10d47330c41f27e12f78eff504941 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/classify.doctest @@ -0,0 +1,202 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +============= + Classifiers +============= + + >>> from nltk.test.classify_fixt import setup_module + >>> setup_module() + +Classifiers label tokens with category labels (or *class labels*). +Typically, labels are represented with strings (such as ``"health"`` +or ``"sports"``. In NLTK, classifiers are defined using classes that +implement the `ClassifierI` interface, which supports the following operations: + +- self.classify(featureset) +- self.classify_many(featuresets) +- self.labels() +- self.prob_classify(featureset) +- self.prob_classify_many(featuresets) + +NLTK defines several classifier classes: + +- `ConditionalExponentialClassifier` +- `DecisionTreeClassifier` +- `MaxentClassifier` +- `NaiveBayesClassifier` +- `WekaClassifier` + +Classifiers are typically created by training them on a training +corpus. + + +Regression Tests +~~~~~~~~~~~~~~~~ + +We define a very simple training corpus with 3 binary features: ['a', +'b', 'c'], and are two labels: ['x', 'y']. We use a simple feature set so +that the correct answers can be calculated analytically (although we +haven't done this yet for all tests). + + >>> import nltk + >>> train = [ + ... (dict(a=1,b=1,c=1), 'y'), + ... (dict(a=1,b=1,c=1), 'x'), + ... (dict(a=1,b=1,c=0), 'y'), + ... (dict(a=0,b=1,c=1), 'x'), + ... (dict(a=0,b=1,c=1), 'y'), + ... (dict(a=0,b=0,c=1), 'y'), + ... (dict(a=0,b=1,c=0), 'x'), + ... (dict(a=0,b=0,c=0), 'x'), + ... (dict(a=0,b=1,c=1), 'y'), + ... (dict(a=None,b=1,c=0), 'x'), + ... ] + >>> test = [ + ... (dict(a=1,b=0,c=1)), # unseen + ... (dict(a=1,b=0,c=0)), # unseen + ... (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x + ... (dict(a=0,b=1,c=0)), # seen 1 time, label=x + ... ] + +Test the Naive Bayes classifier: + + >>> classifier = nltk.classify.NaiveBayesClassifier.train(train) + >>> sorted(classifier.labels()) + ['x', 'y'] + >>> classifier.classify_many(test) + ['y', 'x', 'y', 'x'] + >>> for pdist in classifier.prob_classify_many(test): + ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y'))) + 0.2500 0.7500 + 0.5833 0.4167 + 0.3571 0.6429 + 0.7000 0.3000 + >>> classifier.show_most_informative_features() + Most Informative Features + c = 0 x : y = 2.3 : 1.0 + c = 1 y : x = 1.8 : 1.0 + a = 1 y : x = 1.7 : 1.0 + a = 0 x : y = 1.0 : 1.0 + b = 0 x : y = 1.0 : 1.0 + b = 1 x : y = 1.0 : 1.0 + +Test the Decision Tree classifier (without None): + + >>> classifier = nltk.classify.DecisionTreeClassifier.train( + ... train[:-1], entropy_cutoff=0, + ... support_cutoff=0) + >>> sorted(classifier.labels()) + ['x', 'y'] + >>> print(classifier) + c=0? .................................................. x + a=0? ................................................ x + a=1? ................................................ y + c=1? .................................................. y + + >>> classifier.classify_many(test) + ['y', 'y', 'y', 'x'] + >>> for pdist in classifier.prob_classify_many(test): + ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y'))) + Traceback (most recent call last): + . . . + NotImplementedError + + +Test the Decision Tree classifier (with None): + + >>> classifier = nltk.classify.DecisionTreeClassifier.train( + ... train, entropy_cutoff=0, + ... support_cutoff=0) + >>> sorted(classifier.labels()) + ['x', 'y'] + >>> print(classifier) + c=0? .................................................. x + a=0? ................................................ x + a=1? ................................................ y + a=None? ............................................. x + c=1? .................................................. y + + + +Test SklearnClassifier, which requires the scikit-learn package. + + >>> from nltk.classify import SklearnClassifier + >>> from sklearn.naive_bayes import BernoulliNB + >>> from sklearn.svm import SVC + >>> train_data = [({"a": 4, "b": 1, "c": 0}, "ham"), + ... ({"a": 5, "b": 2, "c": 1}, "ham"), + ... ({"a": 0, "b": 3, "c": 4}, "spam"), + ... ({"a": 5, "b": 1, "c": 1}, "ham"), + ... ({"a": 1, "b": 4, "c": 3}, "spam")] + >>> classif = SklearnClassifier(BernoulliNB()).train(train_data) + >>> test_data = [{"a": 3, "b": 2, "c": 1}, + ... {"a": 0, "b": 3, "c": 7}] + >>> classif.classify_many(test_data) + ['ham', 'spam'] + >>> classif = SklearnClassifier(SVC(), sparse=False).train(train_data) + >>> classif.classify_many(test_data) + ['ham', 'spam'] + +Test the Maximum Entropy classifier training algorithms; they should all +generate the same results. + + >>> def print_maxent_test_header(): + ... print(' '*11+''.join([' test[%s] ' % i + ... for i in range(len(test))])) + ... print(' '*11+' p(x) p(y)'*len(test)) + ... print('-'*(11+15*len(test))) + + >>> def test_maxent(algorithm): + ... print('%11s' % algorithm, end=' ') + ... try: + ... classifier = nltk.classify.MaxentClassifier.train( + ... train, algorithm, trace=0, max_iter=1000) + ... except Exception as e: + ... print('Error: %r' % e) + ... return + ... + ... for featureset in test: + ... pdist = classifier.prob_classify(featureset) + ... print('%8.2f%6.2f' % (pdist.prob('x'), pdist.prob('y')), end=' ') + ... print() + + >>> print_maxent_test_header(); test_maxent('GIS'); test_maxent('IIS') + test[0] test[1] test[2] test[3] + p(x) p(y) p(x) p(y) p(x) p(y) p(x) p(y) + ----------------------------------------------------------------------- + GIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 + IIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 + + >>> test_maxent('MEGAM'); test_maxent('TADM') # doctest: +SKIP + MEGAM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 + TADM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 + + + +Regression tests for TypedMaxentFeatureEncoding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + >>> from nltk.classify import maxent + >>> train = [ + ... ({'a': 1, 'b': 1, 'c': 1}, 'y'), + ... ({'a': 5, 'b': 5, 'c': 5}, 'x'), + ... ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'), + ... ({'a': 5.5, 'b': 5.4, 'c': 5.3}, 'x'), + ... ({'a': 0.8, 'b': 1.2, 'c': 1}, 'y'), + ... ({'a': 5.1, 'b': 4.9, 'c': 5.2}, 'x') + ... ] + + >>> test = [ + ... {'a': 1, 'b': 0.8, 'c': 1.2}, + ... {'a': 5.2, 'b': 5.1, 'c': 5} + ... ] + + >>> encoding = maxent.TypedMaxentFeatureEncoding.train( + ... train, count_cutoff=3, alwayson_features=True) + + >>> classifier = maxent.MaxentClassifier.train( + ... train, bernoulli=False, encoding=encoding, trace=0) + + >>> classifier.classify_many(test) + ['y', 'x'] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/classify_fixt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/classify_fixt.py new file mode 100644 index 0000000000000000000000000000000000000000..17b037281aff04a7d9a1faf56ccd9b055e1a1071 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/classify_fixt.py @@ -0,0 +1,5 @@ +# most of classify.doctest requires numpy +def setup_module(): + import pytest + + pytest.importorskip("numpy") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/collections.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/collections.doctest new file mode 100644 index 0000000000000000000000000000000000000000..01e69ec09d4b817c1f0420cba4bf4ff57e358e69 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/collections.doctest @@ -0,0 +1,31 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=========== +Collections +=========== + + >>> import nltk + >>> from nltk.collections import * + +Trie +---- + +Trie can be pickled: + + >>> import pickle + >>> trie = nltk.collections.Trie(['a']) + >>> s = pickle.dumps(trie) + >>> pickle.loads(s) + {'a': {True: None}} + +LazyIteratorList +---------------- + +Fetching the length of a LazyIteratorList object does not throw a StopIteration exception: + + >>> lil = LazyIteratorList(i for i in range(1, 11)) + >>> lil[-1] + 10 + >>> len(lil) + 10 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/collocations.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/collocations.doctest new file mode 100644 index 0000000000000000000000000000000000000000..164e598785dfd495902f086de25b3595258e1695 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/collocations.doctest @@ -0,0 +1,307 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +============== + Collocations +============== + +Overview +~~~~~~~~ + +Collocations are expressions of multiple words which commonly co-occur. For +example, the top ten bigram collocations in Genesis are listed below, as +measured using Pointwise Mutual Information. + + >>> import nltk + >>> from nltk.collocations import * + >>> bigram_measures = nltk.collocations.BigramAssocMeasures() + >>> trigram_measures = nltk.collocations.TrigramAssocMeasures() + >>> fourgram_measures = nltk.collocations.QuadgramAssocMeasures() + >>> finder = BigramCollocationFinder.from_words( + ... nltk.corpus.genesis.words('english-web.txt')) + >>> finder.nbest(bigram_measures.pmi, 10) + [('Allon', 'Bacuth'), ('Ashteroth', 'Karnaim'), ('Ben', 'Ammi'), + ('En', 'Mishpat'), ('Jegar', 'Sahadutha'), ('Salt', 'Sea'), + ('Whoever', 'sheds'), ('appoint', 'overseers'), ('aromatic', 'resin'), + ('cutting', 'instrument')] + +While these words are highly collocated, the expressions are also very +infrequent. Therefore it is useful to apply filters, such as ignoring all +bigrams which occur less than three times in the corpus: + + >>> finder.apply_freq_filter(3) + >>> finder.nbest(bigram_measures.pmi, 10) + [('Beer', 'Lahai'), ('Lahai', 'Roi'), ('gray', 'hairs'), + ('ewe', 'lambs'), ('Most', 'High'), ('many', 'colors'), + ('burnt', 'offering'), ('Paddan', 'Aram'), ('east', 'wind'), + ('living', 'creature')] + +We may similarly find collocations among tagged words: + + >>> finder = BigramCollocationFinder.from_words( + ... nltk.corpus.brown.tagged_words('ca01', tagset='universal')) + >>> finder.nbest(bigram_measures.pmi, 5) + [(('1,119', 'NUM'), ('votes', 'NOUN')), + (('1962', 'NUM'), ("governor's", 'NOUN')), + (('637', 'NUM'), ('E.', 'NOUN')), + (('Alpharetta', 'NOUN'), ('prison', 'NOUN')), + (('Bar', 'NOUN'), ('Association', 'NOUN'))] + +Or tags alone: + + >>> finder = BigramCollocationFinder.from_words(t for w, t in + ... nltk.corpus.brown.tagged_words('ca01', tagset='universal')) + >>> finder.nbest(bigram_measures.pmi, 10) + [('PRT', 'VERB'), ('PRON', 'VERB'), ('ADP', 'DET'), ('.', 'PRON'), ('DET', 'ADJ'), + ('CONJ', 'PRON'), ('ADP', 'NUM'), ('NUM', '.'), ('ADV', 'ADV'), ('VERB', 'ADV')] + +Or spanning intervening words: + + >>> finder = BigramCollocationFinder.from_words( + ... nltk.corpus.genesis.words('english-web.txt'), + ... window_size = 20) + >>> finder.apply_freq_filter(2) + >>> ignored_words = nltk.corpus.stopwords.words('english') + >>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) + >>> finder.nbest(bigram_measures.likelihood_ratio, 10) + [('chief', 'chief'), ('became', 'father'), ('years', 'became'), + ('hundred', 'years'), ('lived', 'became'), ('king', 'king'), + ('lived', 'years'), ('became', 'became'), ('chief', 'chiefs'), + ('hundred', 'became')] + +Finders +~~~~~~~ + +The collocations package provides collocation finders which by default +consider all ngrams in a text as candidate collocations: + + >>> text = "I do not like green eggs and ham, I do not like them Sam I am!" + >>> tokens = nltk.wordpunct_tokenize(text) + >>> finder = BigramCollocationFinder.from_words(tokens) + >>> scored = finder.score_ngrams(bigram_measures.raw_freq) + >>> sorted(bigram for bigram, score in scored) + [(',', 'I'), ('I', 'am'), ('I', 'do'), ('Sam', 'I'), ('am', '!'), + ('and', 'ham'), ('do', 'not'), ('eggs', 'and'), ('green', 'eggs'), + ('ham', ','), ('like', 'green'), ('like', 'them'), ('not', 'like'), + ('them', 'Sam')] + +We could otherwise construct the collocation finder from manually-derived +FreqDists: + + >>> word_fd = nltk.FreqDist(tokens) + >>> bigram_fd = nltk.FreqDist(nltk.bigrams(tokens)) + >>> finder = BigramCollocationFinder(word_fd, bigram_fd) + >>> scored == finder.score_ngrams(bigram_measures.raw_freq) + True + +A similar interface is provided for trigrams: + + >>> finder = TrigramCollocationFinder.from_words(tokens) + >>> scored = finder.score_ngrams(trigram_measures.raw_freq) + >>> set(trigram for trigram, score in scored) == set(nltk.trigrams(tokens)) + True + +We may want to select only the top n results: + + >>> sorted(finder.nbest(trigram_measures.raw_freq, 2)) + [('I', 'do', 'not'), ('do', 'not', 'like')] + +Alternatively, we can select those above a minimum score value: + + >>> sorted(finder.above_score(trigram_measures.raw_freq, + ... 1.0 / len(tuple(nltk.trigrams(tokens))))) + [('I', 'do', 'not'), ('do', 'not', 'like')] + +Now spanning intervening words: + + >>> finder = TrigramCollocationFinder.from_words(tokens) + >>> finder = TrigramCollocationFinder.from_words(tokens, window_size=4) + >>> sorted(finder.nbest(trigram_measures.raw_freq, 4)) + [('I', 'do', 'like'), ('I', 'do', 'not'), ('I', 'not', 'like'), ('do', 'not', 'like')] + +A closer look at the finder's ngram frequencies: + + >>> sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10] + [(('I', 'do', 'like'), 2), (('I', 'do', 'not'), 2), (('I', 'not', 'like'), 2), + (('do', 'not', 'like'), 2), ((',', 'I', 'do'), 1), ((',', 'I', 'not'), 1), + ((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1), + (('Sam', 'I', 'am'), 1)] + +A similar interface is provided for fourgrams: + + >>> finder_4grams = QuadgramCollocationFinder.from_words(tokens) + >>> scored_4grams = finder_4grams.score_ngrams(fourgram_measures.raw_freq) + >>> set(fourgram for fourgram, score in scored_4grams) == set(nltk.ngrams(tokens, n=4)) + True + +Filtering candidates +~~~~~~~~~~~~~~~~~~~~ + +All the ngrams in a text are often too many to be useful when finding +collocations. It is generally useful to remove some words or punctuation, +and to require a minimum frequency for candidate collocations. + +Given our sample text above, if we remove all trigrams containing personal +pronouns from candidature, score_ngrams should return 6 less results, and +'do not like' will be the only candidate which occurs more than once: + + >>> finder = TrigramCollocationFinder.from_words(tokens) + >>> len(finder.score_ngrams(trigram_measures.raw_freq)) + 14 + >>> finder.apply_word_filter(lambda w: w in ('I', 'me')) + >>> len(finder.score_ngrams(trigram_measures.raw_freq)) + 8 + >>> sorted(finder.above_score(trigram_measures.raw_freq, + ... 1.0 / len(tuple(nltk.trigrams(tokens))))) + [('do', 'not', 'like')] + +Sometimes a filter is a function on the whole ngram, rather than each word, +such as if we may permit 'and' to appear in the middle of a trigram, but +not on either edge: + + >>> finder.apply_ngram_filter(lambda w1, w2, w3: 'and' in (w1, w3)) + >>> len(finder.score_ngrams(trigram_measures.raw_freq)) + 6 + +Finally, it is often important to remove low frequency candidates, as we +lack sufficient evidence about their significance as collocations: + + >>> finder.apply_freq_filter(2) + >>> len(finder.score_ngrams(trigram_measures.raw_freq)) + 1 + +Association measures +~~~~~~~~~~~~~~~~~~~~ + +A number of measures are available to score collocations or other associations. +The arguments to measure functions are marginals of a contingency table, in the +bigram case (n_ii, (n_ix, n_xi), n_xx):: + + w1 ~w1 + ------ ------ + w2 | n_ii | n_oi | = n_xi + ------ ------ + ~w2 | n_io | n_oo | + ------ ------ + = n_ix TOTAL = n_xx + +We test their calculation using some known values presented in Manning and +Schutze's text and other papers. + +Student's t: examples from Manning and Schutze 5.3.2 + + >>> print('%0.4f' % bigram_measures.student_t(8, (15828, 4675), 14307668)) + 0.9999 + >>> print('%0.4f' % bigram_measures.student_t(20, (42, 20), 14307668)) + 4.4721 + +Chi-square: examples from Manning and Schutze 5.3.3 + + >>> print('%0.2f' % bigram_measures.chi_sq(8, (15828, 4675), 14307668)) + 1.55 + >>> print('%0.0f' % bigram_measures.chi_sq(59, (67, 65), 571007)) + 456400 + +Likelihood ratios: examples from Dunning, CL, 1993 + + >>> print('%0.2f' % bigram_measures.likelihood_ratio(110, (2552, 221), 31777)) + 270.72 + >>> print('%0.2f' % bigram_measures.likelihood_ratio(8, (13, 32), 31777)) + 95.29 + +Pointwise Mutual Information: examples from Manning and Schutze 5.4 + + >>> print('%0.2f' % bigram_measures.pmi(20, (42, 20), 14307668)) + 18.38 + >>> print('%0.2f' % bigram_measures.pmi(20, (15019, 15629), 14307668)) + 0.29 + +TODO: Find authoritative results for trigrams. + +Using contingency table values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While frequency counts make marginals readily available for collocation +finding, it is common to find published contingency table values. The +collocations package therefore provides a wrapper, ContingencyMeasures, which +wraps an association measures class, providing association measures which +take contingency values as arguments, (n_ii, n_io, n_oi, n_oo) in the +bigram case. + + >>> from nltk.metrics import ContingencyMeasures + >>> cont_bigram_measures = ContingencyMeasures(bigram_measures) + >>> print('%0.2f' % cont_bigram_measures.likelihood_ratio(8, 5, 24, 31740)) + 95.29 + >>> print('%0.2f' % cont_bigram_measures.chi_sq(8, 15820, 4667, 14287173)) + 1.55 + +Ranking and correlation +~~~~~~~~~~~~~~~~~~~~~~~ + +It is useful to consider the results of finding collocations as a ranking, and +the rankings output using different association measures can be compared using +the Spearman correlation coefficient. + +Ranks can be assigned to a sorted list of results trivially by assigning +strictly increasing ranks to each result: + + >>> from nltk.metrics.spearman import * + >>> results_list = ['item1', 'item2', 'item3', 'item4', 'item5'] + >>> print(list(ranks_from_sequence(results_list))) + [('item1', 0), ('item2', 1), ('item3', 2), ('item4', 3), ('item5', 4)] + +If scores are available for each result, we may allow sufficiently similar +results (differing by no more than rank_gap) to be assigned the same rank: + + >>> results_scored = [('item1', 50.0), ('item2', 40.0), ('item3', 38.0), + ... ('item4', 35.0), ('item5', 14.0)] + >>> print(list(ranks_from_scores(results_scored, rank_gap=5))) + [('item1', 0), ('item2', 1), ('item3', 1), ('item4', 1), ('item5', 4)] + +The Spearman correlation coefficient gives a number from -1.0 to 1.0 comparing +two rankings. A coefficient of 1.0 indicates identical rankings; -1.0 indicates +exact opposite rankings. + + >>> print('%0.1f' % spearman_correlation( + ... ranks_from_sequence(results_list), + ... ranks_from_sequence(results_list))) + 1.0 + >>> print('%0.1f' % spearman_correlation( + ... ranks_from_sequence(reversed(results_list)), + ... ranks_from_sequence(results_list))) + -1.0 + >>> results_list2 = ['item2', 'item3', 'item1', 'item5', 'item4'] + >>> print('%0.1f' % spearman_correlation( + ... ranks_from_sequence(results_list), + ... ranks_from_sequence(results_list2))) + 0.6 + >>> print('%0.1f' % spearman_correlation( + ... ranks_from_sequence(reversed(results_list)), + ... ranks_from_sequence(results_list2))) + -0.6 + +Keywords +~~~~~~~~ + +Bigram association metrics can also be used to perform keyword analysis. . For example, this finds the keywords +associated with the "romance" section of the Brown corpus as measured by likelihood ratio: + + >>> romance = nltk.FreqDist(w.lower() for w in nltk.corpus.brown.words(categories='romance') if w.isalpha()) + >>> freq = nltk.FreqDist(w.lower() for w in nltk.corpus.brown.words() if w.isalpha()) + + >>> key = nltk.FreqDist() + >>> for w in romance: + ... key[w] = bigram_measures.likelihood_ratio(romance[w], (freq[w], romance.N()), freq.N()) + + >>> for k,v in key.most_common(10): + ... print(f'{k:10s} {v:9.3f}') + she 1163.325 + i 995.961 + her 930.528 + you 513.149 + of 501.891 + is 463.386 + had 421.615 + he 411.000 + the 347.632 + said 300.811 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/concordance.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/concordance.doctest new file mode 100644 index 0000000000000000000000000000000000000000..8dbd81a01818b99681be51491bd3eaadd0c86e38 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/concordance.doctest @@ -0,0 +1,75 @@ +.. Copyright (C) 2001-2016 NLTK Project +.. For license information, see LICENSE.TXT + +================================== +Concordance Example +================================== + +A concordance view shows us every occurrence of a given +word, together with some context. Here we look up the word monstrous +in Moby Dick by entering text1 followed by a period, then the term +concordance, and then placing "monstrous" in parentheses: + +>>> from nltk.corpus import gutenberg +>>> from nltk.text import Text +>>> corpus = gutenberg.words('melville-moby_dick.txt') +>>> text = Text(corpus) + +>>> text.concordance("monstrous") +Displaying 11 of 11 matches: +ong the former , one was of a most monstrous size . ... This came towards us , +ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r +ll over with a heathenish array of monstrous clubs and spears . Some were thick +d as you gazed , and wondered what monstrous cannibal and savage could ever hav +that has survived the flood ; most monstrous and most mountainous ! That Himmal +they might scout at Moby Dick as a monstrous fable , or still worse and more de +th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l +ing Scenes . In connexion with the monstrous pictures of whales , I am strongly +ere to enter upon those still more monstrous stories of them which are to be fo +ght have been rummaged out of this monstrous cabinet there is no telling . But +of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u + +>>> text.concordance("monstrous") +Displaying 11 of 11 matches: +ong the former , one was of a most monstrous size . ... This came towards us , +ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r +ll over with a heathenish array of monstrous clubs and spears . Some were thick +... + +We can also search for a multi-word phrase by passing a list of strings: + +>>> text.concordance(["monstrous", "size"]) +Displaying 2 of 2 matches: +the former , one was of a most monstrous size . ... This came towards us , op +Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead upo + +================================= +Concordance List +================================= + +Often we need to store the results of concordance for further usage. +To do so, call the concordance function with the stdout argument set +to false: + +>>> from nltk.corpus import gutenberg +>>> from nltk.text import Text +>>> corpus = gutenberg.words('melville-moby_dick.txt') +>>> text = Text(corpus) +>>> con_list = text.concordance_list("monstrous") +>>> con_list[2].line +'ll over with a heathenish array of monstrous clubs and spears . Some were thick' +>>> len(con_list) +11 + +================================= +Patching Issue #2088 +================================= + +Patching https://github.com/nltk/nltk/issues/2088 +The left slice of the left context should be clip to 0 if the `i-context` < 0. + +>>> from nltk import Text, word_tokenize +>>> jane_eyre = 'Chapter 1\nTHERE was no possibility of taking a walk that day. We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further outdoor exercise was now out of the question.' +>>> text = Text(word_tokenize(jane_eyre)) +>>> text.concordance_list('taking')[0].left +['Chapter', '1', 'THERE', 'was', 'no', 'possibility', 'of'] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/corpus.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/corpus.doctest new file mode 100644 index 0000000000000000000000000000000000000000..06b779aeda2d7753c5a30443a0e5efaa6ed8453a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/corpus.doctest @@ -0,0 +1,2196 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +================ + Corpus Readers +================ + +The `nltk.corpus` package defines a collection of *corpus reader* +classes, which can be used to access the contents of a diverse set of +corpora. The list of available corpora is given at: + +https://www.nltk.org/nltk_data/ + +Each corpus reader class is specialized to handle a specific +corpus format. In addition, the `nltk.corpus` package automatically +creates a set of corpus reader instances that can be used to access +the corpora in the NLTK data package. +Section `Corpus Reader Objects`_ ("Corpus Reader Objects") describes +the corpus reader instances that can be used to read the corpora in +the NLTK data package. Section `Corpus Reader Classes`_ ("Corpus +Reader Classes") describes the corpus reader classes themselves, and +discusses the issues involved in creating new corpus reader objects +and new corpus reader classes. Section `Regression Tests`_ +("Regression Tests") contains regression tests for the corpus readers +and associated functions and classes. + +.. contents:: **Table of Contents** + :depth: 4 + :backlinks: none + +--------------------- +Corpus Reader Objects +--------------------- + +Overview +======== + +NLTK includes a diverse set of corpora which can be +read using the ``nltk.corpus`` package. Each corpus is accessed by +means of a "corpus reader" object from ``nltk.corpus``: + + >>> import nltk.corpus + >>> # The Brown corpus: + >>> print(str(nltk.corpus.brown).replace('\\\\','/')) + + >>> # The Penn Treebank Corpus: + >>> print(str(nltk.corpus.treebank).replace('\\\\','/')) + + >>> # The Name Genders Corpus: + >>> print(str(nltk.corpus.names).replace('\\\\','/')) + + >>> # The Inaugural Address Corpus: + >>> print(str(nltk.corpus.inaugural).replace('\\\\','/')) + + +Most corpora consist of a set of files, each containing a document (or +other pieces of text). A list of identifiers for these files is +accessed via the ``fileids()`` method of the corpus reader: + + >>> nltk.corpus.treebank.fileids() + ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...] + >>> nltk.corpus.inaugural.fileids() + ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', ...] + +Each corpus reader provides a variety of methods to read data from the +corpus, depending on the format of the corpus. For example, plaintext +corpora support methods to read the corpus as raw text, a list of +words, a list of sentences, or a list of paragraphs. + + >>> from nltk.corpus import inaugural + >>> inaugural.raw('1789-Washington.txt') + 'Fellow-Citizens of the Senate ...' + >>> inaugural.words('1789-Washington.txt') + ['Fellow', '-', 'Citizens', 'of', 'the', ...] + >>> inaugural.sents('1789-Washington.txt') + [['Fellow', '-', 'Citizens'...], ['Among', 'the', 'vicissitudes'...]...] + >>> inaugural.paras('1789-Washington.txt') + [[['Fellow', '-', 'Citizens'...]], + [['Among', 'the', 'vicissitudes'...], + ['On', 'the', 'one', 'hand', ',', 'I'...]...]...] + +Each of these reader methods may be given a single document's item +name or a list of document item names. When given a list of document +item names, the reader methods will concatenate together the contents +of the individual documents. + + >>> l1 = len(inaugural.words('1789-Washington.txt')) + >>> l2 = len(inaugural.words('1793-Washington.txt')) + >>> l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt'])) + >>> print('%s+%s == %s' % (l1, l2, l3)) + 1538+147 == 1685 + +If the reader methods are called without any arguments, they will +typically load all documents in the corpus. + + >>> len(inaugural.words()) + 152901 + +If a corpus contains a README file, it can be accessed with a ``readme()`` method: + + >>> inaugural.readme()[:32] + 'C-Span Inaugural Address Corpus\n' + +Plaintext Corpora +================= + +Here are the first few words from each of NLTK's plaintext corpora: + + >>> nltk.corpus.abc.words() + ['PM', 'denies', 'knowledge', 'of', 'AWB', ...] + >>> nltk.corpus.genesis.words() + ['In', 'the', 'beginning', 'God', 'created', ...] + >>> nltk.corpus.gutenberg.words(fileids='austen-emma.txt') + ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ...] + >>> nltk.corpus.inaugural.words() + ['Fellow', '-', 'Citizens', 'of', 'the', ...] + >>> nltk.corpus.state_union.words() + ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ...] + >>> nltk.corpus.webtext.words() + ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...] + +Tagged Corpora +============== + +In addition to the plaintext corpora, NLTK's data package also +contains a wide variety of annotated corpora. For example, the Brown +Corpus is annotated with part-of-speech tags, and defines additional +methods ``tagged_*()`` which words as `(word,tag)` tuples, rather +than just bare word strings. + + >>> from nltk.corpus import brown + >>> print(brown.words()) + ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] + >>> print(brown.tagged_words()) + [('The', 'AT'), ('Fulton', 'NP-TL'), ...] + >>> print(brown.sents()) + [['The', 'Fulton', 'County'...], ['The', 'jury', 'further'...], ...] + >>> print(brown.tagged_sents()) + [[('The', 'AT'), ('Fulton', 'NP-TL')...], + [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR')...]...] + >>> print(brown.paras(categories='reviews')) + [[['It', 'is', 'not', 'news', 'that', 'Nathan', 'Milstein'...], + ['Certainly', 'not', 'in', 'Orchestra', 'Hall', 'where'...]], + [['There', 'was', 'about', 'that', 'song', 'something', ...], + ['Not', 'the', 'noblest', 'performance', 'we', 'have', ...], ...], ...] + >>> print(brown.tagged_paras(categories='reviews')) + [[[('It', 'PPS'), ('is', 'BEZ'), ('not', '*'), ...], + [('Certainly', 'RB'), ('not', '*'), ('in', 'IN'), ...]], + [[('There', 'EX'), ('was', 'BEDZ'), ('about', 'IN'), ...], + [('Not', '*'), ('the', 'AT'), ('noblest', 'JJT'), ...], ...], ...] + +Similarly, the Indian Language POS-Tagged Corpus includes samples of +Indian text annotated with part-of-speech tags: + + >>> from nltk.corpus import indian + >>> print(indian.words()) # doctest: +SKIP + ['\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf\...', + '\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', ...] + >>> print(indian.tagged_words()) # doctest: +SKIP + [('\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf...', 'NN'), + ('\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', 'NN'), ...] + +Several tagged corpora support access to a simplified, universal tagset, e.g. where all nouns +tags are collapsed to a single category ``NOUN``: + + >>> print(brown.tagged_sents(tagset='universal')) + [[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ...], + [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ...]...] + >>> from nltk.corpus import conll2000, switchboard + >>> print(conll2000.tagged_words(tagset='universal')) + [('Confidence', 'NOUN'), ('in', 'ADP'), ...] + +Use ``nltk.app.pos_concordance()`` to access a GUI for searching tagged corpora. + +Chunked Corpora +=============== + +The CoNLL corpora also provide chunk structures, which are encoded as +flat trees. The CoNLL 2000 Corpus includes phrasal chunks; and the +CoNLL 2002 Corpus includes named entity chunks. + + >>> from nltk.corpus import conll2000, conll2002 + >>> print(conll2000.sents()) + [['Confidence', 'in', 'the', 'pound', 'is', 'widely', ...], + ['Chancellor', 'of', 'the', 'Exchequer', ...], ...] + >>> for tree in conll2000.chunked_sents()[:2]: + ... print(tree) + (S + (NP Confidence/NN) + (PP in/IN) + (NP the/DT pound/NN) + (VP is/VBZ widely/RB expected/VBN to/TO take/VB) + (NP another/DT sharp/JJ dive/NN) + if/IN + ...) + (S + Chancellor/NNP + (PP of/IN) + (NP the/DT Exchequer/NNP) + ...) + >>> print(conll2002.sents()) + [['Sao', 'Paulo', '(', 'Brasil', ')', ',', ...], ['-'], ...] + >>> for tree in conll2002.chunked_sents()[:2]: + ... print(tree) + (S + (LOC Sao/NC Paulo/VMI) + (/Fpa + (LOC Brasil/NC) + )/Fpt + ...) + (S -/Fg) + +.. note:: Since the CONLL corpora do not contain paragraph break + information, these readers do not support the ``para()`` method.) + +.. warning:: if you call the conll corpora reader methods without any + arguments, they will return the contents of the entire corpus, + *including* the 'test' portions of the corpus.) + +SemCor is a subset of the Brown corpus tagged with WordNet senses and +named entities. Both kinds of lexical items include multiword units, +which are encoded as chunks (senses and part-of-speech tags pertain +to the entire chunk). + + >>> from nltk.corpus import semcor + >>> semcor.words() + ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] + >>> semcor.chunks() + [['The'], ['Fulton', 'County', 'Grand', 'Jury'], ...] + >>> semcor.sents() + [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], + ['The', 'jury', 'further', 'said', ...], ...] + >>> semcor.chunk_sents() + [[['The'], ['Fulton', 'County', 'Grand', 'Jury'], ['said'], ... + ['.']], [['The'], ['jury'], ['further'], ['said'], ... ['.']], ...] + >>> list(map(str, semcor.tagged_chunks(tag='both')[:3])) + ['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", "(Lemma('state.v.01.say') (VB said))"] + >>> [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] + [['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", ... + '(None .)'], ['(DT The)', ... '(None .)']] + + +The IEER corpus is another chunked corpus. This corpus is unusual in +that each corpus item contains multiple documents. (This reflects the +fact that each corpus file contains multiple documents.) The IEER +corpus defines the `parsed_docs` method, which returns the documents +in a given item as `IEERDocument` objects: + + >>> from nltk.corpus import ieer + >>> ieer.fileids() + ['APW_19980314', 'APW_19980424', 'APW_19980429', + 'NYT_19980315', 'NYT_19980403', 'NYT_19980407'] + >>> docs = ieer.parsed_docs('APW_19980314') + >>> print(docs[0]) + + >>> print(docs[0].docno) + APW19980314.0391 + >>> print(docs[0].doctype) + NEWS STORY + >>> print(docs[0].date_time) + 03/14/1998 10:36:00 + >>> print(docs[0].headline) + (DOCUMENT Kenyans protest tax hikes) + >>> print(docs[0].text) + (DOCUMENT + (LOCATION NAIROBI) + , + (LOCATION Kenya) + ( + (ORGANIZATION AP) + ) + _ + (CARDINAL Thousands) + of + laborers, + ... + on + (DATE Saturday) + ...) + +Parsed Corpora +============== + +The Treebank corpora provide a syntactic parse for each sentence. The +NLTK data package includes a 10% sample of the Penn Treebank (in +``treebank``), as well as the Sinica Treebank (in ``sinica_treebank``). + +Reading the Penn Treebank (Wall Street Journal sample): + + >>> from nltk.corpus import treebank + >>> print(treebank.fileids()) + ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...] + >>> print(treebank.words('wsj_0003.mrg')) + ['A', 'form', 'of', 'asbestos', 'once', 'used', ...] + >>> print(treebank.tagged_words('wsj_0003.mrg')) + [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...] + >>> print(treebank.parsed_sents('wsj_0003.mrg')[0]) + (S + (S-TPC-1 + (NP-SBJ + (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos)))) + (RRC ...)...)...) + ... + (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) + (. .)) + +If you have access to a full installation of the Penn Treebank, NLTK +can be configured to load it as well. Download the ``ptb`` package, +and in the directory ``nltk_data/corpora/ptb`` place the ``BROWN`` +and ``WSJ`` directories of the Treebank installation (symlinks work +as well). Then use the ``ptb`` module instead of ``treebank``: + + >>> from nltk.corpus import ptb + >>> print(ptb.fileids()) # doctest: +SKIP + ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ...] + >>> print(ptb.words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP + ['A', 'form', 'of', 'asbestos', 'once', 'used', '*', ...] + >>> print(ptb.tagged_words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP + [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...] + +...and so forth, like ``treebank`` but with extended fileids. Categories +specified in ``allcats.txt`` can be used to filter by genre; they consist +of ``news`` (for WSJ articles) and names of the Brown subcategories +(``fiction``, ``humor``, ``romance``, etc.): + + >>> ptb.categories() # doctest: +SKIP + ['adventure', 'belles_lettres', 'fiction', 'humor', 'lore', 'mystery', 'news', 'romance', 'science_fiction'] + >>> print(ptb.fileids('news')) # doctest: +SKIP + ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG', ...] + >>> print(ptb.words(categories=['humor','fiction'])) # doctest: +SKIP + ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...] + +As PropBank and NomBank depend on the (WSJ portion of the) Penn Treebank, +the modules ``propbank_ptb`` and ``nombank_ptb`` are provided for access +to a full PTB installation. + +Reading the Sinica Treebank: + + >>> from nltk.corpus import sinica_treebank + >>> print(sinica_treebank.sents()) # doctest: +SKIP + [['\xe4\xb8\x80'], ['\xe5\x8f\x8b\xe6\x83\x85'], ...] + >>> sinica_treebank.parsed_sents()[25] # doctest: +SKIP + Tree('S', + [Tree('NP', + [Tree('Nba', ['\xe5\x98\x89\xe7\x8f\x8d'])]), + Tree('V\xe2\x80\xa7\xe5\x9c\xb0', + [Tree('VA11', ['\xe4\xb8\x8d\xe5\x81\x9c']), + Tree('DE', ['\xe7\x9a\x84'])]), + Tree('VA4', ['\xe5\x93\xad\xe6\xb3\xa3'])]) + +Reading the CoNLL 2007 Dependency Treebanks: + + >>> from nltk.corpus import conll2007 + >>> conll2007.sents('esp.train')[0] # doctest: +SKIP + ['El', 'aumento', 'del', 'índice', 'de', 'desempleo', ...] + >>> conll2007.parsed_sents('esp.train')[0] # doctest: +SKIP + + >>> print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP + (fortaleció + (aumento El (del (índice (de (desempleo estadounidense))))) + hoy + considerablemente + (al + (euro + (cotizaba + , + que + (a (15.35 las GMT)) + se + (en (mercado el (de divisas) (de Fráncfort))) + (a 0,9452_dólares) + (frente_a , (0,9349_dólares los (de (mañana esta))))))) + .) + +Word Lists and Lexicons +======================= + +The NLTK data package also includes a number of lexicons and word +lists. These are accessed just like text corpora. The following +examples illustrate the use of the wordlist corpora: + + >>> from nltk.corpus import names, stopwords, words + >>> words.fileids() + ['en', 'en-basic'] + >>> words.words('en') + ['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', ...] + + >>> stopwords.fileids() # doctest: +SKIP + ['arabic', 'azerbaijani', 'bengali', 'danish', 'dutch', 'english', 'finnish', 'french', ...] + >>> sorted(stopwords.words('portuguese')) + ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', ...] + >>> names.fileids() + ['female.txt', 'male.txt'] + >>> names.words('male.txt') + ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', ...] + >>> names.words('female.txt') + ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', ...] + +The CMU Pronunciation Dictionary corpus contains pronunciation +transcriptions for over 100,000 words. It can be accessed as a list +of entries (where each entry consists of a word, an identifier, and a +transcription) or as a dictionary from words to lists of +transcriptions. Transcriptions are encoded as tuples of phoneme +strings. + + >>> from nltk.corpus import cmudict + >>> print(cmudict.entries()[653:659]) + [('acetate', ['AE1', 'S', 'AH0', 'T', 'EY2', 'T']), + ('acetic', ['AH0', 'S', 'EH1', 'T', 'IH0', 'K']), + ('acetic', ['AH0', 'S', 'IY1', 'T', 'IH0', 'K']), + ('aceto', ['AA0', 'S', 'EH1', 'T', 'OW0']), + ('acetochlor', ['AA0', 'S', 'EH1', 'T', 'OW0', 'K', 'L', 'AO2', 'R']), + ('acetone', ['AE1', 'S', 'AH0', 'T', 'OW2', 'N'])] + >>> # Load the entire cmudict corpus into a Python dictionary: + >>> transcr = cmudict.dict() + >>> print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()]) + [['N', 'AE1', 'CH', 'ER0', 'AH0', 'L'], + ['L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH'], + ['T', 'UW1', 'L'], + ['K', 'IH1', 'T']] + + +WordNet +======= + +Please see the separate WordNet howto. + +FrameNet +======== + +Please see the separate FrameNet howto. + +PropBank +======== + +Please see the separate PropBank howto. + +SentiWordNet +============ + +Please see the separate SentiWordNet howto. + +Categorized Corpora +=================== + +Several corpora included with NLTK contain documents that have been categorized for +topic, genre, polarity, etc. In addition to the standard corpus interface, these +corpora provide access to the list of categories and the mapping between the documents +and their categories (in both directions). Access the categories using the ``categories()`` +method, e.g.: + + >>> from nltk.corpus import brown, movie_reviews, reuters + >>> brown.categories() + ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', + 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] + >>> movie_reviews.categories() + ['neg', 'pos'] + >>> reuters.categories() + ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', + 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', + 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', ...] + +This method has an optional argument that specifies a document or a list +of documents, allowing us to map from (one or more) documents to (one or more) categories: + + >>> brown.categories('ca01') + ['news'] + >>> brown.categories(['ca01','cb01']) + ['editorial', 'news'] + >>> reuters.categories('training/9865') + ['barley', 'corn', 'grain', 'wheat'] + >>> reuters.categories(['training/9865', 'training/9880']) + ['barley', 'corn', 'grain', 'money-fx', 'wheat'] + +We can go back the other way using the optional argument of the ``fileids()`` method: + + >>> reuters.fileids('barley') + ['test/15618', 'test/15649', 'test/15676', 'test/15728', 'test/15871', ...] + +Both the ``categories()`` and ``fileids()`` methods return a sorted list containing +no duplicates. + +In addition to mapping between categories and documents, these corpora permit +direct access to their contents via the categories. Instead of accessing a subset +of a corpus by specifying one or more fileids, we can identify one or more categories, e.g.: + + >>> brown.tagged_words(categories='news') + [('The', 'AT'), ('Fulton', 'NP-TL'), ...] + >>> brown.sents(categories=['editorial','reviews']) + [['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', + 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', + 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', + 'the', 'day', 'it', 'convened', '.'], ...] + +Note that it is an error to specify both documents and categories. + +In the context of a text categorization system, we can easily test if the +category assigned to a document is correct as follows: + + >>> def classify(doc): return 'news' # Trivial classifier + >>> doc = 'ca01' + >>> classify(doc) in brown.categories(doc) + True + + +Other Corpora +============= + +comparative_sentences +--------------------- +A list of sentences from various sources, especially reviews and articles. Each +line contains one sentence; sentences were separated by using a sentence tokenizer. +Comparative sentences have been annotated with their type, entities, features and +keywords. + + >>> from nltk.corpus import comparative_sentences + >>> comparison = comparative_sentences.comparisons()[0] + >>> comparison.text + ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly', + 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve", + 'had', '.'] + >>> comparison.entity_2 + 'models' + >>> (comparison.feature, comparison.keyword) + ('rewind', 'more') + >>> len(comparative_sentences.comparisons()) + 853 + +opinion_lexicon +--------------- +A list of positive and negative opinion words or sentiment words for English. + + >>> from nltk.corpus import opinion_lexicon + >>> opinion_lexicon.words()[:4] + ['2-faced', '2-faces', 'abnormal', 'abolish'] + +The OpinionLexiconCorpusReader also provides shortcuts to retrieve positive/negative +words: + + >>> opinion_lexicon.negative()[:4] + ['2-faced', '2-faces', 'abnormal', 'abolish'] + +Note that words from `words()` method in opinion_lexicon are sorted by file id, +not alphabetically: + + >>> opinion_lexicon.words()[0:10] + ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', + 'abominate', 'abomination', 'abort', 'aborted'] + >>> sorted(opinion_lexicon.words())[0:10] + ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', + 'abominate', 'abomination', 'abort'] + +ppattach +-------- +The Prepositional Phrase Attachment corpus is a corpus of +prepositional phrase attachment decisions. Each instance in the +corpus is encoded as a ``PPAttachment`` object: + + >>> from nltk.corpus import ppattach + >>> ppattach.attachments('training') + [PPAttachment(sent='0', verb='join', noun1='board', + prep='as', noun2='director', attachment='V'), + PPAttachment(sent='1', verb='is', noun1='chairman', + prep='of', noun2='N.V.', attachment='N'), + ...] + >>> inst = ppattach.attachments('training')[0] + >>> (inst.sent, inst.verb, inst.noun1, inst.prep, inst.noun2) + ('0', 'join', 'board', 'as', 'director') + >>> inst.attachment + 'V' + +product_reviews_1 and product_reviews_2 +--------------------------------------- +These two datasets respectively contain annotated customer reviews of 5 and 9 +products from amazon.com. + + >>> from nltk.corpus import product_reviews_1 + >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt') + >>> review = camera_reviews[0] + >>> review.sents()[0] + ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am', + 'extremely', 'satisfied', 'with', 'the', 'purchase', '.'] + >>> review.features() + [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'), + ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'), + ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'), + ('option', '+1')] + +It is also possible to reach the same information directly from the stream: + + >>> product_reviews_1.features('Canon_G3.txt') + [('canon powershot g3', '+3'), ('use', '+2'), ...] + +We can compute stats for specific product features: + + >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) + >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) + >>> mean = tot / n_reviews + >>> print(n_reviews, tot, mean) + 15 24 1.6 + +pros_cons +--------- +A list of pros/cons sentences for determining context (aspect) dependent +sentiment words, which are then applied to sentiment analysis of comparative +sentences. + + >>> from nltk.corpus import pros_cons + >>> pros_cons.sents(categories='Cons') + [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy', + 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'], + ...] + >>> pros_cons.words('IntegratedPros.txt') + ['Easy', 'to', 'use', ',', 'economical', '!', ...] + +semcor +------ +The Brown Corpus, annotated with WordNet senses. + + >>> from nltk.corpus import semcor + >>> semcor.words('brown2/tagfiles/br-n12.xml') + ['When', 'several', 'minutes', 'had', 'passed', ...] + +senseval +-------- +The Senseval 2 corpus is a word sense disambiguation corpus. Each +item in the corpus corresponds to a single ambiguous word. For each +of these words, the corpus contains a list of instances, corresponding +to occurrences of that word. Each instance provides the word; a list +of word senses that apply to the word occurrence; and the word's +context. + + >>> from nltk.corpus import senseval + >>> senseval.fileids() + ['hard.pos', 'interest.pos', 'line.pos', 'serve.pos'] + >>> senseval.instances('hard.pos') + ... + [SensevalInstance(word='hard-a', + position=20, + context=[('``', '``'), ('he', 'PRP'), ...('hard', 'JJ'), ...], + senses=('HARD1',)), + SensevalInstance(word='hard-a', + position=10, + context=[('clever', 'NNP'), ...('hard', 'JJ'), ('time', 'NN'), ...], + senses=('HARD1',)), ...] + +The following code looks at instances of the word 'interest', and +displays their local context (2 words on each side) and word sense(s): + + >>> for inst in senseval.instances('interest.pos')[:10]: + ... p = inst.position + ... left = ' '.join(w for (w,t) in inst.context[p-2:p]) + ... word = ' '.join(w for (w,t) in inst.context[p:p+1]) + ... right = ' '.join(w for (w,t) in inst.context[p+1:p+3]) + ... senses = ' '.join(inst.senses) + ... print('%20s |%10s | %-15s -> %s' % (left, word, right, senses)) + declines in | interest | rates . -> interest_6 + indicate declining | interest | rates because -> interest_6 + in short-term | interest | rates . -> interest_6 + 4 % | interest | in this -> interest_5 + company with | interests | in the -> interest_5 + , plus | interest | . -> interest_6 + set the | interest | rate on -> interest_6 + 's own | interest | , prompted -> interest_4 + principal and | interest | is the -> interest_6 + increase its | interest | to 70 -> interest_5 + +sentence_polarity +----------------- +The Sentence Polarity dataset contains 5331 positive and 5331 negative processed +sentences. + + >>> from nltk.corpus import sentence_polarity + >>> sentence_polarity.sents() + [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', + 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', + 'it', 'funny', '.'], ...] + >>> sentence_polarity.categories() + ['neg', 'pos'] + >>> sentence_polarity.sents()[1] + ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', + 'could', 'possibly', 'find', 'it', 'funny', '.'] + +shakespeare +----------- +The Shakespeare corpus contains a set of Shakespeare plays, formatted +as XML files. These corpora are returned as ElementTree objects: + + >>> from nltk.corpus import shakespeare + >>> from xml.etree import ElementTree + >>> shakespeare.fileids() + ['a_and_c.xml', 'dream.xml', 'hamlet.xml', 'j_caesar.xml', ...] + >>> play = shakespeare.xml('dream.xml') + >>> print(play) + + >>> print('%s: %s' % (play[0].tag, play[0].text)) + TITLE: A Midsummer Night's Dream + >>> personae = [persona.text for persona in + ... play.findall('PERSONAE/PERSONA')] + >>> print(personae) + ['THESEUS, Duke of Athens.', 'EGEUS, father to Hermia.', ...] + >>> # Find and print speakers not listed as personae + >>> names = [persona.split(',')[0] for persona in personae] + >>> speakers = set(speaker.text for speaker in + ... play.findall('*/*/*/SPEAKER')) + >>> print(sorted(speakers.difference(names))) + ['ALL', 'COBWEB', 'DEMETRIUS', 'Fairy', 'HERNIA', 'LYSANDER', + 'Lion', 'MOTH', 'MUSTARDSEED', 'Moonshine', 'PEASEBLOSSOM', + 'Prologue', 'Pyramus', 'Thisbe', 'Wall'] + +subjectivity +------------ +The Subjectivity Dataset contains 5000 subjective and 5000 objective processed +sentences. + + >>> from nltk.corpus import subjectivity + >>> subjectivity.categories() + ['obj', 'subj'] + >>> subjectivity.sents()[23] + ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits', + 'happened', 'off', 'screen', '.'] + >>> subjectivity.words(categories='subj') + ['smart', 'and', 'alert', ',', 'thirteen', ...] + +toolbox +------- +The Toolbox corpus distributed with NLTK contains a sample lexicon and +several sample texts from the Rotokas language. The Toolbox corpus +reader returns Toolbox files as XML ElementTree objects. The +following example loads the Rotokas dictionary, and figures out the +distribution of part-of-speech tags for reduplicated words. + +.. doctest: +SKIP + + >>> from nltk.corpus import toolbox + >>> from nltk.probability import FreqDist + >>> from xml.etree import ElementTree + >>> import re + >>> rotokas = toolbox.xml('rotokas.dic') + >>> redup_pos_freqdist = FreqDist() + >>> # Note: we skip over the first record, which is actually + >>> # the header. + >>> for record in rotokas[1:]: + ... lexeme = record.find('lx').text + ... if re.match(r'(.*)\1$', lexeme): + ... redup_pos_freqdist[record.find('ps').text] += 1 + >>> for item, count in redup_pos_freqdist.most_common(): + ... print(item, count) + V 41 + N 14 + ??? 4 + +This example displays some records from a Rotokas text: + +.. doctest: +SKIP + + >>> river = toolbox.xml('rotokas/river.txt', key='ref') + >>> for record in river.findall('record')[:3]: + ... for piece in record: + ... if len(piece.text) > 60: + ... print('%-6s %s...' % (piece.tag, piece.text[:57])) + ... else: + ... print('%-6s %s' % (piece.tag, piece.text)) + ref Paragraph 1 + t ``Viapau oisio ra ovaupasi ... + m viapau oisio ra ovau -pa -si ... + g NEG this way/like this and forget -PROG -2/3.DL... + p NEG ??? CONJ V.I -SUFF.V.3 -SUFF.V... + f ``No ken lus tingting wanema samting papa i bin tok,'' Na... + fe ``Don't forget what Dad said,'' yelled Naomi. + ref 2 + t Osa Ira ora Reviti viapau uvupasiva. + m osa Ira ora Reviti viapau uvu -pa -si ... + g as/like name and name NEG hear/smell -PROG -2/3... + p CONJ N.PN CONJ N.PN NEG V.T -SUFF.V.3 -SUF... + f Tasol Ila na David no bin harim toktok. + fe But Ila and David took no notice. + ref 3 + t Ikaupaoro rokosiva ... + m ikau -pa -oro roko -si -va ... + g run/hurry -PROG -SIM go down -2/3.DL.M -RP ... + p V.T -SUFF.V.3 -SUFF.V.4 ADV -SUFF.V.4 -SUFF.VT.... + f Tupela i bin hariap i go long wara . + fe They raced to the river. + +timit +----- +The NLTK data package includes a fragment of the TIMIT +Acoustic-Phonetic Continuous Speech Corpus. This corpus is broken +down into small speech samples, each of which is available as a wave +file, a phonetic transcription, and a tokenized word list. + + >>> from nltk.corpus import timit + >>> print(timit.utteranceids()) + ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', + 'dr1-fvmh0/si2096', 'dr1-fvmh0/si836', 'dr1-fvmh0/sx116', + 'dr1-fvmh0/sx206', 'dr1-fvmh0/sx26', 'dr1-fvmh0/sx296', ...] + + >>> item = timit.utteranceids()[5] + >>> print(timit.phones(item)) + ['h#', 'k', 'l', 'ae', 's', 'pcl', 'p', 'dh', 'ax', + 's', 'kcl', 'k', 'r', 'ux', 'ix', 'nx', 'y', 'ax', + 'l', 'eh', 'f', 'tcl', 't', 'hh', 'ae', 'n', 'dcl', + 'd', 'h#'] + >>> print(timit.words(item)) + ['clasp', 'the', 'screw', 'in', 'your', 'left', 'hand'] + >>> timit.play(item) # doctest: +SKIP + +The corpus reader can combine the word segmentation information with +the phonemes to produce a single tree structure: + + >>> for tree in timit.phone_trees(item): + ... print(tree) + (S + h# + (clasp k l ae s pcl p) + (the dh ax) + (screw s kcl k r ux) + (in ix nx) + (your y ax) + (left l eh f tcl t) + (hand hh ae n dcl d) + h#) + +The start time and stop time of each phoneme, word, and sentence are +also available: + + >>> print(timit.phone_times(item)) + [('h#', 0, 2190), ('k', 2190, 3430), ('l', 3430, 4326), ...] + >>> print(timit.word_times(item)) + [('clasp', 2190, 8804), ('the', 8804, 9734), ...] + >>> print(timit.sent_times(item)) + [('Clasp the screw in your left hand.', 0, 32154)] + +We can use these times to play selected pieces of a speech sample: + + >>> timit.play(item, 2190, 8804) # 'clasp' # doctest: +SKIP + +The corpus reader can also be queried for information about the +speaker and sentence identifier for a given speech sample: + + >>> print(timit.spkrid(item)) + dr1-fvmh0 + >>> print(timit.sentid(item)) + sx116 + >>> print(timit.spkrinfo(timit.spkrid(item))) + SpeakerInfo(id='VMH0', + sex='F', + dr='1', + use='TRN', + recdate='03/11/86', + birthdate='01/08/60', + ht='5\'05"', + race='WHT', + edu='BS', + comments='BEST NEW ENGLAND ACCENT SO FAR') + + >>> # List the speech samples from the same speaker: + >>> timit.utteranceids(spkrid=timit.spkrid(item)) + ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...] + +twitter_samples +--------------- + +Twitter is well-known microblog service that allows public data to be +collected via APIs. NLTK's twitter corpus currently contains a sample of 20k Tweets +retrieved from the Twitter Streaming API. + + >>> from nltk.corpus import twitter_samples + >>> twitter_samples.fileids() + ['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json'] + +We follow standard practice in storing full Tweets as line-separated +JSON. These data structures can be accessed via `tweets.docs()`. However, in general it +is more practical to focus just on the text field of the Tweets, which +are accessed via the `strings()` method. + + >>> twitter_samples.strings('tweets.20150430-223406.json')[:5] + ['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain \xa3170 billion per year! #BetterOffOut #UKIP', ...] + +The default tokenizer for Tweets is specialised for 'casual' text, and +the `tokenized()` method returns a list of lists of tokens. + + >>> twitter_samples.tokenized('tweets.20150430-223406.json')[:5] + [['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', ...], + ['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY'], ...] + +rte +--- +The RTE (Recognizing Textual Entailment) corpus was derived from the +RTE1, RTE2 and RTE3 datasets (dev and test data), and consists of a +list of XML-formatted 'text'/'hypothesis' pairs. + + >>> from nltk.corpus import rte + >>> print(rte.fileids()) + ['rte1_dev.xml', 'rte1_test.xml', 'rte2_dev.xml', ..., 'rte3_test.xml'] + >>> rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml']) + >>> print(rtepairs) + [, , , ...] + +In the gold standard test sets, each pair is labeled according to +whether or not the text 'entails' the hypothesis; the +entailment value is mapped to an integer 1 (True) or 0 (False). + + >>> rtepairs[5] + + >>> rtepairs[5].text + 'His wife Strida won a seat in parliament after forging an alliance + with the main anti-Syrian coalition in the recent election.' + >>> rtepairs[5].hyp + 'Strida elected to parliament.' + >>> rtepairs[5].value + 1 + +The RTE corpus also supports an ``xml()`` method which produces ElementTrees. + + >>> xmltree = rte.xml('rte3_dev.xml') + >>> xmltree # doctest: +SKIP + + >>> xmltree[7].findtext('t') + "Mrs. Bush's approval ratings have remained very high, above 80%, + even as her husband's have recently dropped below 50%." + +verbnet +------- +The VerbNet corpus is a lexicon that divides verbs into classes, based +on their syntax-semantics linking behavior. The basic elements in the +lexicon are verb lemmas, such as 'abandon' and 'accept', and verb +classes, which have identifiers such as 'remove-10.1' and +'admire-31.2-1'. These class identifiers consist of a representative +verb selected from the class, followed by a numerical identifier. The +list of verb lemmas, and the list of class identifiers, can be +retrieved with the following methods: + + >>> from nltk.corpus import verbnet + >>> verbnet.lemmas()[20:25] + ['accelerate', 'accept', 'acclaim', 'accompany', 'accrue'] + >>> verbnet.classids()[:5] + ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93'] + +The `classids()` method may also be used to retrieve the classes that +a given lemma belongs to: + + >>> verbnet.classids('accept') + ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2'] + +The `classids()` method may additionally be used to retrieve all classes +within verbnet if nothing is passed: + + >>> verbnet.classids() + ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93', 'advise-37.9', 'advise-37.9-1', 'allow-64', 'amalgamate-22.2', 'amalgamate-22.2-1', 'amalgamate-22.2-1-1', 'amalgamate-22.2-2', 'amalgamate-22.2-2-1', 'amalgamate-22.2-3', 'amalgamate-22.2-3-1', 'amalgamate-22.2-3-1-1', 'amalgamate-22.2-3-2', 'amuse-31.1', 'animal_sounds-38', 'appeal-31.4', 'appeal-31.4-1', 'appeal-31.4-2', 'appeal-31.4-3', 'appear-48.1.1', 'appoint-29.1', 'approve-77', 'assessment-34', 'assuming_position-50', 'avoid-52', 'banish-10.2', 'battle-36.4', 'battle-36.4-1', 'begin-55.1', 'begin-55.1-1', 'being_dressed-41.3.3', 'bend-45.2', 'berry-13.7', 'bill-54.5', 'body_internal_motion-49', 'body_internal_states-40.6', 'braid-41.2.2', 'break-45.1', 'breathe-40.1.2', 'breathe-40.1.2-1', 'bring-11.3', 'bring-11.3-1', 'build-26.1', 'build-26.1-1', 'bulge-47.5.3', 'bump-18.4', 'bump-18.4-1', 'butter-9.9', 'calibratable_cos-45.6', 'calibratable_cos-45.6-1', 'calve-28', 'captain-29.8', 'captain-29.8-1', 'captain-29.8-1-1', 'care-88', 'care-88-1', 'carry-11.4', 'carry-11.4-1', 'carry-11.4-1-1', 'carve-21.2', 'carve-21.2-1', 'carve-21.2-2', 'change_bodily_state-40.8.4', 'characterize-29.2', 'characterize-29.2-1', 'characterize-29.2-1-1', 'characterize-29.2-1-2', 'chase-51.6', 'cheat-10.6', 'cheat-10.6-1', 'cheat-10.6-1-1', 'chew-39.2', 'chew-39.2-1', 'chew-39.2-2', 'chit_chat-37.6', 'clear-10.3', 'clear-10.3-1', 'cling-22.5', 'coil-9.6', 'coil-9.6-1', 'coloring-24', 'complain-37.8', 'complete-55.2', 'concealment-16', 'concealment-16-1', 'confess-37.10', 'confine-92', 'confine-92-1', 'conjecture-29.5', 'conjecture-29.5-1', 'conjecture-29.5-2', 'consider-29.9', 'consider-29.9-1', 'consider-29.9-1-1', 'consider-29.9-1-1-1', 'consider-29.9-2', 'conspire-71', 'consume-66', 'consume-66-1', 'contiguous_location-47.8', 'contiguous_location-47.8-1', 'contiguous_location-47.8-2', 'continue-55.3', 'contribute-13.2', 'contribute-13.2-1', 'contribute-13.2-1-1', 'contribute-13.2-1-1-1', 'contribute-13.2-2', 'contribute-13.2-2-1', 'convert-26.6.2', 'convert-26.6.2-1', 'cooking-45.3', 'cooperate-73', 'cooperate-73-1', 'cooperate-73-2', 'cooperate-73-3', 'cope-83', 'cope-83-1', 'cope-83-1-1', 'correlate-86', 'correspond-36.1', 'correspond-36.1-1', 'correspond-36.1-1-1', 'cost-54.2', 'crane-40.3.2', 'create-26.4', 'create-26.4-1', 'curtsey-40.3.3', 'cut-21.1', 'cut-21.1-1', 'debone-10.8', 'declare-29.4', 'declare-29.4-1', 'declare-29.4-1-1', 'declare-29.4-1-1-1', 'declare-29.4-1-1-2', 'declare-29.4-1-1-3', 'declare-29.4-2', 'dedicate-79', 'defend-85', 'destroy-44', 'devour-39.4', 'devour-39.4-1', 'devour-39.4-2', 'differ-23.4', 'dine-39.5', 'disappearance-48.2', 'disassemble-23.3', 'discover-84', 'discover-84-1', 'discover-84-1-1', 'dress-41.1.1', 'dressing_well-41.3.2', 'drive-11.5', 'drive-11.5-1', 'dub-29.3', 'dub-29.3-1', 'eat-39.1', 'eat-39.1-1', 'eat-39.1-2', 'enforce-63', 'engender-27', 'entity_specific_cos-45.5', 'entity_specific_modes_being-47.2', 'equip-13.4.2', 'equip-13.4.2-1', 'equip-13.4.2-1-1', 'escape-51.1', 'escape-51.1-1', 'escape-51.1-2', 'escape-51.1-2-1', 'exceed-90', 'exchange-13.6', 'exchange-13.6-1', 'exchange-13.6-1-1', 'exhale-40.1.3', 'exhale-40.1.3-1', 'exhale-40.1.3-2', 'exist-47.1', 'exist-47.1-1', 'exist-47.1-1-1', 'feeding-39.7', 'ferret-35.6', 'fill-9.8', 'fill-9.8-1', 'fit-54.3', 'flinch-40.5', 'floss-41.2.1', 'focus-87', 'forbid-67', 'force-59', 'force-59-1', 'free-80', 'free-80-1', 'fulfilling-13.4.1', 'fulfilling-13.4.1-1', 'fulfilling-13.4.1-2', 'funnel-9.3', 'funnel-9.3-1', 'funnel-9.3-2', 'funnel-9.3-2-1', 'future_having-13.3', 'get-13.5.1', 'get-13.5.1-1', 'give-13.1', 'give-13.1-1', 'gobble-39.3', 'gobble-39.3-1', 'gobble-39.3-2', 'gorge-39.6', 'groom-41.1.2', 'grow-26.2', 'help-72', 'help-72-1', 'herd-47.5.2', 'hiccup-40.1.1', 'hit-18.1', 'hit-18.1-1', 'hold-15.1', 'hold-15.1-1', 'hunt-35.1', 'hurt-40.8.3', 'hurt-40.8.3-1', 'hurt-40.8.3-1-1', 'hurt-40.8.3-2', 'illustrate-25.3', 'image_impression-25.1', 'indicate-78', 'indicate-78-1', 'indicate-78-1-1', 'inquire-37.1.2', 'instr_communication-37.4', 'investigate-35.4', 'judgement-33', 'keep-15.2', 'knead-26.5', 'learn-14', 'learn-14-1', 'learn-14-2', 'learn-14-2-1', 'leave-51.2', 'leave-51.2-1', 'lecture-37.11', 'lecture-37.11-1', 'lecture-37.11-1-1', 'lecture-37.11-2', 'light_emission-43.1', 'limit-76', 'linger-53.1', 'linger-53.1-1', 'lodge-46', 'long-32.2', 'long-32.2-1', 'long-32.2-2', 'manner_speaking-37.3', 'marry-36.2', 'marvel-31.3', 'marvel-31.3-1', 'marvel-31.3-2', 'marvel-31.3-3', 'marvel-31.3-4', 'marvel-31.3-5', 'marvel-31.3-6', 'marvel-31.3-7', 'marvel-31.3-8', 'marvel-31.3-9', 'masquerade-29.6', 'masquerade-29.6-1', 'masquerade-29.6-2', 'matter-91', 'meander-47.7', 'meet-36.3', 'meet-36.3-1', 'meet-36.3-2', 'mine-10.9', 'mix-22.1', 'mix-22.1-1', 'mix-22.1-1-1', 'mix-22.1-2', 'mix-22.1-2-1', 'modes_of_being_with_motion-47.3', 'murder-42.1', 'murder-42.1-1', 'neglect-75', 'neglect-75-1', 'neglect-75-1-1', 'neglect-75-2', 'nonvehicle-51.4.2', 'nonverbal_expression-40.2', 'obtain-13.5.2', 'obtain-13.5.2-1', 'occurrence-48.3', 'order-60', 'order-60-1', 'orphan-29.7', 'other_cos-45.4', 'pain-40.8.1', 'pay-68', 'peer-30.3', 'pelt-17.2', 'performance-26.7', 'performance-26.7-1', 'performance-26.7-1-1', 'performance-26.7-2', 'performance-26.7-2-1', 'pit-10.7', 'pocket-9.10', 'pocket-9.10-1', 'poison-42.2', 'poke-19', 'pour-9.5', 'preparing-26.3', 'preparing-26.3-1', 'preparing-26.3-2', 'price-54.4', 'push-12', 'push-12-1', 'push-12-1-1', 'put-9.1', 'put-9.1-1', 'put-9.1-2', 'put_direction-9.4', 'put_spatial-9.2', 'put_spatial-9.2-1', 'reach-51.8', 'reflexive_appearance-48.1.2', 'refrain-69', 'register-54.1', 'rely-70', 'remove-10.1', 'risk-94', 'risk-94-1', 'roll-51.3.1', 'rummage-35.5', 'run-51.3.2', 'rush-53.2', 'say-37.7', 'say-37.7-1', 'say-37.7-1-1', 'say-37.7-2', 'scribble-25.2', 'search-35.2', 'see-30.1', 'see-30.1-1', 'see-30.1-1-1', 'send-11.1', 'send-11.1-1', 'separate-23.1', 'separate-23.1-1', 'separate-23.1-2', 'settle-89', 'shake-22.3', 'shake-22.3-1', 'shake-22.3-1-1', 'shake-22.3-2', 'shake-22.3-2-1', 'sight-30.2', 'simple_dressing-41.3.1', 'slide-11.2', 'slide-11.2-1-1', 'smell_emission-43.3', 'snooze-40.4', 'sound_emission-43.2', 'sound_existence-47.4', 'spank-18.3', 'spatial_configuration-47.6', 'split-23.2', 'spray-9.7', 'spray-9.7-1', 'spray-9.7-1-1', 'spray-9.7-2', 'stalk-35.3', 'steal-10.5', 'stimulus_subject-30.4', 'stop-55.4', 'stop-55.4-1', 'substance_emission-43.4', 'succeed-74', 'succeed-74-1', 'succeed-74-1-1', 'succeed-74-2', 'suffocate-40.7', 'suspect-81', 'swarm-47.5.1', 'swarm-47.5.1-1', 'swarm-47.5.1-2', 'swarm-47.5.1-2-1', 'swat-18.2', 'talk-37.5', 'tape-22.4', 'tape-22.4-1', 'tell-37.2', 'throw-17.1', 'throw-17.1-1', 'throw-17.1-1-1', 'tingle-40.8.2', 'touch-20', 'touch-20-1', 'transcribe-25.4', 'transfer_mesg-37.1.1', 'transfer_mesg-37.1.1-1', 'transfer_mesg-37.1.1-1-1', 'try-61', 'turn-26.6.1', 'turn-26.6.1-1', 'urge-58', 'vehicle-51.4.1', 'vehicle-51.4.1-1', 'waltz-51.5', 'want-32.1', 'want-32.1-1', 'want-32.1-1-1', 'weather-57', 'weekend-56', 'wink-40.3.1', 'wink-40.3.1-1', 'wipe_instr-10.4.2', 'wipe_instr-10.4.2-1', 'wipe_manner-10.4.1', 'wipe_manner-10.4.1-1', 'wish-62', 'withdraw-82', 'withdraw-82-1', 'withdraw-82-2', 'withdraw-82-3'] + +The primary object in the lexicon is a class record, which is stored +as an ElementTree xml object. The class record for a given class +identifier is returned by the `vnclass()` method: + + >>> verbnet.vnclass('remove-10.1') + + +The `vnclass()` method also accepts "short" identifiers, such as '10.1': + + >>> verbnet.vnclass('10.1') + + +See the Verbnet documentation, or the Verbnet files, for information +about the structure of this xml. As an example, we can retrieve a +list of thematic roles for a given Verbnet class: + + >>> vn_31_2 = verbnet.vnclass('admire-31.2') + >>> for themrole in vn_31_2.findall('THEMROLES/THEMROLE'): + ... print(themrole.attrib['type'], end=' ') + ... for selrestr in themrole.findall('SELRESTRS/SELRESTR'): + ... print('[%(Value)s%(type)s]' % selrestr.attrib, end=' ') + ... print() + Theme + Experiencer [+animate] + Predicate + +The Verbnet corpus also provides a variety of pretty printing +functions that can be used to display the xml contents in a more +concise form. The simplest such method is `pprint()`: + + >>> print(verbnet.pprint('57')) + weather-57 + Subclasses: (none) + Members: blow clear drizzle fog freeze gust hail howl lightning mist + mizzle pelt pour precipitate rain roar shower sleet snow spit spot + sprinkle storm swelter teem thaw thunder + Thematic roles: + * Theme[+concrete +force] + Frames: + Intransitive (Expletive Subject) + Example: It's raining. + Syntax: LEX[it] LEX[[+be]] VERB + Semantics: + * weather(during(E), Weather_type, ?Theme) + NP (Expletive Subject, Theme Object) + Example: It's raining cats and dogs. + Syntax: LEX[it] LEX[[+be]] VERB NP[Theme] + Semantics: + * weather(during(E), Weather_type, Theme) + PP (Expletive Subject, Theme-PP) + Example: It was pelting with rain. + Syntax: LEX[it[+be]] VERB PREP[with] NP[Theme] + Semantics: + * weather(during(E), Weather_type, Theme) + +Verbnet gives us frames that link the syntax and semantics using an example. +These frames are part of the corpus and we can use `frames()` to get a frame +for a given verbnet class. + + >>> frame = verbnet.frames('57') + >>> frame == [{'example': "It's raining.", 'description': {'primary': 'Intransitive', 'secondary': 'Expletive Subject'}, 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'selrestrs': [], 'synrestrs': []}}], 'semantics': [{'predicate_value': 'weather', 'arguments': [{'type': 'Event', 'value': 'during(E)'}, {'type': 'VerbSpecific', 'value': 'Weather_type'}, {'type': 'ThemRole', 'value': '?Theme'}], 'negated': False}]}, {'example': "It's raining cats and dogs.", 'description': {'primary': 'NP', 'secondary': 'Expletive Subject, Theme Object'}, 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'selrestrs': [], 'synrestrs': []}}], 'semantics': [{'predicate_value': 'weather', 'arguments': [{'type': 'Event', 'value': 'during(E)'}, {'type': 'VerbSpecific', 'value': 'Weather_type'}, {'type': 'ThemRole', 'value': 'Theme'}], 'negated': False}]}, {'example': 'It was pelting with rain.', 'description': {'primary': 'PP', 'secondary': 'Expletive Subject, Theme-PP'}, 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it[+be]', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'PREP', 'modifiers': {'value': 'with', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'selrestrs': [], 'synrestrs': []}}], 'semantics': [{'predicate_value': 'weather', 'arguments': [{'type': 'Event', 'value': 'during(E)'}, {'type': 'VerbSpecific', 'value': 'Weather_type'}, {'type': 'ThemRole', 'value': 'Theme'}], 'negated': False}]}] + True + +Verbnet corpus lets us access thematic roles individually using `themroles()`. + + >>> themroles = verbnet.themroles('57') + >>> themroles == [{'modifiers': [{'type': 'concrete', 'value': '+'}, {'type': 'force', 'value': '+'}], 'type': 'Theme'}] + True + +Verbnet classes may also have subclasses sharing similar syntactic and semantic properties +while having differences with the superclass. The Verbnet corpus allows us to access these +subclasses using `subclasses()`. + + >>> print(verbnet.subclasses('9.1')) #Testing for 9.1 since '57' does not have subclasses + ['put-9.1-1', 'put-9.1-2'] + + +nps_chat +-------- + +The NPS Chat Corpus, Release 1.0 consists of over 10,000 posts in age-specific +chat rooms, which have been anonymized, POS-tagged and dialogue-act tagged. + + >>> print(nltk.corpus.nps_chat.words()) + ['now', 'im', 'left', 'with', 'this', 'gay', ...] + >>> print(nltk.corpus.nps_chat.tagged_words()) + [('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...] + >>> print(nltk.corpus.nps_chat.tagged_posts()) + [[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'), + ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN')], [(':P', 'UH')], ...] + +We can access the XML elements corresponding to individual posts. These elements +have ``class`` and ``user`` attributes that we can access using ``p.attrib['class']`` +and ``p.attrib['user']``. They also have text content, accessed using ``p.text``. + + >>> print(nltk.corpus.nps_chat.xml_posts()) + [, , ...] + >>> posts = nltk.corpus.nps_chat.xml_posts() + >>> sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys()) + ['Accept', 'Bye', 'Clarify', 'Continuer', 'Emotion', 'Emphasis', + 'Greet', 'Other', 'Reject', 'Statement', 'System', 'nAnswer', + 'whQuestion', 'yAnswer', 'ynQuestion'] + >>> posts[0].text + 'now im left with this gay name' + +In addition to the above methods for accessing tagged text, we can navigate +the XML structure directly, as follows: + + >>> tokens = posts[0].findall('terminals/t') + >>> [t.attrib['pos'] + "/" + t.attrib['word'] for t in tokens] + ['RB/now', 'PRP/im', 'VBD/left', 'IN/with', 'DT/this', 'JJ/gay', 'NN/name'] + +multext_east +------------ + +The Multext-East Corpus consists of POS-tagged versions of George Orwell's book +1984 in 12 languages: English, Czech, Hungarian, Macedonian, Slovenian, Serbian, +Slovak, Romanian, Estonian, Farsi, Bulgarian and Polish. +The corpus can be accessed using the usual methods for tagged corpora. The tagset +can be transformed from the Multext-East specific MSD tags to the Universal tagset +using the "tagset" parameter of all functions returning tagged parts of the corpus. + + >>> print(nltk.corpus.multext_east.words("oana-en.xml")) + ['It', 'was', 'a', 'bright', ...] + >>> print(nltk.corpus.multext_east.tagged_words("oana-en.xml")) + [('It', '#Pp3ns'), ('was', '#Vmis3s'), ('a', '#Di'), ...] + >>> print(nltk.corpus.multext_east.tagged_sents("oana-en.xml", "universal")) + [[('It', 'PRON'), ('was', 'VERB'), ('a', 'DET'), ...] + + + +--------------------- +Corpus Reader Classes +--------------------- + +NLTK's *corpus reader* classes are used to access the contents of a +diverse set of corpora. Each corpus reader class is specialized to +handle a specific corpus format. Examples include the +`PlaintextCorpusReader`, which handles corpora that consist of a set +of unannotated text files, and the `BracketParseCorpusReader`, which +handles corpora that consist of files containing +parenthesis-delineated parse trees. + +Automatically Created Corpus Reader Instances +============================================= + +When the `nltk.corpus` module is imported, it automatically creates a +set of corpus reader instances that can be used to access the corpora +in the NLTK data distribution. Here is a small sample of those +corpus reader instances: + + >>> import nltk + >>> nltk.corpus.brown + + >>> nltk.corpus.treebank + + >>> nltk.corpus.names + + >>> nltk.corpus.genesis + + >>> nltk.corpus.inaugural + + +This sample illustrates that different corpus reader classes are used +to read different corpora; but that the same corpus reader class may +be used for more than one corpus (e.g., ``genesis`` and ``inaugural``). + +Creating New Corpus Reader Instances +==================================== + +Although the `nltk.corpus` module automatically creates corpus reader +instances for the corpora in the NLTK data distribution, you may +sometimes need to create your own corpus reader. In particular, you +would need to create your own corpus reader if you want... + +- To access a corpus that is not included in the NLTK data + distribution. + +- To access a full copy of a corpus for which the NLTK data + distribution only provides a sample. + +- To access a corpus using a customized corpus reader (e.g., with + a customized tokenizer). + +To create a new corpus reader, you will first need to look up the +signature for that corpus reader's constructor. Different corpus +readers have different constructor signatures, but most of the +constructor signatures have the basic form:: + + SomeCorpusReader(root, files, ...options...) + +Where ``root`` is an absolute path to the directory containing the +corpus data files; ``files`` is either a list of file names (relative +to ``root``) or a regexp specifying which files should be included; +and ``options`` are additional reader-specific options. For example, +we can create a customized corpus reader for the genesis corpus that +uses a different sentence tokenizer as follows: + + >>> # Find the directory where the corpus lives. + >>> genesis_dir = nltk.data.find('corpora/genesis') + >>> # Create our custom sentence tokenizer. + >>> my_sent_tokenizer = nltk.RegexpTokenizer('[^.!?]+') + >>> # Create the new corpus reader object. + >>> my_genesis = nltk.corpus.PlaintextCorpusReader( + ... genesis_dir, r'.*\.txt', sent_tokenizer=my_sent_tokenizer) + >>> # Use the new corpus reader object. + >>> print(my_genesis.sents('english-kjv.txt')[0]) + ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', + 'and', 'the', 'earth'] + +If you wish to read your own plaintext corpus, which is stored in the +directory '/usr/share/some-corpus', then you can create a corpus +reader for it with:: + + >>> my_corpus = nltk.corpus.PlaintextCorpusReader( + ... '/usr/share/some-corpus', r'.*\.txt') # doctest: +SKIP + +For a complete list of corpus reader subclasses, see the API +documentation for `nltk.corpus.reader`. + +Corpus Types +============ + +Corpora vary widely in the types of content they include. This is +reflected in the fact that the base class `CorpusReader` only defines +a few general-purpose methods for listing and accessing the files that +make up a corpus. It is up to the subclasses to define *data access +methods* that provide access to the information in the corpus. +However, corpus reader subclasses should be consistent in their +definitions of these data access methods wherever possible. + +At a high level, corpora can be divided into three basic types: + +- A *token corpus* contains information about specific occurrences of + language use (or linguistic tokens), such as dialogues or written + texts. Examples of token corpora are collections of written text + and collections of speech. + +- A *type corpus*, or *lexicon*, contains information about a coherent + set of lexical items (or linguistic types). Examples of lexicons + are dictionaries and word lists. + +- A *language description corpus* contains information about a set of + non-lexical linguistic constructs, such as grammar rules. + +However, many individual corpora blur the distinctions between these +types. For example, corpora that are primarily lexicons may include +token data in the form of example sentences; and corpora that are +primarily token corpora may be accompanied by one or more word lists +or other lexical data sets. + +Because corpora vary so widely in their information content, we have +decided that it would not be wise to use separate corpus reader base +classes for different corpus types. Instead, we simply try to make +the corpus readers consistent wherever possible, but let them differ +where the underlying data itself differs. + +Common Corpus Reader Methods +============================ + +As mentioned above, there are only a handful of methods that all +corpus readers are guaranteed to implement. These methods provide +access to the files that contain the corpus data. Every corpus is +assumed to consist of one or more files, all located in a common root +directory (or in subdirectories of that root directory). The absolute +path to the root directory is stored in the ``root`` property: + + >>> import os + >>> str(nltk.corpus.genesis.root).replace(os.path.sep,'/') + '.../nltk_data/corpora/genesis' + +Each file within the corpus is identified by a platform-independent +identifier, which is basically a path string that uses ``/`` as the +path separator. I.e., this identifier can be converted to a relative +path as follows: + + >>> some_corpus_file_id = nltk.corpus.reuters.fileids()[0] + >>> import os.path + >>> os.path.normpath(some_corpus_file_id).replace(os.path.sep,'/') + 'test/14826' + +To get a list of all data files that make up a corpus, use the +``fileids()`` method. In some corpora, these files will not all contain +the same type of data; for example, for the ``nltk.corpus.timit`` +corpus, ``fileids()`` will return a list including text files, word +segmentation files, phonetic transcription files, sound files, and +metadata files. For corpora with diverse file types, the ``fileids()`` +method will often take one or more optional arguments, which can be +used to get a list of the files with a specific file type: + + >>> nltk.corpus.timit.fileids() + ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...] + >>> nltk.corpus.timit.fileids('phn') + ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa2.phn', 'dr1-fvmh0/si1466.phn', ...] + +In some corpora, the files are divided into distinct categories. For +these corpora, the ``fileids()`` method takes an optional argument, +which can be used to get a list of the files within a specific category: + + >>> nltk.corpus.brown.fileids('hobbies') + ['ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', ...] + +The ``abspath()`` method can be used to find the absolute path to a +corpus file, given its file identifier: + + >>> str(nltk.corpus.brown.abspath('ce06')).replace(os.path.sep,'/') + '.../corpora/brown/ce06' + +The ``abspaths()`` method can be used to find the absolute paths for +one corpus file, a list of corpus files, or (if no fileids are specified), +all corpus files. + +This method is mainly useful as a helper method when defining corpus +data access methods, since data access methods can usually be called +with a string argument (to get a view for a specific file), with a +list argument (to get a view for a specific list of files), or with no +argument (to get a view for the whole corpus). + +Data Access Methods +=================== + +Individual corpus reader subclasses typically extend this basic set of +file-access methods with one or more *data access methods*, which provide +easy access to the data contained in the corpus. The signatures for +data access methods often have the basic form:: + + corpus_reader.some_data access(fileids=None, ...options...) + +Where ``fileids`` can be a single file identifier string (to get a view +for a specific file); a list of file identifier strings (to get a view +for a specific list of files); or None (to get a view for the entire +corpus). Some of the common data access methods, and their return +types, are: + + - I{corpus}.words(): list of str + - I{corpus}.sents(): list of (list of str) + - I{corpus}.paras(): list of (list of (list of str)) + - I{corpus}.tagged_words(): list of (str,str) tuple + - I{corpus}.tagged_sents(): list of (list of (str,str)) + - I{corpus}.tagged_paras(): list of (list of (list of (str,str))) + - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves) + - I{corpus}.parsed_sents(): list of (Tree with str leaves) + - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves)) + - I{corpus}.xml(): A single xml ElementTree + - I{corpus}.raw(): str (unprocessed corpus contents) + +For example, the `words()` method is supported by many different +corpora, and returns a flat list of word strings: + + >>> nltk.corpus.brown.words() + ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] + >>> nltk.corpus.treebank.words() + ['Pierre', 'Vinken', ',', '61', 'years', 'old', ...] + >>> nltk.corpus.conll2002.words() + ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', ...] + >>> nltk.corpus.genesis.words() + ['In', 'the', 'beginning', 'God', 'created', ...] + +On the other hand, the `tagged_words()` method is only supported by +corpora that include part-of-speech annotations: + + >>> nltk.corpus.brown.tagged_words() + [('The', 'AT'), ('Fulton', 'NP-TL'), ...] + >>> nltk.corpus.treebank.tagged_words() + [('Pierre', 'NNP'), ('Vinken', 'NNP'), ...] + >>> nltk.corpus.conll2002.tagged_words() + [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...] + >>> nltk.corpus.genesis.tagged_words() + Traceback (most recent call last): + ... + AttributeError: 'PlaintextCorpusReader' object has no attribute 'tagged_words' + +Although most corpus readers use file identifiers to index their +content, some corpora use different identifiers instead. For example, +the data access methods for the ``timit`` corpus uses *utterance +identifiers* to select which corpus items should be returned: + + >>> nltk.corpus.timit.utteranceids() + ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...] + >>> nltk.corpus.timit.words('dr1-fvmh0/sa2') + ["don't", 'ask', 'me', 'to', 'carry', 'an', 'oily', 'rag', 'like', 'that'] + +Attempting to call ``timit``\ 's data access methods with a file +identifier will result in an exception: + + >>> nltk.corpus.timit.fileids() + ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...] + >>> nltk.corpus.timit.words('dr1-fvmh0/sa1.txt') # doctest: +SKIP + Traceback (most recent call last): + ... + IOError: No such file or directory: '.../dr1-fvmh0/sa1.txt.wrd' + +As another example, the ``propbank`` corpus defines the ``roleset()`` +method, which expects a roleset identifier, not a file identifier: + + >>> roleset = nltk.corpus.propbank.roleset('eat.01') + >>> from xml.etree import ElementTree as ET + >>> print(ET.tostring(roleset).decode('utf8')) + + + ...... + ... + ... + +Stream Backed Corpus Views +========================== +An important feature of NLTK's corpus readers is that many of them +access the underlying data files using "corpus views." A *corpus +view* is an object that acts like a simple data structure (such as a +list), but does not store the data elements in memory; instead, data +elements are read from the underlying data files on an as-needed +basis. + +By only loading items from the file on an as-needed basis, corpus +views maintain both memory efficiency and responsiveness. The memory +efficiency of corpus readers is important because some corpora contain +very large amounts of data, and storing the entire data set in memory +could overwhelm many machines. The responsiveness is important when +experimenting with corpora in interactive sessions and in in-class +demonstrations. + +The most common corpus view is the `StreamBackedCorpusView`, which +acts as a read-only list of tokens. Two additional corpus view +classes, `ConcatenatedCorpusView` and `LazySubsequence`, make it +possible to create concatenations and take slices of +`StreamBackedCorpusView` objects without actually storing the +resulting list-like object's elements in memory. + +In the future, we may add additional corpus views that act like other +basic data structures, such as dictionaries. + +Writing New Corpus Readers +========================== + +In order to add support for new corpus formats, it is necessary to +define new corpus reader classes. For many corpus formats, writing +new corpus readers is relatively straight-forward. In this section, +we'll describe what's involved in creating a new corpus reader. If +you do create a new corpus reader, we encourage you to contribute it +back to the NLTK project. + +Don't Reinvent the Wheel +------------------------ +Before you start writing a new corpus reader, you should check to be +sure that the desired format can't be read using an existing corpus +reader with appropriate constructor arguments. For example, although +the `TaggedCorpusReader` assumes that words and tags are separated by +``/`` characters by default, an alternative tag-separation character +can be specified via the ``sep`` constructor argument. You should +also check whether the new corpus format can be handled by subclassing +an existing corpus reader, and tweaking a few methods or variables. + +Design +------ +If you decide to write a new corpus reader from scratch, then you +should first decide which data access methods you want the reader to +provide, and what their signatures should be. You should look at +existing corpus readers that process corpora with similar data +contents, and try to be consistent with those corpus readers whenever +possible. + +You should also consider what sets of identifiers are appropriate for +the corpus format. Where it's practical, file identifiers should be +used. However, for some corpora, it may make sense to use additional +sets of identifiers. Each set of identifiers should have a distinct +name (e.g., fileids, utteranceids, rolesets); and you should be consistent +in using that name to refer to that identifier. Do not use parameter +names like ``id``, which leave it unclear what type of identifier is +required. + +Once you've decided what data access methods and identifiers are +appropriate for your corpus, you should decide if there are any +customizable parameters that you'd like the corpus reader to handle. +These parameters make it possible to use a single corpus reader to +handle a wider variety of corpora. The ``sep`` argument for +`TaggedCorpusReader`, mentioned above, is an example of a customizable +corpus reader parameter. + +Implementation +-------------- + +Constructor +~~~~~~~~~~~ +If your corpus reader implements any customizable parameters, then +you'll need to override the constructor. Typically, the new +constructor will first call its base class's constructor, and then +store the customizable parameters. For example, the +`ConllChunkCorpusReader`\ 's constructor is defined as follows: + + >>> def __init__(self, root, fileids, chunk_types, encoding='utf8', + ... tagset=None, separator=None): + ... ConllCorpusReader.__init__( + ... self, root, fileids, ('words', 'pos', 'chunk'), + ... chunk_types=chunk_types, encoding=encoding, + ... tagset=tagset, separator=separator) + +If your corpus reader does not implement any customization parameters, +then you can often just inherit the base class's constructor. + +Data Access Methods +~~~~~~~~~~~~~~~~~~~ + +The most common type of data access method takes an argument +identifying which files to access, and returns a view covering those +files. This argument may be a single file identifier string (to get a +view for a specific file); a list of file identifier strings (to get a +view for a specific list of files); or None (to get a view for the +entire corpus). The method's implementation converts this argument to +a list of path names using the `abspaths()` method, which handles all +three value types (string, list, and None): + + >>> print(str(nltk.corpus.brown.abspaths()).replace('\\\\','/')) + [FileSystemPathPointer('.../corpora/brown/ca01'), + FileSystemPathPointer('.../corpora/brown/ca02'), ...] + >>> print(str(nltk.corpus.brown.abspaths('ce06')).replace('\\\\','/')) + [FileSystemPathPointer('.../corpora/brown/ce06')] + >>> print(str(nltk.corpus.brown.abspaths(['ce06', 'ce07'])).replace('\\\\','/')) + [FileSystemPathPointer('.../corpora/brown/ce06'), + FileSystemPathPointer('.../corpora/brown/ce07')] + +An example of this type of method is the `words()` method, defined by +the `PlaintextCorpusReader` as follows: + + >>> def words(self, fileids=None): + ... return concat([self.CorpusView(fileid, self._read_word_block) + ... for fileid in self.abspaths(fileids)]) + +This method first uses `abspaths()` to convert ``fileids`` to a list of +absolute paths. It then creates a corpus view for each file, using +the `PlaintextCorpusReader._read_word_block()` method to read elements +from the data file (see the discussion of corpus views below). +Finally, it combines these corpus views using the +`nltk.corpus.reader.util.concat()` function. + +When writing a corpus reader for a corpus that is never expected to be +very large, it can sometimes be appropriate to read the files +directly, rather than using a corpus view. For example, the +`WordListCorpusView` class defines its `words()` method as follows: + + >>> def words(self, fileids=None): + ... return concat([[w for w in open(fileid).read().split('\n') if w] + ... for fileid in self.abspaths(fileids)]) + +(This is usually more appropriate for lexicons than for token corpora.) + +If the type of data returned by a data access method is one for which +NLTK has a conventional representation (e.g., words, tagged words, and +parse trees), then you should use that representation. Otherwise, you +may find it necessary to define your own representation. For data +structures that are relatively corpus-specific, it's usually best to +define new classes for these elements. For example, the ``propbank`` +corpus defines the `PropbankInstance` class to store the semantic role +labeling instances described by the corpus; and the ``ppattach`` +corpus defines the `PPAttachment` class to store the prepositional +attachment instances described by the corpus. + +Corpus Views +~~~~~~~~~~~~ +.. (Much of the content for this section is taken from the + StreamBackedCorpusView docstring.) + +The heart of a `StreamBackedCorpusView` is its *block reader* +function, which reads zero or more tokens from a stream, and returns +them as a list. A very simple example of a block reader is: + + >>> def simple_block_reader(stream): + ... return stream.readline().split() + +This simple block reader reads a single line at a time, and returns a +single token (consisting of a string) for each whitespace-separated +substring on the line. A `StreamBackedCorpusView` built from this +block reader will act like a read-only list of all the +whitespace-separated tokens in an underlying file. + +When deciding how to define the block reader for a given corpus, +careful consideration should be given to the size of blocks handled by +the block reader. Smaller block sizes will increase the memory +requirements of the corpus view's internal data structures (by 2 +integers per block). On the other hand, larger block sizes may +decrease performance for random access to the corpus. (But note that +larger block sizes will *not* decrease performance for iteration.) + +Internally, the `StreamBackedCorpusView` class maintains a partial +mapping from token index to file position, with one entry per block. +When a token with a given index *i* is requested, the corpus view +constructs it as follows: + +1. First, it searches the toknum/filepos mapping for the token index + closest to (but less than or equal to) *i*. + +2. Then, starting at the file position corresponding to that index, it + reads one block at a time using the block reader until it reaches + the requested token. + +The toknum/filepos mapping is created lazily: it is initially empty, +but every time a new block is read, the block's initial token is added +to the mapping. (Thus, the toknum/filepos map has one entry per +block.) + +You can create your own corpus view in one of two ways: + +1. Call the `StreamBackedCorpusView` constructor, and provide your + block reader function via the ``block_reader`` argument. + +2. Subclass `StreamBackedCorpusView`, and override the + `read_block()` method. + +The first option is usually easier, but the second option can allow +you to write a single `read_block` method whose behavior can be +customized by different parameters to the subclass's constructor. For +an example of this design pattern, see the `TaggedCorpusView` class, +which is used by `TaggedCorpusView`. + +---------------- +Regression Tests +---------------- + +The following helper functions are used to create and then delete +testing corpora that are stored in temporary directories. These +testing corpora are used to make sure the readers work correctly. + + >>> import tempfile, os.path, textwrap + >>> def make_testcorpus(ext='', **fileids): + ... root = tempfile.mkdtemp() + ... for fileid, contents in fileids.items(): + ... fileid += ext + ... f = open(os.path.join(root, fileid), 'w') + ... f.write(textwrap.dedent(contents)) + ... f.close() + ... return root + >>> def del_testcorpus(root): + ... for fileid in os.listdir(root): + ... os.remove(os.path.join(root, fileid)) + ... os.rmdir(root) + +Plaintext Corpus Reader +======================= +The plaintext corpus reader is used to access corpora that consist of +unprocessed plaintext data. It assumes that paragraph breaks are +indicated by blank lines. Sentences and words can be tokenized using +the default tokenizers, or by custom tokenizers specified as +parameters to the constructor. + + >>> root = make_testcorpus(ext='.txt', + ... a="""\ + ... This is the first sentence. Here is another + ... sentence! And here's a third sentence. + ... + ... This is the second paragraph. Tokenization is currently + ... fairly simple, so the period in Mr. gets tokenized. + ... """, + ... b="""This is the second file.""") + + >>> from nltk.corpus.reader.plaintext import PlaintextCorpusReader + +The list of documents can be specified explicitly, or implicitly (using a +regexp). The ``ext`` argument specifies a file extension. + + >>> corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt']) + >>> corpus.fileids() + ['a.txt', 'b.txt'] + >>> corpus = PlaintextCorpusReader(root, r'.*\.txt') + >>> corpus.fileids() + ['a.txt', 'b.txt'] + +The directory containing the corpus is corpus.root: + + >>> str(corpus.root) == str(root) + True + +We can get a list of words, or the raw string: + + >>> corpus.words() + ['This', 'is', 'the', 'first', 'sentence', '.', ...] + >>> corpus.raw()[:40] + 'This is the first sentence. Here is ano' + +Check that reading individual documents works, and reading all documents at +once works: + + >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()] + (46, [40, 6]) + >>> corpus.words('a.txt') + ['This', 'is', 'the', 'first', 'sentence', '.', ...] + >>> corpus.words('b.txt') + ['This', 'is', 'the', 'second', 'file', '.'] + >>> corpus.words()[:4], corpus.words()[-4:] + (['This', 'is', 'the', 'first'], ['the', 'second', 'file', '.']) + +We're done with the test corpus: + + >>> del_testcorpus(root) + +Test the plaintext corpora that come with nltk: + + >>> from nltk.corpus import abc, genesis, inaugural + >>> from nltk.corpus import state_union, webtext + >>> for corpus in (abc, genesis, inaugural, state_union, + ... webtext): + ... print(str(corpus).replace('\\\\','/')) + ... print(' ', repr(corpus.fileids())[:60]) + ... print(' ', repr(corpus.words()[:10])[:60]) + + ['rural.txt', 'science.txt'] + ['PM', 'denies', 'knowledge', 'of', 'AWB', ... + + ['english-kjv.txt', 'english-web.txt', 'finnish.txt', ... + ['In', 'the', 'beginning', 'God', 'created', 'the', ... + + ['1789-Washington.txt', '1793-Washington.txt', ... + ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ... + + ['1945-Truman.txt', '1946-Truman.txt', ... + ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ... + + ['firefox.txt', 'grail.txt', 'overheard.txt', ... + ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ... + + +Tagged Corpus Reader +==================== +The Tagged Corpus reader can give us words, sentences, and paragraphs, +each tagged or untagged. All of the read methods can take one item +(in which case they return the contents of that file) or a list of +documents (in which case they concatenate the contents of those files). +By default, they apply to all documents in the corpus. + + >>> root = make_testcorpus( + ... a="""\ + ... This/det is/verb the/det first/adj sentence/noun ./punc + ... Here/det is/verb another/adj sentence/noun ./punc + ... Note/verb that/comp you/pron can/verb use/verb \ + ... any/noun tag/noun set/noun + ... + ... This/det is/verb the/det second/adj paragraph/noun ./punc + ... word/n without/adj a/det tag/noun :/: hello ./punc + ... """, + ... b="""\ + ... This/det is/verb the/det second/adj file/noun ./punc + ... """) + + >>> from nltk.corpus.reader.tagged import TaggedCorpusReader + >>> corpus = TaggedCorpusReader(root, list('ab')) + >>> corpus.fileids() + ['a', 'b'] + >>> str(corpus.root) == str(root) + True + >>> corpus.words() + ['This', 'is', 'the', 'first', 'sentence', '.', ...] + >>> corpus.sents() + [['This', 'is', 'the', 'first', ...], ['Here', 'is', 'another'...], ...] + >>> corpus.paras() + [[['This', ...], ['Here', ...], ...], [['This', ...], ...], ...] + >>> corpus.tagged_words() + [('This', 'DET'), ('is', 'VERB'), ('the', 'DET'), ...] + >>> corpus.tagged_sents() + [[('This', 'DET'), ('is', 'VERB'), ...], [('Here', 'DET'), ...], ...] + >>> corpus.tagged_paras() + [[[('This', 'DET'), ...], ...], [[('This', 'DET'), ...], ...], ...] + >>> corpus.raw()[:40] + 'This/det is/verb the/det first/adj sente' + >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()] + (38, [32, 6]) + >>> len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()] + (6, [5, 1]) + >>> len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()] + (3, [2, 1]) + >>> print(corpus.words('a')) + ['This', 'is', 'the', 'first', 'sentence', '.', ...] + >>> print(corpus.words('b')) + ['This', 'is', 'the', 'second', 'file', '.'] + >>> del_testcorpus(root) + +The Brown Corpus uses the tagged corpus reader: + + >>> from nltk.corpus import brown + >>> brown.fileids() + ['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', ...] + >>> brown.categories() + ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', + 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] + >>> print(repr(brown.root).replace('\\\\','/')) + FileSystemPathPointer('.../corpora/brown') + >>> brown.words() + ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] + >>> brown.sents() + [['The', 'Fulton', 'County', 'Grand', ...], ...] + >>> brown.paras() + [[['The', 'Fulton', 'County', ...]], [['The', 'jury', ...]], ...] + >>> brown.tagged_words() + [('The', 'AT'), ('Fulton', 'NP-TL'), ...] + >>> brown.tagged_sents() + [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ...], ...] + >>> brown.tagged_paras() + [[[('The', 'AT'), ...]], [[('The', 'AT'), ...]], ...] + +Verbnet Corpus Reader +===================== + +Make sure we're picking up the right number of elements: + + >>> from nltk.corpus import verbnet + >>> len(verbnet.lemmas()) + 3621 + >>> len(verbnet.wordnetids()) + 4953 + >>> len(verbnet.classids()) + 429 + +Selecting classids based on various selectors: + + >>> verbnet.classids(lemma='take') + ['bring-11.3', 'characterize-29.2', 'convert-26.6.2', 'cost-54.2', + 'fit-54.3', 'performance-26.7-2', 'steal-10.5'] + >>> verbnet.classids(wordnetid='lead%2:38:01') + ['accompany-51.7'] + >>> verbnet.classids(fileid='approve-77.xml') + ['approve-77'] + >>> verbnet.classids(classid='admire-31.2') # subclasses + ['admire-31.2-1'] + +vnclass() accepts filenames, long ids, and short ids: + + >>> a = ElementTree.tostring(verbnet.vnclass('admire-31.2.xml')) + >>> b = ElementTree.tostring(verbnet.vnclass('admire-31.2')) + >>> c = ElementTree.tostring(verbnet.vnclass('31.2')) + >>> a == b == c + True + +fileids() can be used to get files based on verbnet class ids: + + >>> verbnet.fileids('admire-31.2') + ['admire-31.2.xml'] + >>> verbnet.fileids(['admire-31.2', 'obtain-13.5.2']) + ['admire-31.2.xml', 'obtain-13.5.2.xml'] + >>> verbnet.fileids('badidentifier') + Traceback (most recent call last): + . . . + ValueError: vnclass identifier 'badidentifier' not found + +longid() and shortid() can be used to convert identifiers: + + >>> verbnet.longid('31.2') + 'admire-31.2' + >>> verbnet.longid('admire-31.2') + 'admire-31.2' + >>> verbnet.shortid('31.2') + '31.2' + >>> verbnet.shortid('admire-31.2') + '31.2' + >>> verbnet.longid('badidentifier') + Traceback (most recent call last): + . . . + ValueError: vnclass identifier 'badidentifier' not found + >>> verbnet.shortid('badidentifier') + Traceback (most recent call last): + . . . + ValueError: vnclass identifier 'badidentifier' not found + +Corpus View Regression Tests +============================ + +Select some corpus files to play with: + + >>> import nltk.data + >>> # A very short file (160 chars): + >>> f1 = nltk.data.find('corpora/inaugural/README') + >>> # A relatively short file (791 chars): + >>> f2 = nltk.data.find('corpora/inaugural/1793-Washington.txt') + >>> # A longer file (32k chars): + >>> f3 = nltk.data.find('corpora/inaugural/1909-Taft.txt') + >>> fileids = [f1, f2, f3] + + +Concatenation +------------- +Check that concatenation works as intended. + + >>> from nltk.corpus.reader.util import * + + >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8') + >>> c2 = StreamBackedCorpusView(f2, read_whitespace_block, encoding='utf-8') + >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8') + >>> c123 = c1+c2+c3 + >>> print(c123) + ['C-Span', 'Inaugural', 'Address', 'Corpus', 'US', ...] + + >>> l1 = f1.open(encoding='utf-8').read().split() + >>> l2 = f2.open(encoding='utf-8').read().split() + >>> l3 = f3.open(encoding='utf-8').read().split() + >>> l123 = l1+l2+l3 + + >>> list(c123) == l123 + True + + >>> (c1+c2+c3)[100] == l123[100] + True + +Slicing +------- +First, do some tests with fairly small slices. These will all +generate tuple values. + + >>> from nltk.util import LazySubsequence + >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8') + >>> l1 = f1.open(encoding='utf-8').read().split() + >>> print(len(c1)) + 21 + >>> len(c1) < LazySubsequence.MIN_SIZE + True + +Choose a list of indices, based on the length, that covers the +important corner cases: + + >>> indices = [-60, -30, -22, -21, -20, -1, + ... 0, 1, 10, 20, 21, 22, 30, 60] + +Test slicing with explicit start & stop value: + + >>> for s in indices: + ... for e in indices: + ... assert list(c1[s:e]) == l1[s:e] + +Test slicing with stop=None: + + >>> for s in indices: + ... assert list(c1[s:]) == l1[s:] + +Test slicing with start=None: + + >>> for e in indices: + ... assert list(c1[:e]) == l1[:e] + +Test slicing with start=stop=None: + + >>> list(c1[:]) == list(l1[:]) + True + +Next, we'll do some tests with much longer slices. These will +generate LazySubsequence objects. + + >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8') + >>> l3 = f3.open(encoding='utf-8').read().split() + >>> print(len(c3)) + 5430 + >>> len(c3) > LazySubsequence.MIN_SIZE*2 + True + +Choose a list of indices, based on the length, that covers the +important corner cases: + + >>> indices = [-12000, -6000, -5431, -5430, -5429, -3000, -200, -1, + ... 0, 1, 200, 3000, 5000, 5429, 5430, 5431, 6000, 12000] + +Test slicing with explicit start & stop value: + + >>> for s in indices: + ... for e in indices: + ... assert list(c3[s:e]) == l3[s:e] + +Test slicing with stop=None: + + >>> for s in indices: + ... assert list(c3[s:]) == l3[s:] + +Test slicing with start=None: + + >>> for e in indices: + ... assert list(c3[:e]) == l3[:e] + +Test slicing with start=stop=None: + + >>> list(c3[:]) == list(l3[:]) + True + +Multiple Iterators +------------------ +If multiple iterators are created for the same corpus view, their +iteration can be interleaved: + + >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block) + >>> iterators = [c3.iterate_from(n) for n in [0,15,30,45]] + >>> for i in range(15): + ... for iterator in iterators: + ... print('%-15s' % next(iterator), end=' ') + ... print() + My a duties in + fellow heavy of a + citizens: weight the proper + Anyone of office sense + who responsibility. upon of + has If which the + taken not, he obligation + the he is which + oath has about the + I no to oath + have conception enter, imposes. + just of or The + taken the he office + must powers is of + feel and lacking an + +SeekableUnicodeStreamReader +=========================== + +The file-like objects provided by the ``codecs`` module unfortunately +suffer from a bug that prevents them from working correctly with +corpus view objects. In particular, although the expose ``seek()`` +and ``tell()`` methods, those methods do not exhibit the expected +behavior, because they are not synchronized with the internal buffers +that are kept by the file-like objects. For example, the ``tell()`` +method will return the file position at the end of the buffers (whose +contents have not yet been returned by the stream); and therefore this +file position can not be used to return to the 'current' location in +the stream (since ``seek()`` has no way to reconstruct the buffers). + +To get around these problems, we define a new class, +`SeekableUnicodeStreamReader`, to act as a file-like interface to +files containing encoded unicode data. This class is loosely based on +the ``codecs.StreamReader`` class. To construct a new reader, we call +the constructor with an underlying stream and an encoding name: + + >>> from io import StringIO, BytesIO + >>> from nltk.data import SeekableUnicodeStreamReader + >>> stream = BytesIO(b"""\ + ... This is a test file. + ... It is encoded in ascii. + ... """.decode('ascii').encode('ascii')) + >>> reader = SeekableUnicodeStreamReader(stream, 'ascii') + +`SeekableUnicodeStreamReader`\ s support all of the normal operations +supplied by a read-only stream. Note that all of the read operations +return ``unicode`` objects (not ``str`` objects). + + >>> reader.read() # read the entire file. + 'This is a test file.\nIt is encoded in ascii.\n' + >>> reader.seek(0) # rewind to the start. + >>> reader.read(5) # read at most 5 bytes. + 'This ' + >>> reader.readline() # read to the end of the line. + 'is a test file.\n' + >>> reader.seek(0) # rewind to the start. + >>> for line in reader: + ... print(repr(line)) # iterate over lines + 'This is a test file.\n' + 'It is encoded in ascii.\n' + >>> reader.seek(0) # rewind to the start. + >>> reader.readlines() # read a list of line strings + ['This is a test file.\n', 'It is encoded in ascii.\n'] + >>> reader.close() + +Size argument to ``read()`` +--------------------------- +The ``size`` argument to ``read()`` specifies the maximum number of +*bytes* to read, not the maximum number of *characters*. Thus, for +encodings that use multiple bytes per character, it may return fewer +characters than the ``size`` argument: + + >>> stream = BytesIO(b"""\ + ... This is a test file. + ... It is encoded in utf-16. + ... """.decode('ascii').encode('utf-16')) + >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') + >>> reader.read(10) + 'This ' + +If a read block ends in the middle of the byte string encoding a +single character, then that byte string is stored in an internal +buffer, and re-used on the next call to ``read()``. However, if the +size argument is too small to read even a single character, even +though at least one character is available, then the ``read()`` method +will read additional bytes until it can return a single character. +This ensures that the ``read()`` method does not return an empty +string, which could be mistaken for indicating the end of the file. + + >>> reader.seek(0) # rewind to the start. + >>> reader.read(1) # we actually need to read 4 bytes + 'T' + >>> int(reader.tell()) + 4 + +The ``readline()`` method may read more than a single line of text, in +which case it stores the text that it does not return in a buffer. If +this buffer is not empty, then its contents will be included in the +value returned by the next call to ``read()``, regardless of the +``size`` argument, since they are available without reading any new +bytes from the stream: + + >>> reader.seek(0) # rewind to the start. + >>> reader.readline() # stores extra text in a buffer + 'This is a test file.\n' + >>> print(reader.linebuffer) # examine the buffer contents + ['It is encoded i'] + >>> reader.read(0) # returns the contents of the buffer + 'It is encoded i' + >>> print(reader.linebuffer) # examine the buffer contents + None + +Seek and Tell +------------- +In addition to these basic read operations, +`SeekableUnicodeStreamReader` also supports the ``seek()`` and +``tell()`` operations. However, some care must still be taken when +using these operations. In particular, the only file offsets that +should be passed to ``seek()`` are ``0`` and any offset that has been +returned by ``tell``. + + >>> stream = BytesIO(b"""\ + ... This is a test file. + ... It is encoded in utf-16. + ... """.decode('ascii').encode('utf-16')) + >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') + >>> reader.read(20) + 'This is a ' + >>> pos = reader.tell(); print(pos) + 22 + >>> reader.read(20) + 'test file.' + >>> reader.seek(pos) # rewind to the position from tell. + >>> reader.read(20) + 'test file.' + +The ``seek()`` and ``tell()`` methods work property even when +``readline()`` is used. + + >>> stream = BytesIO(b"""\ + ... This is a test file. + ... It is encoded in utf-16. + ... """.decode('ascii').encode('utf-16')) + >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') + >>> reader.readline() + 'This is a test file.\n' + >>> pos = reader.tell(); print(pos) + 44 + >>> reader.readline() + 'It is encoded in utf-16.\n' + >>> reader.seek(pos) # rewind to the position from tell. + >>> reader.readline() + 'It is encoded in utf-16.\n' + + +Squashed Bugs +============= + +svn 5276 fixed a bug in the comment-stripping behavior of +parse_sexpr_block. + + >>> from io import StringIO + >>> from nltk.corpus.reader.util import read_sexpr_block + >>> f = StringIO(b""" + ... (a b c) + ... # This line is a comment. + ... (d e f\ng h)""".decode('ascii')) + >>> print(read_sexpr_block(f, block_size=38, comment_char='#')) + ['(a b c)'] + >>> print(read_sexpr_block(f, block_size=38, comment_char='#')) + ['(d e f\ng h)'] + +svn 5277 fixed a bug in parse_sexpr_block, which would cause it to +enter an infinite loop if a file ended mid-sexpr, or ended with a +token that was not followed by whitespace. A related bug caused +an infinite loop if the corpus ended in an unmatched close paren -- +this was fixed in svn 5279 + + >>> f = StringIO(b""" + ... This file ends mid-sexpr + ... (hello (world""".decode('ascii')) + >>> for i in range(3): print(read_sexpr_block(f)) + ['This', 'file', 'ends', 'mid-sexpr'] + ['(hello (world'] + [] + + >>> f = StringIO(b"This file has no trailing whitespace.".decode('ascii')) + >>> for i in range(3): print(read_sexpr_block(f)) + ['This', 'file', 'has', 'no', 'trailing'] + ['whitespace.'] + [] + + >>> # Bug fixed in 5279: + >>> f = StringIO(b"a b c)".decode('ascii')) + >>> for i in range(3): print(read_sexpr_block(f)) + ['a', 'b'] + ['c)'] + [] + + +svn 5624 & 5265 fixed a bug in ConcatenatedCorpusView, which caused it +to return the wrong items when indexed starting at any index beyond +the first file. + + >>> import nltk + >>> sents = nltk.corpus.brown.sents() + >>> print(sents[6000]) + ['Cholesterol', 'and', 'thyroid'] + >>> print(sents[6000]) + ['Cholesterol', 'and', 'thyroid'] + +svn 5728 fixed a bug in Categorized*CorpusReader, which caused them +to return words from *all* files when just one file was specified. + + >>> from nltk.corpus import reuters + >>> reuters.words('training/13085') + ['SNYDER', '&', 'lt', ';', 'SOI', '>', 'MAKES', ...] + >>> reuters.words('training/5082') + ['SHEPPARD', 'RESOURCES', 'TO', 'MERGE', 'WITH', ...] + +svn 7227 fixed a bug in the qc corpus reader, which prevented +access to its tuples() method + + >>> from nltk.corpus import qc + >>> qc.tuples('test.txt') + [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...] + +Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulnerability. + + >>> import re + >>> import time + >>> from nltk.corpus.reader.comparative_sents import KEYWORD + >>> sizes = { + ... "short": 4000, + ... "long": 40000 + ... } + >>> exec_times = { + ... "short": [], + ... "long": [], + ... } + >>> for size_name, size in sizes.items(): + ... for j in range(9): + ... start_t = time.perf_counter() + ... payload = "( " + "(" * size + ... output = KEYWORD.findall(payload) + ... exec_times[size_name].append(time.perf_counter() - start_t) + ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the median + +Ideally, the execution time of such a regular expression is linear +in the length of the input. As such, we would expect exec_times["long"] +to be roughly 10 times as big as exec_times["short"]. +With the ReDoS in place, it took roughly 80 times as long. +For now, we accept values below 30 (times as long), due to the potential +for variance. This ensures that the ReDoS has certainly been reduced, +if not removed. + + >>> exec_times["long"] / exec_times["short"] < 30 # doctest: +SKIP + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/data.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/data.doctest new file mode 100644 index 0000000000000000000000000000000000000000..b340a8e463ea28f52e081290d6e6f753ce468b1f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/data.doctest @@ -0,0 +1,377 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========================================= + Loading Resources From the Data Package +========================================= + + >>> import nltk.data + +Overview +~~~~~~~~ +The `nltk.data` module contains functions that can be used to load +NLTK resource files, such as corpora, grammars, and saved processing +objects. + +Loading Data Files +~~~~~~~~~~~~~~~~~~ +Resources are loaded using the function `nltk.data.load()`, which +takes as its first argument a URL specifying what file should be +loaded. The ``nltk:`` protocol loads files from the NLTK data +distribution: + + >>> tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') + >>> tokenizer.tokenize('Hello. This is a test. It works!') + ['Hello.', 'This is a test.', 'It works!'] + +It is important to note that there should be no space following the +colon (':') in the URL; 'nltk: tokenizers/punkt/english.pickle' will +not work! + +The ``nltk:`` protocol is used by default if no protocol is specified: + + >>> nltk.data.load('tokenizers/punkt/english.pickle') + + +But it is also possible to load resources from ``http:``, ``ftp:``, +and ``file:`` URLs, e.g. ``cfg = nltk.data.load('https://example.com/path/to/toy.cfg')`` + + >>> # Load a grammar using an absolute path. + >>> url = 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg') + >>> url.replace('\\', '/') + 'file:...toy.cfg' + >>> print(nltk.data.load(url)) + Grammar with 14 productions (start state = S) + S -> NP VP + PP -> P NP + ... + P -> 'on' + P -> 'in' + +The second argument to the `nltk.data.load()` function specifies the +file format, which determines how the file's contents are processed +before they are returned by ``load()``. The formats that are +currently supported by the data module are described by the dictionary +`nltk.data.FORMATS`: + + >>> for format, descr in sorted(nltk.data.FORMATS.items()): + ... print('{0:<7} {1:}'.format(format, descr)) + cfg A context free grammar. + fcfg A feature CFG. + fol A list of first order logic expressions, parsed with + nltk.sem.logic.Expression.fromstring. + json A serialized python object, stored using the json module. + logic A list of first order logic expressions, parsed with + nltk.sem.logic.LogicParser. Requires an additional logic_parser + parameter + pcfg A probabilistic CFG. + pickle A serialized python object, stored using the pickle + module. + raw The raw (byte string) contents of a file. + text The raw (unicode string) contents of a file. + val A semantic valuation, parsed by + nltk.sem.Valuation.fromstring. + yaml A serialized python object, stored using the yaml module. + +`nltk.data.load()` will raise a ValueError if a bad format name is +specified: + + >>> nltk.data.load('grammars/sample_grammars/toy.cfg', 'bar') + Traceback (most recent call last): + . . . + ValueError: Unknown format type! + +By default, the ``"auto"`` format is used, which chooses a format +based on the filename's extension. The mapping from file extensions +to format names is specified by `nltk.data.AUTO_FORMATS`: + + >>> for ext, format in sorted(nltk.data.AUTO_FORMATS.items()): + ... print('.%-7s -> %s' % (ext, format)) + .cfg -> cfg + .fcfg -> fcfg + .fol -> fol + .json -> json + .logic -> logic + .pcfg -> pcfg + .pickle -> pickle + .text -> text + .txt -> text + .val -> val + .yaml -> yaml + +If `nltk.data.load()` is unable to determine the format based on the +filename's extension, it will raise a ValueError: + + >>> nltk.data.load('foo.bar') + Traceback (most recent call last): + . . . + ValueError: Could not determine format for foo.bar based on its file + extension; use the "format" argument to specify the format explicitly. + +Note that by explicitly specifying the ``format`` argument, you can +override the load method's default processing behavior. For example, +to get the raw contents of any file, simply use ``format="raw"``: + + >>> s = nltk.data.load('grammars/sample_grammars/toy.cfg', 'text') + >>> print(s) + S -> NP VP + PP -> P NP + NP -> Det N | NP PP + VP -> V NP | VP PP + ... + +Making Local Copies +~~~~~~~~~~~~~~~~~~~ +.. This will not be visible in the html output: create a tempdir to + play in. + >>> import tempfile, os + >>> tempdir = tempfile.mkdtemp() + >>> old_dir = os.path.abspath('.') + >>> os.chdir(tempdir) + +The function `nltk.data.retrieve()` copies a given resource to a local +file. This can be useful, for example, if you want to edit one of the +sample grammars. + + >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') + Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy.cfg' + + >>> # Simulate editing the grammar. + >>> with open('toy.cfg') as inp: + ... s = inp.read().replace('NP', 'DP') + >>> with open('toy.cfg', 'w') as out: + ... _bytes_written = out.write(s) + + >>> # Load the edited grammar, & display it. + >>> cfg = nltk.data.load('file:///' + os.path.abspath('toy.cfg')) + >>> print(cfg) + Grammar with 14 productions (start state = S) + S -> DP VP + PP -> P DP + ... + P -> 'on' + P -> 'in' + +The second argument to `nltk.data.retrieve()` specifies the filename +for the new copy of the file. By default, the source file's filename +is used. + + >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg', 'mytoy.cfg') + Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'mytoy.cfg' + >>> os.path.isfile('./mytoy.cfg') + True + >>> nltk.data.retrieve('grammars/sample_grammars/np.fcfg') + Retrieving 'nltk:grammars/sample_grammars/np.fcfg', saving to 'np.fcfg' + >>> os.path.isfile('./np.fcfg') + True + +If a file with the specified (or default) filename already exists in +the current directory, then `nltk.data.retrieve()` will raise a +ValueError exception. It will *not* overwrite the file: + + >>> os.path.isfile('./toy.cfg') + True + >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') + Traceback (most recent call last): + . . . + ValueError: File '...toy.cfg' already exists! + +.. This will not be visible in the html output: clean up the tempdir. + >>> os.chdir(old_dir) + >>> for f in os.listdir(tempdir): + ... os.remove(os.path.join(tempdir, f)) + >>> os.rmdir(tempdir) + +Finding Files in the NLTK Data Package +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The `nltk.data.find()` function searches the NLTK data package for a +given file, and returns a pointer to that file. This pointer can +either be a `FileSystemPathPointer` (whose `path` attribute gives the +absolute path of the file); or a `ZipFilePathPointer`, specifying a +zipfile and the name of an entry within that zipfile. Both pointer +types define the `open()` method, which can be used to read the string +contents of the file. + + >>> path = nltk.data.find('corpora/abc/rural.txt') + >>> str(path) + '...rural.txt' + >>> print(path.open().read(60).decode()) + PM denies knowledge of AWB kickbacks + The Prime Minister has + +Alternatively, the `nltk.data.load()` function can be used with the +keyword argument ``format="raw"``: + + >>> s = nltk.data.load('corpora/abc/rural.txt', format='raw')[:60] + >>> print(s.decode()) + PM denies knowledge of AWB kickbacks + The Prime Minister has + +Alternatively, you can use the keyword argument ``format="text"``: + + >>> s = nltk.data.load('corpora/abc/rural.txt', format='text')[:60] + >>> print(s) + PM denies knowledge of AWB kickbacks + The Prime Minister has + +Resource Caching +~~~~~~~~~~~~~~~~ + +NLTK uses a weakref dictionary to maintain a cache of resources that +have been loaded. If you load a resource that is already stored in +the cache, then the cached copy will be returned. This behavior can +be seen by the trace output generated when verbose=True: + + >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True) + <> + >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True) + <> + +If you wish to load a resource from its source, bypassing the cache, +use the ``cache=False`` argument to `nltk.data.load()`. This can be +useful, for example, if the resource is loaded from a local file, and +you are actively editing that file: + + >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg',cache=False,verbose=True) + <> + +The cache *no longer* uses weak references. A resource will not be +automatically expunged from the cache when no more objects are using +it. In the following example, when we clear the variable ``feat0``, +the reference count for the feature grammar object drops to zero. +However, the object remains cached: + + >>> del feat0 + >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', + ... verbose=True) + <> + +You can clear the entire contents of the cache, using +`nltk.data.clear_cache()`: + + >>> nltk.data.clear_cache() + +Retrieving other Data Sources +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + >>> formulas = nltk.data.load('grammars/book_grammars/background.fol') + >>> for f in formulas: print(str(f)) + all x.(boxerdog(x) -> dog(x)) + all x.(boxer(x) -> person(x)) + all x.-(dog(x) & person(x)) + all x.(married(x) <-> exists y.marry(x,y)) + all x.(bark(x) -> dog(x)) + all x y.(marry(x,y) -> (person(x) & person(y))) + -(Vincent = Mia) + -(Vincent = Fido) + -(Mia = Fido) + +Regression Tests +~~~~~~~~~~~~~~~~ +Create a temp dir for tests that write files: + + >>> import tempfile, os + >>> tempdir = tempfile.mkdtemp() + >>> old_dir = os.path.abspath('.') + >>> os.chdir(tempdir) + +The `retrieve()` function accepts all url types: + + >>> urls = ['https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', + ... 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg'), + ... 'nltk:grammars/sample_grammars/toy.cfg', + ... 'grammars/sample_grammars/toy.cfg'] + >>> for i, url in enumerate(urls): + ... nltk.data.retrieve(url, 'toy-%d.cfg' % i) + Retrieving 'https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', saving to 'toy-0.cfg' + Retrieving 'file:...toy.cfg', saving to 'toy-1.cfg' + Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-2.cfg' + Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-3.cfg' + +Clean up the temp dir: + + >>> os.chdir(old_dir) + >>> for f in os.listdir(tempdir): + ... os.remove(os.path.join(tempdir, f)) + >>> os.rmdir(tempdir) + +Lazy Loader +----------- +A lazy loader is a wrapper object that defers loading a resource until +it is accessed or used in any way. This is mainly intended for +internal use by NLTK's corpus readers. + + >>> # Create a lazy loader for toy.cfg. + >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg') + + >>> # Show that it's not loaded yet: + >>> object.__repr__(ll) + '' + + >>> # printing it is enough to cause it to be loaded: + >>> print(ll) + + + >>> # Show that it's now been loaded: + >>> object.__repr__(ll) + '' + + + >>> # Test that accessing an attribute also loads it: + >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg') + >>> ll.start() + S + >>> object.__repr__(ll) + '' + +Buffered Gzip Reading and Writing +--------------------------------- +Write performance to gzip-compressed is extremely poor when the files become large. +File creation can become a bottleneck in those cases. + +Read performance from large gzipped pickle files was improved in data.py by +buffering the reads. A similar fix can be applied to writes by buffering +the writes to a StringIO object first. + +This is mainly intended for internal use. The test simply tests that reading +and writing work as intended and does not test how much improvement buffering +provides. + + >>> from io import StringIO + >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'wb', size=2**10) + >>> ans = [] + >>> for i in range(10000): + ... ans.append(str(i).encode('ascii')) + ... test.write(str(i).encode('ascii')) + >>> test.close() + >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'rb') + >>> test.read() == b''.join(ans) + True + >>> test.close() + >>> import os + >>> os.unlink('testbuf.gz') + +JSON Encoding and Decoding +-------------------------- +JSON serialization is used instead of pickle for some classes. + + >>> from nltk import jsontags + >>> from nltk.jsontags import JSONTaggedEncoder, JSONTaggedDecoder, register_tag + >>> @jsontags.register_tag + ... class JSONSerializable: + ... json_tag = 'JSONSerializable' + ... + ... def __init__(self, n): + ... self.n = n + ... + ... def encode_json_obj(self): + ... return self.n + ... + ... @classmethod + ... def decode_json_obj(cls, obj): + ... n = obj + ... return cls(n) + ... + >>> JSONTaggedEncoder().encode(JSONSerializable(1)) + '{"!JSONSerializable": 1}' + >>> JSONTaggedDecoder().decode('{"!JSONSerializable": 1}').n + 1 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/dependency.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/dependency.doctest new file mode 100644 index 0000000000000000000000000000000000000000..069a82cd47b48b240155296a1b6d0b2b95875bec --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/dependency.doctest @@ -0,0 +1,241 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=================== +Dependency Grammars +=================== + + >>> from nltk.grammar import DependencyGrammar + >>> from nltk.parse import ( + ... DependencyGraph, + ... ProjectiveDependencyParser, + ... NonprojectiveDependencyParser, + ... ) + +CoNLL Data +---------- + + >>> treebank_data = """Pierre NNP 2 NMOD + ... Vinken NNP 8 SUB + ... , , 2 P + ... 61 CD 5 NMOD + ... years NNS 6 AMOD + ... old JJ 2 NMOD + ... , , 2 P + ... will MD 0 ROOT + ... join VB 8 VC + ... the DT 11 NMOD + ... board NN 9 OBJ + ... as IN 9 VMOD + ... a DT 15 NMOD + ... nonexecutive JJ 15 NMOD + ... director NN 12 PMOD + ... Nov. NNP 9 VMOD + ... 29 CD 16 NMOD + ... . . 9 VMOD + ... """ + + >>> dg = DependencyGraph(treebank_data) + >>> dg.tree().pprint() + (will + (Vinken Pierre , (old (years 61)) ,) + (join (board the) (as (director a nonexecutive)) (Nov. 29) .)) + >>> for head, rel, dep in dg.triples(): + ... print( + ... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})' + ... .format(h=head, r=rel, d=dep) + ... ) + (will, MD), SUB, (Vinken, NNP) + (Vinken, NNP), NMOD, (Pierre, NNP) + (Vinken, NNP), P, (,, ,) + (Vinken, NNP), NMOD, (old, JJ) + (old, JJ), AMOD, (years, NNS) + (years, NNS), NMOD, (61, CD) + (Vinken, NNP), P, (,, ,) + (will, MD), VC, (join, VB) + (join, VB), OBJ, (board, NN) + (board, NN), NMOD, (the, DT) + (join, VB), VMOD, (as, IN) + (as, IN), PMOD, (director, NN) + (director, NN), NMOD, (a, DT) + (director, NN), NMOD, (nonexecutive, JJ) + (join, VB), VMOD, (Nov., NNP) + (Nov., NNP), NMOD, (29, CD) + (join, VB), VMOD, (., .) + +Using a custom cell extractor. + + >>> def custom_extractor(cells): + ... _, tag, head, rel = cells + ... return 'spam', 'spam', tag, tag, '', head, rel + >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) + >>> dg.tree().pprint() + (spam + (spam spam spam (spam (spam spam)) spam) + (spam (spam spam) (spam (spam spam spam)) (spam spam) spam)) + +Custom cell extractors can take in and return an index. + + >>> def custom_extractor(cells, index): + ... word, tag, head, rel = cells + ... return (index, '{}-{}'.format(word, index), word, + ... tag, tag, '', head, rel) + >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) + >>> dg.tree().pprint() + (will-8 + (Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7) + (join-9 + (board-11 the-10) + (as-12 (director-15 a-13 nonexecutive-14)) + (Nov.-16 29-17) + .-18)) + +Using the dependency-parsed version of the Penn Treebank corpus sample. + + >>> from nltk.corpus import dependency_treebank + >>> t = dependency_treebank.parsed_sents()[0] + >>> print(t.to_conll(3)) + Pierre NNP 2 + Vinken NNP 8 + , , 2 + 61 CD 5 + years NNS 6 + old JJ 2 + , , 2 + will MD 0 + join VB 8 + the DT 11 + board NN 9 + as IN 9 + a DT 15 + nonexecutive JJ 15 + director NN 12 + Nov. NNP 9 + 29 CD 16 + . . 8 + +Using the output of zpar (like Malt-TAB but with zero-based indexing) + + >>> zpar_data = """ + ... Pierre NNP 1 NMOD + ... Vinken NNP 7 SUB + ... , , 1 P + ... 61 CD 4 NMOD + ... years NNS 5 AMOD + ... old JJ 1 NMOD + ... , , 1 P + ... will MD -1 ROOT + ... join VB 7 VC + ... the DT 10 NMOD + ... board NN 8 OBJ + ... as IN 8 VMOD + ... a DT 14 NMOD + ... nonexecutive JJ 14 NMOD + ... director NN 11 PMOD + ... Nov. NNP 8 VMOD + ... 29 CD 15 NMOD + ... . . 7 P + ... """ + + >>> zdg = DependencyGraph(zpar_data, zero_based=True) + >>> print(zdg.tree()) + (will + (Vinken Pierre , (old (years 61)) ,) + (join (board the) (as (director a nonexecutive)) (Nov. 29)) + .) + + +Projective Dependency Parsing +----------------------------- + + >>> grammar = DependencyGrammar.fromstring(""" + ... 'fell' -> 'price' | 'stock' + ... 'price' -> 'of' 'the' + ... 'of' -> 'stock' + ... 'stock' -> 'the' + ... """) + >>> print(grammar) + Dependency grammar with 5 productions + 'fell' -> 'price' + 'fell' -> 'stock' + 'price' -> 'of' 'the' + 'of' -> 'stock' + 'stock' -> 'the' + + >>> dp = ProjectiveDependencyParser(grammar) + >>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])): + ... print(t) + (fell (price the (of (stock the)))) + (fell (price the of) (stock the)) + (fell (price the of the) stock) + +Non-Projective Dependency Parsing +--------------------------------- + + >>> grammar = DependencyGrammar.fromstring(""" + ... 'taught' -> 'play' | 'man' + ... 'man' -> 'the' + ... 'play' -> 'golf' | 'dog' | 'to' + ... 'dog' -> 'his' + ... """) + >>> print(grammar) + Dependency grammar with 7 productions + 'taught' -> 'play' + 'taught' -> 'man' + 'man' -> 'the' + 'play' -> 'golf' + 'play' -> 'dog' + 'play' -> 'to' + 'dog' -> 'his' + + >>> dp = NonprojectiveDependencyParser(grammar) + >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']) + + >>> print(g.root['word']) + taught + + >>> for _, node in sorted(g.nodes.items()): + ... if node['word'] is not None: + ... print('{address} {word}: {d}'.format(d=node['deps'][''], **node)) + 1 the: [] + 2 man: [1] + 3 taught: [2, 7] + 4 his: [] + 5 dog: [4] + 6 to: [] + 7 play: [5, 6, 8] + 8 golf: [] + + >>> print(g.tree()) + (taught (man the) (play (dog his) to golf)) + +Integration with MALT parser +============================ + +In case the top relation is different from the default, we can set it. In case +of MALT parser, it's set to `'null'`. + +>>> dg_str = """1 I _ NN NN _ 2 nn _ _ +... 2 shot _ NN NN _ 0 null _ _ +... 3 an _ AT AT _ 2 dep _ _ +... 4 elephant _ NN NN _ 7 nn _ _ +... 5 in _ NN NN _ 7 nn _ _ +... 6 my _ NN NN _ 7 nn _ _ +... 7 pajamas _ NNS NNS _ 3 dobj _ _ +... """ +>>> dg = DependencyGraph(dg_str, top_relation_label='null') + +>>> len(dg.nodes) +8 + +>>> dg.root['word'], dg.root['address'] +('shot', 2) + +>>> print(dg.to_conll(10)) +1 I _ NN NN _ 2 nn _ _ +2 shot _ NN NN _ 0 null _ _ +3 an _ AT AT _ 2 dep _ _ +4 elephant _ NN NN _ 7 nn _ _ +5 in _ NN NN _ 7 nn _ _ +6 my _ NN NN _ 7 nn _ _ +7 pajamas _ NNS NNS _ 3 dobj _ _ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/discourse.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/discourse.doctest new file mode 100644 index 0000000000000000000000000000000000000000..9f26172b8f021295fc7328282b71d4d0149881d5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/discourse.doctest @@ -0,0 +1,552 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +================== +Discourse Checking +================== + + >>> from nltk import * + >>> from nltk.sem import logic + >>> logic._counter._value = 0 + +Setup +===== + + >>> from nltk.test.childes_fixt import setup_module + >>> setup_module() + +Introduction +============ + +The NLTK discourse module makes it possible to test consistency and +redundancy of simple discourses, using theorem-proving and +model-building from `nltk.inference`. + +The ``DiscourseTester`` constructor takes a list of sentences as a +parameter. + + >>> dt = DiscourseTester(['a boxer walks', 'every boxer chases a girl']) + +The ``DiscourseTester`` parses each sentence into a list of logical +forms. Once we have created ``DiscourseTester`` object, we can +inspect various properties of the discourse. First off, we might want +to double-check what sentences are currently stored as the discourse. + + >>> dt.sentences() + s0: a boxer walks + s1: every boxer chases a girl + +As you will see, each sentence receives an identifier `s`\ :subscript:`i`. +We might also want to check what grammar the ``DiscourseTester`` is +using (by default, ``book_grammars/discourse.fcfg``): + + >>> dt.grammar() + % start S + # Grammar Rules + S[SEM = ] -> NP[NUM=?n,SEM=?subj] VP[NUM=?n,SEM=?vp] + NP[NUM=?n,SEM= ] -> Det[NUM=?n,SEM=?det] Nom[NUM=?n,SEM=?nom] + NP[LOC=?l,NUM=?n,SEM=?np] -> PropN[LOC=?l,NUM=?n,SEM=?np] + ... + +A different grammar can be invoked by using the optional ``gramfile`` +parameter when a ``DiscourseTester`` object is created. + +Readings and Threads +==================== + +Depending on +the grammar used, we may find some sentences have more than one +logical form. To check this, use the ``readings()`` method. Given a +sentence identifier of the form `s`\ :subscript:`i`, each reading of +that sentence is given an identifier `s`\ :sub:`i`-`r`\ :sub:`j`. + + + >>> dt.readings() + + s0 readings: + + s0-r0: exists z1.(boxer(z1) & walk(z1)) + s0-r1: exists z1.(boxerdog(z1) & walk(z1)) + + s1 readings: + + s1-r0: all z2.(boxer(z2) -> exists z3.(girl(z3) & chase(z2,z3))) + s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) + + +In this case, the only source of ambiguity lies in the word *boxer*, +which receives two translations: ``boxer`` and ``boxerdog``. The +intention is that one of these corresponds to the ``person`` sense and +one to the ``dog`` sense. In principle, we would also expect to see a +quantifier scope ambiguity in ``s1``. However, the simple grammar we +are using, namely `sem4.fcfg `_, doesn't support quantifier +scope ambiguity. + +We can also investigate the readings of a specific sentence: + + >>> dt.readings('a boxer walks') + The sentence 'a boxer walks' has these readings: + exists x.(boxer(x) & walk(x)) + exists x.(boxerdog(x) & walk(x)) + +Given that each sentence is two-ways ambiguous, we potentially have +four different discourse 'threads', taking all combinations of +readings. To see these, specify the ``threaded=True`` parameter on +the ``readings()`` method. Again, each thread is assigned an +identifier of the form `d`\ :sub:`i`. Following the identifier is a +list of the readings that constitute that thread. + + >>> dt.readings(threaded=True) + d0: ['s0-r0', 's1-r0'] + d1: ['s0-r0', 's1-r1'] + d2: ['s0-r1', 's1-r0'] + d3: ['s0-r1', 's1-r1'] + +Of course, this simple-minded approach doesn't scale: a discourse with, say, three +sentences, each of which has 3 readings, will generate 27 different +threads. It is an interesting exercise to consider how to manage +discourse ambiguity more efficiently. + +Checking Consistency +==================== + +Now, we can check whether some or all of the discourse threads are +consistent, using the ``models()`` method. With no parameter, this +method will try to find a model for every discourse thread in the +current discourse. However, we can also specify just one thread, say ``d1``. + + >>> dt.models('d1') + -------------------------------------------------------------------------------- + Model for Discourse Thread d1 + -------------------------------------------------------------------------------- + % number = 1 + % seconds = 0 + + % Interpretation of size 2 + + c1 = 0. + + f1(0) = 0. + f1(1) = 0. + + boxer(0). + - boxer(1). + + - boxerdog(0). + - boxerdog(1). + + - girl(0). + - girl(1). + + walk(0). + - walk(1). + + - chase(0,0). + - chase(0,1). + - chase(1,0). + - chase(1,1). + + Consistent discourse: d1 ['s0-r0', 's1-r1']: + s0-r0: exists z1.(boxer(z1) & walk(z1)) + s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) + + +There are various formats for rendering **Mace4** models --- here, +we have used the 'cooked' format (which is intended to be +human-readable). There are a number of points to note. + +#. The entities in the domain are all treated as non-negative + integers. In this case, there are only two entities, ``0`` and + ``1``. + +#. The ``-`` symbol indicates negation. So ``0`` is the only + ``boxerdog`` and the only thing that ``walk``\ s. Nothing is a + ``boxer``, or a ``girl`` or in the ``chase`` relation. Thus the + universal sentence is vacuously true. + +#. ``c1`` is an introduced constant that denotes ``0``. + +#. ``f1`` is a Skolem function, but it plays no significant role in + this model. + + +We might want to now add another sentence to the discourse, and there +is method ``add_sentence()`` for doing just this. + + >>> dt.add_sentence('John is a boxer') + >>> dt.sentences() + s0: a boxer walks + s1: every boxer chases a girl + s2: John is a boxer + +We can now test all the properties as before; here, we just show a +couple of them. + + >>> dt.readings() + + s0 readings: + + s0-r0: exists z1.(boxer(z1) & walk(z1)) + s0-r1: exists z1.(boxerdog(z1) & walk(z1)) + + s1 readings: + + s1-r0: all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2))) + s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) + + s2 readings: + + s2-r0: boxer(John) + s2-r1: boxerdog(John) + >>> dt.readings(threaded=True) + d0: ['s0-r0', 's1-r0', 's2-r0'] + d1: ['s0-r0', 's1-r0', 's2-r1'] + d2: ['s0-r0', 's1-r1', 's2-r0'] + d3: ['s0-r0', 's1-r1', 's2-r1'] + d4: ['s0-r1', 's1-r0', 's2-r0'] + d5: ['s0-r1', 's1-r0', 's2-r1'] + d6: ['s0-r1', 's1-r1', 's2-r0'] + d7: ['s0-r1', 's1-r1', 's2-r1'] + +If you are interested in a particular thread, the ``expand_threads()`` +method will remind you of what readings it consists of: + + >>> thread = dt.expand_threads('d1') + >>> for rid, reading in thread: + ... print(rid, str(reading.normalize())) + s0-r0 exists z1.(boxer(z1) & walk(z1)) + s1-r0 all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2))) + s2-r1 boxerdog(John) + +Suppose we have already defined a discourse, as follows: + + >>> dt = DiscourseTester(['A student dances', 'Every student is a person']) + +Now, when we add a new sentence, is it consistent with what we already +have? The `` consistchk=True`` parameter of ``add_sentence()`` allows +us to check: + + >>> dt.add_sentence('No person dances', consistchk=True) + Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']: + s0-r0: exists z1.(student(z1) & dance(z1)) + s1-r0: all z1.(student(z1) -> person(z1)) + s2-r0: -exists z1.(person(z1) & dance(z1)) + + >>> dt.readings() + + s0 readings: + + s0-r0: exists z1.(student(z1) & dance(z1)) + + s1 readings: + + s1-r0: all z1.(student(z1) -> person(z1)) + + s2 readings: + + s2-r0: -exists z1.(person(z1) & dance(z1)) + +So let's retract the inconsistent sentence: + + >>> dt.retract_sentence('No person dances', verbose=True) + Current sentences are + s0: A student dances + s1: Every student is a person + +We can now verify that result is consistent. + + >>> dt.models() + -------------------------------------------------------------------------------- + Model for Discourse Thread d0 + -------------------------------------------------------------------------------- + % number = 1 + % seconds = 0 + + % Interpretation of size 2 + + c1 = 0. + + dance(0). + - dance(1). + + person(0). + - person(1). + + student(0). + - student(1). + + Consistent discourse: d0 ['s0-r0', 's1-r0']: + s0-r0: exists z1.(student(z1) & dance(z1)) + s1-r0: all z1.(student(z1) -> person(z1)) + + +Checking Informativity +====================== + +Let's assume that we are still trying to extend the discourse *A +student dances.* *Every student is a person.* We add a new sentence, +but this time, we check whether it is informative with respect to what +has gone before. + + >>> dt.add_sentence('A person dances', informchk=True) + Sentence 'A person dances' under reading 'exists x.(person(x) & dance(x))': + Not informative relative to thread 'd0' + +In fact, we are just checking whether the new sentence is entailed by +the preceding discourse. + + >>> dt.models() + -------------------------------------------------------------------------------- + Model for Discourse Thread d0 + -------------------------------------------------------------------------------- + % number = 1 + % seconds = 0 + + % Interpretation of size 2 + + c1 = 0. + + c2 = 0. + + dance(0). + - dance(1). + + person(0). + - person(1). + + student(0). + - student(1). + + Consistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']: + s0-r0: exists z1.(student(z1) & dance(z1)) + s1-r0: all z1.(student(z1) -> person(z1)) + s2-r0: exists z1.(person(z1) & dance(z1)) + + + + +Adding Background Knowledge +=========================== + +Let's build a new discourse, and look at the readings of the component sentences: + + >>> dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks']) + >>> dt.readings() + + s0 readings: + + s0-r0: boxer(Vincent) + s0-r1: boxerdog(Vincent) + + s1 readings: + + s1-r0: boxer(Fido) + s1-r1: boxerdog(Fido) + + s2 readings: + + s2-r0: married(Vincent) + + s3 readings: + + s3-r0: bark(Fido) + +This gives us a lot of threads: + + >>> dt.readings(threaded=True) + d0: ['s0-r0', 's1-r0', 's2-r0', 's3-r0'] + d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0'] + d2: ['s0-r1', 's1-r0', 's2-r0', 's3-r0'] + d3: ['s0-r1', 's1-r1', 's2-r0', 's3-r0'] + + +We can eliminate some of the readings, and hence some of the threads, +by adding background information. + + >>> import nltk.data + >>> bg = nltk.data.load('grammars/book_grammars/background.fol') + >>> dt.add_background(bg) + >>> dt.background() + all x.(boxerdog(x) -> dog(x)) + all x.(boxer(x) -> person(x)) + all x.-(dog(x) & person(x)) + all x.(married(x) <-> exists y.marry(x,y)) + all x.(bark(x) -> dog(x)) + all x y.(marry(x,y) -> (person(x) & person(y))) + -(Vincent = Mia) + -(Vincent = Fido) + -(Mia = Fido) + +The background information allows us to reject three of the threads as +inconsistent. To see what remains, use the ``filter=True`` parameter +on ``readings()``. + + >>> dt.readings(filter=True) + d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0'] + +The ``models()`` method gives us more information about the surviving thread. + + >>> dt.models() + -------------------------------------------------------------------------------- + Model for Discourse Thread d0 + -------------------------------------------------------------------------------- + No model found! + + -------------------------------------------------------------------------------- + Model for Discourse Thread d1 + -------------------------------------------------------------------------------- + % number = 1 + % seconds = 0 + + % Interpretation of size 3 + + Fido = 0. + + Mia = 1. + + Vincent = 2. + + f1(0) = 0. + f1(1) = 0. + f1(2) = 2. + + bark(0). + - bark(1). + - bark(2). + + - boxer(0). + - boxer(1). + boxer(2). + + boxerdog(0). + - boxerdog(1). + - boxerdog(2). + + dog(0). + - dog(1). + - dog(2). + + - married(0). + - married(1). + married(2). + + - person(0). + - person(1). + person(2). + + - marry(0,0). + - marry(0,1). + - marry(0,2). + - marry(1,0). + - marry(1,1). + - marry(1,2). + - marry(2,0). + - marry(2,1). + marry(2,2). + + -------------------------------------------------------------------------------- + Model for Discourse Thread d2 + -------------------------------------------------------------------------------- + No model found! + + -------------------------------------------------------------------------------- + Model for Discourse Thread d3 + -------------------------------------------------------------------------------- + No model found! + + Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0', 's3-r0']: + s0-r0: boxer(Vincent) + s1-r0: boxer(Fido) + s2-r0: married(Vincent) + s3-r0: bark(Fido) + + Consistent discourse: d1 ['s0-r0', 's1-r1', 's2-r0', 's3-r0']: + s0-r0: boxer(Vincent) + s1-r1: boxerdog(Fido) + s2-r0: married(Vincent) + s3-r0: bark(Fido) + + Inconsistent discourse: d2 ['s0-r1', 's1-r0', 's2-r0', 's3-r0']: + s0-r1: boxerdog(Vincent) + s1-r0: boxer(Fido) + s2-r0: married(Vincent) + s3-r0: bark(Fido) + + Inconsistent discourse: d3 ['s0-r1', 's1-r1', 's2-r0', 's3-r0']: + s0-r1: boxerdog(Vincent) + s1-r1: boxerdog(Fido) + s2-r0: married(Vincent) + s3-r0: bark(Fido) + + + +.. This will not be visible in the html output: create a tempdir to + play in. + >>> import tempfile, os + >>> tempdir = tempfile.mkdtemp() + >>> old_dir = os.path.abspath('.') + >>> os.chdir(tempdir) + +In order to play around with your own version of background knowledge, +you might want to start off with a local copy of ``background.fol``: + + >>> nltk.data.retrieve('grammars/book_grammars/background.fol') + Retrieving 'nltk:grammars/book_grammars/background.fol', saving to 'background.fol' + +After you have modified the file, the ``load_fol()`` function will parse +the strings in the file into expressions of ``nltk.sem.logic``. + + >>> from nltk.inference.discourse import load_fol + >>> mybg = load_fol(open('background.fol').read()) + +The result can be loaded as an argument of ``add_background()`` in the +manner shown earlier. + +.. This will not be visible in the html output: clean up the tempdir. + >>> os.chdir(old_dir) + >>> for f in os.listdir(tempdir): + ... os.remove(os.path.join(tempdir, f)) + >>> os.rmdir(tempdir) + >>> nltk.data.clear_cache() + + +Regression Testing from book +============================ + + >>> logic._counter._value = 0 + + >>> from nltk.tag import RegexpTagger + >>> tagger = RegexpTagger( + ... [('^(chases|runs)$', 'VB'), + ... ('^(a)$', 'ex_quant'), + ... ('^(every)$', 'univ_quant'), + ... ('^(dog|boy)$', 'NN'), + ... ('^(He)$', 'PRP') + ... ]) + >>> rc = DrtGlueReadingCommand(depparser=MaltParser(tagger=tagger)) + >>> dt = DiscourseTester(map(str.split, ['Every dog chases a boy', 'He runs']), rc) + >>> dt.readings() + + s0 readings: + + s0-r0: ([z2],[boy(z2), (([z5],[dog(z5)]) -> ([],[chases(z5,z2)]))]) + s0-r1: ([],[(([z1],[dog(z1)]) -> ([z2],[boy(z2), chases(z1,z2)]))]) + + s1 readings: + + s1-r0: ([z1],[PRO(z1), runs(z1)]) + >>> dt.readings(show_thread_readings=True) + d0: ['s0-r0', 's1-r0'] : ([z1,z2],[boy(z1), (([z3],[dog(z3)]) -> ([],[chases(z3,z1)])), (z2 = z1), runs(z2)]) + d1: ['s0-r1', 's1-r0'] : INVALID: AnaphoraResolutionException + >>> dt.readings(filter=True, show_thread_readings=True) + d0: ['s0-r0', 's1-r0'] : ([z1,z3],[boy(z1), (([z2],[dog(z2)]) -> ([],[chases(z2,z1)])), (z3 = z1), runs(z3)]) + + >>> logic._counter._value = 0 + + >>> from nltk.parse import FeatureEarleyChartParser + >>> from nltk.sem.drt import DrtParser + >>> grammar = nltk.data.load('grammars/book_grammars/drt.fcfg', logic_parser=DrtParser()) + >>> parser = FeatureEarleyChartParser(grammar, trace=0) + >>> trees = parser.parse('Angus owns a dog'.split()) + >>> print(list(trees)[0].label()['SEM'].simplify().normalize()) + ([z1,z2],[Angus(z1), dog(z2), own(z1,z2)]) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/featgram.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/featgram.doctest new file mode 100644 index 0000000000000000000000000000000000000000..914a02b5183f57bd783bf6ecec6ca9c241c8ce24 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/featgram.doctest @@ -0,0 +1,610 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========================= + Feature Grammar Parsing +========================= + +.. definitions from nltk_book/definitions.rst + +.. role:: feat + :class: feature +.. role:: fval + :class: fval +.. |rarr| unicode:: U+2192 .. right arrow +.. |dot| unicode:: U+2022 .. bullet +.. |pi| unicode:: U+03C0 + +Grammars can be parsed from strings. + + >>> import nltk + >>> from nltk import grammar, parse + >>> g = """ + ... % start DP + ... DP[AGR=?a] -> D[AGR=?a] N[AGR=?a] + ... D[AGR=[NUM='sg', PERS=3]] -> 'this' | 'that' + ... D[AGR=[NUM='pl', PERS=3]] -> 'these' | 'those' + ... D[AGR=[NUM='pl', PERS=1]] -> 'we' + ... D[AGR=[PERS=2]] -> 'you' + ... N[AGR=[NUM='sg', GND='m']] -> 'boy' + ... N[AGR=[NUM='pl', GND='m']] -> 'boys' + ... N[AGR=[NUM='sg', GND='f']] -> 'girl' + ... N[AGR=[NUM='pl', GND='f']] -> 'girls' + ... N[AGR=[NUM='sg']] -> 'student' + ... N[AGR=[NUM='pl']] -> 'students' + ... """ + >>> grammar = grammar.FeatureGrammar.fromstring(g) + >>> tokens = 'these girls'.split() + >>> parser = parse.FeatureEarleyChartParser(grammar) + >>> trees = parser.parse(tokens) + >>> for tree in trees: print(tree) + (DP[AGR=[GND='f', NUM='pl', PERS=3]] + (D[AGR=[NUM='pl', PERS=3]] these) + (N[AGR=[GND='f', NUM='pl']] girls)) + +In general, when we are trying to develop even a very small grammar, +it is convenient to put the rules in a file where they can be edited, +tested and revised. Let's assume that we have saved feat0cfg as a file named +``'feat0.fcfg'`` and placed it in the NLTK ``data`` directory. We can +inspect it as follows: + + >>> nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg') + % start S + # ################### + # Grammar Productions + # ################### + # S expansion productions + S -> NP[NUM=?n] VP[NUM=?n] + # NP expansion productions + NP[NUM=?n] -> N[NUM=?n] + NP[NUM=?n] -> PropN[NUM=?n] + NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n] + NP[NUM=pl] -> N[NUM=pl] + # VP expansion productions + VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n] + VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP + # ################### + # Lexical Productions + # ################### + Det[NUM=sg] -> 'this' | 'every' + Det[NUM=pl] -> 'these' | 'all' + Det -> 'the' | 'some' | 'several' + PropN[NUM=sg]-> 'Kim' | 'Jody' + N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child' + N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' + IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks' + TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes' + IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk' + TV[TENSE=pres, NUM=pl] -> 'see' | 'like' + IV[TENSE=past] -> 'disappeared' | 'walked' + TV[TENSE=past] -> 'saw' | 'liked' + +Assuming we have saved feat0cfg as a file named +``'feat0.fcfg'``, the function ``parse.load_parser`` allows us to +read the grammar into NLTK, ready for use in parsing. + + + >>> cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1) + >>> sent = 'Kim likes children' + >>> tokens = sent.split() + >>> tokens + ['Kim', 'likes', 'children'] + >>> trees = cp.parse(tokens) + |.Kim .like.chil.| + |[----] . .| [0:1] 'Kim' + |. [----] .| [1:2] 'likes' + |. . [----]| [2:3] 'children' + |[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' * + |[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] * + |[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'} + |. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' * + |. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'} + |. . [----]| [2:3] N[NUM='pl'] -> 'children' * + |. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] * + |. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'} + |. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] * + |[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] * + >>> for tree in trees: print(tree) + (S[] + (NP[NUM='sg'] (PropN[NUM='sg'] Kim)) + (VP[NUM='sg', TENSE='pres'] + (TV[NUM='sg', TENSE='pres'] likes) + (NP[NUM='pl'] (N[NUM='pl'] children)))) + +The parser works directly with +the underspecified productions given by the grammar. That is, the +Predictor rule does not attempt to compile out all admissible feature +combinations before trying to expand the non-terminals on the left hand +side of a production. However, when the Scanner matches an input word +against a lexical production that has been predicted, the new edge will +typically contain fully specified features; e.g., the edge +[PropN[`num`:feat: = `sg`:fval:] |rarr| 'Kim', (0, 1)]. Recall from +Chapter 8 that the Fundamental (or Completer) Rule in +standard CFGs is used to combine an incomplete edge that's expecting a +nonterminal *B* with a following, complete edge whose left hand side +matches *B*. In our current setting, rather than checking for a +complete match, we test whether the expected category *B* will +unify with the left hand side *B'* of a following complete +edge. We will explain in more detail in Section 9.2 how +unification works; for the moment, it is enough to know that as a +result of unification, any variable values of features in *B* will be +instantiated by constant values in the corresponding feature structure +in *B'*, and these instantiated values will be used in the new edge +added by the Completer. This instantiation can be seen, for example, +in the edge +[NP [`num`:feat:\ =\ `sg`:fval:] |rarr| PropN[`num`:feat:\ =\ `sg`:fval:] |dot|, (0, 1)] +in Example 9.2, where the feature `num`:feat: has been assigned the value `sg`:fval:. + +Feature structures in NLTK are ... Atomic feature values can be strings or +integers. + + >>> fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') + >>> print(fs1) + [ NUM = 'sg' ] + [ TENSE = 'past' ] + +We can think of a feature structure as being like a Python dictionary, +and access its values by indexing in the usual way. + + >>> fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem') + >>> print(fs1['GND']) + fem + +We can also define feature structures which have complex values, as +discussed earlier. + + >>> fs2 = nltk.FeatStruct(POS='N', AGR=fs1) + >>> print(fs2) + [ [ GND = 'fem' ] ] + [ AGR = [ NUM = 'pl' ] ] + [ [ PER = 3 ] ] + [ ] + [ POS = 'N' ] + >>> print(fs2['AGR']) + [ GND = 'fem' ] + [ NUM = 'pl' ] + [ PER = 3 ] + >>> print(fs2['AGR']['PER']) + 3 + +Feature structures can also be constructed using the ``parse()`` +method of the ``nltk.FeatStruct`` class. Note that in this case, atomic +feature values do not need to be enclosed in quotes. + + >>> f1 = nltk.FeatStruct("[NUMBER = sg]") + >>> f2 = nltk.FeatStruct("[PERSON = 3]") + >>> print(nltk.unify(f1, f2)) + [ NUMBER = 'sg' ] + [ PERSON = 3 ] + + >>> f1 = nltk.FeatStruct("[A = [B = b, D = d]]") + >>> f2 = nltk.FeatStruct("[A = [C = c, D = d]]") + >>> print(nltk.unify(f1, f2)) + [ [ B = 'b' ] ] + [ A = [ C = 'c' ] ] + [ [ D = 'd' ] ] + + +Feature Structures as Graphs +---------------------------- + +Feature structures are not inherently tied to linguistic objects; they are +general purpose structures for representing knowledge. For example, we +could encode information about a person in a feature structure: + + >>> person01 = nltk.FeatStruct("[NAME=Lee, TELNO='01 27 86 42 96',AGE=33]") + >>> print(person01) + [ AGE = 33 ] + [ NAME = 'Lee' ] + [ TELNO = '01 27 86 42 96' ] + +There are a number of notations for representing reentrancy in +matrix-style representations of feature structures. In NLTK, we adopt +the following convention: the first occurrence of a shared feature structure +is prefixed with an integer in parentheses, such as ``(1)``, and any +subsequent reference to that structure uses the notation +``->(1)``, as shown below. + + + >>> fs = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], + ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""") + >>> print(fs) + [ ADDRESS = (1) [ NUMBER = 74 ] ] + [ [ STREET = 'rue Pascal' ] ] + [ ] + [ NAME = 'Lee' ] + [ ] + [ SPOUSE = [ ADDRESS -> (1) ] ] + [ [ NAME = 'Kim' ] ] + +There can be any number of tags within a single feature structure. + + >>> fs3 = nltk.FeatStruct("[A=(1)[B=b], C=(2)[], D->(1), E->(2)]") + >>> print(fs3) + [ A = (1) [ B = 'b' ] ] + [ ] + [ C = (2) [] ] + [ ] + [ D -> (1) ] + [ E -> (2) ] + >>> fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal') + >>> fs2 = nltk.FeatStruct(CITY='Paris') + >>> print(nltk.unify(fs1, fs2)) + [ CITY = 'Paris' ] + [ NUMBER = 74 ] + [ STREET = 'rue Pascal' ] + +Unification is symmetric: + + >>> nltk.unify(fs1, fs2) == nltk.unify(fs2, fs1) + True + +Unification is commutative: + + >>> fs3 = nltk.FeatStruct(TELNO='01 27 86 42 96') + >>> nltk.unify(nltk.unify(fs1, fs2), fs3) == nltk.unify(fs1, nltk.unify(fs2, fs3)) + True + +Unification between *FS*:math:`_0` and *FS*:math:`_1` will fail if the +two feature structures share a path |pi|, +but the value of |pi| in *FS*:math:`_0` is a distinct +atom from the value of |pi| in *FS*:math:`_1`. In NLTK, +this is implemented by setting the result of unification to be +``None``. + + >>> fs0 = nltk.FeatStruct(A='a') + >>> fs1 = nltk.FeatStruct(A='b') + >>> print(nltk.unify(fs0, fs1)) + None + +Now, if we look at how unification interacts with structure-sharing, +things become really interesting. + + + + >>> fs0 = nltk.FeatStruct("""[NAME=Lee, + ... ADDRESS=[NUMBER=74, + ... STREET='rue Pascal'], + ... SPOUSE= [NAME=Kim, + ... ADDRESS=[NUMBER=74, + ... STREET='rue Pascal']]]""") + >>> print(fs0) + [ ADDRESS = [ NUMBER = 74 ] ] + [ [ STREET = 'rue Pascal' ] ] + [ ] + [ NAME = 'Lee' ] + [ ] + [ [ ADDRESS = [ NUMBER = 74 ] ] ] + [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ] + [ [ ] ] + [ [ NAME = 'Kim' ] ] + + + >>> fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]") + >>> print(nltk.unify(fs0, fs1)) + [ ADDRESS = [ NUMBER = 74 ] ] + [ [ STREET = 'rue Pascal' ] ] + [ ] + [ NAME = 'Lee' ] + [ ] + [ [ [ CITY = 'Paris' ] ] ] + [ [ ADDRESS = [ NUMBER = 74 ] ] ] + [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ] + [ [ ] ] + [ [ NAME = 'Kim' ] ] + + >>> fs2 = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], + ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""") + + + >>> print(fs2) + [ ADDRESS = (1) [ NUMBER = 74 ] ] + [ [ STREET = 'rue Pascal' ] ] + [ ] + [ NAME = 'Lee' ] + [ ] + [ SPOUSE = [ ADDRESS -> (1) ] ] + [ [ NAME = 'Kim' ] ] + + + >>> print(nltk.unify(fs2, fs1)) + [ [ CITY = 'Paris' ] ] + [ ADDRESS = (1) [ NUMBER = 74 ] ] + [ [ STREET = 'rue Pascal' ] ] + [ ] + [ NAME = 'Lee' ] + [ ] + [ SPOUSE = [ ADDRESS -> (1) ] ] + [ [ NAME = 'Kim' ] ] + + + >>> fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]") + >>> fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]") + >>> print(fs2) + [ ADDRESS1 = ?x ] + [ ADDRESS2 = ?x ] + >>> print(nltk.unify(fs1, fs2)) + [ ADDRESS1 = (1) [ NUMBER = 74 ] ] + [ [ STREET = 'rue Pascal' ] ] + [ ] + [ ADDRESS2 -> (1) ] + + + + + >>> sent = 'who do you claim that you like' + >>> tokens = sent.split() + >>> cp = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1) + >>> trees = cp.parse(tokens) + |.w.d.y.c.t.y.l.| + |[-] . . . . . .| [0:1] 'who' + |. [-] . . . . .| [1:2] 'do' + |. . [-] . . . .| [2:3] 'you' + |. . . [-] . . .| [3:4] 'claim' + |. . . . [-] . .| [4:5] 'that' + |. . . . . [-] .| [5:6] 'you' + |. . . . . . [-]| [6:7] 'like' + |# . . . . . . .| [0:0] NP[]/NP[] -> * + |. # . . . . . .| [1:1] NP[]/NP[] -> * + |. . # . . . . .| [2:2] NP[]/NP[] -> * + |. . . # . . . .| [3:3] NP[]/NP[] -> * + |. . . . # . . .| [4:4] NP[]/NP[] -> * + |. . . . . # . .| [5:5] NP[]/NP[] -> * + |. . . . . . # .| [6:6] NP[]/NP[] -> * + |. . . . . . . #| [7:7] NP[]/NP[] -> * + |[-] . . . . . .| [0:1] NP[+WH] -> 'who' * + |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {} + |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} + |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {} + |. [-] . . . . .| [1:2] V[+AUX] -> 'do' * + |. [-> . . . . .| [1:2] S[+INV] -> V[+AUX] * NP[] VP[] {} + |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {} + |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {} + |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {} + |. . [-] . . . .| [2:3] NP[-WH] -> 'you' * + |. . [-> . . . .| [2:3] S[-INV] -> NP[] * VP[] {} + |. . [-> . . . .| [2:3] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} + |. . [-> . . . .| [2:3] S[-INV] -> NP[] * S[]/NP[] {} + |. [---> . . . .| [1:3] S[+INV] -> V[+AUX] NP[] * VP[] {} + |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {} + |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' * + |. . . [-> . . .| [3:4] VP[] -> V[-AUX, SUBCAT='clause'] * SBar[] {} + |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {} + |. . . . [-] . .| [4:5] Comp[] -> 'that' * + |. . . . [-> . .| [4:5] SBar[] -> Comp[] * S[-INV] {} + |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {} + |. . . . . [-] .| [5:6] NP[-WH] -> 'you' * + |. . . . . [-> .| [5:6] S[-INV] -> NP[] * VP[] {} + |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} + |. . . . . [-> .| [5:6] S[-INV] -> NP[] * S[]/NP[] {} + |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' * + |. . . . . . [->| [6:7] VP[] -> V[-AUX, SUBCAT='trans'] * NP[] {} + |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {} + |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] * + |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * + |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] * + |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] * + |. . [---------]| [2:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * + |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] * + |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] * + + >>> trees = list(trees) + >>> for tree in trees: print(tree) + (S[-INV] + (NP[+WH] who) + (S[+INV]/NP[] + (V[+AUX] do) + (NP[-WH] you) + (VP[]/NP[] + (V[-AUX, SUBCAT='clause'] claim) + (SBar[]/NP[] + (Comp[] that) + (S[-INV]/NP[] + (NP[-WH] you) + (VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] ))))))) + +A different parser should give the same parse trees, but perhaps in a different order: + + >>> cp2 = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1, + ... parser=parse.FeatureEarleyChartParser) + >>> trees2 = cp2.parse(tokens) + |.w.d.y.c.t.y.l.| + |[-] . . . . . .| [0:1] 'who' + |. [-] . . . . .| [1:2] 'do' + |. . [-] . . . .| [2:3] 'you' + |. . . [-] . . .| [3:4] 'claim' + |. . . . [-] . .| [4:5] 'that' + |. . . . . [-] .| [5:6] 'you' + |. . . . . . [-]| [6:7] 'like' + |> . . . . . . .| [0:0] S[-INV] -> * NP[] VP[] {} + |> . . . . . . .| [0:0] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} + |> . . . . . . .| [0:0] S[-INV] -> * NP[] S[]/NP[] {} + |> . . . . . . .| [0:0] S[-INV] -> * Adv[+NEG] S[+INV] {} + |> . . . . . . .| [0:0] S[+INV] -> * V[+AUX] NP[] VP[] {} + |> . . . . . . .| [0:0] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {} + |> . . . . . . .| [0:0] NP[+WH] -> * 'who' {} + |[-] . . . . . .| [0:1] NP[+WH] -> 'who' * + |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {} + |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} + |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {} + |. > . . . . . .| [1:1] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} + |. > . . . . . .| [1:1] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {} + |. > . . . . . .| [1:1] V[+AUX] -> * 'do' {} + |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} + |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} + |. > . . . . . .| [1:1] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} + |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='intrans'] {} + |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {} + |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {} + |. > . . . . . .| [1:1] VP[] -> * V[+AUX] VP[] {} + |. [-] . . . . .| [1:2] V[+AUX] -> 'do' * + |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {} + |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {} + |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {} + |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='intrans'] {} + |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {} + |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {} + |. . > . . . . .| [2:2] VP[] -> * V[+AUX] VP[] {} + |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} + |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} + |. . > . . . . .| [2:2] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} + |. . > . . . . .| [2:2] NP[-WH] -> * 'you' {} + |. . [-] . . . .| [2:3] NP[-WH] -> 'you' * + |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {} + |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} + |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} + |. . . > . . . .| [3:3] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} + |. . . > . . . .| [3:3] V[-AUX, SUBCAT='clause'] -> * 'claim' {} + |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' * + |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {} + |. . . . > . . .| [4:4] SBar[]/?x[] -> * Comp[] S[-INV]/?x[] {} + |. . . . > . . .| [4:4] Comp[] -> * 'that' {} + |. . . . [-] . .| [4:5] Comp[] -> 'that' * + |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {} + |. . . . . > . .| [5:5] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} + |. . . . . > . .| [5:5] NP[-WH] -> * 'you' {} + |. . . . . [-] .| [5:6] NP[-WH] -> 'you' * + |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} + |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} + |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} + |. . . . . . > .| [6:6] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} + |. . . . . . > .| [6:6] V[-AUX, SUBCAT='trans'] -> * 'like' {} + |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' * + |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {} + |. . . . . . . #| [7:7] NP[]/NP[] -> * + |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] * + |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * + |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] * + |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] * + |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] * + |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] * + + >>> sorted(trees) == sorted(trees2) + True + + +Let's load a German grammar: + + >>> cp = parse.load_parser('grammars/book_grammars/german.fcfg', trace=0) + >>> sent = 'die Katze sieht den Hund' + >>> tokens = sent.split() + >>> trees = cp.parse(tokens) + >>> for tree in trees: print(tree) + (S[] + (NP[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] + (Det[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] die) + (N[AGR=[GND='fem', NUM='sg', PER=3]] Katze)) + (VP[AGR=[NUM='sg', PER=3]] + (TV[AGR=[NUM='sg', PER=3], OBJCASE='acc'] sieht) + (NP[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] + (Det[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] den) + (N[AGR=[GND='masc', NUM='sg', PER=3]] Hund)))) + +Grammar with Binding Operators +------------------------------ +The bindop.fcfg grammar is a semantic grammar that uses lambda +calculus. Each element has a core semantics, which is a single lambda +calculus expression; and a set of binding operators, which bind +variables. + +In order to make the binding operators work right, they need to +instantiate their bound variable every time they are added to the +chart. To do this, we use a special subclass of `Chart`, called +`InstantiateVarsChart`. + + >>> from nltk.parse.featurechart import InstantiateVarsChart + >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=1, + ... chart_class=InstantiateVarsChart) + >>> print(cp.grammar()) + Grammar with 15 productions (start state = S[]) + S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] VP[SEM=[BO=?b2, CORE=?vp]] + VP[SEM=[BO={?b1+?b2}, CORE=]] -> TV[SEM=[BO=?b1, CORE=?v]] NP[SEM=[BO=?b2, CORE=?obj]] + VP[SEM=?s] -> IV[SEM=?s] + NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] N[SEM=[BO=?b2, CORE=?n]] + Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' + N[SEM=[BO={/}, CORE=]] -> 'dog' + N[SEM=[BO={/}, CORE=]] -> 'cat' + N[SEM=[BO={/}, CORE=]] -> 'mouse' + IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' + IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'eats' + IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'walks' + TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' + TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'walks' + NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'john' + NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'alex' + +A simple intransitive sentence: + + >>> from nltk.sem import logic + >>> logic._counter._value = 100 + + >>> trees = cp.parse('john barks'.split()) + |. john.barks.| + |[-----] .| [0:1] 'john' + |. [-----]| [1:2] 'barks' + |[-----] .| [0:1] NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=]] -> 'john' * + |[-----> .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: } + |. [-----]| [1:2] IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' * + |. [-----]| [1:2] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] * + |[===========]| [0:2] S[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] * + >>> for tree in trees: print(tree) + (S[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] + (NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=]] john) + (VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] + (IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] barks))) + +A transitive sentence: + + >>> trees = cp.parse('john feeds a dog'.split()) + |.joh.fee. a .dog.| + |[---] . . .| [0:1] 'john' + |. [---] . .| [1:2] 'feeds' + |. . [---] .| [2:3] 'a' + |. . . [---]| [3:4] 'dog' + |[---] . . .| [0:1] NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=]] -> 'john' * + |[---> . . .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: } + |. [---] . .| [1:2] TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' * + |. [---> . .| [1:2] VP[SEM=[BO={?b1+?b2}, CORE=]] -> TV[SEM=[BO=?b1, CORE=?v]] * NP[SEM=[BO=?b2, CORE=?obj]] {?b1: {/}, ?v: } + |. . [---] .| [2:3] Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' * + |. . [---> .| [2:3] NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] * N[SEM=[BO=?b2, CORE=?n]] {?b1: {/}, ?det: } + |. . . [---]| [3:4] N[SEM=[BO={/}, CORE=]] -> 'dog' * + |. . [-------]| [2:4] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=]] -> Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] N[SEM=[BO={/}, CORE=]] * + |. . [------->| [2:4] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.exists x.(dog(x) & P(x)),z2)}, ?subj: } + |. [-----------]| [1:4] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] -> TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=]] * + |[===============]| [0:4] S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<\y.feed(y,z3)>]] * + + >>> for tree in trees: print(tree) + (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] + (NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=]] john) + (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] + (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) + (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=]] + (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) + (N[SEM=[BO={/}, CORE=]] dog)))) + +Turn down the verbosity: + + >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=0, + ... chart_class=InstantiateVarsChart) + +Reuse the same lexical item twice: + + >>> trees = cp.parse('john feeds john'.split()) + >>> for tree in trees: print(tree) + (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.P(John),z3)}, CORE=]] + (NP[SEM=[BO={bo(\P.P(John),z104)}, CORE=]] john) + (VP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<\y.feed(y,z2)>]] + (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) + (NP[SEM=[BO={bo(\P.P(John),z105)}, CORE=]] john))) + + >>> trees = cp.parse('a dog feeds a dog'.split()) + >>> for tree in trees: print(tree) + (S[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] + (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z106)}, CORE=]] + (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) + (N[SEM=[BO={/}, CORE=]] dog)) + (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] + (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) + (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z107)}, CORE=]] + (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) + (N[SEM=[BO={/}, CORE=]] dog)))) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/gensim.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gensim.doctest new file mode 100644 index 0000000000000000000000000000000000000000..4a44ceed0efb76b764ce1e497021a92ef57b6345 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gensim.doctest @@ -0,0 +1,141 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +======================================= +Demonstrate word embedding using Gensim +======================================= + + >>> from nltk.test.gensim_fixt import setup_module + >>> setup_module() + +We demonstrate three functions: +- Train the word embeddings using brown corpus; +- Load the pre-trained model and perform simple tasks; and +- Pruning the pre-trained binary model. + + >>> import gensim + +--------------- +Train the model +--------------- + +Here we train a word embedding using the Brown Corpus: + + >>> from nltk.corpus import brown + >>> train_set = brown.sents()[:10000] + >>> model = gensim.models.Word2Vec(train_set) + +It might take some time to train the model. So, after it is trained, it can be saved as follows: + + >>> model.save('brown.embedding') + >>> new_model = gensim.models.Word2Vec.load('brown.embedding') + +The model will be the list of words with their embedding. We can easily get the vector representation of a word. + + >>> len(new_model.wv['university']) + 100 + +There are some supporting functions already implemented in Gensim to manipulate with word embeddings. +For example, to compute the cosine similarity between 2 words: + + >>> new_model.wv.similarity('university','school') > 0.3 + True + +--------------------------- +Using the pre-trained model +--------------------------- + +NLTK includes a pre-trained model which is part of a model that is trained on 100 billion words from the Google News Dataset. +The full model is from https://code.google.com/p/word2vec/ (about 3 GB). + + >>> from nltk.data import find + >>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt')) + >>> model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False) + +We pruned the model to only include the most common words (~44k words). + + >>> len(model) + 43981 + +Each word is represented in the space of 300 dimensions: + + >>> len(model['university']) + 300 + +Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score. + + >>> model.most_similar(positive=['university'], topn = 3) + [('universities', 0.70039...), ('faculty', 0.67809...), ('undergraduate', 0.65870...)] + +Finding a word that is not in a list is also supported, although, implementing this by yourself is simple. + + >>> model.doesnt_match('breakfast cereal dinner lunch'.split()) + 'cereal' + +Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example, +the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'. + + >>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1) + [('queen', 0.71181...)] + + >>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1) + [('France', 0.78840...)] + +We can visualize the word embeddings using t-SNE (https://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words. + +| import numpy as np +| labels = [] +| count = 0 +| max_count = 1000 +| X = np.zeros(shape=(max_count,len(model['university']))) +| +| for term in model.index_to_key: +| X[count] = model[term] +| labels.append(term) +| count+= 1 +| if count >= max_count: break +| +| # It is recommended to use PCA first to reduce to ~50 dimensions +| from sklearn.decomposition import PCA +| pca = PCA(n_components=50) +| X_50 = pca.fit_transform(X) +| +| # Using TSNE to further reduce to 2 dimensions +| from sklearn.manifold import TSNE +| model_tsne = TSNE(n_components=2, random_state=0) +| Y = model_tsne.fit_transform(X_50) +| +| # Show the scatter plot +| import matplotlib.pyplot as plt +| plt.scatter(Y[:,0], Y[:,1], 20) +| +| # Add labels +| for label, x, y in zip(labels, Y[:, 0], Y[:, 1]): +| plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10) +| +| plt.show() + +------------------------------ +Prune the trained binary model +------------------------------ + +Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/ +We use this code to get the `word2vec_sample` model. + +| import gensim +| # Load the binary model +| model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True) +| +| # Only output word that appear in the Brown corpus +| from nltk.corpus import brown +| words = set(brown.words()) +| print(len(words)) +| +| # Output presented word to a temporary file +| out_file = 'pruned.word2vec.txt' +| with open(out_file,'w') as f: +| word_presented = words.intersection(model.index_to_key) +| f.write('{} {}\n'.format(len(word_presented),len(model['word']))) +| +| for word in word_presented: +| f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word]))) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/gensim_fixt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gensim_fixt.py new file mode 100644 index 0000000000000000000000000000000000000000..ee6855f3d46863f07f7e137be3f2a0fc37e7dcc3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gensim_fixt.py @@ -0,0 +1,4 @@ +def setup_module(): + import pytest + + pytest.importorskip("gensim") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics.doctest new file mode 100644 index 0000000000000000000000000000000000000000..7f32bb33f684adda7f3e48c4645084fd2087f2fb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics.doctest @@ -0,0 +1,383 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +============================================================================== + Glue Semantics +============================================================================== + + + +====================== +Linear logic +====================== + + >>> from nltk.sem import logic + >>> from nltk.sem.glue import * + >>> from nltk.sem.linearlogic import * + + >>> from nltk.sem.linearlogic import Expression + >>> read_expr = Expression.fromstring + +Parser + + >>> print(read_expr(r'f')) + f + >>> print(read_expr(r'(g -o f)')) + (g -o f) + >>> print(read_expr(r'(g -o (h -o f))')) + (g -o (h -o f)) + >>> print(read_expr(r'((g -o G) -o G)')) + ((g -o G) -o G) + >>> print(read_expr(r'(g -o f)(g)')) + (g -o f)(g) + >>> print(read_expr(r'((g -o G) -o G)((g -o f))')) + ((g -o G) -o G)((g -o f)) + +Simplify + + >>> print(read_expr(r'f').simplify()) + f + >>> print(read_expr(r'(g -o f)').simplify()) + (g -o f) + >>> print(read_expr(r'((g -o G) -o G)').simplify()) + ((g -o G) -o G) + >>> print(read_expr(r'(g -o f)(g)').simplify()) + f + >>> try: read_expr(r'(g -o f)(f)').simplify() + ... except LinearLogicApplicationException as e: print(e) + ... + Cannot apply (g -o f) to f. Cannot unify g with f given {} + >>> print(read_expr(r'(G -o f)(g)').simplify()) + f + >>> print(read_expr(r'((g -o G) -o G)((g -o f))').simplify()) + f + +Test BindingDict + + >>> h = ConstantExpression('h') + >>> g = ConstantExpression('g') + >>> f = ConstantExpression('f') + + >>> H = VariableExpression('H') + >>> G = VariableExpression('G') + >>> F = VariableExpression('F') + + >>> d1 = BindingDict({H: h}) + >>> d2 = BindingDict({F: f, G: F}) + >>> d12 = d1 + d2 + >>> all12 = ['%s: %s' % (v, d12[v]) for v in d12.d] + >>> all12.sort() + >>> print(all12) + ['F: f', 'G: f', 'H: h'] + + >>> BindingDict([(F,f),(G,g),(H,h)]) == BindingDict({F:f, G:g, H:h}) + True + + >>> d4 = BindingDict({F: f}) + >>> try: d4[F] = g + ... except VariableBindingException as e: print(e) + Variable F already bound to another value + +Test Unify + + >>> try: f.unify(g, BindingDict()) + ... except UnificationException as e: print(e) + ... + Cannot unify f with g given {} + + >>> f.unify(G, BindingDict()) == BindingDict({G: f}) + True + >>> try: f.unify(G, BindingDict({G: h})) + ... except UnificationException as e: print(e) + ... + Cannot unify f with G given {G: h} + >>> f.unify(G, BindingDict({G: f})) == BindingDict({G: f}) + True + >>> f.unify(G, BindingDict({H: f})) == BindingDict({G: f, H: f}) + True + + >>> G.unify(f, BindingDict()) == BindingDict({G: f}) + True + >>> try: G.unify(f, BindingDict({G: h})) + ... except UnificationException as e: print(e) + ... + Cannot unify G with f given {G: h} + >>> G.unify(f, BindingDict({G: f})) == BindingDict({G: f}) + True + >>> G.unify(f, BindingDict({H: f})) == BindingDict({G: f, H: f}) + True + + >>> G.unify(F, BindingDict()) == BindingDict({G: F}) + True + >>> try: G.unify(F, BindingDict({G: H})) + ... except UnificationException as e: print(e) + ... + Cannot unify G with F given {G: H} + >>> G.unify(F, BindingDict({G: F})) == BindingDict({G: F}) + True + >>> G.unify(F, BindingDict({H: F})) == BindingDict({G: F, H: F}) + True + +Test Compile + + >>> print(read_expr('g').compile_pos(Counter(), GlueFormula)) + (, []) + >>> print(read_expr('(g -o f)').compile_pos(Counter(), GlueFormula)) + (, []) + >>> print(read_expr('(g -o (h -o f))').compile_pos(Counter(), GlueFormula)) + (, []) + + +====================== +Glue +====================== + +Demo of "John walks" +-------------------- + + >>> john = GlueFormula("John", "g") + >>> print(john) + John : g + >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)") + >>> print(walks) + \x.walks(x) : (g -o f) + >>> print(walks.applyto(john)) + \x.walks(x)(John) : (g -o f)(g) + >>> print(walks.applyto(john).simplify()) + walks(John) : f + + +Demo of "A dog walks" +--------------------- + + >>> a = GlueFormula("\\P Q.some x.(P(x) and Q(x))", "((gv -o gr) -o ((g -o G) -o G))") + >>> print(a) + \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) + >>> man = GlueFormula(r"\x.man(x)", "(gv -o gr)") + >>> print(man) + \x.man(x) : (gv -o gr) + >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)") + >>> print(walks) + \x.walks(x) : (g -o f) + >>> a_man = a.applyto(man) + >>> print(a_man.simplify()) + \Q.exists x.(man(x) & Q(x)) : ((g -o G) -o G) + >>> a_man_walks = a_man.applyto(walks) + >>> print(a_man_walks.simplify()) + exists x.(man(x) & walks(x)) : f + + +Demo of 'every girl chases a dog' +--------------------------------- + +Individual words: + + >>> every = GlueFormula("\\P Q.all x.(P(x) -> Q(x))", "((gv -o gr) -o ((g -o G) -o G))") + >>> print(every) + \P Q.all x.(P(x) -> Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) + >>> girl = GlueFormula(r"\x.girl(x)", "(gv -o gr)") + >>> print(girl) + \x.girl(x) : (gv -o gr) + >>> chases = GlueFormula(r"\x y.chases(x,y)", "(g -o (h -o f))") + >>> print(chases) + \x y.chases(x,y) : (g -o (h -o f)) + >>> a = GlueFormula("\\P Q.some x.(P(x) and Q(x))", "((hv -o hr) -o ((h -o H) -o H))") + >>> print(a) + \P Q.exists x.(P(x) & Q(x)) : ((hv -o hr) -o ((h -o H) -o H)) + >>> dog = GlueFormula(r"\x.dog(x)", "(hv -o hr)") + >>> print(dog) + \x.dog(x) : (hv -o hr) + +Noun Quantification can only be done one way: + + >>> every_girl = every.applyto(girl) + >>> print(every_girl.simplify()) + \Q.all x.(girl(x) -> Q(x)) : ((g -o G) -o G) + >>> a_dog = a.applyto(dog) + >>> print(a_dog.simplify()) + \Q.exists x.(dog(x) & Q(x)) : ((h -o H) -o H) + +The first reading is achieved by combining 'chases' with 'a dog' first. +Since 'a girl' requires something of the form '(h -o H)' we must +get rid of the 'g' in the glue of 'see'. We will do this with +the '-o elimination' rule. So, x1 will be our subject placeholder. + + >>> xPrime = GlueFormula("x1", "g") + >>> print(xPrime) + x1 : g + >>> xPrime_chases = chases.applyto(xPrime) + >>> print(xPrime_chases.simplify()) + \y.chases(x1,y) : (h -o f) + >>> xPrime_chases_a_dog = a_dog.applyto(xPrime_chases) + >>> print(xPrime_chases_a_dog.simplify()) + exists x.(dog(x) & chases(x1,x)) : f + +Now we can retract our subject placeholder using lambda-abstraction and +combine with the true subject. + + >>> chases_a_dog = xPrime_chases_a_dog.lambda_abstract(xPrime) + >>> print(chases_a_dog.simplify()) + \x1.exists x.(dog(x) & chases(x1,x)) : (g -o f) + >>> every_girl_chases_a_dog = every_girl.applyto(chases_a_dog) + >>> r1 = every_girl_chases_a_dog.simplify() + >>> r2 = GlueFormula(r'all x.(girl(x) -> exists z1.(dog(z1) & chases(x,z1)))', 'f') + >>> r1 == r2 + True + +The second reading is achieved by combining 'every girl' with 'chases' first. + + >>> xPrime = GlueFormula("x1", "g") + >>> print(xPrime) + x1 : g + >>> xPrime_chases = chases.applyto(xPrime) + >>> print(xPrime_chases.simplify()) + \y.chases(x1,y) : (h -o f) + >>> yPrime = GlueFormula("x2", "h") + >>> print(yPrime) + x2 : h + >>> xPrime_chases_yPrime = xPrime_chases.applyto(yPrime) + >>> print(xPrime_chases_yPrime.simplify()) + chases(x1,x2) : f + >>> chases_yPrime = xPrime_chases_yPrime.lambda_abstract(xPrime) + >>> print(chases_yPrime.simplify()) + \x1.chases(x1,x2) : (g -o f) + >>> every_girl_chases_yPrime = every_girl.applyto(chases_yPrime) + >>> print(every_girl_chases_yPrime.simplify()) + all x.(girl(x) -> chases(x,x2)) : f + >>> every_girl_chases = every_girl_chases_yPrime.lambda_abstract(yPrime) + >>> print(every_girl_chases.simplify()) + \x2.all x.(girl(x) -> chases(x,x2)) : (h -o f) + >>> every_girl_chases_a_dog = a_dog.applyto(every_girl_chases) + >>> r1 = every_girl_chases_a_dog.simplify() + >>> r2 = GlueFormula(r'exists x.(dog(x) & all z2.(girl(z2) -> chases(z2,x)))', 'f') + >>> r1 == r2 + True + + +Compilation +----------- + + >>> for cp in GlueFormula('m', '(b -o a)').compile(Counter()): print(cp) + m : (b -o a) : {1} + >>> for cp in GlueFormula('m', '((c -o b) -o a)').compile(Counter()): print(cp) + v1 : c : {1} + m : (b[1] -o a) : {2} + >>> for cp in GlueFormula('m', '((d -o (c -o b)) -o a)').compile(Counter()): print(cp) + v1 : c : {1} + v2 : d : {2} + m : (b[1, 2] -o a) : {3} + >>> for cp in GlueFormula('m', '((d -o e) -o ((c -o b) -o a))').compile(Counter()): print(cp) + v1 : d : {1} + v2 : c : {2} + m : (e[1] -o (b[2] -o a)) : {3} + >>> for cp in GlueFormula('m', '(((d -o c) -o b) -o a)').compile(Counter()): print(cp) + v1 : (d -o c) : {1} + m : (b[1] -o a) : {2} + >>> for cp in GlueFormula('m', '((((e -o d) -o c) -o b) -o a)').compile(Counter()): print(cp) + v1 : e : {1} + v2 : (d[1] -o c) : {2} + m : (b[2] -o a) : {3} + + +Demo of 'a man walks' using Compilation +--------------------------------------- + +Premises + + >>> a = GlueFormula('\\P Q.some x.(P(x) and Q(x))', '((gv -o gr) -o ((g -o G) -o G))') + >>> print(a) + \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) + + >>> man = GlueFormula('\\x.man(x)', '(gv -o gr)') + >>> print(man) + \x.man(x) : (gv -o gr) + + >>> walks = GlueFormula('\\x.walks(x)', '(g -o f)') + >>> print(walks) + \x.walks(x) : (g -o f) + +Compiled Premises: + + >>> counter = Counter() + >>> ahc = a.compile(counter) + >>> g1 = ahc[0] + >>> print(g1) + v1 : gv : {1} + >>> g2 = ahc[1] + >>> print(g2) + v2 : g : {2} + >>> g3 = ahc[2] + >>> print(g3) + \P Q.exists x.(P(x) & Q(x)) : (gr[1] -o (G[2] -o G)) : {3} + >>> g4 = man.compile(counter)[0] + >>> print(g4) + \x.man(x) : (gv -o gr) : {4} + >>> g5 = walks.compile(counter)[0] + >>> print(g5) + \x.walks(x) : (g -o f) : {5} + +Derivation: + + >>> g14 = g4.applyto(g1) + >>> print(g14.simplify()) + man(v1) : gr : {1, 4} + >>> g134 = g3.applyto(g14) + >>> print(g134.simplify()) + \Q.exists x.(man(x) & Q(x)) : (G[2] -o G) : {1, 3, 4} + >>> g25 = g5.applyto(g2) + >>> print(g25.simplify()) + walks(v2) : f : {2, 5} + >>> g12345 = g134.applyto(g25) + >>> print(g12345.simplify()) + exists x.(man(x) & walks(x)) : f : {1, 2, 3, 4, 5} + +--------------------------------- +Dependency Graph to Glue Formulas +--------------------------------- + >>> from nltk.corpus.reader.dependency import DependencyGraph + + >>> depgraph = DependencyGraph("""1 John _ NNP NNP _ 2 SUBJ _ _ + ... 2 sees _ VB VB _ 0 ROOT _ _ + ... 3 a _ ex_quant ex_quant _ 4 SPEC _ _ + ... 4 dog _ NN NN _ 2 OBJ _ _ + ... """) + >>> gfl = GlueDict('nltk:grammars/sample_grammars/glue.semtype').to_glueformula_list(depgraph) + >>> print(gfl) # doctest: +SKIP + [\x y.sees(x,y) : (f -o (i -o g)), + \x.dog(x) : (iv -o ir), + \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I3) -o I3)), + \P Q.exists x.(P(x) & Q(x)) : ((fv -o fr) -o ((f -o F4) -o F4)), + \x.John(x) : (fv -o fr)] + >>> glue = Glue() + >>> for r in sorted([r.simplify().normalize() for r in glue.get_readings(glue.gfl_to_compiled(gfl))], key=str): + ... print(r) + exists z1.(John(z1) & exists z2.(dog(z2) & sees(z1,z2))) + exists z1.(dog(z1) & exists z2.(John(z2) & sees(z2,z1))) + +----------------------------------- +Dependency Graph to LFG f-structure +----------------------------------- + >>> from nltk.sem.lfg import FStructure + + >>> fstruct = FStructure.read_depgraph(depgraph) + + >>> print(fstruct) # doctest: +SKIP + f:[pred 'sees' + obj h:[pred 'dog' + spec 'a'] + subj g:[pred 'John']] + + >>> fstruct.to_depgraph().tree().pprint() + (sees (dog a) John) + +--------------------------------- +LFG f-structure to Glue +--------------------------------- + >>> fstruct.to_glueformula_list(GlueDict('nltk:grammars/sample_grammars/glue.semtype')) # doctest: +SKIP + [\x y.sees(x,y) : (i -o (g -o f)), + \x.dog(x) : (gv -o gr), + \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G3) -o G3)), + \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I4) -o I4)), + \x.John(x) : (iv -o ir)] + +.. see gluesemantics_malt.doctest for more diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics_malt.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics_malt.doctest new file mode 100644 index 0000000000000000000000000000000000000000..75a6fafdfd50c0a7606303ed25869c77a8454eb7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/gluesemantics_malt.doctest @@ -0,0 +1,69 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +.. see also: gluesemantics.doctest + +============================================================================== + Glue Semantics +============================================================================== + + >>> from nltk.test.gluesemantics_malt_fixt import setup_module + >>> setup_module() + + >>> from nltk.sem.glue import * + >>> nltk.sem.logic._counter._value = 0 + +-------------------------------- +Initialize the Dependency Parser +-------------------------------- + >>> from nltk.parse.malt import MaltParser + + >>> tagger = RegexpTagger( + ... [('^(John|Mary)$', 'NNP'), + ... ('^(sees|chases)$', 'VB'), + ... ('^(a)$', 'ex_quant'), + ... ('^(every)$', 'univ_quant'), + ... ('^(girl|dog)$', 'NN') + ... ]).tag + >>> depparser = MaltParser(tagger=tagger) + +-------------------- +Automated Derivation +-------------------- + >>> glue = Glue(depparser=depparser) + >>> readings = glue.parse_to_meaning('every girl chases a dog'.split()) + >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str): + ... print(reading.normalize()) + all z1.(girl(z1) -> exists z2.(dog(z2) & chases(z1,z2))) + exists z1.(dog(z1) & all z2.(girl(z2) -> chases(z2,z1))) + + >>> drtglue = DrtGlue(depparser=depparser) + >>> readings = drtglue.parse_to_meaning('every girl chases a dog'.split()) + >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str): + ... print(reading) + ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chases(z1,z2)]))]) + ([z1],[dog(z1), (([z2],[girl(z2)]) -> ([],[chases(z2,z1)]))]) + +-------------- +With inference +-------------- + +Checking for equality of two DRSs is very useful when generating readings of a sentence. +For example, the ``glue`` module generates two readings for the sentence +*John sees Mary*: + + >>> from nltk.sem.glue import DrtGlue + >>> readings = drtglue.parse_to_meaning('John sees Mary'.split()) + >>> for drs in sorted([r.simplify().normalize() for r in readings], key=str): + ... print(drs) + ([z1,z2],[John(z1), Mary(z2), sees(z1,z2)]) + ([z1,z2],[Mary(z1), John(z2), sees(z2,z1)]) + +However, it is easy to tell that these two readings are logically the +same, and therefore one of them is superfluous. We can use the theorem prover +to determine this equivalence, and then delete one of them. A particular +theorem prover may be specified, or the argument may be left off to use the +default. + + >>> readings[0].equiv(readings[1]) + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/index.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/index.doctest new file mode 100644 index 0000000000000000000000000000000000000000..7d6799700d343d9031f5c187b58875f122999cbf --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/index.doctest @@ -0,0 +1,100 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +.. _align howto: align.html +.. _ccg howto: ccg.html +.. _chat80 howto: chat80.html +.. _childes howto: childes.html +.. _chunk howto: chunk.html +.. _classify howto: classify.html +.. _collocations howto: collocations.html +.. _compat howto: compat.html +.. _corpus howto: corpus.html +.. _data howto: data.html +.. _dependency howto: dependency.html +.. _discourse howto: discourse.html +.. _drt howto: drt.html +.. _featgram howto: featgram.html +.. _featstruct howto: featstruct.html +.. _framenet howto: framenet.html +.. _generate howto: generate.html +.. _gluesemantics howto: gluesemantics.html +.. _gluesemantics_malt howto: gluesemantics_malt.html +.. _grammar howto: grammar.html +.. _grammartestsuites howto: grammartestsuites.html +.. _index howto: index.html +.. _inference howto: inference.html +.. _internals howto: internals.html +.. _japanese howto: japanese.html +.. _logic howto: logic.html +.. _metrics howto: metrics.html +.. _misc howto: misc.html +.. _nonmonotonic howto: nonmonotonic.html +.. _parse howto: parse.html +.. _portuguese_en howto: portuguese_en.html +.. _probability howto: probability.html +.. _propbank howto: propbank.html +.. _relextract howto: relextract.html +.. _resolution howto: resolution.html +.. _semantics howto: semantics.html +.. _simple howto: simple.html +.. _stem howto: stem.html +.. _tag howto: tag.html +.. _tokenize howto: tokenize.html +.. _toolbox howto: toolbox.html +.. _tree howto: tree.html +.. _treetransforms howto: treetransforms.html +.. _util howto: util.html +.. _wordnet howto: wordnet.html +.. _wordnet_lch howto: wordnet_lch.html + +=========== +NLTK HOWTOs +=========== + +* `align HOWTO`_ +* `ccg HOWTO`_ +* `chat80 HOWTO`_ +* `childes HOWTO`_ +* `chunk HOWTO`_ +* `classify HOWTO`_ +* `collocations HOWTO`_ +* `compat HOWTO`_ +* `corpus HOWTO`_ +* `data HOWTO`_ +* `dependency HOWTO`_ +* `discourse HOWTO`_ +* `drt HOWTO`_ +* `featgram HOWTO`_ +* `featstruct HOWTO`_ +* `framenet HOWTO`_ +* `generate HOWTO`_ +* `gluesemantics HOWTO`_ +* `gluesemantics_malt HOWTO`_ +* `grammar HOWTO`_ +* `grammartestsuites HOWTO`_ +* `index HOWTO`_ +* `inference HOWTO`_ +* `internals HOWTO`_ +* `japanese HOWTO`_ +* `logic HOWTO`_ +* `metrics HOWTO`_ +* `misc HOWTO`_ +* `nonmonotonic HOWTO`_ +* `parse HOWTO`_ +* `portuguese_en HOWTO`_ +* `probability HOWTO`_ +* `propbank HOWTO`_ +* `relextract HOWTO`_ +* `resolution HOWTO`_ +* `semantics HOWTO`_ +* `simple HOWTO`_ +* `stem HOWTO`_ +* `tag HOWTO`_ +* `tokenize HOWTO`_ +* `toolbox HOWTO`_ +* `tree HOWTO`_ +* `treetransforms HOWTO`_ +* `util HOWTO`_ +* `wordnet HOWTO`_ +* `wordnet_lch HOWTO`_ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/internals.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/internals.doctest new file mode 100644 index 0000000000000000000000000000000000000000..7c6dc8595a2dd9d637a12e0c751ca3fec6bffa98 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/internals.doctest @@ -0,0 +1,161 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========================================== + Unit tests for the nltk.utilities module +========================================== + +overridden() +~~~~~~~~~~~~ + >>> from nltk.internals import overridden + +The typical use case is in defining methods for an interface or +abstract base class, in such a way that subclasses don't have to +implement all of the methods: + + >>> class EaterI(object): + ... '''Subclass must define eat() or batch_eat().''' + ... def eat(self, food): + ... if overridden(self.batch_eat): + ... return self.batch_eat([food])[0] + ... else: + ... raise NotImplementedError() + ... def batch_eat(self, foods): + ... return [self.eat(food) for food in foods] + +As long as a subclass implements one method, it will be used to +perform the other method: + + >>> class GoodEater1(EaterI): + ... def eat(self, food): + ... return 'yum' + >>> GoodEater1().eat('steak') + 'yum' + >>> GoodEater1().batch_eat(['steak', 'peas']) + ['yum', 'yum'] + + >>> class GoodEater2(EaterI): + ... def batch_eat(self, foods): + ... return ['yum' for food in foods] + >>> GoodEater2().eat('steak') + 'yum' + >>> GoodEater2().batch_eat(['steak', 'peas']) + ['yum', 'yum'] + +But if a subclass doesn't implement either one, then they'll get an +error when they try to call them. (nb this is better than infinite +recursion): + + >>> class BadEater1(EaterI): + ... pass + >>> BadEater1().eat('steak') + Traceback (most recent call last): + . . . + NotImplementedError + >>> BadEater1().batch_eat(['steak', 'peas']) + Traceback (most recent call last): + . . . + NotImplementedError + +Trying to use the abstract base class itself will also result in an +error: + + >>> class EaterI(EaterI): + ... pass + >>> EaterI().eat('steak') + Traceback (most recent call last): + . . . + NotImplementedError + >>> EaterI().batch_eat(['steak', 'peas']) + Traceback (most recent call last): + . . . + NotImplementedError + +It's ok to use intermediate abstract classes: + + >>> class AbstractEater(EaterI): + ... pass + + >>> class GoodEater3(AbstractEater): + ... def eat(self, food): + ... return 'yum' + ... + >>> GoodEater3().eat('steak') + 'yum' + >>> GoodEater3().batch_eat(['steak', 'peas']) + ['yum', 'yum'] + + >>> class GoodEater4(AbstractEater): + ... def batch_eat(self, foods): + ... return ['yum' for food in foods] + >>> GoodEater4().eat('steak') + 'yum' + >>> GoodEater4().batch_eat(['steak', 'peas']) + ['yum', 'yum'] + + >>> class BadEater2(AbstractEater): + ... pass + >>> BadEater2().eat('steak') + Traceback (most recent call last): + . . . + NotImplementedError + >>> BadEater2().batch_eat(['steak', 'peas']) + Traceback (most recent call last): + . . . + NotImplementedError + +Here's some extra tests: + + >>> class A(object): + ... def f(x): pass + >>> class B(A): + ... def f(x): pass + >>> class C(A): pass + >>> class D(B): pass + + >>> overridden(A().f) + False + >>> overridden(B().f) + True + >>> overridden(C().f) + False + >>> overridden(D().f) + True + +It works for classic classes, too: + + >>> class A: + ... def f(x): pass + >>> class B(A): + ... def f(x): pass + >>> class C(A): pass + >>> class D(B): pass + >>> overridden(A().f) + False + >>> overridden(B().f) + True + >>> overridden(C().f) + False + >>> overridden(D().f) + True + + +read_str() +~~~~~~~~~~~~ + >>> from nltk.internals import read_str + +Test valid scenarios + + >>> read_str("'valid string'", 0) + ('valid string', 14) + +Now test invalid scenarios + + >>> read_str("should error", 0) + Traceback (most recent call last): + ... + nltk.internals.ReadError: Expected open quote at 0 + >>> read_str("'should error", 0) + Traceback (most recent call last): + ... + nltk.internals.ReadError: Expected close quote at 1 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/japanese.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/japanese.doctest new file mode 100644 index 0000000000000000000000000000000000000000..7fc4b2a0fcb63d3335423fd51b7627219aa93e6b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/japanese.doctest @@ -0,0 +1,48 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +============================ +Japanese Language Processing +============================ + + >>> from nltk import * + +------------- +Corpus Access +------------- + +KNB Corpus +---------- + + >>> from nltk.corpus import knbc + +Access the words: this should produce a list of strings: + + >>> type(knbc.words()[0]) is not bytes + True + +Access the sentences: this should produce a list of lists of strings: + + >>> type(knbc.sents()[0][0]) is not bytes + True + +Access the tagged words: this should produce a list of word, tag pairs: + + >>> type(knbc.tagged_words()[0]) + <... 'tuple'> + +Access the tagged sentences: this should produce a list of lists of word, tag pairs: + + >>> type(knbc.tagged_sents()[0][0]) + <... 'tuple'> + + +JEITA Corpus +------------ + + >>> from nltk.corpus import jeita + +Access the tagged words: this should produce a list of word, tag pairs, where a tag is a string: + + >>> type(jeita.tagged_words()[0][1]) is not bytes + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/lm.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/lm.doctest new file mode 100644 index 0000000000000000000000000000000000000000..2ccbbe1c4e0b9e006034ceaf9e50fb6ee1c6c0cc --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/lm.doctest @@ -0,0 +1,135 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +.. -*- coding: utf-8 -*- + + +Regression Tests +================ + + +Issue 167 +--------- +https://github.com/nltk/nltk/issues/167 + + >>> from nltk.corpus import brown + >>> from nltk.lm.preprocessing import padded_everygram_pipeline + >>> ngram_order = 3 + >>> train_data, vocab_data = padded_everygram_pipeline( + ... ngram_order, + ... brown.sents(categories="news") + ... ) + + >>> from nltk.lm import WittenBellInterpolated + >>> lm = WittenBellInterpolated(ngram_order) + >>> lm.fit(train_data, vocab_data) + + + + +Sentence containing an unseen word should result in infinite entropy because +Witten-Bell is based ultimately on MLE, which cannot handle unseen ngrams. +Crucially, it shouldn't raise any exceptions for unseen words. + + >>> from nltk.util import ngrams + >>> sent = ngrams("This is a sentence with the word aaddvark".split(), 3) + >>> lm.entropy(sent) + inf + +If we remove all unseen ngrams from the sentence, we'll get a non-infinite value +for the entropy. + + >>> sent = ngrams("This is a sentence".split(), 3) + >>> round(lm.entropy(sent), 14) + 10.23701322869105 + + +Issue 367 +--------- +https://github.com/nltk/nltk/issues/367 + +Reproducing Dan Blanchard's example: +https://github.com/nltk/nltk/issues/367#issuecomment-14646110 + + >>> from nltk.lm import Lidstone, Vocabulary + >>> word_seq = list('aaaababaaccbacb') + >>> ngram_order = 2 + >>> from nltk.util import everygrams + >>> train_data = [everygrams(word_seq, max_len=ngram_order)] + >>> V = Vocabulary(['a', 'b', 'c', '']) + >>> lm = Lidstone(0.2, ngram_order, vocabulary=V) + >>> lm.fit(train_data) + +For doctest to work we have to sort the vocabulary keys. + + >>> V_keys = sorted(V) + >>> round(sum(lm.score(w, ("b",)) for w in V_keys), 6) + 1.0 + >>> round(sum(lm.score(w, ("a",)) for w in V_keys), 6) + 1.0 + + >>> [lm.score(w, ("b",)) for w in V_keys] + [0.05, 0.05, 0.8, 0.05, 0.05] + >>> [round(lm.score(w, ("a",)), 4) for w in V_keys] + [0.0222, 0.0222, 0.4667, 0.2444, 0.2444] + + +Here's reproducing @afourney's comment: +https://github.com/nltk/nltk/issues/367#issuecomment-15686289 + + >>> sent = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz'] + >>> ngram_order = 3 + >>> from nltk.lm.preprocessing import padded_everygram_pipeline + >>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, [sent]) + >>> from nltk.lm import Lidstone + >>> lm = Lidstone(0.2, ngram_order) + >>> lm.fit(train_data, vocab_data) + +The vocabulary includes the "UNK" symbol as well as two padding symbols. + + >>> len(lm.vocab) + 6 + >>> word = "foo" + >>> context = ("bar", "baz") + +The raw counts. + + >>> lm.context_counts(context)[word] + 0 + >>> lm.context_counts(context).N() + 1 + +Counts with Lidstone smoothing. + + >>> lm.context_counts(context)[word] + lm.gamma + 0.2 + >>> lm.context_counts(context).N() + len(lm.vocab) * lm.gamma + 2.2 + +Without any backoff, just using Lidstone smoothing, P("foo" | "bar", "baz") should be: +0.2 / 2.2 ~= 0.090909 + + >>> round(lm.score(word, context), 6) + 0.090909 + + +Issue 380 +--------- +https://github.com/nltk/nltk/issues/380 + +Reproducing setup akin to this comment: +https://github.com/nltk/nltk/issues/380#issue-12879030 + +For speed take only the first 100 sentences of reuters. Shouldn't affect the test. + + >>> from nltk.corpus import reuters + >>> sents = reuters.sents()[:100] + >>> ngram_order = 3 + >>> from nltk.lm.preprocessing import padded_everygram_pipeline + >>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, sents) + + >>> from nltk.lm import Lidstone + >>> lm = Lidstone(0.2, ngram_order) + >>> lm.fit(train_data, vocab_data) + >>> lm.score("said", ("",)) < 1 + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/logic.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/logic.doctest new file mode 100644 index 0000000000000000000000000000000000000000..c853e3794ba796705d2a5954037117dd66fecf15 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/logic.doctest @@ -0,0 +1,1096 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +======================= +Logic & Lambda Calculus +======================= + +The `nltk.logic` package allows expressions of First-Order Logic (FOL) to be +parsed into ``Expression`` objects. In addition to FOL, the parser +handles lambda-abstraction with variables of higher order. + +-------- +Overview +-------- + + >>> from nltk.sem.logic import * + +The default inventory of logical constants is the following: + + >>> boolean_ops() + negation - + conjunction & + disjunction | + implication -> + equivalence <-> + >>> equality_preds() + equality = + inequality != + >>> binding_ops() + existential exists + universal all + lambda \ + +---------------- +Regression Tests +---------------- + + +Untyped Logic ++++++++++++++ + +Process logical expressions conveniently: + + >>> read_expr = Expression.fromstring + +Test for equality under alpha-conversion +======================================== + + >>> e1 = read_expr('exists x.P(x)') + >>> print(e1) + exists x.P(x) + >>> e2 = e1.alpha_convert(Variable('z')) + >>> print(e2) + exists z.P(z) + >>> e1 == e2 + True + + + >>> l = read_expr(r'\X.\X.X(X)(1)').simplify() + >>> id = read_expr(r'\X.X(X)') + >>> l == id + True + +Test numerals +============= + + >>> zero = read_expr(r'\F x.x') + >>> one = read_expr(r'\F x.F(x)') + >>> two = read_expr(r'\F x.F(F(x))') + >>> three = read_expr(r'\F x.F(F(F(x)))') + >>> four = read_expr(r'\F x.F(F(F(F(x))))') + >>> succ = read_expr(r'\N F x.F(N(F,x))') + >>> plus = read_expr(r'\M N F x.M(F,N(F,x))') + >>> mult = read_expr(r'\M N F.M(N(F))') + >>> pred = read_expr(r'\N F x.(N(\G H.H(G(F)))(\u.x)(\u.u))') + >>> v1 = ApplicationExpression(succ, zero).simplify() + >>> v1 == one + True + >>> v2 = ApplicationExpression(succ, v1).simplify() + >>> v2 == two + True + >>> v3 = ApplicationExpression(ApplicationExpression(plus, v1), v2).simplify() + >>> v3 == three + True + >>> v4 = ApplicationExpression(ApplicationExpression(mult, v2), v2).simplify() + >>> v4 == four + True + >>> v5 = ApplicationExpression(pred, ApplicationExpression(pred, v4)).simplify() + >>> v5 == two + True + +Overloaded operators also exist, for convenience. + + >>> print(succ(zero).simplify() == one) + True + >>> print(plus(one,two).simplify() == three) + True + >>> print(mult(two,two).simplify() == four) + True + >>> print(pred(pred(four)).simplify() == two) + True + + >>> john = read_expr(r'john') + >>> man = read_expr(r'\x.man(x)') + >>> walk = read_expr(r'\x.walk(x)') + >>> man(john).simplify() + + >>> print(-walk(john).simplify()) + -walk(john) + >>> print((man(john) & walk(john)).simplify()) + (man(john) & walk(john)) + >>> print((man(john) | walk(john)).simplify()) + (man(john) | walk(john)) + >>> print((man(john) > walk(john)).simplify()) + (man(john) -> walk(john)) + >>> print((man(john) < walk(john)).simplify()) + (man(john) <-> walk(john)) + +Python's built-in lambda operator can also be used with Expressions + + >>> john = VariableExpression(Variable('john')) + >>> run_var = VariableExpression(Variable('run')) + >>> run = lambda x: run_var(x) + >>> run(john) + + + +``betaConversionTestSuite.pl`` +------------------------------ + +Tests based on Blackburn & Bos' book, *Representation and Inference +for Natural Language*. + + >>> x1 = read_expr(r'\P.P(mia)(\x.walk(x))').simplify() + >>> x2 = read_expr(r'walk(mia)').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'exists x.(man(x) & ((\P.exists x.(woman(x) & P(x)))(\y.love(x,y))))').simplify() + >>> x2 = read_expr(r'exists x.(man(x) & exists y.(woman(y) & love(x,y)))').simplify() + >>> x1 == x2 + True + >>> x1 = read_expr(r'\a.sleep(a)(mia)').simplify() + >>> x2 = read_expr(r'sleep(mia)').simplify() + >>> x1 == x2 + True + >>> x1 = read_expr(r'\a.\b.like(b,a)(mia)').simplify() + >>> x2 = read_expr(r'\b.like(b,mia)').simplify() + >>> x1 == x2 + True + >>> x1 = read_expr(r'\a.(\b.like(b,a)(vincent))').simplify() + >>> x2 = read_expr(r'\a.like(vincent,a)').simplify() + >>> x1 == x2 + True + >>> x1 = read_expr(r'\a.((\b.like(b,a)(vincent)) & sleep(a))').simplify() + >>> x2 = read_expr(r'\a.(like(vincent,a) & sleep(a))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'(\a.\b.like(b,a)(mia)(vincent))').simplify() + >>> x2 = read_expr(r'like(vincent,mia)').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'P((\a.sleep(a)(vincent)))').simplify() + >>> x2 = read_expr(r'P(sleep(vincent))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'\A.A((\b.sleep(b)(vincent)))').simplify() + >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'\A.A(sleep(vincent))').simplify() + >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'(\A.A(vincent)(\b.sleep(b)))').simplify() + >>> x2 = read_expr(r'sleep(vincent)').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'\A.believe(mia,A(vincent))(\b.sleep(b))').simplify() + >>> x2 = read_expr(r'believe(mia,sleep(vincent))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'(\A.(A(vincent) & A(mia)))(\b.sleep(b))').simplify() + >>> x2 = read_expr(r'(sleep(vincent) & sleep(mia))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'\A.\B.(\C.C(A(vincent))(\d.probably(d)) & (\C.C(B(mia))(\d.improbably(d))))(\f.walk(f))(\f.talk(f))').simplify() + >>> x2 = read_expr(r'(probably(walk(vincent)) & improbably(talk(mia)))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\d.\f.love(d,f))))(jules)(mia)').simplify() + >>> x2 = read_expr(r'love(jules,mia)').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'(\A.\B.exists c.(A(c) & B(c)))(\d.boxer(d),\d.sleep(d))').simplify() + >>> x2 = read_expr(r'exists c.(boxer(c) & sleep(c))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'\A.Z(A)(\c.\a.like(a,c))').simplify() + >>> x2 = read_expr(r'Z(\c.\a.like(a,c))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'\A.\b.A(b)(\c.\b.like(b,c))').simplify() + >>> x2 = read_expr(r'\b.(\c.\b.like(b,c)(b))').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\b.\a.loves(b,a))))(jules)(mia)').simplify() + >>> x2 = read_expr(r'loves(jules,mia)').simplify() + >>> x1 == x2 + True + + >>> x1 = read_expr(r'(\A.\b.(exists b.A(b) & A(b)))(\c.boxer(c))(vincent)').simplify() + >>> x2 = read_expr(r'((exists b.boxer(b)) & boxer(vincent))').simplify() + >>> x1 == x2 + True + +Test Parser +=========== + + >>> print(read_expr(r'john')) + john + >>> print(read_expr(r'x')) + x + >>> print(read_expr(r'-man(x)')) + -man(x) + >>> print(read_expr(r'--man(x)')) + --man(x) + >>> print(read_expr(r'(man(x))')) + man(x) + >>> print(read_expr(r'((man(x)))')) + man(x) + >>> print(read_expr(r'man(x) <-> tall(x)')) + (man(x) <-> tall(x)) + >>> print(read_expr(r'(man(x) <-> tall(x))')) + (man(x) <-> tall(x)) + >>> print(read_expr(r'(man(x) & tall(x) & walks(x))')) + (man(x) & tall(x) & walks(x)) + >>> print(read_expr(r'(man(x) & tall(x) & walks(x))').first) + (man(x) & tall(x)) + >>> print(read_expr(r'man(x) | tall(x) & walks(x)')) + (man(x) | (tall(x) & walks(x))) + >>> print(read_expr(r'((man(x) & tall(x)) | walks(x))')) + ((man(x) & tall(x)) | walks(x)) + >>> print(read_expr(r'man(x) & (tall(x) | walks(x))')) + (man(x) & (tall(x) | walks(x))) + >>> print(read_expr(r'(man(x) & (tall(x) | walks(x)))')) + (man(x) & (tall(x) | walks(x))) + >>> print(read_expr(r'P(x) -> Q(x) <-> R(x) | S(x) & T(x)')) + ((P(x) -> Q(x)) <-> (R(x) | (S(x) & T(x)))) + >>> print(read_expr(r'exists x.man(x)')) + exists x.man(x) + >>> print(read_expr(r'exists x.(man(x) & tall(x))')) + exists x.(man(x) & tall(x)) + >>> print(read_expr(r'exists x.(man(x) & tall(x) & walks(x))')) + exists x.(man(x) & tall(x) & walks(x)) + >>> print(read_expr(r'-P(x) & Q(x)')) + (-P(x) & Q(x)) + >>> read_expr(r'-P(x) & Q(x)') == read_expr(r'(-P(x)) & Q(x)') + True + >>> print(read_expr(r'\x.man(x)')) + \x.man(x) + >>> print(read_expr(r'\x.man(x)(john)')) + \x.man(x)(john) + >>> print(read_expr(r'\x.man(x)(john) & tall(x)')) + (\x.man(x)(john) & tall(x)) + >>> print(read_expr(r'\x.\y.sees(x,y)')) + \x y.sees(x,y) + >>> print(read_expr(r'\x y.sees(x,y)')) + \x y.sees(x,y) + >>> print(read_expr(r'\x.\y.sees(x,y)(a)')) + (\x y.sees(x,y))(a) + >>> print(read_expr(r'\x y.sees(x,y)(a)')) + (\x y.sees(x,y))(a) + >>> print(read_expr(r'\x.\y.sees(x,y)(a)(b)')) + ((\x y.sees(x,y))(a))(b) + >>> print(read_expr(r'\x y.sees(x,y)(a)(b)')) + ((\x y.sees(x,y))(a))(b) + >>> print(read_expr(r'\x.\y.sees(x,y)(a,b)')) + ((\x y.sees(x,y))(a))(b) + >>> print(read_expr(r'\x y.sees(x,y)(a,b)')) + ((\x y.sees(x,y))(a))(b) + >>> print(read_expr(r'((\x.\y.sees(x,y))(a))(b)')) + ((\x y.sees(x,y))(a))(b) + >>> print(read_expr(r'P(x)(y)(z)')) + P(x,y,z) + >>> print(read_expr(r'P(Q)')) + P(Q) + >>> print(read_expr(r'P(Q(x))')) + P(Q(x)) + >>> print(read_expr(r'(\x.exists y.walks(x,y))(x)')) + (\x.exists y.walks(x,y))(x) + >>> print(read_expr(r'exists x.(x = john)')) + exists x.(x = john) + >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))')) + ((\P Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x)) + >>> a = read_expr(r'exists c.exists b.A(b,c) & A(b,c)') + >>> b = read_expr(r'(exists c.(exists b.A(b,c))) & A(b,c)') + >>> print(a == b) + True + >>> a = read_expr(r'exists c.(exists b.A(b,c) & A(b,c))') + >>> b = read_expr(r'exists c.((exists b.A(b,c)) & A(b,c))') + >>> print(a == b) + True + >>> print(read_expr(r'exists x.x = y')) + exists x.(x = y) + >>> print(read_expr('A(B)(C)')) + A(B,C) + >>> print(read_expr('(A(B))(C)')) + A(B,C) + >>> print(read_expr('A((B)(C))')) + A(B(C)) + >>> print(read_expr('A(B(C))')) + A(B(C)) + >>> print(read_expr('(A)(B(C))')) + A(B(C)) + >>> print(read_expr('(((A)))(((B))(((C))))')) + A(B(C)) + >>> print(read_expr(r'A != B')) + -(A = B) + >>> print(read_expr('P(x) & x=y & P(y)')) + (P(x) & (x = y) & P(y)) + >>> try: print(read_expr(r'\walk.walk(x)')) + ... except LogicalExpressionException as e: print(e) + 'walk' is an illegal variable name. Constants may not be abstracted. + \walk.walk(x) + ^ + >>> try: print(read_expr(r'all walk.walk(john)')) + ... except LogicalExpressionException as e: print(e) + 'walk' is an illegal variable name. Constants may not be quantified. + all walk.walk(john) + ^ + >>> try: print(read_expr(r'x(john)')) + ... except LogicalExpressionException as e: print(e) + 'x' is an illegal predicate name. Individual variables may not be used as predicates. + x(john) + ^ + + >>> from nltk.sem.logic import LogicParser # hack to give access to custom quote chars + >>> lpq = LogicParser() + >>> lpq.quote_chars = [("'", "'", "\\", False)] + >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )")) + (man(x) & tall's,(x) & walks(x)) + >>> lpq.quote_chars = [("'", "'", "\\", True)] + >>> print(lpq.parse(r"'tall\'s,'")) + 'tall\'s,' + >>> print(lpq.parse(r"'spaced name(x)'")) + 'spaced name(x)' + >>> print(lpq.parse(r"-'tall\'s,'(x)")) + -'tall\'s,'(x) + >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )")) + (man(x) & 'tall\'s,'(x) & walks(x)) + + +Simplify +======== + + >>> print(read_expr(r'\x.man(x)(john)').simplify()) + man(john) + >>> print(read_expr(r'\x.((man(x)))(john)').simplify()) + man(john) + >>> print(read_expr(r'\x.\y.sees(x,y)(john, mary)').simplify()) + sees(john,mary) + >>> print(read_expr(r'\x y.sees(x,y)(john, mary)').simplify()) + sees(john,mary) + >>> print(read_expr(r'\x.\y.sees(x,y)(john)(mary)').simplify()) + sees(john,mary) + >>> print(read_expr(r'\x y.sees(x,y)(john)(mary)').simplify()) + sees(john,mary) + >>> print(read_expr(r'\x.\y.sees(x,y)(john)').simplify()) + \y.sees(john,y) + >>> print(read_expr(r'\x y.sees(x,y)(john)').simplify()) + \y.sees(john,y) + >>> print(read_expr(r'(\x.\y.sees(x,y)(john))(mary)').simplify()) + sees(john,mary) + >>> print(read_expr(r'(\x y.sees(x,y)(john))(mary)').simplify()) + sees(john,mary) + >>> print(read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify()) + exists x.(man(x) & exists y.walks(x,y)) + >>> e1 = read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(y))').simplify() + >>> e2 = read_expr(r'exists x.(man(x) & exists z1.walks(y,z1))') + >>> e1 == e2 + True + >>> print(read_expr(r'(\P Q.exists x.(P(x) & Q(x)))(\x.dog(x))').simplify()) + \Q.exists x.(dog(x) & Q(x)) + >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))').simplify()) + exists x.(dog(x) & bark(x)) + >>> print(read_expr(r'\P.(P(x)(y))(\a b.Q(a,b))').simplify()) + Q(x,y) + +Replace +======= + + >>> a = read_expr(r'a') + >>> x = read_expr(r'x') + >>> y = read_expr(r'y') + >>> z = read_expr(r'z') + + >>> print(read_expr(r'man(x)').replace(x.variable, a, False)) + man(a) + >>> print(read_expr(r'(man(x) & tall(x))').replace(x.variable, a, False)) + (man(a) & tall(a)) + >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, False)) + exists x.man(x) + >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, True)) + exists a.man(a) + >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, False)) + exists x.give(x,a,z) + >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, True)) + exists x.give(x,a,z) + >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, False) + >>> e2 = read_expr(r'exists z1.give(z1,x,z)') + >>> e1 == e2 + True + >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, True) + >>> e2 = read_expr(r'exists z1.give(z1,x,z)') + >>> e1 == e2 + True + >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, False)) + \x y z.give(x,y,z) + >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, True)) + \x a z.give(x,a,z) + >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, False)) + \x y.give(x,y,a) + >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, True)) + \x y.give(x,y,a) + >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, False) + >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)') + >>> e1 == e2 + True + >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, True) + >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)') + >>> e1 == e2 + True + >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, False)) + \x.give(x,y,y) + >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, True)) + \x.give(x,y,y) + + >>> from nltk.sem import logic + >>> logic._counter._value = 0 + >>> e1 = read_expr('e1') + >>> e2 = read_expr('e2') + >>> print(read_expr('exists e1 e2.(walk(e1) & talk(e2))').replace(e1.variable, e2, True)) + exists e2 e01.(walk(e2) & talk(e01)) + + +Variables / Free +================ + + >>> examples = [r'walk(john)', + ... r'walk(x)', + ... r'?vp(?np)', + ... r'see(john,mary)', + ... r'exists x.walk(x)', + ... r'\x.see(john,x)', + ... r'\x.see(john,x)(mary)', + ... r'P(x)', + ... r'\P.P(x)', + ... r'aa(x,bb(y),cc(z),P(w),u)', + ... r'bo(?det(?n),@x)'] + >>> examples = [read_expr(e) for e in examples] + + >>> for e in examples: + ... print('%-25s' % e, sorted(e.free())) + walk(john) [] + walk(x) [Variable('x')] + ?vp(?np) [] + see(john,mary) [] + exists x.walk(x) [] + \x.see(john,x) [] + (\x.see(john,x))(mary) [] + P(x) [Variable('P'), Variable('x')] + \P.P(x) [Variable('x')] + aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] + bo(?det(?n),@x) [] + + >>> for e in examples: + ... print('%-25s' % e, sorted(e.constants())) + walk(john) [Variable('john')] + walk(x) [] + ?vp(?np) [Variable('?np')] + see(john,mary) [Variable('john'), Variable('mary')] + exists x.walk(x) [] + \x.see(john,x) [Variable('john')] + (\x.see(john,x))(mary) [Variable('john'), Variable('mary')] + P(x) [] + \P.P(x) [] + aa(x,bb(y),cc(z),P(w),u) [] + bo(?det(?n),@x) [Variable('?n'), Variable('@x')] + + >>> for e in examples: + ... print('%-25s' % e, sorted(e.predicates())) + walk(john) [Variable('walk')] + walk(x) [Variable('walk')] + ?vp(?np) [Variable('?vp')] + see(john,mary) [Variable('see')] + exists x.walk(x) [Variable('walk')] + \x.see(john,x) [Variable('see')] + (\x.see(john,x))(mary) [Variable('see')] + P(x) [] + \P.P(x) [] + aa(x,bb(y),cc(z),P(w),u) [Variable('aa'), Variable('bb'), Variable('cc')] + bo(?det(?n),@x) [Variable('?det'), Variable('bo')] + + >>> for e in examples: + ... print('%-25s' % e, sorted(e.variables())) + walk(john) [] + walk(x) [Variable('x')] + ?vp(?np) [Variable('?np'), Variable('?vp')] + see(john,mary) [] + exists x.walk(x) [] + \x.see(john,x) [] + (\x.see(john,x))(mary) [] + P(x) [Variable('P'), Variable('x')] + \P.P(x) [Variable('x')] + aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] + bo(?det(?n),@x) [Variable('?det'), Variable('?n'), Variable('@x')] + + + +`normalize` + >>> print(read_expr(r'\e083.(walk(e083, z472) & talk(e092, z938))').normalize()) + \e01.(walk(e01,z3) & talk(e02,z4)) + +Typed Logic ++++++++++++ + + >>> from nltk.sem.logic import LogicParser + >>> tlp = LogicParser(True) + >>> print(tlp.parse(r'man(x)').type) + ? + >>> print(tlp.parse(r'walk(angus)').type) + ? + >>> print(tlp.parse(r'-man(x)').type) + t + >>> print(tlp.parse(r'(man(x) <-> tall(x))').type) + t + >>> print(tlp.parse(r'exists x.(man(x) & tall(x))').type) + t + >>> print(tlp.parse(r'\x.man(x)').type) + + >>> print(tlp.parse(r'john').type) + e + >>> print(tlp.parse(r'\x y.sees(x,y)').type) + > + >>> print(tlp.parse(r'\x.man(x)(john)').type) + ? + >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)').type) + + >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)(mary)').type) + ? + >>> print(tlp.parse(r'\P.\Q.exists x.(P(x) & Q(x))').type) + <,<,t>> + >>> print(tlp.parse(r'\x.y').type) + + >>> print(tlp.parse(r'\P.P(x)').type) + <,?> + + >>> parsed = tlp.parse('see(john,mary)') + >>> print(parsed.type) + ? + >>> print(parsed.function) + see(john) + >>> print(parsed.function.type) + + >>> print(parsed.function.function) + see + >>> print(parsed.function.function.type) + > + + >>> parsed = tlp.parse('P(x,y)') + >>> print(parsed) + P(x,y) + >>> print(parsed.type) + ? + >>> print(parsed.function) + P(x) + >>> print(parsed.function.type) + + >>> print(parsed.function.function) + P + >>> print(parsed.function.function.type) + > + + >>> print(tlp.parse(r'P').type) + ? + + >>> print(tlp.parse(r'P', {'P': 't'}).type) + t + + >>> a = tlp.parse(r'P(x)') + >>> print(a.type) + ? + >>> print(a.function.type) + + >>> print(a.argument.type) + e + + >>> a = tlp.parse(r'-P(x)') + >>> print(a.type) + t + >>> print(a.term.type) + t + >>> print(a.term.function.type) + + >>> print(a.term.argument.type) + e + + >>> a = tlp.parse(r'P & Q') + >>> print(a.type) + t + >>> print(a.first.type) + t + >>> print(a.second.type) + t + + >>> a = tlp.parse(r'(P(x) & Q(x))') + >>> print(a.type) + t + >>> print(a.first.type) + t + >>> print(a.first.function.type) + + >>> print(a.first.argument.type) + e + >>> print(a.second.type) + t + >>> print(a.second.function.type) + + >>> print(a.second.argument.type) + e + + >>> a = tlp.parse(r'\x.P(x)') + >>> print(a.type) + + >>> print(a.term.function.type) + + >>> print(a.term.argument.type) + e + + >>> a = tlp.parse(r'\P.P(x)') + >>> print(a.type) + <,?> + >>> print(a.term.function.type) + + >>> print(a.term.argument.type) + e + + >>> a = tlp.parse(r'(\x.P(x)(john)) & Q(x)') + >>> print(a.type) + t + >>> print(a.first.type) + t + >>> print(a.first.function.type) + + >>> print(a.first.function.term.function.type) + + >>> print(a.first.function.term.argument.type) + e + >>> print(a.first.argument.type) + e + + >>> a = tlp.parse(r'\x y.P(x,y)(john)(mary) & Q(x)') + >>> print(a.type) + t + >>> print(a.first.type) + t + >>> print(a.first.function.type) + + >>> print(a.first.function.function.type) + > + + >>> a = tlp.parse(r'--P') + >>> print(a.type) + t + >>> print(a.term.type) + t + >>> print(a.term.term.type) + t + + >>> tlp.parse(r'\x y.P(x,y)').type + > + >>> tlp.parse(r'\x y.P(x,y)', {'P': '>'}).type + > + + >>> a = tlp.parse(r'\P y.P(john,y)(\x y.see(x,y))') + >>> a.type + + >>> a.function.type + <>,> + >>> a.function.term.term.function.function.type + > + >>> a.argument.type + > + + >>> a = tlp.parse(r'exists c f.(father(c) = f)') + >>> a.type + t + >>> a.term.term.type + t + >>> a.term.term.first.type + e + >>> a.term.term.first.function.type + + >>> a.term.term.second.type + e + +typecheck() + + >>> a = tlp.parse('P(x)') + >>> b = tlp.parse('Q(x)') + >>> a.type + ? + >>> c = a & b + >>> c.first.type + ? + >>> c.typecheck() + {...} + >>> c.first.type + t + + >>> a = tlp.parse('P(x)') + >>> b = tlp.parse('P(x) & Q(x)') + >>> a.type + ? + >>> typecheck([a,b]) + {...} + >>> a.type + t + + >>> e = tlp.parse(r'man(x)') + >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': ''}) + True + >>> sig = {'man': ''} + >>> e = tlp.parse(r'man(x)', sig) + >>> print(e.function.type) + + >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': ''}) + True + >>> print(e.function.type) + + >>> print(dict((k,str(v)) for k,v in e.typecheck(sig).items()) == {'x': 'e', 'man': ''}) + True + +findtype() + + >>> print(tlp.parse(r'man(x)').findtype(Variable('man'))) + + >>> print(tlp.parse(r'see(x,y)').findtype(Variable('see'))) + > + >>> print(tlp.parse(r'P(Q(R(x)))').findtype(Variable('Q'))) + ? + +reading types from strings + + >>> Type.fromstring('e') + e + >>> Type.fromstring('') + + >>> Type.fromstring('<,>') + <,> + >>> Type.fromstring('<,?>') + <,?> + +alternative type format + + >>> Type.fromstring('e').str() + 'IND' + >>> Type.fromstring('').str() + '(IND -> ANY)' + >>> Type.fromstring('<,t>').str() + '((IND -> BOOL) -> BOOL)' + +Type.__eq__() + + >>> from nltk.sem.logic import * + + >>> e = ENTITY_TYPE + >>> t = TRUTH_TYPE + >>> a = ANY_TYPE + >>> et = ComplexType(e,t) + >>> eet = ComplexType(e,ComplexType(e,t)) + >>> at = ComplexType(a,t) + >>> ea = ComplexType(e,a) + >>> aa = ComplexType(a,a) + + >>> e == e + True + >>> t == t + True + >>> e == t + False + >>> a == t + False + >>> t == a + False + >>> a == a + True + >>> et == et + True + >>> a == et + False + >>> et == a + False + >>> a == ComplexType(a,aa) + True + >>> ComplexType(a,aa) == a + True + +matches() + + >>> e.matches(t) + False + >>> a.matches(t) + True + >>> t.matches(a) + True + >>> a.matches(et) + True + >>> et.matches(a) + True + >>> ea.matches(eet) + True + >>> eet.matches(ea) + True + >>> aa.matches(et) + True + >>> aa.matches(t) + True + +Type error during parsing +========================= + + >>> try: print(tlp.parse(r'exists x y.(P(x) & P(x,y))')) + ... except InconsistentTypeHierarchyException as e: print(e) + The variable 'P' was found in multiple places with different types. + >>> try: tlp.parse(r'\x y.see(x,y)(\x.man(x))') + ... except TypeException as e: print(e) + The function '\x y.see(x,y)' is of type '>' and cannot be applied to '\x.man(x)' of type ''. Its argument must match type 'e'. + >>> try: tlp.parse(r'\P x y.-P(x,y)(\x.-man(x))') + ... except TypeException as e: print(e) + The function '\P x y.-P(x,y)' is of type '<>,>>' and cannot be applied to '\x.-man(x)' of type ''. Its argument must match type '>'. + + >>> a = tlp.parse(r'-talk(x)') + >>> signature = a.typecheck() + >>> try: print(tlp.parse(r'-talk(x,y)', signature)) + ... except InconsistentTypeHierarchyException as e: print(e) + The variable 'talk' was found in multiple places with different types. + + >>> a = tlp.parse(r'-P(x)') + >>> b = tlp.parse(r'-P(x,y)') + >>> a.typecheck() + {...} + >>> b.typecheck() + {...} + >>> try: typecheck([a,b]) + ... except InconsistentTypeHierarchyException as e: print(e) + The variable 'P' was found in multiple places with different types. + + >>> a = tlp.parse(r'P(x)') + >>> b = tlp.parse(r'P(x,y)') + >>> signature = {'P': ''} + >>> a.typecheck(signature) + {...} + >>> try: typecheck([a,b], signature) + ... except InconsistentTypeHierarchyException as e: print(e) + The variable 'P' was found in multiple places with different types. + +Parse errors +============ + + >>> try: read_expr(r'') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + + ^ + >>> try: read_expr(r'(') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + ( + ^ + >>> try: read_expr(r')') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + ) + ^ + >>> try: read_expr(r'()') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + () + ^ + >>> try: read_expr(r'(P(x) & Q(x)') + ... except LogicalExpressionException as e: print(e) + End of input found. Expected token ')'. + (P(x) & Q(x) + ^ + >>> try: read_expr(r'(P(x) &') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + (P(x) & + ^ + >>> try: read_expr(r'(P(x) | )') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + (P(x) | ) + ^ + >>> try: read_expr(r'P(x) ->') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + P(x) -> + ^ + >>> try: read_expr(r'P(x') + ... except LogicalExpressionException as e: print(e) + End of input found. Expected token ')'. + P(x + ^ + >>> try: read_expr(r'P(x,') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + P(x, + ^ + >>> try: read_expr(r'P(x,)') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + P(x,) + ^ + >>> try: read_expr(r'exists') + ... except LogicalExpressionException as e: print(e) + End of input found. Variable and Expression expected following quantifier 'exists'. + exists + ^ + >>> try: read_expr(r'exists x') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + exists x + ^ + >>> try: read_expr(r'exists x.') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + exists x. + ^ + >>> try: read_expr(r'\ ') + ... except LogicalExpressionException as e: print(e) + End of input found. Variable and Expression expected following lambda operator. + \ + ^ + >>> try: read_expr(r'\ x') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + \ x + ^ + >>> try: read_expr(r'\ x y') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + \ x y + ^ + >>> try: read_expr(r'\ x.') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + \ x. + ^ + >>> try: read_expr(r'P(x)Q(x)') + ... except LogicalExpressionException as e: print(e) + Unexpected token: 'Q'. + P(x)Q(x) + ^ + >>> try: read_expr(r'(P(x)Q(x)') + ... except LogicalExpressionException as e: print(e) + Unexpected token: 'Q'. Expected token ')'. + (P(x)Q(x) + ^ + >>> try: read_expr(r'exists x y') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + exists x y + ^ + >>> try: read_expr(r'exists x y.') + ... except LogicalExpressionException as e: print(e) + End of input found. Expression expected. + exists x y. + ^ + >>> try: read_expr(r'exists x -> y') + ... except LogicalExpressionException as e: print(e) + Unexpected token: '->'. Expression expected. + exists x -> y + ^ + + + >>> try: read_expr(r'A -> ((P(x) & Q(x)) -> Z') + ... except LogicalExpressionException as e: print(e) + End of input found. Expected token ')'. + A -> ((P(x) & Q(x)) -> Z + ^ + >>> try: read_expr(r'A -> ((P(x) &) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> ((P(x) &) -> Z + ^ + >>> try: read_expr(r'A -> ((P(x) | )) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> ((P(x) | )) -> Z + ^ + >>> try: read_expr(r'A -> (P(x) ->) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (P(x) ->) -> Z + ^ + >>> try: read_expr(r'A -> (P(x) -> Z') + ... except LogicalExpressionException as e: print(e) + End of input found. Expected token ')'. + A -> (P(x) -> Z + ^ + >>> try: read_expr(r'A -> (P(x,) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (P(x,) -> Z + ^ + >>> try: read_expr(r'A -> (P(x,)) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (P(x,)) -> Z + ^ + >>> try: read_expr(r'A -> (exists) -> Z') + ... except LogicalExpressionException as e: print(e) + ')' is an illegal variable name. Constants may not be quantified. + A -> (exists) -> Z + ^ + >>> try: read_expr(r'A -> (exists x) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (exists x) -> Z + ^ + >>> try: read_expr(r'A -> (exists x.) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (exists x.) -> Z + ^ + >>> try: read_expr(r'A -> (\ ) -> Z') + ... except LogicalExpressionException as e: print(e) + ')' is an illegal variable name. Constants may not be abstracted. + A -> (\ ) -> Z + ^ + >>> try: read_expr(r'A -> (\ x) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (\ x) -> Z + ^ + >>> try: read_expr(r'A -> (\ x y) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (\ x y) -> Z + ^ + >>> try: read_expr(r'A -> (\ x.) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (\ x.) -> Z + ^ + >>> try: read_expr(r'A -> (P(x)Q(x)) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: 'Q'. Expected token ')'. + A -> (P(x)Q(x)) -> Z + ^ + >>> try: read_expr(r'A -> ((P(x)Q(x)) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: 'Q'. Expected token ')'. + A -> ((P(x)Q(x)) -> Z + ^ + >>> try: read_expr(r'A -> (all x y) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (all x y) -> Z + ^ + >>> try: read_expr(r'A -> (exists x y.) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: ')'. Expression expected. + A -> (exists x y.) -> Z + ^ + >>> try: read_expr(r'A -> (exists x -> y) -> Z') + ... except LogicalExpressionException as e: print(e) + Unexpected token: '->'. Expression expected. + A -> (exists x -> y) -> Z + ^ diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/metrics.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/metrics.doctest new file mode 100644 index 0000000000000000000000000000000000000000..d8592d646083ed430df8350eea4eb9e9c1eac249 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/metrics.doctest @@ -0,0 +1,321 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +======= +Metrics +======= + +----- +Setup +----- + + >>> import pytest + >>> _ = pytest.importorskip("numpy") + + +The `nltk.metrics` package provides a variety of *evaluation measures* +which can be used for a wide variety of NLP tasks. + + >>> from nltk.metrics import * + +------------------ +Standard IR Scores +------------------ + +We can use standard scores from information retrieval to test the +performance of taggers, chunkers, etc. + + >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() + >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() + >>> print(accuracy(reference, test)) + 0.8 + + +The following measures apply to sets: + + >>> reference_set = set(reference) + >>> test_set = set(test) + >>> precision(reference_set, test_set) + 1.0 + >>> print(recall(reference_set, test_set)) + 0.8 + >>> print(f_measure(reference_set, test_set)) + 0.88888888888... + +Measuring the likelihood of the data, given probability distributions: + + >>> from nltk import FreqDist, MLEProbDist + >>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf")) + >>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss")) + >>> print(log_likelihood(['a', 'd'], [pdist1, pdist2])) + -2.7075187496... + + +---------------- +Distance Metrics +---------------- + +String edit distance (Levenshtein): + + >>> edit_distance("rain", "shine") + 3 + >>> edit_distance_align("shine", "shine") + [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)] + >>> edit_distance_align("rain", "brainy") + [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)] + >>> edit_distance_align("", "brainy") + [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)] + >>> edit_distance_align("", "") + [(0, 0)] + +Other distance measures: + + >>> s1 = set([1,2,3,4]) + >>> s2 = set([3,4,5]) + >>> binary_distance(s1, s2) + 1.0 + >>> print(jaccard_distance(s1, s2)) + 0.6 + >>> print(masi_distance(s1, s2)) + 0.868 + +---------------------- +Miscellaneous Measures +---------------------- + +Rank Correlation works with two dictionaries mapping keys to ranks. +The dictionaries should have the same set of keys. + + >>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3}) + 0.5 + +Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings). +Segmentations are represented using strings of zeros and ones. + + >>> s1 = "000100000010" + >>> s2 = "000010000100" + >>> s3 = "100000010000" + >>> s4 = "000000000000" + >>> s5 = "111111111111" + >>> windowdiff(s1, s1, 3) + 0.0 + >>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3 + True + >>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8 + True + >>> windowdiff(s1, s4, 3) + 0.5 + >>> windowdiff(s1, s5, 3) + 1.0 + +---------------- +Confusion Matrix +---------------- + + >>> reference = 'This is the reference data. Testing 123. aoaeoeoe' + >>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe' + >>> print(ConfusionMatrix(reference, test)) + | . 1 2 3 T _ a c d e f g h i n o r s t z | + --+-------------------------------------------+ + |<8>. . . . . 1 . . . . . . . . . . . . . . | + . | .<2>. . . . . . . . . . . . . . . . . . . | + 1 | . .<1>. . . . . . . . . . . . . . . . . . | + 2 | . . .<1>. . . . . . . . . . . . . . . . . | + 3 | . . . .<1>. . . . . . . . . . . . . . . . | + T | . . . . .<2>. . . . . . . . . . . . . . . | + _ | . . . . . .<.>. . . . . . . . . . . . . . | + a | . . . . . . .<4>. . . . . . . . . . . . . | + c | . . . . . . . .<1>. . . . . . . . . . . . | + d | . . . . . . . . .<1>. . . . . . . . . . . | + e | . . . . . . . . . .<6>. . . 3 . . . . . . | + f | . . . . . . . . . . .<1>. . . . . . . . . | + g | . . . . . . . . . . . .<1>. . . . . . . . | + h | . . . . . . . . . . . . .<2>. . . . . . . | + i | . . . . . . . . . . 1 . . .<1>. 1 . . . . | + n | . . . . . . . . . . . . . . .<2>. . . . . | + o | . . . . . . . . . . . . . . . .<3>. . . . | + r | . . . . . . . . . . . . . . . . .<2>. . . | + s | . . . . . . . . . . . . . . . . . .<2>. 1 | + t | . . . . . . . . . . . . . . . . . . .<3>. | + z | . . . . . . . . . . . . . . . . . . . .<.>| + --+-------------------------------------------+ + (row = reference; col = test) + + + >>> cm = ConfusionMatrix(reference, test) + >>> print(cm.pretty_format(sort_by_count=True)) + | e a i o s t . T h n r 1 2 3 c d f g _ z | + --+-------------------------------------------+ + |<8>. . . . . . . . . . . . . . . . . . 1 . | + e | .<6>. 3 . . . . . . . . . . . . . . . . . | + a | . .<4>. . . . . . . . . . . . . . . . . . | + i | . 1 .<1>1 . . . . . . . . . . . . . . . . | + o | . . . .<3>. . . . . . . . . . . . . . . . | + s | . . . . .<2>. . . . . . . . . . . . . . 1 | + t | . . . . . .<3>. . . . . . . . . . . . . . | + . | . . . . . . .<2>. . . . . . . . . . . . . | + T | . . . . . . . .<2>. . . . . . . . . . . . | + h | . . . . . . . . .<2>. . . . . . . . . . . | + n | . . . . . . . . . .<2>. . . . . . . . . . | + r | . . . . . . . . . . .<2>. . . . . . . . . | + 1 | . . . . . . . . . . . .<1>. . . . . . . . | + 2 | . . . . . . . . . . . . .<1>. . . . . . . | + 3 | . . . . . . . . . . . . . .<1>. . . . . . | + c | . . . . . . . . . . . . . . .<1>. . . . . | + d | . . . . . . . . . . . . . . . .<1>. . . . | + f | . . . . . . . . . . . . . . . . .<1>. . . | + g | . . . . . . . . . . . . . . . . . .<1>. . | + _ | . . . . . . . . . . . . . . . . . . .<.>. | + z | . . . . . . . . . . . . . . . . . . . .<.>| + --+-------------------------------------------+ + (row = reference; col = test) + + + >>> print(cm.pretty_format(sort_by_count=True, truncate=10)) + | e a i o s t . T h | + --+---------------------+ + |<8>. . . . . . . . . | + e | .<6>. 3 . . . . . . | + a | . .<4>. . . . . . . | + i | . 1 .<1>1 . . . . . | + o | . . . .<3>. . . . . | + s | . . . . .<2>. . . . | + t | . . . . . .<3>. . . | + . | . . . . . . .<2>. . | + T | . . . . . . . .<2>. | + h | . . . . . . . . .<2>| + --+---------------------+ + (row = reference; col = test) + + + >>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False)) + | 1 | + | 1 2 3 4 5 6 7 8 9 0 | + ---+---------------------+ + 1 |<8>. . . . . . . . . | + 2 | .<6>. 3 . . . . . . | + 3 | . .<4>. . . . . . . | + 4 | . 1 .<1>1 . . . . . | + 5 | . . . .<3>. . . . . | + 6 | . . . . .<2>. . . . | + 7 | . . . . . .<3>. . . | + 8 | . . . . . . .<2>. . | + 9 | . . . . . . . .<2>. | + 10 | . . . . . . . . .<2>| + ---+---------------------+ + (row = reference; col = test) + Value key: + 1: + 2: e + 3: a + 4: i + 5: o + 6: s + 7: t + 8: . + 9: T + 10: h + + +For "e", the number of true positives should be 6, while the number of false negatives is 3. +So, the recall ought to be 6 / (6 + 3): + + >>> cm.recall("e") # doctest: +ELLIPSIS + 0.666666... + +For "e", the false positive is just 1, so the precision should be 6 / (6 + 1): + + >>> cm.precision("e") # doctest: +ELLIPSIS + 0.857142... + +The f-measure with default value of ``alpha = 0.5`` should then be: + +* *1/(alpha/p + (1-alpha)/r) =* +* *1/(0.5/p + 0.5/r) =* +* *2pr / (p + r) =* +* *2 * 0.857142... * 0.666666... / (0.857142... + 0.666666...) =* +* *0.749999...* + + >>> cm.f_measure("e") # doctest: +ELLIPSIS + 0.749999... + +-------------------- +Association measures +-------------------- + +These measures are useful to determine whether the coocurrence of two random +events is meaningful. They are used, for instance, to distinguish collocations +from other pairs of adjacent words. + +We bring some examples of bigram association calculations from Manning and +Schutze's SNLP, 2nd Ed. chapter 5. + + >>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668 + >>> bam = BigramAssocMeasures + >>> bam.raw_freq(20, (42, 20), N) == 20. / N + True + >>> bam.student_t(n_new_companies, (n_new, n_companies), N) + 0.999... + >>> bam.chi_sq(n_new_companies, (n_new, n_companies), N) + 1.54... + >>> bam.likelihood_ratio(150, (12593, 932), N) + 1291... + +For other associations, we ensure the ordering of the measures: + + >>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N) + True + >>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N) + True + >>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N) + True + >>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N) + True + >>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N) + True + >>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N) + True + >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP + False + +For trigrams, we have to provide more count information: + + >>> n_w1_w2_w3 = 20 + >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 + >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) + >>> n_w1, n_w2, n_w3 = 100, 200, 300 + >>> uni_counts = (n_w1, n_w2, n_w3) + >>> N = 14307668 + >>> tam = TrigramAssocMeasures + >>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N + True + >>> uni_counts2 = (n_w1, n_w2, 100) + >>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N) + True + >>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N) + True + >>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N) + True + >>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N) + True + >>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N) + True + >>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N) + True + >>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N) + True + + +For fourgrams, we have to provide more count information: + + >>> n_w1_w2_w3_w4 = 5 + >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 + >>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10 + >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) + >>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4) + >>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400 + >>> uni_counts = (n_w1, n_w2, n_w3, n_w4) + >>> N = 14307668 + >>> qam = QuadgramAssocMeasures + >>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/misc.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/misc.doctest new file mode 100644 index 0000000000000000000000000000000000000000..6fd9f64af39be1f3cc4cd0456ac2631c0f403d64 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/misc.doctest @@ -0,0 +1,118 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +-------------------------------------------------------------------------------- +Unit tests for the miscellaneous sort functions. +-------------------------------------------------------------------------------- + + >>> from copy import deepcopy + >>> from nltk.misc.sort import * + +A (very) small list of unsorted integers. + + >>> test_data = [12, 67, 7, 28, 92, 56, 53, 720, 91, 57, 20, 20] + +Test each sorting method - each method returns the number of operations +required to sort the data, and sorts in-place (desctructively - hence the need +for multiple copies). + + >>> sorted_data = deepcopy(test_data) + >>> selection(sorted_data) + 66 + + >>> sorted_data + [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] + + >>> sorted_data = deepcopy(test_data) + >>> bubble(sorted_data) + 30 + + >>> sorted_data + [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] + + >>> sorted_data = deepcopy(test_data) + >>> merge(sorted_data) + 30 + + >>> sorted_data + [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] + + >>> sorted_data = deepcopy(test_data) + >>> quick(sorted_data) + 13 + + >>> sorted_data + [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] + +-------------------------------------------------------------------------------- +Unit tests for Wordfinder class +-------------------------------------------------------------------------------- + + >>> import random + + >>> # The following is not enough for reproducibility under Python 2/3 + >>> # (see https://bugs.python.org/issue9025) so this test is skipped. + >>> random.seed(12345) + + >>> from nltk.misc import wordfinder + >>> wordfinder.word_finder() # doctest: +SKIP + Word Finder + + J V L A I R O T A T I S I V O D E R E T + H U U B E A R O E P O C S O R E T N E P + A D A U Z E E S R A P P A L L M E N T R + C X A D Q S Z T P E O R S N G P J A D E + I G Y K K T I A A R G F I D T E L C N S + R E C N B H T R L T N N B W N T A O A I + A Y I L O E I A M E I A A Y U R P L L D + G L T V S T S F E A D I P H D O O H N I + R L S E C I N I L R N N M E C G R U E A + A A Y G I C E N L L E O I G Q R T A E L + M R C E T I S T A E T L L E U A E N R L + O U O T A S E E C S O O N H Y P A T G Y + E M H O M M D R E S F P U L T H C F N V + L A C A I M A M A N L B R U T E D O M I + O R I L N E E E E E U A R S C R Y L I P + H T R K E S N N M S I L A S R E V I N U + T X T A A O U T K S E T A R R E S I B J + A E D L E L J I F O O R P E L K N I R W + K H A I D E Q O P R I C K T I M B E R P + Z K D O O H G N I H T U R V E Y D R O P + + 1: INTERCHANGER + 2: TEARLESSNESS + 3: UNIVERSALISM + 4: DESENSITIZER + 5: INTERMENTION + 6: TRICHOCYSTIC + 7: EXTRAMURALLY + 8: VEGETOALKALI + 9: PALMELLACEAE + 10: AESTHETICISM + 11: PETROGRAPHER + 12: VISITATORIAL + 13: OLEOMARGARIC + 14: WRINKLEPROOF + 15: PRICKTIMBER + 16: PRESIDIALLY + 17: SCITAMINEAE + 18: ENTEROSCOPE + 19: APPALLMENT + 20: TURVEYDROP + 21: THINGHOOD + 22: BISERRATE + 23: GREENLAND + 24: BRUTEDOM + 25: POLONIAN + 26: ACOLHUAN + 27: LAPORTEA + 28: TENDING + 29: TEREDO + 30: MESOLE + 31: UNLIMP + 32: OSTARA + 33: PILY + 34: DUNT + 35: ONYX + 36: KATH + 37: JUNE diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/nonmonotonic.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/nonmonotonic.doctest new file mode 100644 index 0000000000000000000000000000000000000000..dc0a2a098443963812d40a7d490d19e6355ca3c7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/nonmonotonic.doctest @@ -0,0 +1,293 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +====================== +Nonmonotonic Reasoning +====================== + + >>> from nltk.test.setup_fixt import check_binary + >>> check_binary('mace4') + + >>> from nltk import * + >>> from nltk.inference.nonmonotonic import * + >>> from nltk.sem import logic + >>> logic._counter._value = 0 + >>> read_expr = logic.Expression.fromstring + +------------------------ +Closed Domain Assumption +------------------------ + +The only entities in the domain are those found in the assumptions or goal. +If the domain only contains "A" and "B", then the expression "exists x.P(x)" can +be replaced with "P(A) | P(B)" and an expression "all x.P(x)" can be replaced +with "P(A) & P(B)". + + >>> p1 = read_expr(r'all x.(man(x) -> mortal(x))') + >>> p2 = read_expr(r'man(Socrates)') + >>> c = read_expr(r'mortal(Socrates)') + >>> prover = Prover9Command(c, [p1,p2]) + >>> prover.prove() + True + >>> cdp = ClosedDomainProver(prover) + >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP + (man(Socrates) -> mortal(Socrates)) + man(Socrates) + >>> cdp.prove() + True + + >>> p1 = read_expr(r'exists x.walk(x)') + >>> p2 = read_expr(r'man(Socrates)') + >>> c = read_expr(r'walk(Socrates)') + >>> prover = Prover9Command(c, [p1,p2]) + >>> prover.prove() + False + >>> cdp = ClosedDomainProver(prover) + >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP + walk(Socrates) + man(Socrates) + >>> cdp.prove() + True + + >>> p1 = read_expr(r'exists x.walk(x)') + >>> p2 = read_expr(r'man(Socrates)') + >>> p3 = read_expr(r'-walk(Bill)') + >>> c = read_expr(r'walk(Socrates)') + >>> prover = Prover9Command(c, [p1,p2,p3]) + >>> prover.prove() + False + >>> cdp = ClosedDomainProver(prover) + >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP + (walk(Socrates) | walk(Bill)) + man(Socrates) + -walk(Bill) + >>> cdp.prove() + True + + >>> p1 = read_expr(r'walk(Socrates)') + >>> p2 = read_expr(r'walk(Bill)') + >>> c = read_expr(r'all x.walk(x)') + >>> prover = Prover9Command(c, [p1,p2]) + >>> prover.prove() + False + >>> cdp = ClosedDomainProver(prover) + >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP + walk(Socrates) + walk(Bill) + >>> print(cdp.goal()) # doctest: +SKIP + (walk(Socrates) & walk(Bill)) + >>> cdp.prove() + True + + >>> p1 = read_expr(r'girl(mary)') + >>> p2 = read_expr(r'dog(rover)') + >>> p3 = read_expr(r'all x.(girl(x) -> -dog(x))') + >>> p4 = read_expr(r'all x.(dog(x) -> -girl(x))') + >>> p5 = read_expr(r'chase(mary, rover)') + >>> c = read_expr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))') + >>> prover = Prover9Command(c, [p1,p2,p3,p4,p5]) + >>> print(prover.prove()) + False + >>> cdp = ClosedDomainProver(prover) + >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP + girl(mary) + dog(rover) + ((girl(rover) -> -dog(rover)) & (girl(mary) -> -dog(mary))) + ((dog(rover) -> -girl(rover)) & (dog(mary) -> -girl(mary))) + chase(mary,rover) + >>> print(cdp.goal()) # doctest: +SKIP + ((dog(rover) & (girl(rover) -> chase(rover,rover)) & (girl(mary) -> chase(mary,rover))) | (dog(mary) & (girl(rover) -> chase(rover,mary)) & (girl(mary) -> chase(mary,mary)))) + >>> print(cdp.prove()) + True + +----------------------- +Unique Names Assumption +----------------------- + +No two entities in the domain represent the same entity unless it can be +explicitly proven that they do. Therefore, if the domain contains "A" and "B", +then add the assumption "-(A = B)" if it is not the case that +" \|- (A = B)". + + >>> p1 = read_expr(r'man(Socrates)') + >>> p2 = read_expr(r'man(Bill)') + >>> c = read_expr(r'exists x.exists y.-(x = y)') + >>> prover = Prover9Command(c, [p1,p2]) + >>> prover.prove() + False + >>> unp = UniqueNamesProver(prover) + >>> for a in unp.assumptions(): print(a) # doctest: +SKIP + man(Socrates) + man(Bill) + -(Socrates = Bill) + >>> unp.prove() + True + + >>> p1 = read_expr(r'all x.(walk(x) -> (x = Socrates))') + >>> p2 = read_expr(r'Bill = William') + >>> p3 = read_expr(r'Bill = Billy') + >>> c = read_expr(r'-walk(William)') + >>> prover = Prover9Command(c, [p1,p2,p3]) + >>> prover.prove() + False + >>> unp = UniqueNamesProver(prover) + >>> for a in unp.assumptions(): print(a) # doctest: +SKIP + all x.(walk(x) -> (x = Socrates)) + (Bill = William) + (Bill = Billy) + -(William = Socrates) + -(Billy = Socrates) + -(Socrates = Bill) + >>> unp.prove() + True + +----------------------- +Closed World Assumption +----------------------- + +The only entities that have certain properties are those that is it stated +have the properties. We accomplish this assumption by "completing" predicates. + +If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion +of "P". If the assumptions contain "all x.(ostrich(x) -> bird(x))", then +"all x.(bird(x) -> ostrich(x))" is the completion of "bird". If the +assumptions don't contain anything that are "P", then "all x.-P(x)" is the +completion of "P". + + >>> p1 = read_expr(r'walk(Socrates)') + >>> p2 = read_expr(r'-(Socrates = Bill)') + >>> c = read_expr(r'-walk(Bill)') + >>> prover = Prover9Command(c, [p1,p2]) + >>> prover.prove() + False + >>> cwp = ClosedWorldProver(prover) + >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP + walk(Socrates) + -(Socrates = Bill) + all z1.(walk(z1) -> (z1 = Socrates)) + >>> cwp.prove() + True + + >>> p1 = read_expr(r'see(Socrates, John)') + >>> p2 = read_expr(r'see(John, Mary)') + >>> p3 = read_expr(r'-(Socrates = John)') + >>> p4 = read_expr(r'-(John = Mary)') + >>> c = read_expr(r'-see(Socrates, Mary)') + >>> prover = Prover9Command(c, [p1,p2,p3,p4]) + >>> prover.prove() + False + >>> cwp = ClosedWorldProver(prover) + >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP + see(Socrates,John) + see(John,Mary) + -(Socrates = John) + -(John = Mary) + all z3 z4.(see(z3,z4) -> (((z3 = Socrates) & (z4 = John)) | ((z3 = John) & (z4 = Mary)))) + >>> cwp.prove() + True + + >>> p1 = read_expr(r'all x.(ostrich(x) -> bird(x))') + >>> p2 = read_expr(r'bird(Tweety)') + >>> p3 = read_expr(r'-ostrich(Sam)') + >>> p4 = read_expr(r'Sam != Tweety') + >>> c = read_expr(r'-bird(Sam)') + >>> prover = Prover9Command(c, [p1,p2,p3,p4]) + >>> prover.prove() + False + >>> cwp = ClosedWorldProver(prover) + >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP + all x.(ostrich(x) -> bird(x)) + bird(Tweety) + -ostrich(Sam) + -(Sam = Tweety) + all z7.-ostrich(z7) + all z8.(bird(z8) -> ((z8 = Tweety) | ostrich(z8))) + >>> print(cwp.prove()) + True + +----------------------- +Multi-Decorator Example +----------------------- + +Decorators can be nested to utilize multiple assumptions. + + >>> p1 = read_expr(r'see(Socrates, John)') + >>> p2 = read_expr(r'see(John, Mary)') + >>> c = read_expr(r'-see(Socrates, Mary)') + >>> prover = Prover9Command(c, [p1,p2]) + >>> print(prover.prove()) + False + >>> cmd = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover))) + >>> print(cmd.prove()) + True + +----------------- +Default Reasoning +----------------- + >>> logic._counter._value = 0 + >>> premises = [] + +define the taxonomy + + >>> premises.append(read_expr(r'all x.(elephant(x) -> animal(x))')) + >>> premises.append(read_expr(r'all x.(bird(x) -> animal(x))')) + >>> premises.append(read_expr(r'all x.(dove(x) -> bird(x))')) + >>> premises.append(read_expr(r'all x.(ostrich(x) -> bird(x))')) + >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> ostrich(x))')) + +default the properties using abnormalities + + >>> premises.append(read_expr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')) #normal animals don't fly + >>> premises.append(read_expr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')) #normal birds fly + >>> premises.append(read_expr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly + +specify abnormal entities + + >>> premises.append(read_expr(r'all x.(bird(x) -> Ab1(x))')) #flight + >>> premises.append(read_expr(r'all x.(ostrich(x) -> Ab2(x))')) #non-flying bird + >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich + +define entities + + >>> premises.append(read_expr(r'elephant(el)')) + >>> premises.append(read_expr(r'dove(do)')) + >>> premises.append(read_expr(r'ostrich(os)')) + +print the augmented assumptions list + + >>> prover = Prover9Command(None, premises) + >>> command = UniqueNamesProver(ClosedWorldProver(prover)) + >>> for a in command.assumptions(): print(a) # doctest: +SKIP + all x.(elephant(x) -> animal(x)) + all x.(bird(x) -> animal(x)) + all x.(dove(x) -> bird(x)) + all x.(ostrich(x) -> bird(x)) + all x.(flying_ostrich(x) -> ostrich(x)) + all x.((animal(x) & -Ab1(x)) -> -fly(x)) + all x.((bird(x) & -Ab2(x)) -> fly(x)) + all x.((ostrich(x) & -Ab3(x)) -> -fly(x)) + all x.(bird(x) -> Ab1(x)) + all x.(ostrich(x) -> Ab2(x)) + all x.(flying_ostrich(x) -> Ab3(x)) + elephant(el) + dove(do) + ostrich(os) + all z1.(animal(z1) -> (elephant(z1) | bird(z1))) + all z2.(Ab1(z2) -> bird(z2)) + all z3.(bird(z3) -> (dove(z3) | ostrich(z3))) + all z4.(dove(z4) -> (z4 = do)) + all z5.(Ab2(z5) -> ostrich(z5)) + all z6.(Ab3(z6) -> flying_ostrich(z6)) + all z7.(ostrich(z7) -> ((z7 = os) | flying_ostrich(z7))) + all z8.-flying_ostrich(z8) + all z9.(elephant(z9) -> (z9 = el)) + -(el = os) + -(el = do) + -(os = do) + + >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(el)'), premises))).prove() + True + >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('fly(do)'), premises))).prove() + True + >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(os)'), premises))).prove() + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/portuguese_en.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/portuguese_en.doctest new file mode 100644 index 0000000000000000000000000000000000000000..5c40554b05c2cf9b121c24070c4e4ee47404dd6a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/portuguese_en.doctest @@ -0,0 +1,568 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +================================== +Examples for Portuguese Processing +================================== + +This HOWTO contains a variety of examples relating to the Portuguese language. +It is intended to be read in conjunction with the NLTK book +(``https://www.nltk.org/book/``). For instructions on running the Python +interpreter, please see the section *Getting Started with Python*, in Chapter 1. + +-------------------------------------------- +Python Programming, with Portuguese Examples +-------------------------------------------- + +Chapter 1 of the NLTK book contains many elementary programming examples, all +with English texts. In this section, we'll see some corresponding examples +using Portuguese. Please refer to the chapter for full discussion. *Vamos!* + + >>> from nltk.test.portuguese_en_fixt import setup_module + >>> setup_module() + + >>> from nltk.examples.pt import * + *** Introductory Examples for the NLTK Book *** + Loading ptext1, ... and psent1, ... + Type the name of the text or sentence to view it. + Type: 'texts()' or 'sents()' to list the materials. + ptext1: Memórias Póstumas de Brás Cubas (1881) + ptext2: Dom Casmurro (1899) + ptext3: Gênesis + ptext4: Folha de Sao Paulo (1994) + + +Any time we want to find out about these texts, we just have +to enter their names at the Python prompt: + + >>> ptext2 + + +Searching Text +-------------- + +A concordance permits us to see words in context. + + >>> ptext1.concordance('olhos') + Building index... + Displaying 25 of 138 matches: + De pé , à cabeceira da cama , com os olhos estúpidos , a boca entreaberta , a t + orelhas . Pela minha parte fechei os olhos e deixei - me ir à ventura . Já agor + xões de cérebro enfermo . Como ia de olhos fechados , não via o caminho ; lembr + gelos eternos . Com efeito , abri os olhos e vi que o meu animal galopava numa + me apareceu então , fitando - me uns olhos rutilantes como o sol . Tudo nessa f + mim mesmo . Então , encarei - a com olhos súplices , e pedi mais alguns anos . + ... + +For a given word, we can find words with a similar text distribution: + + >>> ptext1.similar('chegar') + Building word-context index... + acabada acudir aludir avistar bramanismo casamento cheguei com contar + contrário corpo dali deixei desferirem dizer fazer filhos já leitor lhe + >>> ptext3.similar('chegar') + Building word-context index... + achar alumiar arrombar destruir governar guardar ir lavrar passar que + toda tomar ver vir + +We can search for the statistically significant collocations in a text: + + >>> ptext1.collocations() + Building collocations list + Quincas Borba; Lobo Neves; alguma coisa; Brás Cubas; meu pai; dia + seguinte; não sei; Meu pai; alguns instantes; outra vez; outra coisa; + por exemplo; mim mesmo; coisa nenhuma; mesma coisa; não era; dias + depois; Passeio Público; olhar para; das coisas + +We can search for words in context, with the help of *regular expressions*, e.g.: + + >>> ptext1.findall(" (<.*>)") + estúpidos; e; fechados; rutilantes; súplices; a; do; babavam; + na; moles; se; da; umas; espraiavam; chamejantes; espetados; + ... + +We can automatically generate random text based on a given text, e.g.: + + >>> ptext3.generate() # doctest: +SKIP + No princípio , criou Deus os abençoou , dizendo : Onde { estão } e até + à ave dos céus , { que } será . Disse mais Abrão : Dá - me a mulher + que tomaste ; porque daquele poço Eseque , { tinha .} E disse : Não + poderemos descer ; mas , do campo ainda não estava na casa do teu + pescoço . E viveu Serugue , depois Simeão e Levi { são } estes ? E o + varão , porque habitava na terra de Node , da mão de Esaú : Jeús , + Jalão e Corá + +Texts as List of Words +---------------------- + +A few sentences have been defined for you. + + >>> psent1 + ['o', 'amor', 'da', 'gl\xf3ria', 'era', 'a', 'coisa', 'mais', + 'verdadeiramente', 'humana', 'que', 'h\xe1', 'no', 'homem', ',', + 'e', ',', 'conseq\xfcentemente', ',', 'a', 'sua', 'mais', + 'genu\xedna', 'fei\xe7\xe3o', '.'] + >>> + +Notice that the sentence has been *tokenized*. Each token is +represented as a string, represented using quotes, e.g. ``'coisa'``. +Some strings contain special characters, e.g. ``\xf3``, +the internal representation for ó. +The tokens are combined in the form of a *list*. How long is this list? + + >>> len(psent1) + 25 + >>> + +What is the vocabulary of this sentence? + + >>> sorted(set(psent1)) + [',', '.', 'a', 'amor', 'coisa', 'conseqüentemente', 'da', 'e', 'era', + 'feição', 'genuína', 'glória', 'homem', 'humana', 'há', 'mais', 'no', + 'o', 'que', 'sua', 'verdadeiramente'] + >>> + +Let's iterate over each item in ``psent2``, and print information for each: + + >>> for w in psent2: + ... print(w, len(w), w[-1]) + ... + Não 3 o + consultes 9 s + dicionários 11 s + . 1 . + +Observe how we make a human-readable version of a string, using ``decode()``. +Also notice that we accessed the last character of a string ``w`` using ``w[-1]``. + +We just saw a ``for`` loop above. Another useful control structure is a +*list comprehension*. + + >>> [w.upper() for w in psent2] + ['N\xc3O', 'CONSULTES', 'DICION\xc1RIOS', '.'] + >>> [w for w in psent1 if w.endswith('a')] + ['da', 'gl\xf3ria', 'era', 'a', 'coisa', 'humana', 'a', 'sua', 'genu\xedna'] + >>> [w for w in ptext4 if len(w) > 15] + ['norte-irlandeses', 'pan-nacionalismo', 'predominatemente', 'primeiro-ministro', + 'primeiro-ministro', 'irlandesa-americana', 'responsabilidades', 'significativamente'] + +We can examine the relative frequency of words in a text, using ``FreqDist``: + + >>> fd1 = FreqDist(ptext1) + >>> fd1 + + >>> fd1['olhos'] + 137 + >>> fd1.max() + ',' + >>> fd1.samples()[:100] + [',', '.', 'a', 'que', 'de', 'e', '-', 'o', ';', 'me', 'um', 'n\xe3o', + '\x97', 'se', 'do', 'da', 'uma', 'com', 'os', '\xe9', 'era', 'as', 'eu', + 'lhe', 'ao', 'em', 'para', 'mas', '...', '!', '\xe0', 'na', 'mais', '?', + 'no', 'como', 'por', 'N\xe3o', 'dos', 'o', 'ele', ':', 'Virg\xedlia', + 'me', 'disse', 'minha', 'das', 'O', '/', 'A', 'CAP\xcdTULO', 'muito', + 'depois', 'coisa', 'foi', 'sem', 'olhos', 'ela', 'nos', 'tinha', 'nem', + 'E', 'outro', 'vida', 'nada', 'tempo', 'menos', 'outra', 'casa', 'homem', + 'porque', 'quando', 'mim', 'mesmo', 'ser', 'pouco', 'estava', 'dia', + 't\xe3o', 'tudo', 'Mas', 'at\xe9', 'D', 'ainda', 's\xf3', 'alguma', + 'la', 'vez', 'anos', 'h\xe1', 'Era', 'pai', 'esse', 'lo', 'dizer', 'assim', + 'ent\xe3o', 'dizia', 'aos', 'Borba'] + +--------------- +Reading Corpora +--------------- + +Accessing the Machado Text Corpus +--------------------------------- + +NLTK includes the complete works of Machado de Assis. + + >>> from nltk.corpus import machado + >>> machado.fileids() + ['contos/macn001.txt', 'contos/macn002.txt', 'contos/macn003.txt', ...] + +Each file corresponds to one of the works of Machado de Assis. To see a complete +list of works, you can look at the corpus README file: ``print machado.readme()``. +Let's access the text of the *Posthumous Memories of Brás Cubas*. + +We can access the text as a list of characters, and access 200 characters starting +from position 10,000. + + >>> raw_text = machado.raw('romance/marm05.txt') + >>> raw_text[10000:10200] + u', primou no\nEstado, e foi um dos amigos particulares do vice-rei Conde + da Cunha.\n\nComo este apelido de Cubas lhe\ncheirasse excessivamente a + tanoaria, alegava meu pai, bisneto de Dami\xe3o, que o\ndito ape' + +However, this is not a very useful way to work with a text. We generally think +of a text as a sequence of words and punctuation, not characters: + + >>> text1 = machado.words('romance/marm05.txt') + >>> text1 + ['Romance', ',', 'Mem\xf3rias', 'P\xf3stumas', 'de', ...] + >>> len(text1) + 77098 + >>> len(set(text1)) + 10848 + +Here's a program that finds the most common ngrams that contain a +particular target word. + + >>> from nltk import ngrams, FreqDist + >>> target_word = 'olhos' + >>> fd = FreqDist(ng + ... for ng in ngrams(text1, 5) + ... if target_word in ng) + >>> for hit in fd.samples(): + ... print(' '.join(hit)) + ... + , com os olhos no + com os olhos no ar + com os olhos no chão + e todos com os olhos + me estar com os olhos + os olhos estúpidos , a + os olhos na costura , + os olhos no ar , + , com os olhos espetados + , com os olhos estúpidos + , com os olhos fitos + , com os olhos naquele + , com os olhos para + + +Accessing the MacMorpho Tagged Corpus +------------------------------------- + +NLTK includes the MAC-MORPHO Brazilian Portuguese POS-tagged news text, +with over a million words of +journalistic texts extracted from ten sections of +the daily newspaper *Folha de Sao Paulo*, 1994. + +We can access this corpus as a sequence of words or tagged words as follows: + + >>> import nltk.corpus + >>> nltk.corpus.mac_morpho.words() + ['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', ...] + >>> nltk.corpus.mac_morpho.sents() + [['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o', + 'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'], + ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional', + 'do', 'Zeb', ',', 'que', 'come\xe7a', 'dia', '25'], ...] + >>> nltk.corpus.mac_morpho.tagged_words() + [('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...] + +We can also access it in sentence chunks. + + >>> nltk.corpus.mac_morpho.tagged_sents() + [[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ('de', 'PREP'), + ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milh\xe3o', 'N'), ('em', 'PREP|+'), + ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'), + ('Pinhal', 'NPROP'), ('em', 'PREP'), ('S\xe3o', 'NPROP'), + ('Paulo', 'NPROP')], + [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'), + ('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'), + ('do', 'NPROP'), ('Zeb', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'), + ('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...] + +This data can be used to train taggers (examples below for the Floresta treebank). + +Accessing the Floresta Portuguese Treebank +------------------------------------------ + +The NLTK data distribution includes the +"Floresta Sinta(c)tica Corpus" version 7.4, available from +``https://www.linguateca.pt/Floresta/``. + +We can access this corpus as a sequence of words or tagged words as follows: + + >>> from nltk.corpus import floresta + >>> floresta.words() + ['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...] + >>> floresta.tagged_words() + [('Um', '>N+art'), ('revivalismo', 'H+n'), ...] + +The tags consist of some syntactic information, followed by a plus sign, +followed by a conventional part-of-speech tag. Let's strip off the material before +the plus sign: + + >>> def simplify_tag(t): + ... if "+" in t: + ... return t[t.index("+")+1:] + ... else: + ... return t + >>> twords = floresta.tagged_words() + >>> twords = [(w.lower(), simplify_tag(t)) for (w,t) in twords] + >>> twords[:10] + [('um', 'art'), ('revivalismo', 'n'), ('refrescante', 'adj'), ('o', 'art'), ('7_e_meio', 'prop'), + ('\xe9', 'v-fin'), ('um', 'art'), ('ex-libris', 'n'), ('de', 'prp'), ('a', 'art')] + +Pretty printing the tagged words: + + >>> print(' '.join(word + '/' + tag for (word, tag) in twords[:10])) + um/art revivalismo/n refrescante/adj o/art 7_e_meio/prop é/v-fin um/art ex-libris/n de/prp a/art + +Count the word tokens and types, and determine the most common word: + + >>> words = floresta.words() + >>> len(words) + 211852 + >>> fd = nltk.FreqDist(words) + >>> len(fd) + 29421 + >>> fd.max() + 'de' + +List the 20 most frequent tags, in order of decreasing frequency: + + >>> tags = [simplify_tag(tag) for (word,tag) in floresta.tagged_words()] + >>> fd = nltk.FreqDist(tags) + >>> fd.keys()[:20] + ['n', 'prp', 'art', 'v-fin', ',', 'prop', 'adj', 'adv', '.', + 'conj-c', 'v-inf', 'pron-det', 'v-pcp', 'num', 'pron-indp', + 'pron-pers', '\xab', '\xbb', 'conj-s', '}'] + +We can also access the corpus grouped by sentence: + + >>> floresta.sents() + [['Um', 'revivalismo', 'refrescante'], + ['O', '7_e_Meio', '\xe9', 'um', 'ex-libris', 'de', 'a', 'noite', + 'algarvia', '.'], ...] + >>> floresta.tagged_sents() + [[('Um', '>N+art'), ('revivalismo', 'H+n'), ('refrescante', 'N<+adj')], + [('O', '>N+art'), ('7_e_Meio', 'H+prop'), ('\xe9', 'P+v-fin'), + ('um', '>N+art'), ('ex-libris', 'H+n'), ('de', 'H+prp'), + ('a', '>N+art'), ('noite', 'H+n'), ('algarvia', 'N<+adj'), ('.', '.')], + ...] + >>> floresta.parsed_sents() + [Tree('UTT+np', [Tree('>N+art', ['Um']), Tree('H+n', ['revivalismo']), + Tree('N<+adj', ['refrescante'])]), + Tree('STA+fcl', + [Tree('SUBJ+np', [Tree('>N+art', ['O']), + Tree('H+prop', ['7_e_Meio'])]), + Tree('P+v-fin', ['\xe9']), + Tree('SC+np', + [Tree('>N+art', ['um']), + Tree('H+n', ['ex-libris']), + Tree('N<+pp', [Tree('H+prp', ['de']), + Tree('P<+np', [Tree('>N+art', ['a']), + Tree('H+n', ['noite']), + Tree('N<+adj', ['algarvia'])])])]), + Tree('.', ['.'])]), ...] + +To view a parse tree, use the ``draw()`` method, e.g.: + + >>> psents = floresta.parsed_sents() + >>> psents[5].draw() # doctest: +SKIP + +Character Encodings +------------------- + +Python understands the common character encoding used for Portuguese, ISO 8859-1 (ISO Latin 1). + + >>> import os, nltk.test + >>> testdir = os.path.split(nltk.test.__file__)[0] + >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO 8859-1') + >>> text[:60] + 'O 7 e Meio \xe9 um ex-libris da noite algarvia.\n\xc9 uma das mais ' + >>> print(text[:60]) + O 7 e Meio é um ex-libris da noite algarvia. + É uma das mais + +For more information about character encodings and Python, please see section 3.3 of the book. + +---------------- +Processing Tasks +---------------- + + +Simple Concordancing +-------------------- + +Here's a function that takes a word and a specified amount of context (measured +in characters), and generates a concordance for that word. + + >>> def concordance(word, context=30): + ... for sent in floresta.sents(): + ... if word in sent: + ... pos = sent.index(word) + ... left = ' '.join(sent[:pos]) + ... right = ' '.join(sent[pos+1:]) + ... print('%*s %s %-*s' % + ... (context, left[-context:], word, context, right[:context])) + + >>> concordance("dar") # doctest: +SKIP + anduru , foi o suficiente para dar a volta a o resultado . + 1. O P?BLICO veio dar a a imprensa di?ria portuguesa + A fartura de pensamento pode dar maus resultados e n?s n?o quer + Come?a a dar resultados a pol?tica de a Uni + ial come?ar a incorporar- lo e dar forma a um ' site ' que tem se + r com Constantino para ele lhe dar tamb?m os pap?is assinados . + va a brincar , pois n?o lhe ia dar procura??o nenhuma enquanto n? + ?rica como o ant?doto capaz de dar sentido a o seu enorme poder . + . . . + >>> concordance("vender") # doctest: +SKIP + er recebido uma encomenda para vender 4000 blindados a o Iraque . + m?rico_Amorim caso conseguisse vender o lote de ac??es de o empres?r + mpre ter jovens simp?ticos a ? vender ? chega ! } + Disse que o governo vai vender ? desde autom?vel at? particip + ndiciou ontem duas pessoas por vender carro com ?gio . + A inten??o de Fleury ? vender as a??es para equilibrar as fi + +Part-of-Speech Tagging +---------------------- + +Let's begin by getting the tagged sentence data, and simplifying the tags +as described earlier. + + >>> from nltk.corpus import floresta + >>> tsents = floresta.tagged_sents() + >>> tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] + >>> train = tsents[100:] + >>> test = tsents[:100] + +We already know that ``n`` is the most common tag, so we can set up a +default tagger that tags every word as a noun, and see how well it does: + + >>> tagger0 = nltk.DefaultTagger('n') + >>> nltk.tag.accuracy(tagger0, test) + 0.17697228144989338 + +Evidently, about one in every six words is a noun. Let's improve on this by +training a unigram tagger: + + >>> tagger1 = nltk.UnigramTagger(train, backoff=tagger0) + >>> nltk.tag.accuracy(tagger1, test) + 0.87029140014214645 + +Next a bigram tagger: + + >>> tagger2 = nltk.BigramTagger(train, backoff=tagger1) + >>> nltk.tag.accuracy(tagger2, test) + 0.89019189765458417 + + +Sentence Segmentation +--------------------- + +Punkt is a language-neutral sentence segmentation tool. We + + >>> sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle') + >>> raw_text = machado.raw('romance/marm05.txt') + >>> sentences = sent_tokenizer.tokenize(raw_text) + >>> for sent in sentences[1000:1005]: + ... print("<<", sent, ">>") + ... + << Em verdade, parecia ainda mais mulher do que era; + seria criança nos seus folgares de moça; mas assim quieta, impassível, tinha a + compostura da mulher casada. >> + << Talvez essa circunstância lhe diminuía um pouco da + graça virginal. >> + << Depressa nos familiarizamos; a mãe fazia-lhe grandes elogios, eu + escutava-os de boa sombra, e ela sorria com os olhos fúlgidos, como se lá dentro + do cérebro lhe estivesse a voar uma borboletinha de asas de ouro e olhos de + diamante... >> + << Digo lá dentro, porque cá fora o + que esvoaçou foi uma borboleta preta, que subitamente penetrou na varanda, e + começou a bater as asas em derredor de D. Eusébia. >> + << D. Eusébia deu um grito, + levantou-se, praguejou umas palavras soltas: - T'esconjuro!... >> + +The sentence tokenizer can be trained and evaluated on other text. +The source text (from the Floresta Portuguese Treebank) contains one sentence per line. +We read the text, split it into its lines, and then join these lines together using +spaces. Now the information about sentence breaks has been discarded. We split this +material into training and testing data: + + >>> import os, nltk.test + >>> testdir = os.path.split(nltk.test.__file__)[0] + >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO-8859-1') + >>> lines = text.split('\n') + >>> train = ' '.join(lines[10:]) + >>> test = ' '.join(lines[:10]) + +Now we train the sentence segmenter (or sentence tokenizer) and use it on our test sentences: + + >>> stok = nltk.PunktSentenceTokenizer(train) + >>> print(stok.tokenize(test)) + ['O 7 e Meio \xe9 um ex-libris da noite algarvia.', + '\xc9 uma das mais antigas discotecas do Algarve, situada em Albufeira, + que continua a manter os tra\xe7os decorativos e as clientelas de sempre.', + '\xc9 um pouco a vers\xe3o de uma esp\xe9cie de \xaboutro lado\xbb da noite, + a meio caminho entre os devaneios de uma fauna perif\xe9rica, seja de Lisboa, + Londres, Dublin ou Faro e Portim\xe3o, e a postura circunspecta dos fi\xe9is da casa, + que dela esperam a m\xfasica \xabgeracionista\xbb dos 60 ou dos 70.', + 'N\xe3o deixa de ser, nos tempos que correm, um certo \xabvery typical\xbb algarvio, + cabe\xe7a de cartaz para os que querem fugir a algumas movimenta\xe7\xf5es nocturnas + j\xe1 a caminho da ritualiza\xe7\xe3o de massas, do g\xe9nero \xabvamos todos ao + Calypso e encontramo-nos na Locomia\xbb.', + 'E assim, aos 2,5 milh\xf5es que o Minist\xe9rio do Planeamento e Administra\xe7\xe3o + do Territ\xf3rio j\xe1 gasta no pagamento do pessoal afecto a estes organismos, + v\xeam juntar-se os montantes das obras propriamente ditas, que os munic\xedpios, + j\xe1 com projectos na m\xe3o, v\xeam reivindicar junto do Executivo, como salienta + aquele membro do Governo.', + 'E o dinheiro \xabn\xe3o falta s\xf3 \xe0s c\xe2maras\xbb, lembra o secret\xe1rio de Estado, + que considera que a solu\xe7\xe3o para as autarquias \xe9 \xabespecializarem-se em + fundos comunit\xe1rios\xbb.', + 'Mas como, se muitas n\xe3o disp\xf5em, nos seus quadros, dos t\xe9cnicos necess\xe1rios?', + '\xabEncomendem-nos a projectistas de fora\xbb porque, se as obras vierem a ser financiadas, + eles at\xe9 saem de gra\xe7a, j\xe1 que, nesse caso, \xabos fundos comunit\xe1rios pagam + os projectos, o mesmo n\xe3o acontecendo quando eles s\xe3o feitos pelos GAT\xbb, + dado serem organismos do Estado.', + 'Essa poder\xe1 vir a ser uma hip\xf3tese, at\xe9 porque, no terreno, a capacidade dos GAT + est\xe1 cada vez mais enfraquecida.', + 'Alguns at\xe9 j\xe1 desapareceram, como o de Castro Verde, e outros t\xeam vindo a perder quadros.'] + +NLTK's data collection includes a trained model for Portuguese sentence +segmentation, which can be loaded as follows. It is faster to load a trained model than +to retrain it. + + >>> stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') + +Stemming +-------- + +NLTK includes the RSLP Portuguese stemmer. Here we use it to stem some Portuguese text: + + >>> stemmer = nltk.stem.RSLPStemmer() + >>> stemmer.stem("copiar") + 'copi' + >>> stemmer.stem("paisagem") + 'pais' + + +Stopwords +--------- + +NLTK includes Portuguese stopwords: + + >>> stopwords = nltk.corpus.stopwords.words('portuguese') + >>> stopwords[:10] + ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'at\xe9'] + +Now we can use these to filter text. Let's find the most frequent words (other than stopwords) +and print them in descending order of frequency: + + >>> fd = nltk.FreqDist(w.lower() for w in floresta.words() if w not in stopwords) + >>> for word in list(fd.keys())[:20]: + ... print(word, fd[word]) + , 13444 + . 7725 + « 2369 + » 2310 + é 1305 + o 1086 + } 1047 + { 1044 + a 897 + ; 633 + em 516 + ser 466 + sobre 349 + os 313 + anos 301 + ontem 292 + ainda 279 + segundo 256 + ter 249 + dois 231 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/portuguese_en_fixt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/portuguese_en_fixt.py new file mode 100644 index 0000000000000000000000000000000000000000..1e86682b0810ef1299cf353ae606db9f9e9114d7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/portuguese_en_fixt.py @@ -0,0 +1,4 @@ +def setup_module(): + import pytest + + pytest.skip("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/probability.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/probability.doctest new file mode 100644 index 0000000000000000000000000000000000000000..134d97be65af3c3779fd8c81ad5dd02cdc63aaef --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/probability.doctest @@ -0,0 +1,306 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=========== +Probability +=========== + + >>> from nltk.test.probability_fixt import setup_module + >>> setup_module() + + >>> import nltk + >>> from nltk.probability import * + +FreqDist +-------- + + >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!'] + >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.'] + + >>> fd1 = nltk.FreqDist(text1) + >>> fd1 == nltk.FreqDist(text1) + True + +Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order. + + >>> import itertools + >>> both = nltk.FreqDist(text1 + text2) + >>> both_most_common = both.most_common() + >>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1])))) + [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)] + + >>> both == fd1 + nltk.FreqDist(text2) + True + >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged + True + + >>> fd2 = nltk.FreqDist(text2) + >>> fd1.update(fd2) + >>> fd1 == both + True + + >>> fd1 = nltk.FreqDist(text1) + >>> fd1.update(text2) + >>> fd1 == both + True + + >>> fd1 = nltk.FreqDist(text1) + >>> fd2 = nltk.FreqDist(fd1) + >>> fd2 == fd1 + True + +``nltk.FreqDist`` can be pickled: + + >>> import pickle + >>> fd1 = nltk.FreqDist(text1) + >>> pickled = pickle.dumps(fd1) + >>> fd1 == pickle.loads(pickled) + True + +Mathematical operations: + + >>> FreqDist('abbb') + FreqDist('bcc') + FreqDist({'b': 4, 'c': 2, 'a': 1}) + >>> FreqDist('abbbc') - FreqDist('bccd') + FreqDist({'b': 2, 'a': 1}) + >>> FreqDist('abbb') | FreqDist('bcc') + FreqDist({'b': 3, 'c': 2, 'a': 1}) + >>> FreqDist('abbb') & FreqDist('bcc') + FreqDist({'b': 1}) + +ConditionalFreqDist +------------------- + + >>> cfd1 = ConditionalFreqDist() + >>> cfd1[1] = FreqDist('abbbb') + >>> cfd1[2] = FreqDist('xxxxyy') + >>> cfd1 + + + >>> cfd2 = ConditionalFreqDist() + >>> cfd2[1] = FreqDist('bbccc') + >>> cfd2[2] = FreqDist('xxxyyyzz') + >>> cfd2[3] = FreqDist('m') + >>> cfd2 + + + >>> r = cfd1 + cfd2 + >>> [(i,r[i]) for i in r.conditions()] + [(1, FreqDist({'b': 6, 'c': 3, 'a': 1})), (2, FreqDist({'x': 7, 'y': 5, 'z': 2})), (3, FreqDist({'m': 1}))] + + >>> r = cfd1 - cfd2 + >>> [(i,r[i]) for i in r.conditions()] + [(1, FreqDist({'b': 2, 'a': 1})), (2, FreqDist({'x': 1}))] + + >>> r = cfd1 | cfd2 + >>> [(i,r[i]) for i in r.conditions()] + [(1, FreqDist({'b': 4, 'c': 3, 'a': 1})), (2, FreqDist({'x': 4, 'y': 3, 'z': 2})), (3, FreqDist({'m': 1}))] + + >>> r = cfd1 & cfd2 + >>> [(i,r[i]) for i in r.conditions()] + [(1, FreqDist({'b': 2})), (2, FreqDist({'x': 3, 'y': 2}))] + +Testing some HMM estimators +--------------------------- + +We extract a small part (500 sentences) of the Brown corpus + + >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] + >>> print(len(corpus)) + 500 + +We create a HMM trainer - note that we need the tags and symbols +from the whole corpus, not just the training corpus + + >>> from nltk.util import unique_list + >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) + >>> print(len(tag_set)) + 92 + >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) + >>> print(len(symbols)) + 1464 + >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) + +We divide the corpus into 90% training and 10% testing + + >>> train_corpus = [] + >>> test_corpus = [] + >>> for i in range(len(corpus)): + ... if i % 10: + ... train_corpus += [corpus[i]] + ... else: + ... test_corpus += [corpus[i]] + >>> print(len(train_corpus)) + 450 + >>> print(len(test_corpus)) + 50 + +And now we can test the estimators + + >>> def train_and_test(est): + ... hmm = trainer.train_supervised(train_corpus, estimator=est) + ... print('%.2f%%' % (100 * hmm.accuracy(test_corpus))) + +Maximum Likelihood Estimation +----------------------------- +- this resulted in an initialization error before r7209 + + >>> mle = lambda fd, bins: MLEProbDist(fd) + >>> train_and_test(mle) + 22.75% + +Laplace (= Lidstone with gamma==1) + + >>> train_and_test(LaplaceProbDist) + 66.04% + +Expected Likelihood Estimation (= Lidstone with gamma==0.5) + + >>> train_and_test(ELEProbDist) + 73.01% + +Lidstone Estimation, for gamma==0.1, 0.5 and 1 +(the later two should be exactly equal to MLE and ELE above) + + >>> def lidstone(gamma): + ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) + >>> train_and_test(lidstone(0.1)) + 82.51% + >>> train_and_test(lidstone(0.5)) + 73.01% + >>> train_and_test(lidstone(1.0)) + 66.04% + +Witten Bell Estimation +---------------------- +- This resulted in ZeroDivisionError before r7209 + + >>> train_and_test(WittenBellProbDist) + 88.12% + +Good Turing Estimation + + >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5) + >>> train_and_test(gt) + 86.93% + +Kneser Ney Estimation +--------------------- +Since the Kneser-Ney distribution is best suited for trigrams, we must adjust +our testing accordingly. + + >>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1])) + ... for x, y, z in nltk.trigrams(sent)] + ... for sent in corpus[:100]] + +We will then need to redefine the rest of the training/testing variables + + >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) + >>> len(tag_set) + 906 + + >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) + >>> len(symbols) + 1341 + + >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) + >>> train_corpus = [] + >>> test_corpus = [] + + >>> for i in range(len(corpus)): + ... if i % 10: + ... train_corpus += [corpus[i]] + ... else: + ... test_corpus += [corpus[i]] + + >>> len(train_corpus) + 90 + >>> len(test_corpus) + 10 + + >>> kn = lambda fd, bins: KneserNeyProbDist(fd) + >>> train_and_test(kn) + 0.86% + +Remains to be added: +- Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist + +Squashed bugs +------------- + +Issue 511: override pop and popitem to invalidate the cache + + >>> fd = nltk.FreqDist('a') + >>> list(fd.keys()) + ['a'] + >>> fd.pop('a') + 1 + >>> list(fd.keys()) + [] + +Issue 533: access cumulative frequencies with no arguments + + >>> fd = nltk.FreqDist('aab') + >>> list(fd._cumulative_frequencies(['a'])) + [2.0] + >>> list(fd._cumulative_frequencies(['a', 'b'])) + [2.0, 3.0] + +Issue 579: override clear to reset some variables + + >>> fd = FreqDist('aab') + >>> fd.clear() + >>> fd.N() + 0 + +Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently +add errant categories + + >>> from nltk.corpus import brown + >>> brown.fileids('blah') + Traceback (most recent call last): + ... + ValueError: Category blah not found + >>> brown.categories() + ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] + +Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default +otherwise any unseen events get a probability of zero, i.e., +they don't get smoothed + + >>> from nltk import SimpleGoodTuringProbDist, FreqDist + >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10}) + >>> p = SimpleGoodTuringProbDist(fd) + >>> p.prob('a') + 0.017649766667026317... + >>> p.prob('o') + 0.08433050215340411... + >>> p.prob('z') + 0.022727272727272728... + >>> p.prob('foobar') + 0.022727272727272728... + +``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and +``ConditionalFreqDist`` can be pickled: + + >>> import pickle + >>> pd = MLEProbDist(fd) + >>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples()) + True + >>> dpd = DictionaryConditionalProbDist({'x': pd}) + >>> unpickled = pickle.loads(pickle.dumps(dpd)) + >>> dpd['x'].prob('a') + 0.011363636... + >>> dpd['x'].prob('a') == unpickled['x'].prob('a') + True + >>> cfd = nltk.probability.ConditionalFreqDist() + >>> cfd['foo']['hello'] += 1 + >>> cfd['foo']['hello'] += 1 + >>> cfd['bar']['hello'] += 1 + >>> cfd2 = pickle.loads(pickle.dumps(cfd)) + >>> cfd2 == cfd + True + >>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist) + >>> cpd2 = pickle.loads(pickle.dumps(cpd)) + >>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello') + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/probability_fixt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/probability_fixt.py new file mode 100644 index 0000000000000000000000000000000000000000..a67809384d3780fa9d1b3efcf4ca51e10cd4be00 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/probability_fixt.py @@ -0,0 +1,8 @@ +# probability.doctest uses HMM which requires numpy; +# skip probability.doctest if numpy is not available + + +def setup_module(): + import pytest + + pytest.importorskip("numpy") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/propbank.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/propbank.doctest new file mode 100644 index 0000000000000000000000000000000000000000..2840c5845353028c93789db5f51d5997ae00329c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/propbank.doctest @@ -0,0 +1,176 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +======== +PropBank +======== + +The PropBank Corpus provides predicate-argument annotation for the +entire Penn Treebank. Each verb in the treebank is annotated by a single +instance in PropBank, containing information about the location of +the verb, and the location and identity of its arguments: + + >>> from nltk.corpus import propbank + >>> pb_instances = propbank.instances() + >>> print(pb_instances) + [, + , ...] + +Each propbank instance defines the following member variables: + + - Location information: `fileid`, `sentnum`, `wordnum` + - Annotator information: `tagger` + - Inflection information: `inflection` + - Roleset identifier: `roleset` + - Verb (aka predicate) location: `predicate` + - Argument locations and types: `arguments` + +The following examples show the types of these arguments: + + >>> inst = pb_instances[103] + >>> (inst.fileid, inst.sentnum, inst.wordnum) + ('wsj_0004.mrg', 8, 16) + >>> inst.tagger + 'gold' + >>> inst.inflection + + >>> infl = inst.inflection + >>> infl.form, infl.tense, infl.aspect, infl.person, infl.voice + ('v', 'p', '-', '-', 'a') + >>> inst.roleset + 'rise.01' + >>> inst.predicate + PropbankTreePointer(16, 0) + >>> inst.arguments + ((PropbankTreePointer(0, 2), 'ARG1'), + (PropbankTreePointer(13, 1), 'ARGM-DIS'), + (PropbankTreePointer(17, 1), 'ARG4-to'), + (PropbankTreePointer(20, 1), 'ARG3-from')) + +The location of the predicate and of the arguments are encoded using +`PropbankTreePointer` objects, as well as `PropbankChainTreePointer` +objects and `PropbankSplitTreePointer` objects. A +`PropbankTreePointer` consists of a `wordnum` and a `height`: + + >>> print(inst.predicate.wordnum, inst.predicate.height) + 16 0 + +This identifies the tree constituent that is headed by the word that +is the `wordnum`\ 'th token in the sentence, and whose span is found +by going `height` nodes up in the tree. This type of pointer is only +useful if we also have the corresponding tree structure, since it +includes empty elements such as traces in the word number count. The +trees for 10% of the standard PropBank Corpus are contained in the +`treebank` corpus: + + >>> tree = inst.tree + + >>> from nltk.corpus import treebank + >>> assert tree == treebank.parsed_sents(inst.fileid)[inst.sentnum] + + >>> inst.predicate.select(tree) + Tree('VBD', ['rose']) + >>> for (argloc, argid) in inst.arguments: + ... print('%-10s %s' % (argid, argloc.select(tree).pformat(500)[:50])) + ARG1 (NP-SBJ (NP (DT The) (NN yield)) (PP (IN on) (NP ( + ARGM-DIS (PP (IN for) (NP (NN example))) + ARG4-to (PP-DIR (TO to) (NP (CD 8.04) (NN %))) + ARG3-from (PP-DIR (IN from) (NP (CD 7.90) (NN %))) + +Propbank tree pointers can be converted to standard tree locations, +which are usually easier to work with, using the `treepos()` method: + + >>> treepos = inst.predicate.treepos(tree) + >>> print (treepos, tree[treepos]) + (4, 0) (VBD rose) + +In some cases, argument locations will be encoded using +`PropbankChainTreePointer`\ s (for trace chains) or +`PropbankSplitTreePointer`\ s (for discontinuous constituents). Both +of these objects contain a single member variable, `pieces`, +containing a list of the constituent pieces. They also define the +method `select()`, which will return a tree containing all the +elements of the argument. (A new head node is created, labeled +"*CHAIN*" or "*SPLIT*", since the argument is not a single constituent +in the original tree). Sentence #6 contains an example of an argument +that is both discontinuous and contains a chain: + + >>> inst = pb_instances[6] + >>> inst.roleset + 'expose.01' + >>> argloc, argid = inst.arguments[2] + >>> argloc + + >>> argloc.pieces + [, PropbankTreePointer(27, 0)] + >>> argloc.pieces[0].pieces + ... + [PropbankTreePointer(22, 1), PropbankTreePointer(24, 0), + PropbankTreePointer(25, 1)] + >>> print(argloc.select(inst.tree)) + (*CHAIN* + (*SPLIT* (NP (DT a) (NN group)) (IN of) (NP (NNS workers))) + (-NONE- *)) + +The PropBank Corpus also provides access to the frameset files, which +define the argument labels used by the annotations, on a per-verb +basis. Each frameset file contains one or more predicates, such as +'turn' or 'turn_on', each of which is divided into coarse-grained word +senses called rolesets. For each roleset, the frameset file provides +descriptions of the argument roles, along with examples. + + >>> expose_01 = propbank.roleset('expose.01') + >>> turn_01 = propbank.roleset('turn.01') + >>> print(turn_01) + + >>> for role in turn_01.findall("roles/role"): + ... print(role.attrib['n'], role.attrib['descr']) + 0 turner + 1 thing turning + m direction, location + + >>> from xml.etree import ElementTree + >>> print(ElementTree.tostring(turn_01.find('example')).decode('utf8').strip()) + + + John turned the key in the lock. + + John + turned + the key + in the lock + + +Note that the standard corpus distribution only contains 10% of the +treebank, so the parse trees are not available for instances starting +at 9353: + + >>> inst = pb_instances[9352] + >>> inst.fileid + 'wsj_0199.mrg' + >>> print(inst.tree) + (S (NP-SBJ (NNP Trinity)) (VP (VBD said) (SBAR (-NONE- 0) ...)) + >>> print(inst.predicate.select(inst.tree)) + (VB begin) + + >>> inst = pb_instances[9353] + >>> inst.fileid + 'wsj_0200.mrg' + >>> print(inst.tree) + None + >>> print(inst.predicate.select(inst.tree)) + Traceback (most recent call last): + . . . + ValueError: Parse tree not available + +However, if you supply your own version of the treebank corpus (by +putting it before the nltk-provided version on `nltk.data.path`, or +by creating a `ptb` directory as described above and using the +`propbank_ptb` module), then you can access the trees for all +instances. + +A list of the verb lemmas contained in PropBank is returned by the +`propbank.verbs()` method: + + >>> propbank.verbs() + ['abandon', 'abate', 'abdicate', 'abet', 'abide', ...] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/relextract.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/relextract.doctest new file mode 100644 index 0000000000000000000000000000000000000000..54fbbe746a5d8eb8e8b729f2551c5e1cfbc4e704 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/relextract.doctest @@ -0,0 +1,263 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +====================== +Information Extraction +====================== + +Information Extraction standardly consists of three subtasks: + +#. Named Entity Recognition + +#. Relation Extraction + +#. Template Filling + +Named Entities +~~~~~~~~~~~~~~ + +The IEER corpus is marked up for a variety of Named Entities. A Named +Entity (more strictly, a Named Entity mention) is a name of an +entity belonging to a specified class. For example, the Named Entity +classes in IEER include PERSON, LOCATION, ORGANIZATION, DATE and so +on. Within NLTK, Named Entities are represented as subtrees within a +chunk structure: the class name is treated as node label, while the +entity mention itself appears as the leaves of the subtree. This is +illustrated below, where we have show an extract of the chunk +representation of document NYT_19980315.064: + + >>> from nltk.corpus import ieer + >>> docs = ieer.parsed_docs('NYT_19980315') + >>> tree = docs[1].text + >>> print(tree) + (DOCUMENT + ... + ``It's + a + chance + to + think + about + first-level + questions,'' + said + Ms. + (PERSON Cohn) + , + a + partner + in + the + (ORGANIZATION McGlashan & Sarrail) + firm + in + (LOCATION San Mateo) + , + (LOCATION Calif.) + ...) + +Thus, the Named Entity mentions in this example are *Cohn*, *McGlashan & +Sarrail*, *San Mateo* and *Calif.*. + +The CoNLL2002 Dutch and Spanish data is treated similarly, although in +this case, the strings are also POS tagged. + + >>> from nltk.corpus import conll2002 + >>> for doc in conll2002.chunked_sents('ned.train')[27]: + ... print(doc) + ('Het', 'Art') + (ORG Hof/N van/Prep Cassatie/N) + ('verbrak', 'V') + ('het', 'Art') + ('arrest', 'N') + ('zodat', 'Conj') + ('het', 'Pron') + ('moest', 'V') + ('worden', 'V') + ('overgedaan', 'V') + ('door', 'Prep') + ('het', 'Art') + ('hof', 'N') + ('van', 'Prep') + ('beroep', 'N') + ('van', 'Prep') + (LOC Antwerpen/N) + ('.', 'Punc') + +Relation Extraction +~~~~~~~~~~~~~~~~~~~ + +Relation Extraction standardly consists of identifying specified +relations between Named Entities. For example, assuming that we can +recognize ORGANIZATIONs and LOCATIONs in text, we might want to also +recognize pairs *(o, l)* of these kinds of entities such that *o* is +located in *l*. + +The `sem.relextract` module provides some tools to help carry out a +simple version of this task. The `tree2semi_rel()` function splits a chunk +document into a list of two-member lists, each of which consists of a +(possibly empty) string followed by a `Tree` (i.e., a Named Entity): + + >>> from nltk.sem import relextract + >>> pairs = relextract.tree2semi_rel(tree) + >>> for s, tree in pairs[18:22]: + ... print('("...%s", %s)' % (" ".join(s[-5:]),tree)) + ("...about first-level questions,'' said Ms.", (PERSON Cohn)) + ("..., a partner in the", (ORGANIZATION McGlashan & Sarrail)) + ("...firm in", (LOCATION San Mateo)) + ("...,", (LOCATION Calif.)) + +The function `semi_rel2reldict()` processes triples of these pairs, i.e., +pairs of the form ``((string1, Tree1), (string2, Tree2), (string3, +Tree3))`` and outputs a dictionary (a `reldict`) in which ``Tree1`` is +the subject of the relation, ``string2`` is the filler +and ``Tree3`` is the object of the relation. ``string1`` and ``string3`` are +stored as left and right context respectively. + + >>> reldicts = relextract.semi_rel2reldict(pairs) + >>> for k, v in sorted(reldicts[0].items()): + ... print(k, '=>', v) + filler => of messages to their own ``Cyberia'' ... + lcon => transactions.'' Each week, they post + objclass => ORGANIZATION + objsym => white_house + objtext => White House + rcon => for access to its planned + subjclass => CARDINAL + subjsym => hundreds + subjtext => hundreds + untagged_filler => of messages to their own ``Cyberia'' ... + +The next example shows some of the values for two `reldict`\ s +corresponding to the ``'NYT_19980315'`` text extract shown earlier. + + >>> for r in reldicts[18:20]: + ... print('=' * 20) + ... print(r['subjtext']) + ... print(r['filler']) + ... print(r['objtext']) + ==================== + Cohn + , a partner in the + McGlashan & Sarrail + ==================== + McGlashan & Sarrail + firm in + San Mateo + +The function `relextract()` allows us to filter the `reldict`\ s +according to the classes of the subject and object named entities. In +addition, we can specify that the filler text has to match a given +regular expression, as illustrated in the next example. Here, we are +looking for pairs of entities in the IN relation, where IN has +signature . + + >>> import re + >>> IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') + >>> for fileid in ieer.fileids(): + ... for doc in ieer.parsed_docs(fileid): + ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN): + ... print(relextract.rtuple(rel)) + [ORG: 'Christian Democrats'] ', the leading political forces in' [LOC: 'Italy'] + [ORG: 'AP'] ') _ Lebanese guerrillas attacked Israeli forces in southern' [LOC: 'Lebanon'] + [ORG: 'Security Council'] 'adopted Resolution 425. Huge yellow banners hung across intersections in' [LOC: 'Beirut'] + [ORG: 'U.N.'] 'failures in' [LOC: 'Africa'] + [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia'] + [ORG: 'U.N.'] 'partners on a more effective role in' [LOC: 'Africa'] + [ORG: 'AP'] ') _ A bomb exploded in a mosque in central' [LOC: 'San`a'] + [ORG: 'Krasnoye Sormovo'] 'shipyard in the Soviet city of' [LOC: 'Gorky'] + [ORG: 'Kelab Golf Darul Ridzuan'] 'in' [LOC: 'Perak'] + [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia'] + [ORG: 'WHYY'] 'in' [LOC: 'Philadelphia'] + [ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo'] + [ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington'] + [ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington'] + [ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles'] + [ORG: 'Open Text'] ', based in' [LOC: 'Waterloo'] + ... + +The next example illustrates a case where the pattern is a disjunction +of roles that a PERSON can occupy in an ORGANIZATION. + + >>> roles = r""" + ... (.*( + ... analyst| + ... chair(wo)?man| + ... commissioner| + ... counsel| + ... director| + ... economist| + ... editor| + ... executive| + ... foreman| + ... governor| + ... head| + ... lawyer| + ... leader| + ... librarian).*)| + ... manager| + ... partner| + ... president| + ... producer| + ... professor| + ... researcher| + ... spokes(wo)?man| + ... writer| + ... ,\sof\sthe?\s* # "X, of (the) Y" + ... """ + >>> ROLES = re.compile(roles, re.VERBOSE) + >>> for fileid in ieer.fileids(): + ... for doc in ieer.parsed_docs(fileid): + ... for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): + ... print(relextract.rtuple(rel)) + [PER: 'Kivutha Kibwana'] ', of the' [ORG: 'National Convention Assembly'] + [PER: 'Boban Boskovic'] ', chief executive of the' [ORG: 'Plastika'] + [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations'] + [PER: 'Kiriyenko'] 'became a foreman at the' [ORG: 'Krasnoye Sormovo'] + [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations'] + [PER: 'Mike Godwin'] ', chief counsel for the' [ORG: 'Electronic Frontier Foundation'] + ... + +In the case of the CoNLL2002 data, we can include POS tags in the +query pattern. This example also illustrates how the output can be +presented as something that looks more like a clause in a logical language. + + >>> de = """ + ... .* + ... ( + ... de/SP| + ... del/SP + ... ) + ... """ + >>> DE = re.compile(de, re.VERBOSE) + >>> rels = [rel for doc in conll2002.chunked_sents('esp.train') + ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] + >>> for r in rels[:10]: + ... print(relextract.clause(r, relsym='DE')) + DE('tribunal_supremo', 'victoria') + DE('museo_de_arte', 'alcorc\xf3n') + DE('museo_de_bellas_artes', 'a_coru\xf1a') + DE('siria', 'l\xedbano') + DE('uni\xf3n_europea', 'pek\xedn') + DE('ej\xe9rcito', 'rogberi') + DE('juzgado_de_instrucci\xf3n_n\xfamero_1', 'san_sebasti\xe1n') + DE('psoe', 'villanueva_de_la_serena') + DE('ej\xe9rcito', 'l\xedbano') + DE('juzgado_de_lo_penal_n\xfamero_2', 'ceuta') + >>> vnv = """ + ... ( + ... is/V| + ... was/V| + ... werd/V| + ... wordt/V + ... ) + ... .* + ... van/Prep + ... """ + >>> VAN = re.compile(vnv, re.VERBOSE) + >>> for doc in conll2002.chunked_sents('ned.train'): + ... for r in relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): + ... print(relextract.clause(r, relsym="VAN")) + VAN("cornet_d'elzius", 'buitenlandse_handel') + VAN('johan_rottiers', 'kardinaal_van_roey_instituut') + VAN('annie_lennox', 'eurythmics') diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/semantics.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/semantics.doctest new file mode 100644 index 0000000000000000000000000000000000000000..fdddd5fe944e8bf9fe2d09cf9aa2afa84e776366 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/semantics.doctest @@ -0,0 +1,667 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========= +Semantics +========= + + >>> # Setup tests by setting the counter to 0 + >>> from nltk.sem import logic + >>> logic._counter._value = 0 + + >>> import nltk + >>> from nltk.sem import Valuation, Model + >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), + ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), + ... ('dog', set(['d1'])), + ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] + >>> val = Valuation(v) + >>> dom = val.domain + >>> m = Model(dom, val) + +Evaluation +---------- + +The top-level method of a ``Model`` instance is ``evaluate()``, which +assigns a semantic value to expressions of the ``logic`` module, under +an assignment ``g``: + + >>> dom = val.domain + >>> g = nltk.sem.Assignment(dom) + >>> m.evaluate('all x.(boy(x) -> - girl(x))', g) + True + + +``evaluate()`` calls a recursive function ``satisfy()``, which in turn +calls a function ``i()`` to interpret non-logical constants and +individual variables. ``i()`` delegates the interpretation of these to +the the model's ``Valuation`` and the variable assignment ``g`` +respectively. Any atomic expression which cannot be assigned a value +by ``i`` raises an ``Undefined`` exception; this is caught by +``evaluate``, which returns the string ``'Undefined'``. + + >>> m.evaluate('walk(adam)', g, trace=2) + + 'walk(adam)' is undefined under M, g + 'Undefined' + +Batch Processing +---------------- + +The utility functions ``interpret_sents()`` and ``evaluate_sents()`` are intended to +help with processing multiple sentences. Here's an example of the first of these: + + >>> sents = ['Mary walks'] + >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg') + >>> for result in results: + ... for (synrep, semrep) in result: + ... print(synrep) + (S[SEM=] + (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>] + (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary)) + (VP[NUM='sg', SEM=<\x.walk(x)>] + (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks))) + +In order to provide backwards compatibility with 'legacy' grammars where the semantics value +is specified with a lowercase +``sem`` feature, the relevant feature name can be passed to the function using the +``semkey`` parameter, as shown here: + + >>> sents = ['raining'] + >>> g = nltk.grammar.FeatureGrammar.fromstring(""" + ... % start S + ... S[sem=] -> 'raining' + ... """) + >>> results = nltk.sem.util.interpret_sents(sents, g, semkey='sem') + >>> for result in results: + ... for (synrep, semrep) in result: + ... print(semrep) + raining + +The function ``evaluate_sents()`` works in a similar manner, but also needs to be +passed a ``Model`` against which the semantic representations are evaluated. + +Unit Tests +========== + + +Unit tests for relations and valuations +--------------------------------------- + + >>> from nltk.sem import * + +Relations are sets of tuples, all of the same length. + + >>> s1 = set([('d1', 'd2'), ('d1', 'd1'), ('d2', 'd1')]) + >>> is_rel(s1) + True + >>> s2 = set([('d1', 'd2'), ('d1', 'd2'), ('d1',)]) + >>> is_rel(s2) + Traceback (most recent call last): + . . . + ValueError: Set set([('d1', 'd2'), ('d1',)]) contains sequences of different lengths + >>> s3 = set(['d1', 'd2']) + >>> is_rel(s3) + Traceback (most recent call last): + . . . + ValueError: Set set(['d2', 'd1']) contains sequences of different lengths + >>> s4 = set2rel(s3) + >>> is_rel(s4) + True + >>> is_rel(set()) + True + >>> null_binary_rel = set([(None, None)]) + >>> is_rel(null_binary_rel) + True + +Sets of entities are converted into sets of singleton tuples +(containing strings). + + >>> sorted(set2rel(s3)) + [('d1',), ('d2',)] + >>> sorted(set2rel(set([1,3,5,]))) + ['1', '3', '5'] + >>> set2rel(set()) == set() + True + >>> set2rel(set2rel(s3)) == set2rel(s3) + True + +Predication is evaluated by set membership. + + >>> ('d1', 'd2') in s1 + True + >>> ('d2', 'd2') in s1 + False + >>> ('d1',) in s1 + False + >>> 'd2' in s1 + False + >>> ('d1',) in s4 + True + >>> ('d1',) in set() + False + >>> 'd1' in null_binary_rel + False + + + >>> val = Valuation([('Fido', 'd1'), ('dog', set(['d1', 'd2'])), ('walk', set())]) + >>> sorted(val['dog']) + [('d1',), ('d2',)] + >>> val.domain == set(['d1', 'd2']) + True + >>> print(val.symbols) + ['Fido', 'dog', 'walk'] + + +Parse a valuation from a string. + + >>> v = """ + ... john => b1 + ... mary => g1 + ... suzie => g2 + ... fido => d1 + ... tess => d2 + ... noosa => n + ... girl => {g1, g2} + ... boy => {b1, b2} + ... dog => {d1, d2} + ... bark => {d1, d2} + ... walk => {b1, g2, d1} + ... chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} + ... see => {(b1, g1), (b2, d2), (g1, b1),(d2, b1), (g2, n)} + ... in => {(b1, n), (b2, n), (d2, n)} + ... with => {(b1, g1), (g1, b1), (d1, b1), (b1, d1)} + ... """ + >>> val = Valuation.fromstring(v) + + >>> print(val) # doctest: +SKIP + {'bark': set([('d1',), ('d2',)]), + 'boy': set([('b1',), ('b2',)]), + 'chase': set([('b1', 'g1'), ('g2', 'd2'), ('g1', 'd1'), ('b2', 'g1')]), + 'dog': set([('d1',), ('d2',)]), + 'fido': 'd1', + 'girl': set([('g2',), ('g1',)]), + 'in': set([('d2', 'n'), ('b1', 'n'), ('b2', 'n')]), + 'john': 'b1', + 'mary': 'g1', + 'noosa': 'n', + 'see': set([('b1', 'g1'), ('b2', 'd2'), ('d2', 'b1'), ('g2', 'n'), ('g1', 'b1')]), + 'suzie': 'g2', + 'tess': 'd2', + 'walk': set([('d1',), ('b1',), ('g2',)]), + 'with': set([('b1', 'g1'), ('d1', 'b1'), ('b1', 'd1'), ('g1', 'b1')])} + + +Unit tests for function argument application in a Model +------------------------------------------------------- + + >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ + ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), + ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')])), + ... ('kiss', null_binary_rel)] + >>> val = Valuation(v) + >>> dom = val.domain + >>> m = Model(dom, val) + >>> g = Assignment(dom) + >>> sorted(val['boy']) + [('b1',), ('b2',)] + >>> ('b1',) in val['boy'] + True + >>> ('g1',) in val['boy'] + False + >>> ('foo',) in val['boy'] + False + >>> ('b1', 'g1') in val['love'] + True + >>> ('b1', 'b1') in val['kiss'] + False + >>> sorted(val.domain) + ['b1', 'b2', 'd1', 'g1', 'g2'] + + +Model Tests +=========== + +Extension of Lambda expressions + + >>> v0 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ + ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), + ... ('dog', set(['d1'])), + ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] + + >>> val0 = Valuation(v0) + >>> dom0 = val0.domain + >>> m0 = Model(dom0, val0) + >>> g0 = Assignment(dom0) + + >>> print(m0.evaluate(r'\x. \y. love(x, y)', g0) == {'g2': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'b2': {'g2': True, 'b2': False, 'b1': False, 'g1': False, 'd1': False}, 'b1': {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}, 'g1': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'd1': {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}}) + True + >>> print(m0.evaluate(r'\x. dog(x) (adam)', g0)) + False + >>> print(m0.evaluate(r'\x. (dog(x) | boy(x)) (adam)', g0)) + True + >>> print(m0.evaluate(r'\x. \y. love(x, y)(fido)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}) + True + >>> print(m0.evaluate(r'\x. \y. love(x, y)(adam)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}) + True + >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)', g0) == {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}) + True + >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)(adam)', g0)) + True + >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty, adam)', g0)) + True + >>> print(m0.evaluate(r'\y. \x. love(x, y)(fido)(adam)', g0)) + False + >>> print(m0.evaluate(r'\y. \x. love(x, y)(betty, adam)', g0)) + True + >>> print(m0.evaluate(r'\x. exists y. love(x, y)', g0) == {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}) + True + >>> print(m0.evaluate(r'\z. adam', g0) == {'g2': 'b1', 'b2': 'b1', 'b1': 'b1', 'g1': 'b1', 'd1': 'b1'}) + True + >>> print(m0.evaluate(r'\z. love(x, y)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}) + True + + +Propositional Model Test +------------------------ + + >>> tests = [ + ... ('P & Q', True), + ... ('P & R', False), + ... ('- P', False), + ... ('- R', True), + ... ('- - P', True), + ... ('- (P & R)', True), + ... ('P | R', True), + ... ('R | P', True), + ... ('R | R', False), + ... ('- P | R', False), + ... ('P | - P', True), + ... ('P -> Q', True), + ... ('P -> R', False), + ... ('R -> P', True), + ... ('P <-> P', True), + ... ('R <-> R', True), + ... ('P <-> R', False), + ... ] + >>> val1 = Valuation([('P', True), ('Q', True), ('R', False)]) + >>> dom = set([]) + >>> m = Model(dom, val1) + >>> g = Assignment(dom) + >>> for (sent, testvalue) in tests: + ... semvalue = m.evaluate(sent, g) + ... if semvalue == testvalue: + ... print('*', end=' ') + * * * * * * * * * * * * * * * * * + + +Test of i Function +------------------ + + >>> from nltk.sem import Expression + >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), + ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), + ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] + >>> val = Valuation(v) + >>> dom = val.domain + >>> m = Model(dom, val) + >>> g = Assignment(dom, [('x', 'b1'), ('y', 'g2')]) + >>> exprs = ['adam', 'girl', 'love', 'walks', 'x', 'y', 'z'] + >>> parsed_exprs = [Expression.fromstring(e) for e in exprs] + >>> sorted_set = lambda x: sorted(x) if isinstance(x, set) else x + >>> for parsed in parsed_exprs: + ... try: + ... print("'%s' gets value %s" % (parsed, sorted_set(m.i(parsed, g)))) + ... except Undefined: + ... print("'%s' is Undefined" % parsed) + 'adam' gets value b1 + 'girl' gets value [('g1',), ('g2',)] + 'love' gets value [('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')] + 'walks' is Undefined + 'x' gets value b1 + 'y' gets value g2 + 'z' is Undefined + +Test for formulas in Model +-------------------------- + + >>> tests = [ + ... ('love(adam, betty)', True), + ... ('love(adam, sue)', 'Undefined'), + ... ('dog(fido)', True), + ... ('- dog(fido)', False), + ... ('- - dog(fido)', True), + ... ('- dog(sue)', 'Undefined'), + ... ('dog(fido) & boy(adam)', True), + ... ('- (dog(fido) & boy(adam))', False), + ... ('- dog(fido) & boy(adam)', False), + ... ('dog(fido) | boy(adam)', True), + ... ('- (dog(fido) | boy(adam))', False), + ... ('- dog(fido) | boy(adam)', True), + ... ('- dog(fido) | - boy(adam)', False), + ... ('dog(fido) -> boy(adam)', True), + ... ('- (dog(fido) -> boy(adam))', False), + ... ('- dog(fido) -> boy(adam)', True), + ... ('exists x . love(adam, x)', True), + ... ('all x . love(adam, x)', False), + ... ('fido = fido', True), + ... ('exists x . all y. love(x, y)', False), + ... ('exists x . (x = fido)', True), + ... ('all x . (dog(x) | - dog(x))', True), + ... ('adam = mia', 'Undefined'), + ... ('\\x. (boy(x) | girl(x))', {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}), + ... ('\\x. exists y. (boy(x) & love(x, y))', {'g2': False, 'b2': True, 'b1': True, 'g1': False, 'd1': False}), + ... ('exists z1. boy(z1)', True), + ... ('exists x. (boy(x) & - (x = adam))', True), + ... ('exists x. (boy(x) & all y. love(y, x))', False), + ... ('all x. (boy(x) | girl(x))', False), + ... ('all x. (girl(x) -> exists y. boy(y) & love(x, y))', False), + ... ('exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', True), + ... ('exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', False), + ... ('all x. (dog(x) -> - girl(x))', True), + ... ('exists x. exists y. (love(x, y) & love(x, y))', True), + ... ] + >>> for (sent, testvalue) in tests: + ... semvalue = m.evaluate(sent, g) + ... if semvalue == testvalue: + ... print('*', end=' ') + ... else: + ... print(sent, semvalue) + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + + + +Satisfier Tests +--------------- + + >>> formulas = [ + ... 'boy(x)', + ... '(x = x)', + ... '(boy(x) | girl(x))', + ... '(boy(x) & girl(x))', + ... 'love(adam, x)', + ... 'love(x, adam)', + ... '- (x = adam)', + ... 'exists z22. love(x, z22)', + ... 'exists y. love(y, x)', + ... 'all y. (girl(y) -> love(x, y))', + ... 'all y. (girl(y) -> love(y, x))', + ... 'all y. (girl(y) -> (boy(x) & love(y, x)))', + ... 'boy(x) & all y. (girl(y) -> love(x, y))', + ... 'boy(x) & all y. (girl(y) -> love(y, x))', + ... 'boy(x) & exists y. (girl(y) & love(y, x))', + ... 'girl(x) -> dog(x)', + ... 'all y. (dog(y) -> (x = y))', + ... '- exists y. love(y, x)', + ... 'exists y. (love(adam, y) & love(y, x))' + ... ] + >>> g.purge() + >>> g.add('x', 'b1') + {'x': 'b1'} + >>> for f in formulas: + ... try: + ... print("'%s' gets value: %s" % (f, m.evaluate(f, g))) + ... except Undefined: + ... print("'%s' is Undefined" % f) + 'boy(x)' gets value: True + '(x = x)' gets value: True + '(boy(x) | girl(x))' gets value: True + '(boy(x) & girl(x))' gets value: False + 'love(adam, x)' gets value: False + 'love(x, adam)' gets value: False + '- (x = adam)' gets value: False + 'exists z22. love(x, z22)' gets value: True + 'exists y. love(y, x)' gets value: True + 'all y. (girl(y) -> love(x, y))' gets value: False + 'all y. (girl(y) -> love(y, x))' gets value: True + 'all y. (girl(y) -> (boy(x) & love(y, x)))' gets value: True + 'boy(x) & all y. (girl(y) -> love(x, y))' gets value: False + 'boy(x) & all y. (girl(y) -> love(y, x))' gets value: True + 'boy(x) & exists y. (girl(y) & love(y, x))' gets value: True + 'girl(x) -> dog(x)' gets value: True + 'all y. (dog(y) -> (x = y))' gets value: False + '- exists y. love(y, x)' gets value: False + 'exists y. (love(adam, y) & love(y, x))' gets value: True + + >>> from nltk.sem import Expression + >>> for fmla in formulas: + ... p = Expression.fromstring(fmla) + ... g.purge() + ... print("Satisfiers of '%s':\n\t%s" % (p, sorted(m.satisfiers(p, 'x', g)))) + Satisfiers of 'boy(x)': + ['b1', 'b2'] + Satisfiers of '(x = x)': + ['b1', 'b2', 'd1', 'g1', 'g2'] + Satisfiers of '(boy(x) | girl(x))': + ['b1', 'b2', 'g1', 'g2'] + Satisfiers of '(boy(x) & girl(x))': + [] + Satisfiers of 'love(adam,x)': + ['g1'] + Satisfiers of 'love(x,adam)': + ['g1', 'g2'] + Satisfiers of '-(x = adam)': + ['b2', 'd1', 'g1', 'g2'] + Satisfiers of 'exists z22.love(x,z22)': + ['b1', 'b2', 'g1', 'g2'] + Satisfiers of 'exists y.love(y,x)': + ['b1', 'g1', 'g2'] + Satisfiers of 'all y.(girl(y) -> love(x,y))': + [] + Satisfiers of 'all y.(girl(y) -> love(y,x))': + ['b1'] + Satisfiers of 'all y.(girl(y) -> (boy(x) & love(y,x)))': + ['b1'] + Satisfiers of '(boy(x) & all y.(girl(y) -> love(x,y)))': + [] + Satisfiers of '(boy(x) & all y.(girl(y) -> love(y,x)))': + ['b1'] + Satisfiers of '(boy(x) & exists y.(girl(y) & love(y,x)))': + ['b1'] + Satisfiers of '(girl(x) -> dog(x))': + ['b1', 'b2', 'd1'] + Satisfiers of 'all y.(dog(y) -> (x = y))': + ['d1'] + Satisfiers of '-exists y.love(y,x)': + ['b2', 'd1'] + Satisfiers of 'exists y.(love(adam,y) & love(y,x))': + ['b1'] + + +Tests based on the Blackburn & Bos testsuite +-------------------------------------------- + + >>> v1 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'), + ... ('honey_bunny', 'd4'), ('yolanda', 'd5'), + ... ('customer', set(['d1', 'd2'])), + ... ('robber', set(['d3', 'd4'])), + ... ('love', set([('d3', 'd4')]))] + >>> val1 = Valuation(v1) + >>> dom1 = val1.domain + >>> m1 = Model(dom1, val1) + >>> g1 = Assignment(dom1) + + >>> v2 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'), + ... ('honey_bunny', 'd4'), ('yolanda', 'd4'), + ... ('customer', set(['d1', 'd2', 'd5', 'd6'])), + ... ('robber', set(['d3', 'd4'])), + ... ('love', set([(None, None)]))] + >>> val2 = Valuation(v2) + >>> dom2 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6']) + >>> m2 = Model(dom2, val2) + >>> g2 = Assignment(dom2) + >>> g21 = Assignment(dom2) + >>> g21.add('y', 'd3') + {'y': 'd3'} + + >>> v3 = [('mia', 'd1'), ('jody', 'd2'), ('jules', 'd3'), + ... ('vincent', 'd4'), + ... ('woman', set(['d1', 'd2'])), ('man', set(['d3', 'd4'])), + ... ('joke', set(['d5', 'd6'])), ('episode', set(['d7', 'd8'])), + ... ('in', set([('d5', 'd7'), ('d5', 'd8')])), + ... ('tell', set([('d1', 'd5'), ('d2', 'd6')]))] + >>> val3 = Valuation(v3) + >>> dom3 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8']) + >>> m3 = Model(dom3, val3) + >>> g3 = Assignment(dom3) + + >>> tests = [ + ... ('exists x. robber(x)', m1, g1, True), + ... ('exists x. exists y. love(y, x)', m1, g1, True), + ... ('exists x0. exists x1. love(x1, x0)', m2, g2, False), + ... ('all x. all y. love(y, x)', m2, g2, False), + ... ('- (all x. all y. love(y, x))', m2, g2, True), + ... ('all x. all y. - love(y, x)', m2, g2, True), + ... ('yolanda = honey_bunny', m2, g2, True), + ... ('mia = honey_bunny', m2, g2, 'Undefined'), + ... ('- (yolanda = honey_bunny)', m2, g2, False), + ... ('- (mia = honey_bunny)', m2, g2, 'Undefined'), + ... ('all x. (robber(x) | customer(x))', m2, g2, True), + ... ('- (all x. (robber(x) | customer(x)))', m2, g2, False), + ... ('(robber(x) | customer(x))', m2, g2, 'Undefined'), + ... ('(robber(y) | customer(y))', m2, g21, True), + ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True), + ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True), + ... ('- exists x. woman(x)', m3, g3, False), + ... ('exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'), + ... ('- exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'), + ... ('exists x. (man(x) & - exists y. woman(y))', m3, g3, False), + ... ('exists x. (man(x) & - exists x. woman(x))', m3, g3, False), + ... ('exists x. (woman(x) & - exists x. customer(x))', m2, g2, 'Undefined'), + ... ] + + >>> for item in tests: + ... sentence, model, g, testvalue = item + ... semvalue = model.evaluate(sentence, g) + ... if semvalue == testvalue: + ... print('*', end=' ') + ... g.purge() + * * * * * * * * * * * * * * * * * * * * * * + + +Tests for mapping from syntax to semantics +------------------------------------------ + +Load a valuation from a file. + + >>> import nltk.data + >>> from nltk.sem.util import parse_sents + >>> val = nltk.data.load('grammars/sample_grammars/valuation1.val') + >>> dom = val.domain + >>> m = Model(dom, val) + >>> g = Assignment(dom) + >>> gramfile = 'grammars/sample_grammars/sem2.fcfg' + >>> inputs = ['John sees a girl', 'every dog barks'] + >>> parses = parse_sents(inputs, gramfile) + >>> for sent, trees in zip(inputs, parses): + ... print() + ... print("Sentence: %s" % sent) + ... for tree in trees: + ... print("Parse:\n %s" %tree) + ... print("Semantics: %s" % root_semrep(tree)) + + Sentence: John sees a girl + Parse: + (S[SEM=] + (NP[-LOC, NUM='sg', SEM=<\P.P(john)>] + (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John)) + (VP[NUM='sg', SEM=<\y.exists x.(girl(x) & see(y,x))>] + (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees) + (NP[NUM='sg', SEM=<\Q.exists x.(girl(x) & Q(x))>] + (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a) + (Nom[NUM='sg', SEM=<\x.girl(x)>] + (N[NUM='sg', SEM=<\x.girl(x)>] girl))))) + Semantics: exists x.(girl(x) & see(john,x)) + + Sentence: every dog barks + Parse: + (S[SEM= bark(x))>] + (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] + (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) + (Nom[NUM='sg', SEM=<\x.dog(x)>] + (N[NUM='sg', SEM=<\x.dog(x)>] dog))) + (VP[NUM='sg', SEM=<\x.bark(x)>] + (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) + Semantics: all x.(dog(x) -> bark(x)) + + >>> sent = "every dog barks" + >>> result = nltk.sem.util.interpret_sents([sent], gramfile)[0] + >>> for (syntree, semrep) in result: + ... print(syntree) + ... print() + ... print(semrep) + (S[SEM= bark(x))>] + (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] + (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) + (Nom[NUM='sg', SEM=<\x.dog(x)>] + (N[NUM='sg', SEM=<\x.dog(x)>] dog))) + (VP[NUM='sg', SEM=<\x.bark(x)>] + (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) + + all x.(dog(x) -> bark(x)) + + >>> result = nltk.sem.util.evaluate_sents([sent], gramfile, m, g)[0] + >>> for (syntree, semrel, value) in result: + ... print(syntree) + ... print() + ... print(semrep) + ... print() + ... print(value) + (S[SEM= bark(x))>] + (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] + (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) + (Nom[NUM='sg', SEM=<\x.dog(x)>] + (N[NUM='sg', SEM=<\x.dog(x)>] dog))) + (VP[NUM='sg', SEM=<\x.bark(x)>] + (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) + + all x.(dog(x) -> bark(x)) + + True + + >>> sents = ['Mary walks', 'John sees a dog'] + >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg') + >>> for result in results: + ... for (synrep, semrep) in result: + ... print(synrep) + (S[SEM=] + (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>] + (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary)) + (VP[NUM='sg', SEM=<\x.walk(x)>] + (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks))) + (S[SEM=] + (NP[-LOC, NUM='sg', SEM=<\P.P(john)>] + (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John)) + (VP[NUM='sg', SEM=<\y.exists x.(dog(x) & see(y,x))>] + (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees) + (NP[NUM='sg', SEM=<\Q.exists x.(dog(x) & Q(x))>] + (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a) + (Nom[NUM='sg', SEM=<\x.dog(x)>] + (N[NUM='sg', SEM=<\x.dog(x)>] dog))))) + +Cooper Storage +-------------- + + >>> from nltk.sem import cooper_storage as cs + >>> sentence = 'every girl chases a dog' + >>> trees = cs.parse_with_bindops(sentence, grammar='grammars/book_grammars/storage.fcfg') + >>> semrep = trees[0].label()['SEM'] + >>> cs_semrep = cs.CooperStore(semrep) + >>> print(cs_semrep.core) + chase(z2,z4) + >>> for bo in cs_semrep.store: + ... print(bo) + bo(\P.all x.(girl(x) -> P(x)),z2) + bo(\P.exists x.(dog(x) & P(x)),z4) + >>> cs_semrep.s_retrieve(trace=True) + Permutation 1 + (\P.all x.(girl(x) -> P(x)))(\z2.chase(z2,z4)) + (\P.exists x.(dog(x) & P(x)))(\z4.all x.(girl(x) -> chase(x,z4))) + Permutation 2 + (\P.exists x.(dog(x) & P(x)))(\z4.chase(z2,z4)) + (\P.all x.(girl(x) -> P(x)))(\z2.exists x.(dog(x) & chase(z2,x))) + + >>> for reading in cs_semrep.readings: + ... print(reading) + exists x.(dog(x) & all z3.(girl(z3) -> chase(z3,x))) + all x.(girl(x) -> exists z4.(dog(z4) & chase(x,z4))) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/sentiment.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/sentiment.doctest new file mode 100644 index 0000000000000000000000000000000000000000..0f2fbd2b112a7d2d3df22a4012c80bb6197471ea --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/sentiment.doctest @@ -0,0 +1,236 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=================== +Sentiment Analysis +=================== + + >>> from nltk.classify import NaiveBayesClassifier + >>> from nltk.corpus import subjectivity + >>> from nltk.sentiment import SentimentAnalyzer + >>> from nltk.sentiment.util import * + + >>> n_instances = 100 + >>> subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] + >>> obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] + >>> len(subj_docs), len(obj_docs) + (100, 100) + +Each document is represented by a tuple (sentence, label). The sentence is tokenized, +so it is represented by a list of strings: + + >>> subj_docs[0] + (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', + 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj') + +We separately split subjective and objective instances to keep a balanced uniform +class distribution in both train and test sets. + + >>> train_subj_docs = subj_docs[:80] + >>> test_subj_docs = subj_docs[80:100] + >>> train_obj_docs = obj_docs[:80] + >>> test_obj_docs = obj_docs[80:100] + >>> training_docs = train_subj_docs+train_obj_docs + >>> testing_docs = test_subj_docs+test_obj_docs + + >>> sentim_analyzer = SentimentAnalyzer() + >>> all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) + +We use simple unigram word features, handling negation: + + >>> unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) + >>> len(unigram_feats) + 83 + >>> sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) + +We apply features to obtain a feature-value representation of our datasets: + + >>> training_set = sentim_analyzer.apply_features(training_docs) + >>> test_set = sentim_analyzer.apply_features(testing_docs) + +We can now train our classifier on the training set, and subsequently output the +evaluation results: + + >>> trainer = NaiveBayesClassifier.train + >>> classifier = sentim_analyzer.train(trainer, training_set) + Training classifier + >>> for key,value in sorted(sentim_analyzer.evaluate(test_set).items()): + ... print('{0}: {1}'.format(key, value)) + Evaluating NaiveBayesClassifier results... + Accuracy: 0.8 + F-measure [obj]: 0.8 + F-measure [subj]: 0.8 + Precision [obj]: 0.8 + Precision [subj]: 0.8 + Recall [obj]: 0.8 + Recall [subj]: 0.8 + + +Vader +------ + + >>> from nltk.sentiment.vader import SentimentIntensityAnalyzer + >>> sentences = ["VADER is smart, handsome, and funny.", # positive sentence example + ... "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted) + ... "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted) + ... "VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled + ... "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity + ... "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score + ... "The book was good.", # positive sentence + ... "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted) + ... "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence + ... "A really bad, horrible book.", # negative sentence with booster words + ... "At least it isn't a horrible book.", # negated negative sentence with contraction + ... ":) and :D", # emoticons handled + ... "", # an empty string is correctly handled + ... "Today sux", # negative slang handled + ... "Today sux!", # negative slang with punctuation emphasis handled + ... "Today SUX!", # negative slang with capitalization emphasis + ... "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but" + ... ] + >>> paragraph = "It was one of the worst movies I've seen, despite good reviews. \ + ... Unbelievably bad acting!! Poor direction. VERY poor production. \ + ... The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!" + + >>> from nltk import tokenize + >>> lines_list = tokenize.sent_tokenize(paragraph) + >>> sentences.extend(lines_list) + + >>> tricky_sentences = [ + ... "Most automated sentiment analysis tools are shit.", + ... "VADER sentiment analysis is the shit.", + ... "Sentiment analysis has never been good.", + ... "Sentiment analysis with VADER has never been this good.", + ... "Warren Beatty has never been so entertaining.", + ... "I won't say that the movie is astounding and I wouldn't claim that \ + ... the movie is too banal either.", + ... "I like to hate Michael Bay films, but I couldn't fault this one", + ... "I like to hate Michael Bay films, BUT I couldn't help but fault this one", + ... "It's one thing to watch an Uwe Boll film, but another thing entirely \ + ... to pay for it", + ... "The movie was too good", + ... "This movie was actually neither that funny, nor super witty.", + ... "This movie doesn't care about cleverness, wit or any other kind of \ + ... intelligent humor.", + ... "Those who find ugly meanings in beautiful things are corrupt without \ + ... being charming.", + ... "There are slow and repetitive parts, BUT it has just enough spice to \ + ... keep it interesting.", + ... "The script is not fantastic, but the acting is decent and the cinematography \ + ... is EXCELLENT!", + ... "Roger Dodger is one of the most compelling variations on this theme.", + ... "Roger Dodger is one of the least compelling variations on this theme.", + ... "Roger Dodger is at least compelling as a variation on the theme.", + ... "they fall in love with the product", + ... "but then it breaks", + ... "usually around the time the 90 day warranty expires", + ... "the twin towers collapsed today", + ... "However, Mr. Carter solemnly argues, his client carried out the kidnapping \ + ... under orders and in the ''least offensive way possible.''" + ... ] + >>> sentences.extend(tricky_sentences) + >>> for sentence in sentences: + ... sid = SentimentIntensityAnalyzer() + ... print(sentence) + ... ss = sid.polarity_scores(sentence) + ... for k in sorted(ss): + ... print('{0}: {1}, '.format(k, ss[k]), end='') + ... print() + VADER is smart, handsome, and funny. + compound: 0.8316, neg: 0.0, neu: 0.254, pos: 0.746, + VADER is smart, handsome, and funny! + compound: 0.8439, neg: 0.0, neu: 0.248, pos: 0.752, + VADER is very smart, handsome, and funny. + compound: 0.8545, neg: 0.0, neu: 0.299, pos: 0.701, + VADER is VERY SMART, handsome, and FUNNY. + compound: 0.9227, neg: 0.0, neu: 0.246, pos: 0.754, + VADER is VERY SMART, handsome, and FUNNY!!! + compound: 0.9342, neg: 0.0, neu: 0.233, pos: 0.767, + VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!! + compound: 0.9469, neg: 0.0, neu: 0.294, pos: 0.706, + The book was good. + compound: 0.4404, neg: 0.0, neu: 0.508, pos: 0.492, + The book was kind of good. + compound: 0.3832, neg: 0.0, neu: 0.657, pos: 0.343, + The plot was good, but the characters are uncompelling and the dialog is not great. + compound: -0.7042, neg: 0.327, neu: 0.579, pos: 0.094, + A really bad, horrible book. + compound: -0.8211, neg: 0.791, neu: 0.209, pos: 0.0, + At least it isn't a horrible book. + compound: 0.431, neg: 0.0, neu: 0.637, pos: 0.363, + :) and :D + compound: 0.7925, neg: 0.0, neu: 0.124, pos: 0.876, + + compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0, + Today sux + compound: -0.3612, neg: 0.714, neu: 0.286, pos: 0.0, + Today sux! + compound: -0.4199, neg: 0.736, neu: 0.264, pos: 0.0, + Today SUX! + compound: -0.5461, neg: 0.779, neu: 0.221, pos: 0.0, + Today kinda sux! But I'll get by, lol + compound: 0.5249, neg: 0.138, neu: 0.517, pos: 0.344, + It was one of the worst movies I've seen, despite good reviews. + compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0, + Unbelievably bad acting!! + compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0, + Poor direction. + compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0, + VERY poor production. + compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0, + The movie was bad. + compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0, + Very bad movie. + compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0, + VERY bad movie. + compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0, + VERY BAD movie. + compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0, + VERY BAD movie! + compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0, + Most automated sentiment analysis tools are shit. + compound: -0.5574, neg: 0.375, neu: 0.625, pos: 0.0, + VADER sentiment analysis is the shit. + compound: 0.6124, neg: 0.0, neu: 0.556, pos: 0.444, + Sentiment analysis has never been good. + compound: -0.3412, neg: 0.325, neu: 0.675, pos: 0.0, + Sentiment analysis with VADER has never been this good. + compound: 0.5228, neg: 0.0, neu: 0.703, pos: 0.297, + Warren Beatty has never been so entertaining. + compound: 0.5777, neg: 0.0, neu: 0.616, pos: 0.384, + I won't say that the movie is astounding and I wouldn't claim that the movie is too banal either. + compound: 0.4215, neg: 0.0, neu: 0.851, pos: 0.149, + I like to hate Michael Bay films, but I couldn't fault this one + compound: 0.3153, neg: 0.157, neu: 0.534, pos: 0.309, + I like to hate Michael Bay films, BUT I couldn't help but fault this one + compound: -0.1531, neg: 0.277, neu: 0.477, pos: 0.246, + It's one thing to watch an Uwe Boll film, but another thing entirely to pay for it + compound: -0.2541, neg: 0.112, neu: 0.888, pos: 0.0, + The movie was too good + compound: 0.4404, neg: 0.0, neu: 0.58, pos: 0.42, + This movie was actually neither that funny, nor super witty. + compound: -0.6759, neg: 0.41, neu: 0.59, pos: 0.0, + This movie doesn't care about cleverness, wit or any other kind of intelligent humor. + compound: -0.1338, neg: 0.265, neu: 0.497, pos: 0.239, + Those who find ugly meanings in beautiful things are corrupt without being charming. + compound: -0.3553, neg: 0.314, neu: 0.493, pos: 0.192, + There are slow and repetitive parts, BUT it has just enough spice to keep it interesting. + compound: 0.4678, neg: 0.079, neu: 0.735, pos: 0.186, + The script is not fantastic, but the acting is decent and the cinematography is EXCELLENT! + compound: 0.7565, neg: 0.092, neu: 0.607, pos: 0.301, + Roger Dodger is one of the most compelling variations on this theme. + compound: 0.2944, neg: 0.0, neu: 0.834, pos: 0.166, + Roger Dodger is one of the least compelling variations on this theme. + compound: -0.1695, neg: 0.132, neu: 0.868, pos: 0.0, + Roger Dodger is at least compelling as a variation on the theme. + compound: 0.2263, neg: 0.0, neu: 0.84, pos: 0.16, + they fall in love with the product + compound: 0.6369, neg: 0.0, neu: 0.588, pos: 0.412, + but then it breaks + compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, + usually around the time the 90 day warranty expires + compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, + the twin towers collapsed today + compound: -0.2732, neg: 0.344, neu: 0.656, pos: 0.0, + However, Mr. Carter solemnly argues, his client carried out the kidnapping under orders and in the ''least offensive way possible.'' + compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074, diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/sentiwordnet.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/sentiwordnet.doctest new file mode 100644 index 0000000000000000000000000000000000000000..02dad5565e051f67f805909c4e2c547271920e00 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/sentiwordnet.doctest @@ -0,0 +1,41 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +====================== +SentiWordNet Interface +====================== + +SentiWordNet can be imported like this: + + >>> from nltk.corpus import sentiwordnet as swn + +------------ +SentiSynsets +------------ + + >>> breakdown = swn.senti_synset('breakdown.n.03') + >>> print(breakdown) + + >>> breakdown.pos_score() + 0.0 + >>> breakdown.neg_score() + 0.25 + >>> breakdown.obj_score() + 0.75 + + +------ +Lookup +------ + + >>> list(swn.senti_synsets('slow')) + [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), + SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), + SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), + SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), + SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), + SentiSynset('behind.r.03')] + + >>> happy = swn.senti_synsets('happy', 'a') + + >>> all = swn.all_senti_synsets() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/setup_fixt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/setup_fixt.py new file mode 100644 index 0000000000000000000000000000000000000000..e7f3a27464b1875107354eb01e1fe9467c653539 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/setup_fixt.py @@ -0,0 +1,26 @@ +from nltk.internals import find_binary, find_jar + + +def check_binary(binary: str, **args): + """Skip a test via `pytest.skip` if the `binary` executable is not found. + Keyword arguments are passed to `nltk.internals.find_binary`.""" + import pytest + + try: + find_binary(binary, **args) + except LookupError: + pytest.skip(f"Skipping test because the {binary} binary was not found.") + + +def check_jar(name_pattern: str, **args): + """Skip a test via `pytest.skip` if the `name_pattern` jar is not found. + Keyword arguments are passed to `nltk.internals.find_jar`. + + TODO: Investigate why the CoreNLP tests that rely on this check_jar failed + on the CI. https://github.com/nltk/nltk/pull/3060#issuecomment-1268355108 + """ + import pytest + + pytest.skip( + "Skipping test because the doctests requiring jars are inconsistent on the CI." + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/stem.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/stem.doctest new file mode 100644 index 0000000000000000000000000000000000000000..c257c716fbbadd13b560b3733a563aeff500e291 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/stem.doctest @@ -0,0 +1,105 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========== + Stemmers +========== + +Overview +~~~~~~~~ + +Stemmers remove morphological affixes from words, leaving only the +word stem. + + >>> from nltk.stem import * + +Unit tests for the Porter stemmer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + >>> from nltk.stem.porter import * + +Create a new Porter stemmer. + + >>> stemmer = PorterStemmer() + +Test the stemmer on various pluralised words. + + >>> plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', + ... 'died', 'agreed', 'owned', 'humbled', 'sized', + ... 'meeting', 'stating', 'siezing', 'itemization', + ... 'sensational', 'traditional', 'reference', 'colonizer', + ... 'plotted'] + + >>> singles = [stemmer.stem(plural) for plural in plurals] + + >>> print(' '.join(singles)) + caress fli die mule deni die agre own humbl size meet + state siez item sensat tradit refer colon plot + + +Unit tests for Snowball stemmer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + >>> from nltk.stem.snowball import SnowballStemmer + +See which languages are supported. + + >>> print(" ".join(SnowballStemmer.languages)) + arabic danish dutch english finnish french german hungarian italian + norwegian porter portuguese romanian russian spanish swedish + +Create a new instance of a language specific subclass. + + >>> stemmer = SnowballStemmer("english") + +Stem a word. + + >>> print(stemmer.stem("running")) + run + +Decide not to stem stopwords. + + >>> stemmer2 = SnowballStemmer("english", ignore_stopwords=True) + >>> print(stemmer.stem("having")) + have + >>> print(stemmer2.stem("having")) + having + +The 'english' stemmer is better than the original 'porter' stemmer. + + >>> print(SnowballStemmer("english").stem("generously")) + generous + >>> print(SnowballStemmer("porter").stem("generously")) + gener + +.. note:: + + Extra stemmer tests can be found in `nltk.test.unit.test_stem`. + +Unit tests for ARLSTem Stemmer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + >>> from nltk.stem.arlstem import ARLSTem + +Create a Stemmer instance. + + >>> stemmer = ARLSTem() + +Stem a word. + + >>> stemmer.stem('يعمل') + 'عمل' + +Unit tests for ARLSTem2 Stemmer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + >>> from nltk.stem.arlstem2 import ARLSTem2 + +Create a Stemmer instance. + + >>> stemmer = ARLSTem2() + +Stem a word. + + >>> stemmer.stem('يعمل') + 'عمل' diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/tag.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/tag.doctest new file mode 100644 index 0000000000000000000000000000000000000000..36eed718270061aea8d08bbde73cdfbbe887c3b5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/tag.doctest @@ -0,0 +1,475 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +Evaluation of Taggers +===================== + +Evaluating the standard NLTK PerceptronTagger using Accuracy, +Precision, Recall and F-measure for each of the tags. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[10:20] + >>> print(tagger.accuracy(gold_data)) # doctest: +ELLIPSIS + 0.885931... + + >>> print(tagger.evaluate_per_tag(gold_data)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + '' | 1.0000 | 1.0000 | 1.0000 + , | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + . | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + CC | 1.0000 | 1.0000 | 1.0000 + CD | 0.7647 | 1.0000 | 0.8667 + DT | 1.0000 | 1.0000 | 1.0000 + IN | 1.0000 | 1.0000 | 1.0000 + JJ | 0.5882 | 0.8333 | 0.6897 + JJR | 1.0000 | 1.0000 | 1.0000 + JJS | 1.0000 | 1.0000 | 1.0000 + NN | 0.7647 | 0.9630 | 0.8525 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 1.0000 | 1.0000 | 1.0000 + POS | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 1.0000 | 1.0000 + RB | 0.8000 | 1.0000 | 0.8889 + RBR | 0.0000 | 0.0000 | 0.0000 + TO | 1.0000 | 1.0000 | 1.0000 + VB | 1.0000 | 1.0000 | 1.0000 + VBD | 0.8571 | 0.9231 | 0.8889 + VBG | 1.0000 | 1.0000 | 1.0000 + VBN | 0.8333 | 0.5556 | 0.6667 + VBP | 0.5714 | 0.8000 | 0.6667 + VBZ | 1.0000 | 1.0000 | 1.0000 + WP | 1.0000 | 1.0000 | 1.0000 + `` | 1.0000 | 1.0000 | 1.0000 + + +List only the 10 most common tags: + + >>> print(tagger.evaluate_per_tag(gold_data, truncate=10, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + IN | 1.0000 | 1.0000 | 1.0000 + DT | 1.0000 | 1.0000 | 1.0000 + NN | 0.7647 | 0.9630 | 0.8525 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + CD | 0.7647 | 1.0000 | 0.8667 + VBD | 0.8571 | 0.9231 | 0.8889 + JJ | 0.5882 | 0.8333 | 0.6897 + , | 1.0000 | 1.0000 | 1.0000 + + +Similarly, we can display the confusion matrix for this tagger. + + >>> print(tagger.confusion(gold_data)) + | - | + | N | + | O | + | N J J N N P P R V V V V V | + | ' E C C D I J J J N N N O R R B T V B B B B B W ` | + | ' , - . : C D T N J R S N P S S P B R O B D G N P Z P ` | + -------+-------------------------------------------------------------------------------------+ + '' | <3> . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | .<11> . . . . . . . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . <.> . . . 4 . . 4 . . 7 2 . . . 1 . . . . . . 3 . . . | + . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . .<13> . . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . .<28> . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . .<34> . . . . . . . . . . . . . . . . . . . | + JJ | . . . . . . . . .<10> . . . 1 . . . . 1 . . . . . . . . . | + JJR | . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . 1 . .<26> . . . . . . . . . . . . . . . | + NNP | . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . .<22> . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . <3> . . . . . . . . . . . | + RB | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . <.> . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . <2> . . . . . . . . | + VB | . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . | + VBD | . . . . . . . . . . . . . . . . . . . . .<12> . 1 . . . . | + VBG | . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . | + VBN | . . . . . . . . . 2 . . . . . . . . . . . 2 . <5> . . . . | + VBP | . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3>| + -------+-------------------------------------------------------------------------------------+ + (row = reference; col = test) + + +Brill Trainer with evaluation +============================= + + >>> # Perform the relevant imports. + >>> from nltk.tbl.template import Template + >>> from nltk.tag.brill import Pos, Word + >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer, UnigramTagger + + >>> # Load some data + >>> from nltk.corpus import treebank + >>> training_data = treebank.tagged_sents()[:100] + >>> baseline_data = treebank.tagged_sents()[100:200] + >>> gold_data = treebank.tagged_sents()[200:300] + >>> testing_data = [untag(s) for s in gold_data] + + >>> backoff = RegexpTagger([ + ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers + ... (r'(The|the|A|a|An|an)$', 'AT'), # articles + ... (r'.*able$', 'JJ'), # adjectives + ... (r'.*ness$', 'NN'), # nouns formed from adjectives + ... (r'.*ly$', 'RB'), # adverbs + ... (r'.*s$', 'NNS'), # plural nouns + ... (r'.*ing$', 'VBG'), # gerunds + ... (r'.*ed$', 'VBD'), # past tense verbs + ... (r'.*', 'NN') # nouns (default) + ... ]) + +We've now created a simple ``RegexpTagger``, which tags according to the regular expression +rules it has been supplied. This tagger in and of itself does not have a great accuracy. + + >>> backoff.accuracy(gold_data) #doctest: +ELLIPSIS + 0.245014... + +Neither does a simple ``UnigramTagger``. This tagger is trained on some data, +and will then first try to match unigrams (i.e. tokens) of the sentence it has +to tag to the learned data. + + >>> unigram_tagger = UnigramTagger(baseline_data) + >>> unigram_tagger.accuracy(gold_data) #doctest: +ELLIPSIS + 0.581196... + +The lackluster accuracy here can be explained with the following example: + + >>> unigram_tagger.tag(["I", "would", "like", "this", "sentence", "to", "be", "tagged"]) + [('I', 'NNP'), ('would', 'MD'), ('like', None), ('this', 'DT'), ('sentence', None), + ('to', 'TO'), ('be', 'VB'), ('tagged', None)] + +As you can see, many tokens are tagged as ``None``, as these tokens are OOV (out of vocabulary). +The ``UnigramTagger`` has never seen them, and as a result they are not in its database of known terms. + +In practice, a ``UnigramTagger`` is exclusively used in conjunction with a *backoff*. Our real +baseline which will use such a backoff. We'll create a ``UnigramTagger`` like before, but now +the ``RegexpTagger`` will be used as a backoff for the situations where the ``UnigramTagger`` +encounters an OOV token. + + >>> baseline = UnigramTagger(baseline_data, backoff=backoff) + >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS + 0.7537647... + +That is already much better. We can investigate the performance further by running +``evaluate_per_tag``. This method will output the *Precision*, *Recall* and *F-measure* +of each tag. + + >>> print(baseline.evaluate_per_tag(gold_data, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + NNP | 0.9674 | 0.2738 | 0.4269 + NN | 0.4111 | 0.9136 | 0.5670 + IN | 0.9383 | 0.9580 | 0.9480 + DT | 0.9819 | 0.8859 | 0.9314 + JJ | 0.8167 | 0.2970 | 0.4356 + NNS | 0.7393 | 0.9630 | 0.8365 + -NONE- | 1.0000 | 0.8345 | 0.9098 + , | 1.0000 | 1.0000 | 1.0000 + . | 1.0000 | 1.0000 | 1.0000 + VBD | 0.6429 | 0.8804 | 0.7431 + CD | 1.0000 | 0.9872 | 0.9935 + CC | 1.0000 | 0.9355 | 0.9667 + VB | 0.7778 | 0.3684 | 0.5000 + VBN | 0.9375 | 0.3000 | 0.4545 + RB | 0.7778 | 0.7447 | 0.7609 + TO | 1.0000 | 1.0000 | 1.0000 + VBZ | 0.9643 | 0.6429 | 0.7714 + VBG | 0.6415 | 0.9444 | 0.7640 + PRP$ | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 0.5556 | 0.7143 + MD | 1.0000 | 1.0000 | 1.0000 + VBP | 0.6471 | 0.5789 | 0.6111 + POS | 1.0000 | 1.0000 | 1.0000 + $ | 1.0000 | 0.8182 | 0.9000 + '' | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + WDT | 0.4000 | 0.2000 | 0.2667 + `` | 1.0000 | 1.0000 | 1.0000 + JJR | 1.0000 | 0.5000 | 0.6667 + NNPS | 0.0000 | 0.0000 | 0.0000 + RBR | 1.0000 | 1.0000 | 1.0000 + -LRB- | 0.0000 | 0.0000 | 0.0000 + -RRB- | 0.0000 | 0.0000 | 0.0000 + RP | 0.6667 | 0.6667 | 0.6667 + EX | 0.5000 | 0.5000 | 0.5000 + JJS | 0.0000 | 0.0000 | 0.0000 + WP | 1.0000 | 1.0000 | 1.0000 + PDT | 0.0000 | 0.0000 | 0.0000 + AT | 0.0000 | 0.0000 | 0.0000 + + +It's clear that although the precision of tagging `"NNP"` is high, the recall is very low. +With other words, we're missing a lot of cases where the true label is `"NNP"`. We can see +a similar effect with `"JJ"`. + +We can also see a very expected result: The precision of `"NN"` is low, while the recall +is high. If a term is OOV (i.e. ``UnigramTagger`` defers it to ``RegexpTagger``) and +``RegexpTagger`` doesn't have a good rule for it, then it will be tagged as `"NN"`. So, +we catch almost all tokens that are truly labeled as `"NN"`, but we also tag as `"NN"` +for many tokens that shouldn't be `"NN"`. + +This method gives us some insight in what parts of the tagger needs more attention, and why. +However, it doesn't tell us what the terms with true label `"NNP"` or `"JJ"` are actually +tagged as. +To help that, we can create a confusion matrix. + + >>> print(baseline.confusion(gold_data)) + | - | + | - N - | + | L O R N P | + | R N R J J N N N P P P R R V V V V V W | + | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | + | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + $ | <9> . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . | + '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -LRB- | . . . <.> . . . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . . .<121> . . . . . . . . . . . . . 24 . . . . . . . . . . . . . . . . . . . . | + -RRB- | . . . . . <.> . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | + . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . . . . . <58> . . . . . . . . 4 . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . . . 1 . .<163> . 4 . . . . 13 . . . . . . . . . . . . . . . . . 3 . . | + EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | + JJ | . . . . . . . . . . . . . . <49> . . . 86 2 . 4 . . . . 6 . . . . 12 3 . 3 . . . . | + JJR | . . . . . . . . . . . . . . . <3> . . 3 . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . . . 9 . . .<296> . . 5 . . . . . . . . 5 . 9 . . . . . . | + NNP | . . . . . . . . . . . 2 . . . . . . 199 <89> . 26 . . . . 2 . . . . 2 5 . . . . . . | + NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | + PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | + RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | + VB | . . . . . . . . . . . . . . 2 . . . 30 . . . . . . . 1 . . . <21> . . . 3 . . . . | + VBD | . . . . . . . . . . . . . . . . . . 10 . . . . . . . . . . . . <81> . 1 . . . . . | + VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | + VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 31 . <15> . . . . . | + VBP | . . . . . . . . . . . . . . . . . . 7 . . . . . . . . . . . 1 . . . <11> . . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . 15 . . . . . . . . . . . . . <27> . . . | + WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + +Once again we can see that `"NN"` is the default if the tagger isn't sure. Beyond that, +we can see why the recall for `"NNP"` is so low: these tokens are often tagged as `"NN"`. +This effect can also be seen for `"JJ"`, where the majority of tokens that ought to be +tagged as `"JJ"` are actually tagged as `"NN"` by our tagger. + +This tagger will only serve as a baseline for the ``BrillTaggerTrainer``, which uses +templates to attempt to improve the performance of the tagger. + + >>> # Set up templates + >>> Template._cleartemplates() #clear any templates created in earlier tests + >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] + + >>> # Construct a BrillTaggerTrainer + >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) + >>> tagger1 = tt.train(training_data, max_rules=10) + TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) + Finding initial useful rules... + Found 618 useful rules. + + B | + S F r O | Score = Fixed - Broken + c i o t | R Fixed = num tags changed incorrect -> correct + o x k h | u Broken = num tags changed correct -> incorrect + r e e e | l Other = num tags changed incorrect -> incorrect + e d n r | e + ------------------+------------------------------------------------------- + 13 14 1 4 | NN->VB if Pos:TO@[-1] + 8 8 0 0 | NN->VB if Pos:MD@[-1] + 7 10 3 22 | NN->IN if Pos:NNS@[-1] + 5 5 0 0 | NN->VBP if Pos:PRP@[-1] + 5 5 0 0 | VBD->VBN if Pos:VBZ@[-1] + 5 5 0 0 | NNS->NN if Pos:IN@[-1] & Word:asbestos@[0] + 4 4 0 0 | NN->-NONE- if Pos:WP@[-1] + 4 4 0 3 | NN->NNP if Pos:-NONE-@[-1] + 4 6 2 2 | NN->NNP if Pos:NNP@[-1] + 4 4 0 0 | NNS->VBZ if Pos:PRP@[-1] + + >>> tagger1.rules()[1:3] + (Rule('000', 'NN', 'VB', [(Pos([-1]),'MD')]), Rule('000', 'NN', 'IN', [(Pos([-1]),'NNS')])) + + >>> tagger1.print_template_statistics(printunused=False) + TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) + TRAIN ( 2417 tokens) initial 555 0.7704 final: 496 0.7948 + #ID | Score (train) | #Rules | Template + -------------------------------------------- + 000 | 54 0.915 | 9 0.900 | Template(Pos([-1])) + 001 | 5 0.085 | 1 0.100 | Template(Pos([-1]),Word([0])) + + + + >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS + 0.769230... + + >>> print(tagger1.evaluate_per_tag(gold_data, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + NNP | 0.8298 | 0.3600 | 0.5021 + NN | 0.4435 | 0.8364 | 0.5797 + IN | 0.8476 | 0.9580 | 0.8994 + DT | 0.9819 | 0.8859 | 0.9314 + JJ | 0.8167 | 0.2970 | 0.4356 + NNS | 0.7464 | 0.9630 | 0.8410 + -NONE- | 1.0000 | 0.8414 | 0.9139 + , | 1.0000 | 1.0000 | 1.0000 + . | 1.0000 | 1.0000 | 1.0000 + VBD | 0.6723 | 0.8696 | 0.7583 + CD | 1.0000 | 0.9872 | 0.9935 + CC | 1.0000 | 0.9355 | 0.9667 + VB | 0.8103 | 0.8246 | 0.8174 + VBN | 0.9130 | 0.4200 | 0.5753 + RB | 0.7778 | 0.7447 | 0.7609 + TO | 1.0000 | 1.0000 | 1.0000 + VBZ | 0.9667 | 0.6905 | 0.8056 + VBG | 0.6415 | 0.9444 | 0.7640 + PRP$ | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 0.5556 | 0.7143 + MD | 1.0000 | 1.0000 | 1.0000 + VBP | 0.6316 | 0.6316 | 0.6316 + POS | 1.0000 | 1.0000 | 1.0000 + $ | 1.0000 | 0.8182 | 0.9000 + '' | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + WDT | 0.4000 | 0.2000 | 0.2667 + `` | 1.0000 | 1.0000 | 1.0000 + JJR | 1.0000 | 0.5000 | 0.6667 + NNPS | 0.0000 | 0.0000 | 0.0000 + RBR | 1.0000 | 1.0000 | 1.0000 + -LRB- | 0.0000 | 0.0000 | 0.0000 + -RRB- | 0.0000 | 0.0000 | 0.0000 + RP | 0.6667 | 0.6667 | 0.6667 + EX | 0.5000 | 0.5000 | 0.5000 + JJS | 0.0000 | 0.0000 | 0.0000 + WP | 1.0000 | 1.0000 | 1.0000 + PDT | 0.0000 | 0.0000 | 0.0000 + AT | 0.0000 | 0.0000 | 0.0000 + + + >>> print(tagger1.confusion(gold_data)) + | - | + | - N - | + | L O R N P | + | R N R J J N N N P P P R R V V V V V W | + | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | + | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + $ | <9> . . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . 1 . . . . . . . . | + '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -LRB- | . . . <.> . . . . . . . . . 1 . . . . 2 . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . . .<122> . . . . . . . . 1 . . . . 22 . . . . . . . . . . . . . . . . . . . . | + -RRB- | . . . . . <.> . . . . . . . . . . . . 2 1 . . . . . . . . . . . . . . . . . . . | + . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . . . . . <58> . . . . . . . . 2 1 . . . . . . . . . . . . . . 1 . . . . | + CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . . . 1 . .<163> . 5 . . . . 12 . . . . . . . . . . . . . . . . . 3 . . | + EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | + JJ | . . . . . . . . . . . . . 4 <49> . . . 79 4 . 4 . . . . 6 . . . 1 12 3 . 3 . . . . | + JJR | . . . . . . . . . . . . . 2 . <3> . . 1 . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . . 7 9 . . .<271> 16 . 5 . . . . . . . . 7 . 9 . . . . . . | + NNP | . . . . . . . . . . . 2 . 7 . . . . 163<117> . 26 . . . . 2 . . . 1 2 5 . . . . . . | + NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | + PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | + RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | + VB | . . . . . . . . . . . . . . 2 . . . 4 . . . . . . . 1 . . . <47> . . . 3 . . . . | + VBD | . . . . . . . . . . . . . 1 . . . . 8 1 . . . . . . . . . . . <80> . 2 . . . . . | + VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | + VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 25 . <21> . . . . . | + VBP | . . . . . . . . . . . . . 2 . . . . 4 . . . . . . . . . . . 1 . . . <12> . . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . 13 . . . . . . . . . . . . . <29> . . . | + WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + + >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) + >>> tagged[33][12:] + [('foreign', 'NN'), ('debt', 'NN'), ('of', 'IN'), ('$', '$'), ('64', 'CD'), + ('billion', 'CD'), ('*U*', '-NONE-'), ('--', ':'), ('the', 'DT'), ('third-highest', 'NN'), + ('in', 'IN'), ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] + +Regression Tests +~~~~~~~~~~~~~~~~ + +Sequential Taggers +------------------ + +Add tests for: + - make sure backoff is being done correctly. + - make sure ngram taggers don't use previous sentences for context. + - make sure ngram taggers see 'beginning of the sentence' as a + unique context + - make sure regexp tagger's regexps are tried in order + - train on some simple examples, & make sure that the size & the + generated models are correct. + - make sure cutoff works as intended + - make sure that ngram models only exclude contexts covered by the + backoff tagger if the backoff tagger gets that context correct at + *all* locations. + + +Regression Testing for issue #1025 +================================== + +We want to ensure that a RegexpTagger can be created with more than 100 patterns +and does not fail with: "AssertionError: sorry, but this version only supports 100 named groups" + + >>> from nltk.tag import RegexpTagger + >>> patterns = [(str(i), 'NNP',) for i in range(200)] + >>> tagger = RegexpTagger(patterns) + +Regression Testing for issue #2483 +================================== + +Ensure that tagging with pos_tag (PerceptronTagger) does not throw an IndexError +when attempting tagging an empty string. What it must return instead is not +strictly defined. + + >>> from nltk.tag import pos_tag + >>> pos_tag(['', 'is', 'a', 'beautiful', 'day']) + [...] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/tokenize.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/tokenize.doctest new file mode 100644 index 0000000000000000000000000000000000000000..2e70048eebe97817ee950c19010ace8b836fd0cc --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/tokenize.doctest @@ -0,0 +1,397 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + + >>> from nltk.tokenize import * + +Regression Tests: NLTKWordTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tokenizing some test strings. + + >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." + >>> word_tokenize(s1) + ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'] + >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said." + >>> word_tokenize(s2) + ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] + >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." + >>> word_tokenize(s3) + ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.'] + >>> s4 = "I cannot cannot work under these conditions!" + >>> word_tokenize(s4) + ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'] + >>> s5 = "The company spent $30,000,000 last year." + >>> word_tokenize(s5) + ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'] + >>> s6 = "The company spent 40.75% of its income last year." + >>> word_tokenize(s6) + ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'] + >>> s7 = "He arrived at 3:00 pm." + >>> word_tokenize(s7) + ['He', 'arrived', 'at', '3:00', 'pm', '.'] + >>> s8 = "I bought these items: books, pencils, and pens." + >>> word_tokenize(s8) + ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'] + >>> s9 = "Though there were 150, 100 of them were old." + >>> word_tokenize(s9) + ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'] + >>> s10 = "There were 300,000, but that wasn't enough." + >>> word_tokenize(s10) + ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'] + >>> s11 = "It's more'n enough." + >>> word_tokenize(s11) + ['It', "'s", 'more', "'n", 'enough', '.'] + +Gathering the spans of the tokenized strings. + + >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' + >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), + ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), + ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), + ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', + ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', + ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + + >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."''' + >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12), + ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36), + ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62), + ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82), + ... (82, 83), (83, 84)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to', + ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost', + ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + +Testing improvement made to the TreebankWordTokenizer + + >>> sx1 = '\xabNow that I can do.\xbb' + >>> expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb'] + >>> word_tokenize(sx1) == expected + True + >>> sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.' + >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.'] + >>> word_tokenize(sx2) == expected + True + + +Testing treebank's detokenizer + + >>> from nltk.tokenize.treebank import TreebankWordDetokenizer + >>> detokenizer = TreebankWordDetokenizer() + >>> s = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." + >>> detokenizer.detokenize(word_tokenize(s)) + 'On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.' + >>> s = "\"We beat some pretty good teams to get here,\" Slocum said." + >>> detokenizer.detokenize(word_tokenize(s)) + '"We beat some pretty good teams to get here," Slocum said.' + >>> s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." + >>> detokenizer.detokenize(word_tokenize(s)) + 'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.' + >>> s = "I cannot cannot work under these conditions!" + >>> detokenizer.detokenize(word_tokenize(s)) + 'I cannot cannot work under these conditions!' + >>> s = "The company spent $30,000,000 last year." + >>> detokenizer.detokenize(word_tokenize(s)) + 'The company spent $30,000,000 last year.' + >>> s = "The company spent 40.75% of its income last year." + >>> detokenizer.detokenize(word_tokenize(s)) + 'The company spent 40.75% of its income last year.' + >>> s = "He arrived at 3:00 pm." + >>> detokenizer.detokenize(word_tokenize(s)) + 'He arrived at 3:00 pm.' + >>> s = "I bought these items: books, pencils, and pens." + >>> detokenizer.detokenize(word_tokenize(s)) + 'I bought these items: books, pencils, and pens.' + >>> s = "Though there were 150, 100 of them were old." + >>> detokenizer.detokenize(word_tokenize(s)) + 'Though there were 150, 100 of them were old.' + >>> s = "There were 300,000, but that wasn't enough." + >>> detokenizer.detokenize(word_tokenize(s)) + "There were 300,000, but that wasn't enough." + >>> s = 'How "are" you?' + >>> detokenizer.detokenize(word_tokenize(s)) + 'How "are" you?' + >>> s = "Hello (world)" + >>> detokenizer.detokenize(word_tokenize(s)) + 'Hello (world)' + >>> s = ' with (many) [kinds] of {parentheses}. "Sometimes it\'s inside (quotes)". ("Sometimes the otherway around").' + >>> detokenizer.detokenize(word_tokenize(s)) + ' with (many) [kinds] of {parentheses}. "Sometimes it\'s inside (quotes)". ("Sometimes the otherway around").' + >>> s = "Sentence ending with (parentheses)" + >>> detokenizer.detokenize(word_tokenize(s)) + 'Sentence ending with (parentheses)' + >>> s = "(Sentence) starting with parentheses." + >>> detokenizer.detokenize(word_tokenize(s)) + '(Sentence) starting with parentheses.' + >>> s = "I've" + >>> detokenizer.detokenize(word_tokenize(s)) + "I've" + >>> s = "Don't" + >>> detokenizer.detokenize(word_tokenize(s)) + "Don't" + >>> s = "I'd" + >>> detokenizer.detokenize(word_tokenize(s)) + "I'd" + + +Sentence tokenization in word_tokenize: + + >>> s11 = "I called Dr. Jones. I called Dr. Jones." + >>> word_tokenize(s11) + ['I', 'called', 'Dr.', 'Jones', '.', 'I', 'called', 'Dr.', 'Jones', '.'] + >>> s12 = ("Ich muss unbedingt daran denken, Mehl, usw. fur einen " + ... "Kuchen einzukaufen. Ich muss.") + >>> word_tokenize(s12) + ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw', + '.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] + >>> word_tokenize(s12, 'german') + ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw.', + 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] + + +Regression Tests: Regexp Tokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some additional test strings. + + >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" + ... "two of them.\n\nThanks.") + >>> s2 = ("Alas, it has not rained today. When, do you think, " + ... "will it rain again?") + >>> s3 = ("

Although this is not the case here, we must " + ... "not relax our vigilance!

") + + >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False) + [', ', '. ', ', ', ', ', '?'] + >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True) + ['Alas', 'it has not rained today', 'When', 'do you think', + 'will it rain again'] + +Take care to avoid using capturing groups: + + >>> regexp_tokenize(s3, r'', gaps=False) + ['

', '', '', '

'] + >>> regexp_tokenize(s3, r'', gaps=False) + ['

', '', '', '

'] + >>> regexp_tokenize(s3, r'', gaps=True) + ['Although this is ', 'not', + ' the case here, we must not relax our vigilance!'] + +Named groups are capturing groups, and confuse the tokenizer: + + >>> regexp_tokenize(s3, r'b|p)>', gaps=False) + ['p', 'b', 'b', 'p'] + >>> regexp_tokenize(s3, r'b|p)>', gaps=True) + ['p', 'Although this is ', 'b', 'not', 'b', + ' the case here, we must not relax our vigilance!', 'p'] + +Make sure that nested groups don't confuse the tokenizer: + + >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False) + ['las', 'has', 'rai', 'rai'] + >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True) + ['A', ', it ', ' not ', 'ned today. When, do you think, will it ', + 'n again?'] + +Back-references require capturing groups, and these are not supported: + + >>> regexp_tokenize("aabbbcccc", r'(.)\1') + ['a', 'b', 'c', 'c'] + +A simple sentence tokenizer '\.(\s+|$)' + + >>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True) + ['Good muffins cost $3.88\nin New York', + 'Please buy me\ntwo of them', 'Thanks'] + + +Regression Tests: TweetTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +TweetTokenizer is a tokenizer specifically designed for micro-blogging tokenization tasks. + + >>> from nltk.tokenize import TweetTokenizer + >>> tknzr = TweetTokenizer() + >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" + >>> tknzr.tokenize(s0) + ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] + >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)" + >>> tknzr.tokenize(s1) + ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)'] + >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn" + >>> tknzr.tokenize(s2) + ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn'] + >>> s3 = "@Insanomania They do... Their mentality doesn't :(" + >>> tknzr.tokenize(s3) + ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':('] + >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!" + >>> tknzr.tokenize(s4) + ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!'] + >>> tknzr = TweetTokenizer(reduce_len=True) + >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" + >>> tknzr.tokenize(s5) + ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':('] + +It is possible to specify `strip_handles` and `reduce_len` parameters for a TweetTokenizer instance. Setting `strip_handles` to True, the tokenizer will remove Twitter handles (e.g. usernames). Setting `reduce_len` to True, repeated character sequences of length 3 or greater will be replaced with sequences of length 3. + + >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) + >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!' + >>> tknzr.tokenize(s6) + [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] + >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.' + >>> tknzr.tokenize(s7) + [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.'] + >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com.' + >>> tknzr.tokenize(s8) + ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin@email.com', '.'] + +The `preserve_case` parameter (default: True) allows to convert uppercase tokens to lowercase tokens. Emoticons are not affected: + + >>> tknzr = TweetTokenizer(preserve_case=False) + >>> s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P" + >>> tknzr.tokenize(s9) + ['@jrmy', ':', "i'm", 'really', 'happyyy', 'about', 'that', '!', 'niceeee', ':D', ':P'] + +It should not hang on long sequences of the same punctuation character. + + >>> tknzr = TweetTokenizer() + >>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L" + >>> tknzr.tokenize(s10) + ['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L'] + +Tokenizing multiple sentences at once: + + >>> tknzr = TweetTokenizer() + >>> sentences = [ + ... "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--", + ... "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P", + ... "@_willy65: No place for @chuck tonight. Sorry." + ... ] + >>> tknzr.tokenize_sents(sentences) # doctest: +NORMALIZE_WHITESPACE + [['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'], + ['@jrmy', ':', "I'm", 'REALLY', 'HAPPYYY', 'about', 'that', '!', 'NICEEEE', ':D', ':P'], + ['@_willy65', ':', 'No', 'place', 'for', '@chuck', 'tonight', '.', 'Sorry', '.']] + + +Regression Tests: PunktSentenceTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The sentence splitter should remove whitespace following the sentence boundary. + + >>> pst = PunktSentenceTokenizer() + >>> pst.tokenize('See Section 3). Or Section 2). ') + ['See Section 3).', 'Or Section 2).'] + >>> pst.tokenize('See Section 3.) Or Section 2.) ') + ['See Section 3.)', 'Or Section 2.)'] + >>> pst.tokenize('See Section 3.) Or Section 2.) ', realign_boundaries=False) + ['See Section 3.', ') Or Section 2.', ')'] + + +Two instances of PunktSentenceTokenizer should not share PunktParameters. + + >>> pst = PunktSentenceTokenizer() + >>> pst2 = PunktSentenceTokenizer() + >>> pst._params is pst2._params + False + +Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067 + + >>> from nltk.tokenize.punkt import PunktBaseClass, PunktTrainer, PunktSentenceTokenizer + >>> from nltk.tokenize.punkt import PunktLanguageVars, PunktParameters + >>> pbc = PunktBaseClass(lang_vars=None, params=None) + >>> type(pbc._params) + + >>> type(pbc._lang_vars) + + >>> pt = PunktTrainer(lang_vars=None) + >>> type(pt._lang_vars) + + >>> pst = PunktSentenceTokenizer(lang_vars=None) + >>> type(pst._lang_vars) + + +Testing that inputs can start with dots. + + >>> pst = PunktSentenceTokenizer(lang_vars=None) + >>> pst.tokenize(". This input starts with a dot. This used to cause issues.") + ['.', 'This input starts with a dot.', 'This used to cause issues.'] + +Regression Tests: align_tokens +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Post-hoc alignment of tokens with a source string + + >>> from nltk.tokenize.util import align_tokens + >>> list(align_tokens([''], "")) + [(0, 0)] + >>> list(align_tokens([''], " ")) + [(0, 0)] + >>> list(align_tokens([], "")) + [] + >>> list(align_tokens([], " ")) + [] + >>> list(align_tokens(['a'], "a")) + [(0, 1)] + >>> list(align_tokens(['abc', 'def'], "abcdef")) + [(0, 3), (3, 6)] + >>> list(align_tokens(['abc', 'def'], "abc def")) + [(0, 3), (4, 7)] + >>> list(align_tokens(['ab', 'cd'], "ab cd ef")) + [(0, 2), (3, 5)] + >>> list(align_tokens(['ab', 'cd', 'ef'], "ab cd ef")) + [(0, 2), (3, 5), (6, 8)] + >>> list(align_tokens(['ab', 'cd', 'efg'], "ab cd ef")) + Traceback (most recent call last): + .... + ValueError: substring "efg" not found in "ab cd ef" + >>> list(align_tokens(['ab', 'cd', 'ef', 'gh'], "ab cd ef")) + Traceback (most recent call last): + .... + ValueError: substring "gh" not found in "ab cd ef" + >>> list(align_tokens(['The', 'plane', ',', 'bound', 'for', 'St', 'Petersburg', ',', 'crashed', 'in', 'Egypt', "'s", 'Sinai', 'desert', 'just', '23', 'minutes', 'after', 'take-off', 'from', 'Sharm', 'el-Sheikh', 'on', 'Saturday', '.'], "The plane, bound for St Petersburg, crashed in Egypt's Sinai desert just 23 minutes after take-off from Sharm el-Sheikh on Saturday.")) + [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), (123, 131), (131, 132)] + + +Regression Tests: MWETokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Pickle an MWETokenizer + + >>> from nltk.tokenize import MWETokenizer + >>> import pickle + + >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+') + >>> p = pickle.dumps(tokenizer) + >>> unpickeled = pickle.loads(p) + >>> unpickeled.tokenize("An hors d'oeuvre tonight, sir?".split()) + ['An', "hors+d'oeuvre", 'tonight,', 'sir?'] + + +Regression Tests: TextTilingTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +TextTilingTokenizer tokenizes text into coherent subtopic chunks based upon Hearst's TextTiling algorithm. + + >>> from nltk.tokenize import TextTilingTokenizer + >>> from nltk.corpus import brown + >>> tt = TextTilingTokenizer() + >>> tt.tokenize(brown.raw()[0:1000]) + ["\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np Allen/np Jr./"] + +Test that `ValueError` exceptions are raised when illegal arguments are used. + + >>> TextTilingTokenizer(similarity_method='foo').tokenize(brown.raw()[0:1000]) + Traceback (most recent call last): + ... + ValueError: Similarity method foo not recognized + >>> TextTilingTokenizer(smoothing_method='bar').tokenize(brown.raw()[0:1000]) + Traceback (most recent call last): + ... + ValueError: Smoothing method bar not recognized diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/toolbox.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/toolbox.doctest new file mode 100644 index 0000000000000000000000000000000000000000..53ab65876cd5d52166ac25f8621b8323ebab5beb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/toolbox.doctest @@ -0,0 +1,306 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=============================== +Unit test cases for ``toolbox`` +=============================== + + >>> from nltk import toolbox + +-------------------------- +``toolbox.StandardFormat`` +-------------------------- + + >>> f = toolbox.StandardFormat() + +``toolbox.StandardFormat.open()`` +--------------------------------- + >>> import os, tempfile + >>> (fd, fname) = tempfile.mkstemp() + >>> tf = os.fdopen(fd, "w") + >>> _ = tf.write('\\lx a value\n\\lx another value\n') + >>> tf.close() + >>> f = toolbox.StandardFormat() + >>> f.open(fname) + >>> list(f.fields()) + [('lx', 'a value'), ('lx', 'another value')] + >>> f.close() + >>> os.unlink(fname) + +``toolbox.StandardFormat.open_string()`` +---------------------------------------- + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\n\\lx another value\n') + >>> list(f.fields()) + [('lx', 'a value'), ('lx', 'another value')] + >>> f.close() + +``toolbox.StandardFormat.close()`` +---------------------------------- + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\n\\lx another value\n') + >>> list(f.fields()) + [('lx', 'a value'), ('lx', 'another value')] + >>> f.close() + +``toolbox.StandardFormat.line_num`` +--------------------------------------- + +``StandardFormat.line_num`` contains the line number of the last line returned: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n') + >>> line_nums = [] + >>> for l in f.raw_fields(): + ... line_nums.append(f.line_num) + >>> line_nums + [1, 2, 3] + +``StandardFormat.line_num`` contains the line number of the last line returned: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n') + >>> line_nums = [] + >>> for l in f.raw_fields(): + ... line_nums.append(f.line_num) + >>> line_nums + [2, 5, 7] + +``StandardFormat.line_num`` doesn't exist before opening or after closing +a file or string: + + >>> f = toolbox.StandardFormat() + >>> f.line_num + Traceback (most recent call last): + ... + AttributeError: 'StandardFormat' object has no attribute 'line_num' + >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n') + >>> line_nums = [] + >>> for l in f.raw_fields(): + ... line_nums.append(f.line_num) + >>> line_nums + [2, 5, 7] + >>> f.close() + >>> f.line_num + Traceback (most recent call last): + ... + AttributeError: 'StandardFormat' object has no attribute 'line_num' + +``toolbox.StandardFormat.raw_fields()`` +--------------------------------------- +``raw_fields()`` returns an iterator over tuples of two strings representing the +marker and its value. The marker is given without the backslash and the value +without its trailing newline: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\n\\lx another value\n') + >>> list(f.raw_fields()) + [('lx', 'a value'), ('lx', 'another value')] + +an empty file returns nothing: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('') + >>> list(f.raw_fields()) + [] + +file with only a newline returns WHAT SHOULD IT RETURN???: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\n') + >>> list(f.raw_fields()) + [(None, '')] + +file with only one field should be parsed ok: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx one value\n') + >>> list(f.raw_fields()) + [('lx', 'one value')] + +file without a trailing newline should be parsed ok: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\n\\lx another value') + >>> list(f.raw_fields()) + [('lx', 'a value'), ('lx', 'another value')] + +trailing white space is preserved except for the final newline: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n') + >>> list(f.raw_fields()) + [('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')] + +line wrapping is preserved: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') + >>> list(f.raw_fields()) + [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')] + +file beginning with a multiline record should be parsed ok: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') + >>> list(f.raw_fields()) + [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')] + +file ending with a multiline record should be parsed ok: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n') + >>> list(f.raw_fields()) + [('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')] + +file beginning with a BOM should be parsed ok: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n') + >>> list(f.raw_fields()) + [('lx', 'a value'), ('lx', 'another value')] + +file beginning with two BOMs should ignore only the first one: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n') + >>> list(f.raw_fields()) + [(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')] + +should not ignore a BOM not at the beginning of the file: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n') + >>> list(f.raw_fields()) + [('lx', 'a value\n\xef\xbb\xbf\\lx another value')] + +``toolbox.StandardFormat.fields()`` +----------------------------------- +trailing white space is not preserved: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n') + >>> list(f.fields()) + [('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')] + +multiline fields are unwrapped: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') + >>> list(f.fields()) + [('lx', 'a value more of the value and still more'), ('lc', 'another val')] + +markers +------- +A backslash in the first position on a new line indicates the start of a +marker. The backslash is not part of the marker: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\mk a value\n') + >>> list(f.fields()) + [('mk', 'a value')] + +If the backslash occurs later in the line it does not indicate the start +of a marker: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\mk a value\n \\mk another one\n') + >>> list(f.raw_fields()) + [('mk', 'a value\n \\mk another one')] + +There is no specific limit to the length of a marker: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\this_is_an_extremely_long_marker value\n') + >>> list(f.fields()) + [('this_is_an_extremely_long_marker', 'value')] + +A marker can contain any non white space character: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789 value\n') + >>> list(f.fields()) + [('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')] + +A marker is terminated by any white space character: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one') + >>> list(f.fields()) + [('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')] + +Consecutive whitespace characters (except newline) are treated the same as one: + + >>> f = toolbox.StandardFormat() + >>> f.open_string('\\mk \t\r\fa value\n') + >>> list(f.fields()) + [('mk', 'a value')] + +----------------------- +``toolbox.ToolboxData`` +----------------------- + + >>> db = toolbox.ToolboxData() + +``toolbox.ToolboxData.parse()`` +------------------------------- +check that normal parsing works: + + >>> from xml.etree import ElementTree + >>> td = toolbox.ToolboxData() + >>> s = """\\_sh v3.0 400 Rotokas Dictionary + ... \\_DateStampHasFourDigitYear + ... + ... \\lx kaa + ... \\ps V.A + ... \\ge gag + ... \\gp nek i pas + ... + ... \\lx kaa + ... \\ps V.B + ... \\ge strangle + ... \\gp pasim nek + ... """ + >>> td.open_string(s) + >>> tree = td.parse(key='lx') + >>> tree.tag + 'toolbox_data' + >>> ElementTree.tostring(list(tree)[0]).decode('utf8') + '
<_sh>v3.0 400 Rotokas Dictionary<_DateStampHasFourDigitYear />
' + >>> ElementTree.tostring(list(tree)[1]).decode('utf8') + 'kaaV.Agagnek i pas' + >>> ElementTree.tostring(list(tree)[2]).decode('utf8') + 'kaaV.Bstranglepasim nek' + +check that guessing the key marker works: + + >>> from xml.etree import ElementTree + >>> td = toolbox.ToolboxData() + >>> s = """\\_sh v3.0 400 Rotokas Dictionary + ... \\_DateStampHasFourDigitYear + ... + ... \\lx kaa + ... \\ps V.A + ... \\ge gag + ... \\gp nek i pas + ... + ... \\lx kaa + ... \\ps V.B + ... \\ge strangle + ... \\gp pasim nek + ... """ + >>> td.open_string(s) + >>> tree = td.parse() + >>> ElementTree.tostring(list(tree)[0]).decode('utf8') + '
<_sh>v3.0 400 Rotokas Dictionary<_DateStampHasFourDigitYear />
' + >>> ElementTree.tostring(list(tree)[1]).decode('utf8') + 'kaaV.Agagnek i pas' + >>> ElementTree.tostring(list(tree)[2]).decode('utf8') + 'kaaV.Bstranglepasim nek' + +----------------------- +``toolbox`` functions +----------------------- + +``toolbox.to_sfm_string()`` +------------------------------- diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/translate.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/translate.doctest new file mode 100644 index 0000000000000000000000000000000000000000..c1fd5bbc1380c7907152e79f2d956a53273244e5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/translate.doctest @@ -0,0 +1,240 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +.. -*- coding: utf-8 -*- + +========= +Alignment +========= + +Corpus Reader +------------- + + >>> from nltk.corpus import comtrans + >>> words = comtrans.words('alignment-en-fr.txt') + >>> for word in words[:6]: + ... print(word) + Resumption + of + the + session + I + declare + >>> als = comtrans.aligned_sents('alignment-en-fr.txt')[0] + >>> als + AlignedSent(['Resumption', 'of', 'the', 'session'], + ['Reprise', 'de', 'la', 'session'], + Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) + + +Alignment Objects +----------------- + +Aligned sentences are simply a mapping between words in a sentence: + + >>> print(" ".join(als.words)) + Resumption of the session + >>> print(" ".join(als.mots)) + Reprise de la session + >>> als.alignment + Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]) + + +Usually we look at them from the perspective of a source to a target language, +but they are easily inverted: + + >>> als.invert() + AlignedSent(['Reprise', 'de', 'la', 'session'], + ['Resumption', 'of', 'the', 'session'], + Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) + + +We can create new alignments, but these need to be in the correct range of +the corresponding sentences: + + >>> from nltk.translate import Alignment, AlignedSent + >>> als = AlignedSent(['Reprise', 'de', 'la', 'session'], + ... ['Resumption', 'of', 'the', 'session'], + ... Alignment([(0, 0), (1, 4), (2, 1), (3, 3)])) + Traceback (most recent call last): + ... + IndexError: Alignment is outside boundary of mots + + +You can set alignments with any sequence of tuples, so long as the first two +indexes of the tuple are the alignment indices: + + >>> als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))]) + + >>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))]) + Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))]) + + +Alignment Algorithms +-------------------- + +EM for IBM Model 1 +~~~~~~~~~~~~~~~~~~ + +Here is an example from Koehn, 2010: + + >>> from nltk.translate import IBMModel1 + >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']), + ... AlignedSent(['the', 'book'], ['das', 'Buch']), + ... AlignedSent(['a', 'book'], ['ein', 'Buch'])] + >>> em_ibm1 = IBMModel1(corpus, 20) + >>> print(round(em_ibm1.translation_table['the']['das'], 1)) + 1.0 + >>> print(round(em_ibm1.translation_table['book']['das'], 1)) + 0.0 + >>> print(round(em_ibm1.translation_table['house']['das'], 1)) + 0.0 + >>> print(round(em_ibm1.translation_table['the']['Buch'], 1)) + 0.0 + >>> print(round(em_ibm1.translation_table['book']['Buch'], 1)) + 1.0 + >>> print(round(em_ibm1.translation_table['a']['Buch'], 1)) + 0.0 + >>> print(round(em_ibm1.translation_table['book']['ein'], 1)) + 0.0 + >>> print(round(em_ibm1.translation_table['a']['ein'], 1)) + 1.0 + >>> print(round(em_ibm1.translation_table['the']['Haus'], 1)) + 0.0 + >>> print(round(em_ibm1.translation_table['house']['Haus'], 1)) + 1.0 + >>> print(round(em_ibm1.translation_table['book'][None], 1)) + 0.5 + +And using an NLTK corpus. We train on only 10 sentences, since it is so slow: + + >>> from nltk.corpus import comtrans + >>> com_ibm1 = IBMModel1(comtrans.aligned_sents()[:10], 20) + >>> print(round(com_ibm1.translation_table['bitte']['Please'], 1)) + 0.2 + >>> print(round(com_ibm1.translation_table['Sitzungsperiode']['session'], 1)) + 1.0 + + +Evaluation +---------- +The evaluation metrics for alignments are usually not interested in the +contents of alignments but more often the comparison to a "gold standard" +alignment that has been been constructed by human experts. For this reason we +often want to work just with raw set operations against the alignment points. +This then gives us a very clean form for defining our evaluation metrics. + +.. Note:: + The AlignedSent class has no distinction of "possible" or "sure" + alignments. Thus all alignments are treated as "sure". + +Consider the following aligned sentence for evaluation: + + >>> my_als = AlignedSent(['Resumption', 'of', 'the', 'session'], + ... ['Reprise', 'de', 'la', 'session'], + ... Alignment([(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)])) + +Precision +~~~~~~~~~ +``precision = |A∩P| / |A|`` + +**Precision** is probably the most well known evaluation metric and it is implemented +in `nltk.metrics.scores.precision`_. Since precision is simply interested in the +proportion of correct alignments, we calculate the ratio of the number of our +test alignments (*A*) that match a possible alignment (*P*), over the number of +test alignments provided. There is no penalty for missing a possible alignment +in our test alignments. An easy way to game this metric is to provide just one +test alignment that is in *P* [OCH2000]_. + +Here are some examples: + + >>> from nltk.metrics import precision + >>> als.alignment = Alignment([(0,0), (1,1), (2,2), (3,3)]) + >>> precision(Alignment([]), als.alignment) + 0.0 + >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) + 1.0 + >>> precision(Alignment([(0,0), (3,3)]), als.alignment) + 0.5 + >>> precision(Alignment.fromstring('0-0 3-3'), als.alignment) + 0.5 + >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment) + 1.0 + >>> precision(als.alignment, my_als.alignment) + 0.6 + + +.. _nltk.metrics.scores.precision: + https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.precision + + +Recall +~~~~~~ +``recall = |A∩S| / |S|`` + +**Recall** is another well known evaluation metric that has a set based +implementation in NLTK as `nltk.metrics.scores.recall`_. Since recall is +simply interested in the proportion of found alignments, we calculate the +ratio of the number of our test alignments (*A*) that match a sure alignment +(*S*) over the number of sure alignments. There is no penalty for producing +a lot of test alignments. An easy way to game this metric is to include every +possible alignment in our test alignments, regardless if they are correct or +not [OCH2000]_. + +Here are some examples: + + >>> from nltk.metrics import recall + >>> print(recall(Alignment([]), als.alignment)) + None + >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) + 1.0 + >>> recall(Alignment.fromstring('0-0 3-3'), als.alignment) + 1.0 + >>> recall(Alignment([(0,0), (3,3)]), als.alignment) + 1.0 + >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment) + 0.66666... + >>> recall(als.alignment, my_als.alignment) + 0.75 + + +.. _nltk.metrics.scores.recall: + https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.recall + + +Alignment Error Rate (AER) +~~~~~~~~~~~~~~~~~~~~~~~~~~ +``AER = 1 - (|A∩S| + |A∩P|) / (|A| + |S|)`` + +**Alignment Error Rate** is commonly used metric for assessing sentence +alignments. It combines precision and recall metrics together such that a +perfect alignment must have all of the sure alignments and may have some +possible alignments [MIHALCEA2003]_ [KOEHN2010]_. + +.. Note:: + [KOEHN2010]_ defines the AER as ``AER = (|A∩S| + |A∩P|) / (|A| + |S|)`` + in his book, but corrects it to the above in his online errata. This is + in line with [MIHALCEA2003]_. + +Here are some examples: + + >>> from nltk.translate import alignment_error_rate + >>> alignment_error_rate(Alignment([]), als.alignment) + 1.0 + >>> alignment_error_rate(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) + 0.0 + >>> alignment_error_rate(als.alignment, my_als.alignment) + 0.333333... + >>> alignment_error_rate(als.alignment, my_als.alignment, + ... als.alignment | Alignment([(1,2), (2,1)])) + 0.222222... + + +.. [OCH2000] Och, F. and Ney, H. (2000) + *Statistical Machine Translation*, EAMT Workshop + +.. [MIHALCEA2003] Mihalcea, R. and Pedersen, T. (2003) + *An evaluation exercise for word alignment*, HLT-NAACL 2003 + +.. [KOEHN2010] Koehn, P. (2010) + *Statistical Machine Translation*, Cambridge University Press diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/tree.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/tree.doctest new file mode 100644 index 0000000000000000000000000000000000000000..100a0020cfda23611cc2e88af1bce8e8b25cfc0f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/tree.doctest @@ -0,0 +1,1223 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=============================== + Unit tests for nltk.tree.Tree +=============================== + + >>> from nltk.tree import * + +Some trees to run tests on: + + >>> dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])]) + >>> dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])]) + >>> vp = Tree('vp', [Tree('v', ['chased']), dp2]) + >>> tree = Tree('s', [dp1, vp]) + >>> print(tree) + (s (dp (d the) (np dog)) (vp (v chased) (dp (d the) (np cat)))) + +The node label is accessed using the `label()` method: + + >>> dp1.label(), dp2.label(), vp.label(), tree.label() + ('dp', 'dp', 'vp', 's') + + >>> print(tree[1,1,1,0]) + cat + +The `treepositions` method returns a list of the tree positions of +subtrees and leaves in a tree. By default, it gives the position of +every tree, subtree, and leaf, in prefix order: + + >>> print(tree.treepositions()) + [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0), (1, 1, 0, 0), (1, 1, 1), (1, 1, 1, 0)] + +In addition to `str` and `repr`, several methods exist to convert a +tree object to one of several standard tree encodings: + + >>> print(tree.pformat_latex_qtree()) + \Tree [.s + [.dp [.d the ] [.np dog ] ] + [.vp [.v chased ] [.dp [.d the ] [.np cat ] ] ] ] + +There is also a fancy ASCII art representation: + + >>> tree.pretty_print() + s + ________|_____ + | vp + | _____|___ + dp | dp + ___|___ | ___|___ + d np v d np + | | | | | + the dog chased the cat + + >>> tree.pretty_print(unicodelines=True, nodedist=4) + s + ┌──────────────┴────────┐ + │ vp + │ ┌────────┴──────┐ + dp │ dp + ┌──────┴──────┐ │ ┌──────┴──────┐ + d np v d np + │ │ │ │ │ + the dog chased the cat + +Trees can be initialized from treebank strings: + + >>> tree2 = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))') + >>> print(tree2) + (S (NP I) (VP (V enjoyed) (NP my cookie))) + +Trees can be compared for equality: + + >>> tree == Tree.fromstring(str(tree)) + True + >>> tree2 == Tree.fromstring(str(tree2)) + True + >>> tree == tree2 + False + >>> tree == Tree.fromstring(str(tree2)) + False + >>> tree2 == Tree.fromstring(str(tree)) + False + + >>> tree != Tree.fromstring(str(tree)) + False + >>> tree2 != Tree.fromstring(str(tree2)) + False + >>> tree != tree2 + True + >>> tree != Tree.fromstring(str(tree2)) + True + >>> tree2 != Tree.fromstring(str(tree)) + True + + >>> tree < tree2 or tree > tree2 + True + +Tree Parsing +============ + +The class method `Tree.fromstring()` can be used to parse trees, and it +provides some additional options. + + >>> tree = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))') + >>> print(tree) + (S (NP I) (VP (V enjoyed) (NP my cookie))) + +When called on a subclass of `Tree`, it will create trees of that +type: + + >>> tree = ImmutableTree.fromstring('(VP (V enjoyed) (NP my cookie))') + >>> print(tree) + (VP (V enjoyed) (NP my cookie)) + >>> print(type(tree)) + + >>> tree[1] = 'x' + Traceback (most recent call last): + . . . + ValueError: ImmutableTree may not be modified + >>> del tree[0] + Traceback (most recent call last): + . . . + ValueError: ImmutableTree may not be modified + +The ``brackets`` parameter can be used to specify two characters that +should be used as brackets: + + >>> print(Tree.fromstring('[S [NP I] [VP [V enjoyed] [NP my cookie]]]', + ... brackets='[]')) + (S (NP I) (VP (V enjoyed) (NP my cookie))) + >>> print(Tree.fromstring(' >>', + ... brackets='<>')) + (S (NP I) (VP (V enjoyed) (NP my cookie))) + +If ``brackets`` is not a string, or is not exactly two characters, +then `Tree.fromstring` raises an exception: + + >>> Tree.fromstring(' >', brackets='') + Traceback (most recent call last): + . . . + TypeError: brackets must be a length-2 string + >>> Tree.fromstring(' >', brackets='<<>>') + Traceback (most recent call last): + . . . + TypeError: brackets must be a length-2 string + >>> Tree.fromstring(' >', brackets=12) + Traceback (most recent call last): + . . . + TypeError: brackets must be a length-2 string + >>> Tree.fromstring('<>', brackets=('<<','>>')) + Traceback (most recent call last): + . . . + TypeError: brackets must be a length-2 string + +(We may add support for multi-character brackets in the future, in +which case the ``brackets=('<<','>>')`` example would start working.) + +Whitespace brackets are not permitted: + + >>> Tree.fromstring('(NP my cookie\n', brackets='(\n') + Traceback (most recent call last): + . . . + TypeError: whitespace brackets not allowed + +If an invalid tree is given to Tree.fromstring, then it raises a +ValueError, with a description of the problem: + + >>> Tree.fromstring('(NP my cookie) (NP my milk)') + Traceback (most recent call last): + . . . + ValueError: Tree.fromstring(): expected 'end-of-string' but got '(NP' + at index 15. + "...y cookie) (NP my mil..." + ^ + >>> Tree.fromstring(')NP my cookie(') + Traceback (most recent call last): + . . . + ValueError: Tree.fromstring(): expected '(' but got ')' + at index 0. + ")NP my coo..." + ^ + >>> Tree.fromstring('(NP my cookie))') + Traceback (most recent call last): + . . . + ValueError: Tree.fromstring(): expected 'end-of-string' but got ')' + at index 14. + "...my cookie))" + ^ + >>> Tree.fromstring('my cookie)') + Traceback (most recent call last): + . . . + ValueError: Tree.fromstring(): expected '(' but got 'my' + at index 0. + "my cookie)" + ^ + >>> Tree.fromstring('(NP my cookie') + Traceback (most recent call last): + . . . + ValueError: Tree.fromstring(): expected ')' but got 'end-of-string' + at index 13. + "... my cookie" + ^ + >>> Tree.fromstring('') + Traceback (most recent call last): + . . . + ValueError: Tree.fromstring(): expected '(' but got 'end-of-string' + at index 0. + "" + ^ + +Trees with no children are supported: + + >>> print(Tree.fromstring('(S)')) + (S ) + >>> print(Tree.fromstring('(X (Y) (Z))')) + (X (Y ) (Z )) + +Trees with an empty node label and no children are supported: + + >>> print(Tree.fromstring('()')) + ( ) + >>> print(Tree.fromstring('(X () ())')) + (X ( ) ( )) + +Trees with an empty node label and children are supported, but only if the +first child is not a leaf (otherwise, it will be treated as the node label). + + >>> print(Tree.fromstring('((A) (B) (C))')) + ( (A ) (B ) (C )) + >>> print(Tree.fromstring('((A) leaf)')) + ( (A ) leaf) + >>> print(Tree.fromstring('(((())))')) + ( ( ( ( )))) + +The optional arguments `read_node` and `read_leaf` may be used to +transform the string values of nodes or leaves. + + >>> print(Tree.fromstring('(A b (C d e) (F (G h i)))', + ... read_node=lambda s: '<%s>' % s, + ... read_leaf=lambda s: '"%s"' % s)) + (
"b" ( "d" "e") ( ( "h" "i"))) + +These transformation functions are typically used when the node or +leaf labels should be parsed to a non-string value (such as a feature +structure). If node and leaf labels need to be able to include +whitespace, then you must also use the optional `node_pattern` and +`leaf_pattern` arguments. + + >>> from nltk.featstruct import FeatStruct + >>> tree = Tree.fromstring('([cat=NP] [lex=the] [lex=dog])', + ... read_node=FeatStruct, read_leaf=FeatStruct) + >>> tree.set_label(tree.label().unify(FeatStruct('[num=singular]'))) + >>> print(tree) + ([cat='NP', num='singular'] [lex='the'] [lex='dog']) + +The optional argument ``remove_empty_top_bracketing`` can be used to +remove any top-level empty bracketing that occurs. + + >>> print(Tree.fromstring('((S (NP I) (VP (V enjoyed) (NP my cookie))))', + ... remove_empty_top_bracketing=True)) + (S (NP I) (VP (V enjoyed) (NP my cookie))) + +It will not remove a top-level empty bracketing with multiple children: + + >>> print(Tree.fromstring('((A a) (B b))')) + ( (A a) (B b)) + + +Tree.fromlist() +--------------- +The class method `Tree.fromlist()` can be used to parse trees +that are expressed as nested lists, such as those produced by +the tree() function from the wordnet module. + + >>> from nltk.corpus import wordnet as wn + >>> t=Tree.fromlist(wn.synset('dog.n.01').tree(lambda s:s.hypernyms())) + >>> print(t.height()) + 14 + >>> print(t.leaves()) + ["Synset('entity.n.01')", "Synset('entity.n.01')"] + >>> t.pretty_print() + Synset('dog.n.01') + _________________|__________________ + Synset('canine.n. | + 02') | + | | + Synset('carnivor | + e.n.01') | + | | + Synset('placenta | + l.n.01') | + | | + Synset('mammal.n. | + 01') | + | | + Synset('vertebra | + te.n.01') | + | | + Synset('chordate. Synset('domestic + n.01') _animal.n.01') + | | + Synset('animal.n. Synset('animal.n. + 01') 01') + | | + Synset('organism. Synset('organism. + n.01') n.01') + | | + Synset('living_t Synset('living_t + hing.n.01') hing.n.01') + | | + Synset('whole.n. Synset('whole.n. + 02') 02') + | | + Synset('object.n. Synset('object.n. + 01') 01') + | | + Synset('physical Synset('physical + _entity.n.01') _entity.n.01') + | | + Synset('entity.n. Synset('entity.n. + 01') 01') + + + +Parented Trees +============== +`ParentedTree` is a subclass of `Tree` that automatically maintains +parent pointers for single-parented trees. Parented trees can be +created directly from a node label and a list of children: + + >>> ptree = ( + ... ParentedTree('VP', [ + ... ParentedTree('VERB', ['saw']), + ... ParentedTree('NP', [ + ... ParentedTree('DET', ['the']), + ... ParentedTree('NOUN', ['dog'])])])) + >>> print(ptree) + (VP (VERB saw) (NP (DET the) (NOUN dog))) + +Parented trees can be created from strings using the classmethod +`ParentedTree.fromstring`: + + >>> ptree = ParentedTree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))') + >>> print(ptree) + (VP (VERB saw) (NP (DET the) (NOUN dog))) + >>> print(type(ptree)) + + +Parented trees can also be created by using the classmethod +`ParentedTree.convert` to convert another type of tree to a parented +tree: + + >>> tree = Tree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))') + >>> ptree = ParentedTree.convert(tree) + >>> print(ptree) + (VP (VERB saw) (NP (DET the) (NOUN dog))) + >>> print(type(ptree)) + + +.. clean-up: + + >>> del tree + +`ParentedTree`\ s should never be used in the same tree as `Tree`\ s +or `MultiParentedTree`\ s. Mixing tree implementations may result in +incorrect parent pointers and in `TypeError` exceptions: + + >>> # Inserting a Tree in a ParentedTree gives an exception: + >>> ParentedTree('NP', [ + ... Tree('DET', ['the']), Tree('NOUN', ['dog'])]) + Traceback (most recent call last): + . . . + TypeError: Can not insert a non-ParentedTree into a ParentedTree + + >>> # inserting a ParentedTree in a Tree gives incorrect parent pointers: + >>> broken_tree = Tree('NP', [ + ... ParentedTree('DET', ['the']), ParentedTree('NOUN', ['dog'])]) + >>> print(broken_tree[0].parent()) + None + +Parented Tree Methods +------------------------ +In addition to all the methods defined by the `Tree` class, the +`ParentedTree` class adds six new methods whose values are +automatically updated whenever a parented tree is modified: `parent()`, +`parent_index()`, `left_sibling()`, `right_sibling()`, `root()`, and +`treeposition()`. + +The `parent()` method contains a `ParentedTree`\ 's parent, if it has +one; and ``None`` otherwise. `ParentedTree`\ s that do not have +parents are known as "root trees." + + >>> for subtree in ptree.subtrees(): + ... print(subtree) + ... print(' Parent = %s' % subtree.parent()) + (VP (VERB saw) (NP (DET the) (NOUN dog))) + Parent = None + (VERB saw) + Parent = (VP (VERB saw) (NP (DET the) (NOUN dog))) + (NP (DET the) (NOUN dog)) + Parent = (VP (VERB saw) (NP (DET the) (NOUN dog))) + (DET the) + Parent = (NP (DET the) (NOUN dog)) + (NOUN dog) + Parent = (NP (DET the) (NOUN dog)) + +The `parent_index()` method stores the index of a tree in its parent's +child list. If a tree does not have a parent, then its `parent_index` +is ``None``. + + >>> for subtree in ptree.subtrees(): + ... print(subtree) + ... print(' Parent Index = %s' % subtree.parent_index()) + ... assert (subtree.parent() is None or + ... subtree.parent()[subtree.parent_index()] is subtree) + (VP (VERB saw) (NP (DET the) (NOUN dog))) + Parent Index = None + (VERB saw) + Parent Index = 0 + (NP (DET the) (NOUN dog)) + Parent Index = 1 + (DET the) + Parent Index = 0 + (NOUN dog) + Parent Index = 1 + +Note that ``ptree.parent().index(ptree)`` is *not* equivalent to +``ptree.parent_index()``. In particular, ``ptree.parent().index(ptree)`` +will return the index of the first child of ``ptree.parent()`` that is +equal to ``ptree`` (using ``==``); and that child may not be +``ptree``: + + >>> on_and_on = ParentedTree('CONJP', [ + ... ParentedTree('PREP', ['on']), + ... ParentedTree('COJN', ['and']), + ... ParentedTree('PREP', ['on'])]) + >>> second_on = on_and_on[2] + >>> print(second_on.parent_index()) + 2 + >>> print(second_on.parent().index(second_on)) + 0 + +The methods `left_sibling()` and `right_sibling()` can be used to get a +parented tree's siblings. If a tree does not have a left or right +sibling, then the corresponding method's value is ``None``: + + >>> for subtree in ptree.subtrees(): + ... print(subtree) + ... print(' Left Sibling = %s' % subtree.left_sibling()) + ... print(' Right Sibling = %s' % subtree.right_sibling()) + (VP (VERB saw) (NP (DET the) (NOUN dog))) + Left Sibling = None + Right Sibling = None + (VERB saw) + Left Sibling = None + Right Sibling = (NP (DET the) (NOUN dog)) + (NP (DET the) (NOUN dog)) + Left Sibling = (VERB saw) + Right Sibling = None + (DET the) + Left Sibling = None + Right Sibling = (NOUN dog) + (NOUN dog) + Left Sibling = (DET the) + Right Sibling = None + +A parented tree's root tree can be accessed using the `root()` +method. This method follows the tree's parent pointers until it +finds a tree without a parent. If a tree does not have a parent, then +it is its own root: + + >>> for subtree in ptree.subtrees(): + ... print(subtree) + ... print(' Root = %s' % subtree.root()) + (VP (VERB saw) (NP (DET the) (NOUN dog))) + Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) + (VERB saw) + Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) + (NP (DET the) (NOUN dog)) + Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) + (DET the) + Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) + (NOUN dog) + Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) + +The `treeposition()` method can be used to find a tree's treeposition +relative to its root: + + >>> for subtree in ptree.subtrees(): + ... print(subtree) + ... print(' Tree Position = %s' % (subtree.treeposition(),)) + ... assert subtree.root()[subtree.treeposition()] is subtree + (VP (VERB saw) (NP (DET the) (NOUN dog))) + Tree Position = () + (VERB saw) + Tree Position = (0,) + (NP (DET the) (NOUN dog)) + Tree Position = (1,) + (DET the) + Tree Position = (1, 0) + (NOUN dog) + Tree Position = (1, 1) + +Whenever a parented tree is modified, all of the methods described +above (`parent()`, `parent_index()`, `left_sibling()`, `right_sibling()`, +`root()`, and `treeposition()`) are automatically updated. For example, +if we replace ``ptree``\ 's subtree for the word "dog" with a new +subtree for "cat," the method values for both the "dog" subtree and the +"cat" subtree get automatically updated: + + >>> # Replace the dog with a cat + >>> dog = ptree[1,1] + >>> cat = ParentedTree('NOUN', ['cat']) + >>> ptree[1,1] = cat + + >>> # the noun phrase is no longer the dog's parent: + >>> print(dog.parent(), dog.parent_index(), dog.left_sibling()) + None None None + >>> # dog is now its own root. + >>> print(dog.root()) + (NOUN dog) + >>> print(dog.treeposition()) + () + + >>> # the cat's parent is now the noun phrase: + >>> print(cat.parent()) + (NP (DET the) (NOUN cat)) + >>> print(cat.parent_index()) + 1 + >>> print(cat.left_sibling()) + (DET the) + >>> print(cat.root()) + (VP (VERB saw) (NP (DET the) (NOUN cat))) + >>> print(cat.treeposition()) + (1, 1) + +ParentedTree Regression Tests +----------------------------- +Keep track of all trees that we create (including subtrees) using this +variable: + + >>> all_ptrees = [] + +Define a helper function to create new parented trees: + + >>> def make_ptree(s): + ... ptree = ParentedTree.convert(Tree.fromstring(s)) + ... all_ptrees.extend(t for t in ptree.subtrees() + ... if isinstance(t, Tree)) + ... return ptree + +Define a test function that examines every subtree in all_ptrees; and +checks that all six of its methods are defined correctly. If any +ptrees are passed as arguments, then they are printed. + + >>> def pcheck(*print_ptrees): + ... for ptree in all_ptrees: + ... # Check ptree's methods. + ... if ptree.parent() is not None: + ... i = ptree.parent_index() + ... assert ptree.parent()[i] is ptree + ... if i > 0: + ... assert ptree.left_sibling() is ptree.parent()[i-1] + ... if i < (len(ptree.parent())-1): + ... assert ptree.right_sibling() is ptree.parent()[i+1] + ... assert len(ptree.treeposition()) > 0 + ... assert (ptree.treeposition() == + ... ptree.parent().treeposition() + (ptree.parent_index(),)) + ... assert ptree.root() is not ptree + ... assert ptree.root() is not None + ... assert ptree.root() is ptree.parent().root() + ... assert ptree.root()[ptree.treeposition()] is ptree + ... else: + ... assert ptree.parent_index() is None + ... assert ptree.left_sibling() is None + ... assert ptree.right_sibling() is None + ... assert ptree.root() is ptree + ... assert ptree.treeposition() == () + ... # Check ptree's children's methods: + ... for i, child in enumerate(ptree): + ... if isinstance(child, Tree): + ... # pcheck parent() & parent_index() methods + ... assert child.parent() is ptree + ... assert child.parent_index() == i + ... # pcheck sibling methods + ... if i == 0: + ... assert child.left_sibling() is None + ... else: + ... assert child.left_sibling() is ptree[i-1] + ... if i == len(ptree)-1: + ... assert child.right_sibling() is None + ... else: + ... assert child.right_sibling() is ptree[i+1] + ... if print_ptrees: + ... print('ok!', end=' ') + ... for ptree in print_ptrees: print(ptree) + ... else: + ... print('ok!') + +Run our test function on a variety of newly-created trees: + + >>> pcheck(make_ptree('(A)')) + ok! (A ) + >>> pcheck(make_ptree('(A (B (C (D) (E f)) g) h)')) + ok! (A (B (C (D ) (E f)) g) h) + >>> pcheck(make_ptree('(A (B) (C c) (D d d) (E e e e))')) + ok! (A (B ) (C c) (D d d) (E e e e)) + >>> pcheck(make_ptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))')) + ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e ))) + +Run our test function after performing various tree-modification +operations: + +**__delitem__()** + + >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> e = ptree[0,0,1] + >>> del ptree[0,0,1]; pcheck(ptree); pcheck(e) + ok! (A (B (C (D ) (Q p)) g) h) + ok! (E f) + >>> del ptree[0,0,0]; pcheck(ptree) + ok! (A (B (C (Q p)) g) h) + >>> del ptree[0,1]; pcheck(ptree) + ok! (A (B (C (Q p))) h) + >>> del ptree[-1]; pcheck(ptree) + ok! (A (B (C (Q p)))) + >>> del ptree[-100] + Traceback (most recent call last): + . . . + IndexError: index out of range + >>> del ptree[()] + Traceback (most recent call last): + . . . + IndexError: The tree position () may not be deleted. + + >>> # With slices: + >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') + >>> b = ptree[0] + >>> del ptree[0:0]; pcheck(ptree) + ok! (A (B c) (D e) f g (H i) j (K l)) + >>> del ptree[:1]; pcheck(ptree); pcheck(b) + ok! (A (D e) f g (H i) j (K l)) + ok! (B c) + >>> del ptree[-2:]; pcheck(ptree) + ok! (A (D e) f g (H i)) + >>> del ptree[1:3]; pcheck(ptree) + ok! (A (D e) (H i)) + >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') + >>> del ptree[5:1000]; pcheck(ptree) + ok! (A (B c) (D e) f g (H i)) + >>> del ptree[-2:1000]; pcheck(ptree) + ok! (A (B c) (D e) f) + >>> del ptree[-100:1]; pcheck(ptree) + ok! (A (D e) f) + >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') + >>> del ptree[1:-2:2]; pcheck(ptree) + ok! (A (B c) f (H i) j (K l)) + +**__setitem__()** + + >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> d, e, q = ptree[0,0] + >>> ptree[0,0,0] = 'x'; pcheck(ptree); pcheck(d) + ok! (A (B (C x (E f) (Q p)) g) h) + ok! (D ) + >>> ptree[0,0,1] = make_ptree('(X (Y z))'); pcheck(ptree); pcheck(e) + ok! (A (B (C x (X (Y z)) (Q p)) g) h) + ok! (E f) + >>> ptree[1] = d; pcheck(ptree) + ok! (A (B (C x (X (Y z)) (Q p)) g) (D )) + >>> ptree[-1] = 'x'; pcheck(ptree) + ok! (A (B (C x (X (Y z)) (Q p)) g) x) + >>> ptree[-100] = 'y' + Traceback (most recent call last): + . . . + IndexError: index out of range + >>> ptree[()] = make_ptree('(X y)') + Traceback (most recent call last): + . . . + IndexError: The tree position () may not be assigned to. + + >>> # With slices: + >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') + >>> b = ptree[0] + >>> ptree[0:0] = ('x', make_ptree('(Y)')); pcheck(ptree) + ok! (A x (Y ) (B c) (D e) f g (H i) j (K l)) + >>> ptree[2:6] = (); pcheck(ptree); pcheck(b) + ok! (A x (Y ) (H i) j (K l)) + ok! (B c) + >>> ptree[-2:] = ('z', 'p'); pcheck(ptree) + ok! (A x (Y ) (H i) z p) + >>> ptree[1:3] = [make_ptree('(X)') for x in range(10)]; pcheck(ptree) + ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p) + >>> ptree[5:1000] = []; pcheck(ptree) + ok! (A x (X ) (X ) (X ) (X )) + >>> ptree[-2:1000] = ['n']; pcheck(ptree) + ok! (A x (X ) (X ) n) + >>> ptree[-100:1] = [make_ptree('(U v)')]; pcheck(ptree) + ok! (A (U v) (X ) (X ) n) + >>> ptree[-1:] = (make_ptree('(X)') for x in range(3)); pcheck(ptree) + ok! (A (U v) (X ) (X ) (X ) (X ) (X )) + >>> ptree[1:-2:2] = ['x', 'y']; pcheck(ptree) + ok! (A (U v) x (X ) y (X ) (X )) + +**append()** + + >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> ptree.append('x'); pcheck(ptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x) + >>> ptree.append(make_ptree('(X (Y z))')); pcheck(ptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z))) + +**extend()** + + >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> ptree.extend(['x', 'y', make_ptree('(X (Y z))')]); pcheck(ptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) + >>> ptree.extend([]); pcheck(ptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) + >>> ptree.extend(make_ptree('(X)') for x in range(3)); pcheck(ptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X )) + +**insert()** + + >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> ptree.insert(0, make_ptree('(X (Y z))')); pcheck(ptree) + ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h) + >>> ptree.insert(-1, make_ptree('(X (Y z))')); pcheck(ptree) + ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) + >>> ptree.insert(-4, make_ptree('(X (Y z))')); pcheck(ptree) + ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) + >>> # Note: as with ``list``, inserting at a negative index that + >>> # gives a position before the start of the list does *not* + >>> # raise an IndexError exception; it just inserts at 0. + >>> ptree.insert(-400, make_ptree('(X (Y z))')); pcheck(ptree) + ok! (A + (X (Y z)) + (X (Y z)) + (X (Y z)) + (B (C (D ) (E f) (Q p)) g) + (X (Y z)) + h) + +**pop()** + + >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> ptree[0,0].pop(1); pcheck(ptree) + ParentedTree('E', ['f']) + ok! (A (B (C (D ) (Q p)) g) h) + >>> ptree[0].pop(-1); pcheck(ptree) + 'g' + ok! (A (B (C (D ) (Q p))) h) + >>> ptree.pop(); pcheck(ptree) + 'h' + ok! (A (B (C (D ) (Q p)))) + >>> ptree.pop(-100) + Traceback (most recent call last): + . . . + IndexError: index out of range + +**remove()** + + >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> e = ptree[0,0,1] + >>> ptree[0,0].remove(ptree[0,0,1]); pcheck(ptree); pcheck(e) + ok! (A (B (C (D ) (Q p)) g) h) + ok! (E f) + >>> ptree[0,0].remove(make_ptree('(Q p)')); pcheck(ptree) + ok! (A (B (C (D )) g) h) + >>> ptree[0,0].remove(make_ptree('(Q p)')) + Traceback (most recent call last): + . . . + ValueError: ParentedTree('Q', ['p']) is not in list + >>> ptree.remove('h'); pcheck(ptree) + ok! (A (B (C (D )) g)) + >>> ptree.remove('h'); + Traceback (most recent call last): + . . . + ValueError: 'h' is not in list + >>> # remove() removes the first subtree that is equal (==) to the + >>> # given tree, which may not be the identical tree we give it: + >>> ptree = make_ptree('(A (X x) (Y y) (X x))') + >>> x1, y, x2 = ptree + >>> ptree.remove(ptree[-1]); pcheck(ptree) + ok! (A (Y y) (X x)) + >>> print(x1.parent()); pcheck(x1) + None + ok! (X x) + >>> print(x2.parent()) + (A (Y y) (X x)) + +Test that a tree can not be given multiple parents: + + >>> ptree = make_ptree('(A (X x) (Y y) (Z z))') + >>> ptree[0] = ptree[1] + Traceback (most recent call last): + . . . + ValueError: Can not insert a subtree that already has a parent. + >>> pcheck() + ok! + +[more to be written] + +Shallow copying can be tricky for Tree and several of its subclasses. +For shallow copies of Tree, only the root node is reconstructed, while +all the children are shared between the two trees. Modify the children +of one tree - and the shallowly copied tree will also update. + + >>> from nltk.tree import Tree, ParentedTree, MultiParentedTree + >>> tree = Tree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") + >>> copy_tree = tree.copy(deep=False) + >>> tree == copy_tree # Ensure identical labels and nodes + True + >>> id(copy_tree[0]) == id(tree[0]) # Ensure shallow copy - the children are the same objects in memory + True + +For ParentedTree objects, this behaviour is not possible. With a shallow +copy, the children of the root node would be reused for both the original +and the shallow copy. For this to be possible, some children would need +to have multiple parents. As this is forbidden for ParentedTree objects, +attempting to make a shallow copy will cause a warning, and a deep copy +is made instead. + + >>> ptree = ParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") + >>> copy_ptree = ptree.copy(deep=False) + >>> copy_ptree == ptree # Ensure identical labels and nodes + True + >>> id(copy_ptree[0]) != id(ptree[0]) # Shallow copying isn't supported - it defaults to deep copy. + True + +For MultiParentedTree objects, the issue of only allowing one parent that +can be seen for ParentedTree objects is no more. Shallow copying a +MultiParentedTree gives the children of the root node two parents: +the original and the newly copied root. + + >>> mptree = MultiParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") + >>> len(mptree[0].parents()) + 1 + >>> copy_mptree = mptree.copy(deep=False) + >>> copy_mptree == mptree # Ensure identical labels and nodes + True + >>> len(mptree[0].parents()) + 2 + >>> len(copy_mptree[0].parents()) + 2 + +Shallow copying a MultiParentedTree is similar to creating a second root +which is identically labeled as the root on which the copy method was called. + + +ImmutableParentedTree Regression Tests +-------------------------------------- + + >>> iptree = ImmutableParentedTree.convert(ptree) + >>> type(iptree) + + >>> del iptree[0] + Traceback (most recent call last): + . . . + ValueError: ImmutableParentedTree may not be modified + >>> iptree.set_label('newnode') + Traceback (most recent call last): + . . . + ValueError: ImmutableParentedTree may not be modified + + +MultiParentedTree Regression Tests +---------------------------------- +Keep track of all trees that we create (including subtrees) using this +variable: + + >>> all_mptrees = [] + +Define a helper function to create new parented trees: + + >>> def make_mptree(s): + ... mptree = MultiParentedTree.convert(Tree.fromstring(s)) + ... all_mptrees.extend(t for t in mptree.subtrees() + ... if isinstance(t, Tree)) + ... return mptree + +Define a test function that examines every subtree in all_mptrees; and +checks that all six of its methods are defined correctly. If any +mptrees are passed as arguments, then they are printed. + + >>> def mpcheck(*print_mptrees): + ... def has(seq, val): # uses identity comparison + ... for item in seq: + ... if item is val: return True + ... return False + ... for mptree in all_mptrees: + ... # Check mptree's methods. + ... if len(mptree.parents()) == 0: + ... assert len(mptree.left_siblings()) == 0 + ... assert len(mptree.right_siblings()) == 0 + ... assert len(mptree.roots()) == 1 + ... assert mptree.roots()[0] is mptree + ... assert mptree.treepositions(mptree) == [()] + ... left_siblings = right_siblings = () + ... roots = {id(mptree): 1} + ... else: + ... roots = dict((id(r), 0) for r in mptree.roots()) + ... left_siblings = mptree.left_siblings() + ... right_siblings = mptree.right_siblings() + ... for parent in mptree.parents(): + ... for i in mptree.parent_indices(parent): + ... assert parent[i] is mptree + ... # check left siblings + ... if i > 0: + ... for j in range(len(left_siblings)): + ... if left_siblings[j] is parent[i-1]: + ... del left_siblings[j] + ... break + ... else: + ... assert 0, 'sibling not found!' + ... # check ight siblings + ... if i < (len(parent)-1): + ... for j in range(len(right_siblings)): + ... if right_siblings[j] is parent[i+1]: + ... del right_siblings[j] + ... break + ... else: + ... assert 0, 'sibling not found!' + ... # check roots + ... for root in parent.roots(): + ... assert id(root) in roots, 'missing root' + ... roots[id(root)] += 1 + ... # check that we don't have any unexplained values + ... assert len(left_siblings)==0, 'unexpected sibling' + ... assert len(right_siblings)==0, 'unexpected sibling' + ... for v in roots.values(): assert v>0, roots #'unexpected root' + ... # check treepositions + ... for root in mptree.roots(): + ... for treepos in mptree.treepositions(root): + ... assert root[treepos] is mptree + ... # Check mptree's children's methods: + ... for i, child in enumerate(mptree): + ... if isinstance(child, Tree): + ... # mpcheck parent() & parent_index() methods + ... assert has(child.parents(), mptree) + ... assert i in child.parent_indices(mptree) + ... # mpcheck sibling methods + ... if i > 0: + ... assert has(child.left_siblings(), mptree[i-1]) + ... if i < len(mptree)-1: + ... assert has(child.right_siblings(), mptree[i+1]) + ... if print_mptrees: + ... print('ok!', end=' ') + ... for mptree in print_mptrees: print(mptree) + ... else: + ... print('ok!') + +Run our test function on a variety of newly-created trees: + + >>> mpcheck(make_mptree('(A)')) + ok! (A ) + >>> mpcheck(make_mptree('(A (B (C (D) (E f)) g) h)')) + ok! (A (B (C (D ) (E f)) g) h) + >>> mpcheck(make_mptree('(A (B) (C c) (D d d) (E e e e))')) + ok! (A (B ) (C c) (D d d) (E e e e)) + >>> mpcheck(make_mptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))')) + ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e ))) + >>> subtree = make_mptree('(A (B (C (D) (E f)) g) h)') + +Including some trees that contain multiple parents: + + >>> mpcheck(MultiParentedTree('Z', [subtree, subtree])) + ok! (Z (A (B (C (D ) (E f)) g) h) (A (B (C (D ) (E f)) g) h)) + +Run our test function after performing various tree-modification +operations (n.b., these are the same tests that we ran for +`ParentedTree`, above; thus, none of these trees actually *uses* +multiple parents.) + +**__delitem__()** + + >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> e = mptree[0,0,1] + >>> del mptree[0,0,1]; mpcheck(mptree); mpcheck(e) + ok! (A (B (C (D ) (Q p)) g) h) + ok! (E f) + >>> del mptree[0,0,0]; mpcheck(mptree) + ok! (A (B (C (Q p)) g) h) + >>> del mptree[0,1]; mpcheck(mptree) + ok! (A (B (C (Q p))) h) + >>> del mptree[-1]; mpcheck(mptree) + ok! (A (B (C (Q p)))) + >>> del mptree[-100] + Traceback (most recent call last): + . . . + IndexError: index out of range + >>> del mptree[()] + Traceback (most recent call last): + . . . + IndexError: The tree position () may not be deleted. + + >>> # With slices: + >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') + >>> b = mptree[0] + >>> del mptree[0:0]; mpcheck(mptree) + ok! (A (B c) (D e) f g (H i) j (K l)) + >>> del mptree[:1]; mpcheck(mptree); mpcheck(b) + ok! (A (D e) f g (H i) j (K l)) + ok! (B c) + >>> del mptree[-2:]; mpcheck(mptree) + ok! (A (D e) f g (H i)) + >>> del mptree[1:3]; mpcheck(mptree) + ok! (A (D e) (H i)) + >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') + >>> del mptree[5:1000]; mpcheck(mptree) + ok! (A (B c) (D e) f g (H i)) + >>> del mptree[-2:1000]; mpcheck(mptree) + ok! (A (B c) (D e) f) + >>> del mptree[-100:1]; mpcheck(mptree) + ok! (A (D e) f) + >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') + >>> del mptree[1:-2:2]; mpcheck(mptree) + ok! (A (B c) f (H i) j (K l)) + +**__setitem__()** + + >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> d, e, q = mptree[0,0] + >>> mptree[0,0,0] = 'x'; mpcheck(mptree); mpcheck(d) + ok! (A (B (C x (E f) (Q p)) g) h) + ok! (D ) + >>> mptree[0,0,1] = make_mptree('(X (Y z))'); mpcheck(mptree); mpcheck(e) + ok! (A (B (C x (X (Y z)) (Q p)) g) h) + ok! (E f) + >>> mptree[1] = d; mpcheck(mptree) + ok! (A (B (C x (X (Y z)) (Q p)) g) (D )) + >>> mptree[-1] = 'x'; mpcheck(mptree) + ok! (A (B (C x (X (Y z)) (Q p)) g) x) + >>> mptree[-100] = 'y' + Traceback (most recent call last): + . . . + IndexError: index out of range + >>> mptree[()] = make_mptree('(X y)') + Traceback (most recent call last): + . . . + IndexError: The tree position () may not be assigned to. + + >>> # With slices: + >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') + >>> b = mptree[0] + >>> mptree[0:0] = ('x', make_mptree('(Y)')); mpcheck(mptree) + ok! (A x (Y ) (B c) (D e) f g (H i) j (K l)) + >>> mptree[2:6] = (); mpcheck(mptree); mpcheck(b) + ok! (A x (Y ) (H i) j (K l)) + ok! (B c) + >>> mptree[-2:] = ('z', 'p'); mpcheck(mptree) + ok! (A x (Y ) (H i) z p) + >>> mptree[1:3] = [make_mptree('(X)') for x in range(10)]; mpcheck(mptree) + ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p) + >>> mptree[5:1000] = []; mpcheck(mptree) + ok! (A x (X ) (X ) (X ) (X )) + >>> mptree[-2:1000] = ['n']; mpcheck(mptree) + ok! (A x (X ) (X ) n) + >>> mptree[-100:1] = [make_mptree('(U v)')]; mpcheck(mptree) + ok! (A (U v) (X ) (X ) n) + >>> mptree[-1:] = (make_mptree('(X)') for x in range(3)); mpcheck(mptree) + ok! (A (U v) (X ) (X ) (X ) (X ) (X )) + >>> mptree[1:-2:2] = ['x', 'y']; mpcheck(mptree) + ok! (A (U v) x (X ) y (X ) (X )) + +**append()** + + >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> mptree.append('x'); mpcheck(mptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x) + >>> mptree.append(make_mptree('(X (Y z))')); mpcheck(mptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z))) + +**extend()** + + >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> mptree.extend(['x', 'y', make_mptree('(X (Y z))')]); mpcheck(mptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) + >>> mptree.extend([]); mpcheck(mptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) + >>> mptree.extend(make_mptree('(X)') for x in range(3)); mpcheck(mptree) + ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X )) + +**insert()** + + >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> mptree.insert(0, make_mptree('(X (Y z))')); mpcheck(mptree) + ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h) + >>> mptree.insert(-1, make_mptree('(X (Y z))')); mpcheck(mptree) + ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) + >>> mptree.insert(-4, make_mptree('(X (Y z))')); mpcheck(mptree) + ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) + >>> # Note: as with ``list``, inserting at a negative index that + >>> # gives a position before the start of the list does *not* + >>> # raise an IndexError exception; it just inserts at 0. + >>> mptree.insert(-400, make_mptree('(X (Y z))')); mpcheck(mptree) + ok! (A + (X (Y z)) + (X (Y z)) + (X (Y z)) + (B (C (D ) (E f) (Q p)) g) + (X (Y z)) + h) + +**pop()** + + >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> mptree[0,0].pop(1); mpcheck(mptree) + MultiParentedTree('E', ['f']) + ok! (A (B (C (D ) (Q p)) g) h) + >>> mptree[0].pop(-1); mpcheck(mptree) + 'g' + ok! (A (B (C (D ) (Q p))) h) + >>> mptree.pop(); mpcheck(mptree) + 'h' + ok! (A (B (C (D ) (Q p)))) + >>> mptree.pop(-100) + Traceback (most recent call last): + . . . + IndexError: index out of range + +**remove()** + + >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') + >>> e = mptree[0,0,1] + >>> mptree[0,0].remove(mptree[0,0,1]); mpcheck(mptree); mpcheck(e) + ok! (A (B (C (D ) (Q p)) g) h) + ok! (E f) + >>> mptree[0,0].remove(make_mptree('(Q p)')); mpcheck(mptree) + ok! (A (B (C (D )) g) h) + >>> mptree[0,0].remove(make_mptree('(Q p)')) + Traceback (most recent call last): + . . . + ValueError: MultiParentedTree('Q', ['p']) is not in list + >>> mptree.remove('h'); mpcheck(mptree) + ok! (A (B (C (D )) g)) + >>> mptree.remove('h'); + Traceback (most recent call last): + . . . + ValueError: 'h' is not in list + >>> # remove() removes the first subtree that is equal (==) to the + >>> # given tree, which may not be the identical tree we give it: + >>> mptree = make_mptree('(A (X x) (Y y) (X x))') + >>> x1, y, x2 = mptree + >>> mptree.remove(mptree[-1]); mpcheck(mptree) + ok! (A (Y y) (X x)) + >>> print([str(p) for p in x1.parents()]) + [] + >>> print([str(p) for p in x2.parents()]) + ['(A (Y y) (X x))'] + + +ImmutableMultiParentedTree Regression Tests +------------------------------------------- + + >>> imptree = ImmutableMultiParentedTree.convert(mptree) + >>> type(imptree) + + >>> del imptree[0] + Traceback (most recent call last): + . . . + ValueError: ImmutableMultiParentedTree may not be modified + >>> imptree.set_label('newnode') + Traceback (most recent call last): + . . . + ValueError: ImmutableMultiParentedTree may not be modified + + +ProbabilisticTree Regression Tests +---------------------------------- + + >>> prtree = ProbabilisticTree("S", [ProbabilisticTree("NP", ["N"], prob=0.3)], prob=0.6) + >>> print(prtree) + (S (NP N)) (p=0.6) + >>> import copy + >>> prtree == copy.deepcopy(prtree) == prtree.copy(deep=True) == prtree.copy() + True + >>> prtree[0] is prtree.copy()[0] + True + >>> prtree[0] is prtree.copy(deep=True)[0] + False + + >>> imprtree = ImmutableProbabilisticTree.convert(prtree) + >>> type(imprtree) + + >>> del imprtree[0] + Traceback (most recent call last): + . . . + ValueError: ImmutableProbabilisticTree may not be modified + >>> imprtree.set_label('newnode') + Traceback (most recent call last): + . . . + ValueError: ImmutableProbabilisticTree may not be modified + + +Squashed Bugs +============= + +This used to discard the ``(B b)`` subtree (fixed in svn 6270): + + >>> print(Tree.fromstring('((A a) (B b))')) + ( (A a) (B b)) + +Pickling ParentedTree instances didn't work for Python 3.7 onwards (See #2478) + + >>> import pickle + >>> tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))') + >>> print(tree) + (S (NN x) (NP x) (NN x)) + + >>> pickled = pickle.dumps(tree) + >>> tree_loaded = pickle.loads(pickled) + >>> print(tree_loaded) + (S (NN x) (NP x) (NN x)) + +ParentedTree used to be impossible to (deep)copy. (See #1324) + + >>> from nltk.tree import ParentedTree + >>> import copy + >>> tree = ParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") + >>> tree == copy.deepcopy(tree) == copy.copy(tree) == tree.copy(deep=True) == tree.copy() + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/treeprettyprinter.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/treeprettyprinter.doctest new file mode 100644 index 0000000000000000000000000000000000000000..aeb879f0e21b696909e5fcff84fde7a5bfd3be27 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/treeprettyprinter.doctest @@ -0,0 +1,177 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +========================================================= + Unit tests for nltk.tree.prettyprinter.TreePrettyPrinter +========================================================= + + >>> from nltk.tree import Tree, TreePrettyPrinter + +Tree nr 2170 from nltk.corpus.treebank: + + >>> tree = Tree.fromstring( + ... '(S (NP-SBJ (PRP I)) (VP (VBP feel) (ADJP-PRD (RB pretty) ' + ... '(JJ good)) (PP-CLR (IN about) (NP (PRP it)))) (. .))') + >>> tpp = TreePrettyPrinter(tree) + >>> print(tpp.text()) + S + __________________________|_____________________ + | VP | + | ____________________|___________ | + | | | PP-CLR | + | | | _____|_____ | + NP-SBJ | ADJP-PRD | NP | + | | _______|______ | | | + PRP VBP RB JJ IN PRP . + | | | | | | | + I feel pretty good about it . + + >>> print(tpp.text(unicodelines=True)) + S + ┌──────────────────────────┼─────────────────────┐ + │ VP │ + │ ┌─────────────┬──────┴───────────┐ │ + │ │ │ PP-CLR │ + │ │ │ ┌─────┴─────┐ │ + NP-SBJ │ ADJP-PRD │ NP │ + │ │ ┌───────┴──────┐ │ │ │ + PRP VBP RB JJ IN PRP . + │ │ │ │ │ │ │ + I feel pretty good about it . + +A tree with long labels: + + >>> tree = Tree.fromstring( + ... '(sentence (plural-noun-phrase (plural-noun Superconductors)) ' + ... '(verb-phrase (plural-verb conduct) ' + ... '(noun-phrase (singular-noun electricity))))') + >>> tpp = TreePrettyPrinter(tree) + >>> print(tpp.text(abbreviate=8, nodedist=2)) + sentence + __________|__________ + | verb-phr. + | __________|__________ + plural-n. | noun-phr. + | | | + plural-n. plural-v. singular. + | | | + Supercon. conduct electric. + + >>> print(tpp.text(maxwidth=8, nodedist=2)) + sentence + _________|________ + | verb- + | phrase + | ________|_________ + plural- | noun- + noun- | phrase + phrase | | + | | | + plural- plural- singular- + noun verb noun + | | | + Supercon conduct electric + ductors ity + +A discontinuous tree: + + >>> tree = Tree.fromstring( + ... '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) ' + ... '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) ' + ... '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int) + >>> sentence = ('Ze had met haar moeder kunnen gaan winkelen ,' + ... ' zwemmen of terrassen .'.split()) + >>> tpp = TreePrettyPrinter(tree, sentence) + >>> print(tpp.text()) + top + _____|______________________________________________ + smain | | + _______________________________|_____ | | + | | inf | | + | | _____|____ | | + | | | inf | | + | | | ____|_____ | | + | | | | conj | | + | | _____ | ___ | _________|______ | __________________ | + | | inf | | | | | | | + | | _________|_____ | ___ | _________ | | | | | + | | pp | | | | | | | | + | | ____|____ | | | | | | | | + | | | np | | | | inf | inf | + | | | ____|____ | | | | | | | | + noun verb prep det noun verb verb verb punct verb vg verb punct + | | | | | | | | | | | | | + Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen . + + >>> print(tpp.text(unicodelines=True)) + top + ┌─────┴──────────────────┬───────────────────────────┐ + smain │ │ + ┌────┬──────────────────────────┴─────┐ │ │ + │ │ inf │ │ + │ │ ┌─────┴────┐ │ │ + │ │ │ inf │ │ + │ │ │ ┌────┴─────┐ │ │ + │ │ │ │ conj │ │ + │ │ ┌───── │ ─── │ ─────────┴────── │ ─────┬─────┬──────┐ │ + │ │ inf │ │ │ │ │ │ │ + │ │ ┌─────────┴───── │ ─── │ ─────────┐ │ │ │ │ │ + │ │ pp │ │ │ │ │ │ │ │ + │ │ ┌────┴────┐ │ │ │ │ │ │ │ │ + │ │ │ np │ │ │ │ inf │ inf │ + │ │ │ ┌────┴────┐ │ │ │ │ │ │ │ │ + noun verb prep det noun verb verb verb punct verb vg verb punct + │ │ │ │ │ │ │ │ │ │ │ │ │ + Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen . + +Importing TreePrettyPrinter +--------------------------- + +First of all, a simple tree will be constructed:: + + >>> from nltk.tree import Tree + >>> tree = Tree.fromstring('(S (NP Mary) (VP walks))') + +We'll use this sample tree to show that the method of importing `TreePrettyPrinter` work correctly: + +- Recommended:: + + >>> from nltk.tree import TreePrettyPrinter + >>> print(TreePrettyPrinter(tree).text()) + S + ____|____ + NP VP + | | + Mary walks + +- Alternative but valid options:: + + >>> from nltk import TreePrettyPrinter + >>> print(TreePrettyPrinter(tree).text()) + S + ____|____ + NP VP + | | + Mary walks + + >>> from nltk.tree.prettyprinter import TreePrettyPrinter + >>> print(TreePrettyPrinter(tree).text()) + S + ____|____ + NP VP + | | + Mary walks + +- Deprecated, do not use:: + + >>> from nltk.treeprettyprinter import TreePrettyPrinter + >>> print(TreePrettyPrinter(tree).text()) + S + ____|____ + NP VP + | | + Mary walks + + This method will throw a DeprecationWarning:: + + Import `TreePrettyPrinter` using `from nltk.tree import TreePrettyPrinter` instead. diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/treetransforms.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/treetransforms.doctest new file mode 100644 index 0000000000000000000000000000000000000000..7efe8b7a0bf947175c269fadc399ff7c58828c30 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/treetransforms.doctest @@ -0,0 +1,154 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +------------------------------------------- +Unit tests for the TreeTransformation class +------------------------------------------- + + >>> from copy import deepcopy + >>> from nltk.tree import Tree, collapse_unary, chomsky_normal_form, un_chomsky_normal_form + + >>> tree_string = "(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))" + + >>> tree = Tree.fromstring(tree_string) + >>> print(tree) + (TOP + (S + (S + (VP + (VBN Turned) + (ADVP (RB loose)) + (PP + (IN in) + (NP + (NP (NNP Shane) (NNP Longman) (POS 's)) + (NN trading) + (NN room))))) + (, ,) + (NP (DT the) (NN yuppie) (NNS dealers)) + (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) + (. .))) + +Make a copy of the original tree and collapse the subtrees with only one child + + >>> collapsedTree = deepcopy(tree) + >>> collapse_unary(collapsedTree) + >>> print(collapsedTree) + (TOP + (S + (S+VP + (VBN Turned) + (ADVP (RB loose)) + (PP + (IN in) + (NP + (NP (NNP Shane) (NNP Longman) (POS 's)) + (NN trading) + (NN room)))) + (, ,) + (NP (DT the) (NN yuppie) (NNS dealers)) + (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) + (. .))) + + >>> collapsedTree2 = deepcopy(tree) + >>> collapse_unary(collapsedTree2, collapsePOS=True, collapseRoot=True) + >>> print(collapsedTree2) + (TOP+S + (S+VP + (VBN Turned) + (ADVP+RB loose) + (PP + (IN in) + (NP + (NP (NNP Shane) (NNP Longman) (POS 's)) + (NN trading) + (NN room)))) + (, ,) + (NP (DT the) (NN yuppie) (NNS dealers)) + (VP (AUX do) (NP (NP+RB little) (ADJP+RB right))) + (. .)) + +Convert the tree to Chomsky Normal Form i.e. each subtree has either two +subtree children or a single leaf value. This conversion can be performed +using either left- or right-factoring. + + >>> cnfTree = deepcopy(collapsedTree) + >>> chomsky_normal_form(cnfTree, factor='left') + >>> print(cnfTree) + (TOP + (S + (S| + (S| + (S| + (S+VP + (S+VP| (VBN Turned) (ADVP (RB loose))) + (PP + (IN in) + (NP + (NP| + (NP + (NP| (NNP Shane) (NNP Longman)) + (POS 's)) + (NN trading)) + (NN room)))) + (, ,)) + (NP (NP| (DT the) (NN yuppie)) (NNS dealers))) + (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))) + (. .))) + + >>> cnfTree = deepcopy(collapsedTree) + >>> chomsky_normal_form(cnfTree, factor='right') + >>> print(cnfTree) + (TOP + (S + (S+VP + (VBN Turned) + (S+VP| + (ADVP (RB loose)) + (PP + (IN in) + (NP + (NP (NNP Shane) (NP| (NNP Longman) (POS 's))) + (NP| (NN trading) (NN room)))))) + (S|<,-NP-VP-.> + (, ,) + (S| + (NP (DT the) (NP| (NN yuppie) (NNS dealers))) + (S| + (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) + (. .)))))) + +Employ some Markov smoothing to make the artificial node labels a bit more +readable. See the treetransforms.py documentation for more details. + + >>> markovTree = deepcopy(collapsedTree) + >>> chomsky_normal_form(markovTree, horzMarkov=2, vertMarkov=1) + >>> print(markovTree) + (TOP + (S^ + (S+VP^ + (VBN Turned) + (S+VP|^ + (ADVP^ (RB loose)) + (PP^ + (IN in) + (NP^ + (NP^ + (NNP Shane) + (NP|^ (NNP Longman) (POS 's))) + (NP|^ (NN trading) (NN room)))))) + (S|<,-NP>^ + (, ,) + (S|^ + (NP^ (DT the) (NP|^ (NN yuppie) (NNS dealers))) + (S|^ + (VP^ + (AUX do) + (NP^ (NP^ (RB little)) (ADJP^ (RB right)))) + (. .)))))) + +Convert the transformed tree back to its original form + + >>> un_chomsky_normal_form(markovTree) + >>> tree == markovTree + True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_models.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_models.py new file mode 100644 index 0000000000000000000000000000000000000000..6bc70222f0ca837161b0982f1a123ab89891234d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_models.py @@ -0,0 +1,610 @@ +# Natural Language Toolkit: Language Model Unit Tests +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT +import math +from operator import itemgetter + +import pytest + +from nltk.lm import ( + MLE, + AbsoluteDiscountingInterpolated, + KneserNeyInterpolated, + Laplace, + Lidstone, + StupidBackoff, + Vocabulary, + WittenBellInterpolated, +) +from nltk.lm.preprocessing import padded_everygrams + + +@pytest.fixture(scope="session") +def vocabulary(): + return Vocabulary(["a", "b", "c", "d", "z", "", ""], unk_cutoff=1) + + +@pytest.fixture(scope="session") +def training_data(): + return [["a", "b", "c", "d"], ["e", "g", "a", "d", "b", "e"]] + + +@pytest.fixture(scope="session") +def bigram_training_data(training_data): + return [list(padded_everygrams(2, sent)) for sent in training_data] + + +@pytest.fixture(scope="session") +def trigram_training_data(training_data): + return [list(padded_everygrams(3, sent)) for sent in training_data] + + +@pytest.fixture +def mle_bigram_model(vocabulary, bigram_training_data): + model = MLE(2, vocabulary=vocabulary) + model.fit(bigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + ("d", ["c"], 1), + # Unseen ngrams should yield 0 + ("d", ["e"], 0), + # Unigrams should also be 0 + ("z", None, 0), + # N unigrams = 14 + # count('a') = 2 + ("a", None, 2.0 / 14), + # count('y') = 3 + ("y", None, 3.0 / 14), + ], +) +def test_mle_bigram_scores(mle_bigram_model, word, context, expected_score): + assert pytest.approx(mle_bigram_model.score(word, context), 1e-4) == expected_score + + +def test_mle_bigram_logscore_for_zero_score(mle_bigram_model): + assert math.isinf(mle_bigram_model.logscore("d", ["e"])) + + +def test_mle_bigram_entropy_perplexity_seen(mle_bigram_model): + # ngrams seen during training + trained = [ + ("", "a"), + ("a", "b"), + ("b", ""), + ("", "a"), + ("a", "d"), + ("d", ""), + ] + # Ngram = Log score + # , a = -1 + # a, b = -1 + # b, UNK = -1 + # UNK, a = -1.585 + # a, d = -1 + # d, = -1 + # TOTAL logscores = -6.585 + # - AVG logscores = 1.0975 + H = 1.0975 + perplexity = 2.1398 + assert pytest.approx(mle_bigram_model.entropy(trained), 1e-4) == H + assert pytest.approx(mle_bigram_model.perplexity(trained), 1e-4) == perplexity + + +def test_mle_bigram_entropy_perplexity_unseen(mle_bigram_model): + # In MLE, even one unseen ngram should make entropy and perplexity infinite + untrained = [("", "a"), ("a", "c"), ("c", "d"), ("d", "")] + + assert math.isinf(mle_bigram_model.entropy(untrained)) + assert math.isinf(mle_bigram_model.perplexity(untrained)) + + +def test_mle_bigram_entropy_perplexity_unigrams(mle_bigram_model): + # word = score, log score + # = 0.1429, -2.8074 + # a = 0.1429, -2.8074 + # c = 0.0714, -3.8073 + # UNK = 0.2143, -2.2224 + # d = 0.1429, -2.8074 + # c = 0.0714, -3.8073 + # = 0.1429, -2.8074 + # TOTAL logscores = -21.6243 + # - AVG logscores = 3.0095 + H = 3.0095 + perplexity = 8.0529 + + text = [("",), ("a",), ("c",), ("-",), ("d",), ("c",), ("",)] + + assert pytest.approx(mle_bigram_model.entropy(text), 1e-4) == H + assert pytest.approx(mle_bigram_model.perplexity(text), 1e-4) == perplexity + + +@pytest.fixture +def mle_trigram_model(trigram_training_data, vocabulary): + model = MLE(order=3, vocabulary=vocabulary) + model.fit(trigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + # count(d | b, c) = 1 + # count(b, c) = 1 + ("d", ("b", "c"), 1), + # count(d | c) = 1 + # count(c) = 1 + ("d", ["c"], 1), + # total number of tokens is 18, of which "a" occurred 2 times + ("a", None, 2.0 / 18), + # in vocabulary but unseen + ("z", None, 0), + # out of vocabulary should use "UNK" score + ("y", None, 3.0 / 18), + ], +) +def test_mle_trigram_scores(mle_trigram_model, word, context, expected_score): + assert pytest.approx(mle_trigram_model.score(word, context), 1e-4) == expected_score + + +@pytest.fixture +def lidstone_bigram_model(bigram_training_data, vocabulary): + model = Lidstone(0.1, order=2, vocabulary=vocabulary) + model.fit(bigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + # count(d | c) = 1 + # *count(d | c) = 1.1 + # Count(w | c for w in vocab) = 1 + # *Count(w | c for w in vocab) = 1.8 + ("d", ["c"], 1.1 / 1.8), + # Total unigrams: 14 + # Vocab size: 8 + # Denominator: 14 + 0.8 = 14.8 + # count("a") = 2 + # *count("a") = 2.1 + ("a", None, 2.1 / 14.8), + # in vocabulary but unseen + # count("z") = 0 + # *count("z") = 0.1 + ("z", None, 0.1 / 14.8), + # out of vocabulary should use "UNK" score + # count("") = 3 + # *count("") = 3.1 + ("y", None, 3.1 / 14.8), + ], +) +def test_lidstone_bigram_score(lidstone_bigram_model, word, context, expected_score): + assert ( + pytest.approx(lidstone_bigram_model.score(word, context), 1e-4) + == expected_score + ) + + +def test_lidstone_entropy_perplexity(lidstone_bigram_model): + text = [ + ("", "a"), + ("a", "c"), + ("c", ""), + ("", "d"), + ("d", "c"), + ("c", ""), + ] + # Unlike MLE this should be able to handle completely novel ngrams + # Ngram = score, log score + # , a = 0.3929, -1.3479 + # a, c = 0.0357, -4.8074 + # c, UNK = 0.0(5), -4.1699 + # UNK, d = 0.0263, -5.2479 + # d, c = 0.0357, -4.8074 + # c, = 0.0(5), -4.1699 + # TOTAL logscore: −24.5504 + # - AVG logscore: 4.0917 + H = 4.0917 + perplexity = 17.0504 + assert pytest.approx(lidstone_bigram_model.entropy(text), 1e-4) == H + assert pytest.approx(lidstone_bigram_model.perplexity(text), 1e-4) == perplexity + + +@pytest.fixture +def lidstone_trigram_model(trigram_training_data, vocabulary): + model = Lidstone(0.1, order=3, vocabulary=vocabulary) + model.fit(trigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + # Logic behind this is the same as for bigram model + ("d", ["c"], 1.1 / 1.8), + # if we choose a word that hasn't appeared after (b, c) + ("e", ["c"], 0.1 / 1.8), + # Trigram score now + ("d", ["b", "c"], 1.1 / 1.8), + ("e", ["b", "c"], 0.1 / 1.8), + ], +) +def test_lidstone_trigram_score(lidstone_trigram_model, word, context, expected_score): + assert ( + pytest.approx(lidstone_trigram_model.score(word, context), 1e-4) + == expected_score + ) + + +@pytest.fixture +def laplace_bigram_model(bigram_training_data, vocabulary): + model = Laplace(2, vocabulary=vocabulary) + model.fit(bigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + # basic sanity-check: + # count(d | c) = 1 + # *count(d | c) = 2 + # Count(w | c for w in vocab) = 1 + # *Count(w | c for w in vocab) = 9 + ("d", ["c"], 2.0 / 9), + # Total unigrams: 14 + # Vocab size: 8 + # Denominator: 14 + 8 = 22 + # count("a") = 2 + # *count("a") = 3 + ("a", None, 3.0 / 22), + # in vocabulary but unseen + # count("z") = 0 + # *count("z") = 1 + ("z", None, 1.0 / 22), + # out of vocabulary should use "UNK" score + # count("") = 3 + # *count("") = 4 + ("y", None, 4.0 / 22), + ], +) +def test_laplace_bigram_score(laplace_bigram_model, word, context, expected_score): + assert ( + pytest.approx(laplace_bigram_model.score(word, context), 1e-4) == expected_score + ) + + +def test_laplace_bigram_entropy_perplexity(laplace_bigram_model): + text = [ + ("", "a"), + ("a", "c"), + ("c", ""), + ("", "d"), + ("d", "c"), + ("c", ""), + ] + # Unlike MLE this should be able to handle completely novel ngrams + # Ngram = score, log score + # , a = 0.2, -2.3219 + # a, c = 0.1, -3.3219 + # c, UNK = 0.(1), -3.1699 + # UNK, d = 0.(09), 3.4594 + # d, c = 0.1 -3.3219 + # c, = 0.(1), -3.1699 + # Total logscores: −18.7651 + # - AVG logscores: 3.1275 + H = 3.1275 + perplexity = 8.7393 + assert pytest.approx(laplace_bigram_model.entropy(text), 1e-4) == H + assert pytest.approx(laplace_bigram_model.perplexity(text), 1e-4) == perplexity + + +def test_laplace_gamma(laplace_bigram_model): + assert laplace_bigram_model.gamma == 1 + + +@pytest.fixture +def wittenbell_trigram_model(trigram_training_data, vocabulary): + model = WittenBellInterpolated(3, vocabulary=vocabulary) + model.fit(trigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + # For unigram scores by default revert to regular MLE + # Total unigrams: 18 + # Vocab Size = 7 + # count('c'): 1 + ("c", None, 1.0 / 18), + # in vocabulary but unseen + # count("z") = 0 + ("z", None, 0 / 18), + # out of vocabulary should use "UNK" score + # count("") = 3 + ("y", None, 3.0 / 18), + # 2 words follow b and b occurred a total of 2 times + # gamma(['b']) = 2 / (2 + 2) = 0.5 + # mle.score('c', ['b']) = 0.5 + # mle('c') = 1 / 18 = 0.055 + # (1 - gamma) * mle + gamma * mle('c') ~= 0.27 + 0.055 + ("c", ["b"], (1 - 0.5) * 0.5 + 0.5 * 1 / 18), + # building on that, let's try 'a b c' as the trigram + # 1 word follows 'a b' and 'a b' occurred 1 time + # gamma(['a', 'b']) = 1 / (1 + 1) = 0.5 + # mle("c", ["a", "b"]) = 1 + ("c", ["a", "b"], (1 - 0.5) + 0.5 * ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)), + # P(c|zb) + # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. + ("c", ["z", "b"], ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)), + ], +) +def test_wittenbell_trigram_score( + wittenbell_trigram_model, word, context, expected_score +): + assert ( + pytest.approx(wittenbell_trigram_model.score(word, context), 1e-4) + == expected_score + ) + + +############################################################################### +# Notation Explained # +############################################################################### +# For all subsequent calculations we use the following notation: +# 1. '*': Placeholder for any word/character. E.g. '*b' stands for +# all bigrams that end in 'b'. '*b*' stands for all trigrams that +# contain 'b' in the middle. +# 1. count(ngram): Count all instances (tokens) of an ngram. +# 1. unique(ngram): Count unique instances (types) of an ngram. + + +@pytest.fixture +def kneserney_trigram_model(trigram_training_data, vocabulary): + model = KneserNeyInterpolated(order=3, discount=0.75, vocabulary=vocabulary) + model.fit(trigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + # P(c) = count('*c') / unique('**') + # = 1 / 14 + ("c", None, 1.0 / 14), + # P(z) = count('*z') / unique('**') + # = 0 / 14 + # 'z' is in the vocabulary, but it was not seen during training. + ("z", None, 0.0 / 14), + # P(y) + # Out of vocabulary should use "UNK" score. + # P(y) = P(UNK) = count('*UNK') / unique('**') + ("y", None, 3 / 14), + # We start with P(c|b) + # P(c|b) = alpha('bc') + gamma('b') * P(c) + # alpha('bc') = max(unique('*bc') - discount, 0) / unique('*b*') + # = max(1 - 0.75, 0) / 2 + # = 0.125 + # gamma('b') = discount * unique('b*') / unique('*b*') + # = (0.75 * 2) / 2 + # = 0.75 + ("c", ["b"], (0.125 + 0.75 * (1 / 14))), + # Building on that, let's try P(c|ab). + # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) + # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') + # = max(1 - 0.75, 0) / 1 + # = 0.25 + # gamma('ab') = (discount * unique('ab*')) / count('ab*') + # = 0.75 * 1 / 1 + ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))), + # P(c|zb) + # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. + ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))), + ], +) +def test_kneserney_trigram_score( + kneserney_trigram_model, word, context, expected_score +): + assert ( + pytest.approx(kneserney_trigram_model.score(word, context), 1e-4) + == expected_score + ) + + +@pytest.fixture +def absolute_discounting_trigram_model(trigram_training_data, vocabulary): + model = AbsoluteDiscountingInterpolated(order=3, vocabulary=vocabulary) + model.fit(trigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + # For unigram scores revert to uniform + # P(c) = count('c') / count('**') + ("c", None, 1.0 / 18), + # in vocabulary but unseen + # count('z') = 0 + ("z", None, 0.0 / 18), + # out of vocabulary should use "UNK" score + # count('') = 3 + ("y", None, 3 / 18), + # P(c|b) = alpha('bc') + gamma('b') * P(c) + # alpha('bc') = max(count('bc') - discount, 0) / count('b*') + # = max(1 - 0.75, 0) / 2 + # = 0.125 + # gamma('b') = discount * unique('b*') / count('b*') + # = (0.75 * 2) / 2 + # = 0.75 + ("c", ["b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))), + # Building on that, let's try P(c|ab). + # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) + # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') + # = max(1 - 0.75, 0) / 1 + # = 0.25 + # gamma('ab') = (discount * unique('ab*')) / count('ab*') + # = 0.75 * 1 / 1 + ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (2 / 2) * (1 / 18))), + # P(c|zb) + # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. + ("c", ["z", "b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))), + ], +) +def test_absolute_discounting_trigram_score( + absolute_discounting_trigram_model, word, context, expected_score +): + assert ( + pytest.approx(absolute_discounting_trigram_model.score(word, context), 1e-4) + == expected_score + ) + + +@pytest.fixture +def stupid_backoff_trigram_model(trigram_training_data, vocabulary): + model = StupidBackoff(order=3, vocabulary=vocabulary) + model.fit(trigram_training_data) + return model + + +@pytest.mark.parametrize( + "word, context, expected_score", + [ + # For unigram scores revert to uniform + # total bigrams = 18 + ("c", None, 1.0 / 18), + # in vocabulary but unseen + # bigrams ending with z = 0 + ("z", None, 0.0 / 18), + # out of vocabulary should use "UNK" score + # count(''): 3 + ("y", None, 3 / 18), + # c follows 1 time out of 2 after b + ("c", ["b"], 1 / 2), + # c always follows ab + ("c", ["a", "b"], 1 / 1), + # The ngram 'z b c' was not seen, so we backoff to + # the score of the ngram 'b c' * smoothing factor + ("c", ["z", "b"], (0.4 * (1 / 2))), + ], +) +def test_stupid_backoff_trigram_score( + stupid_backoff_trigram_model, word, context, expected_score +): + assert ( + pytest.approx(stupid_backoff_trigram_model.score(word, context), 1e-4) + == expected_score + ) + + +############################################################################### +# Probability Distributions Should Sum up to Unity # +############################################################################### + + +@pytest.fixture(scope="session") +def kneserney_bigram_model(bigram_training_data, vocabulary): + model = KneserNeyInterpolated(order=2, vocabulary=vocabulary) + model.fit(bigram_training_data) + return model + + +@pytest.mark.parametrize( + "model_fixture", + [ + "mle_bigram_model", + "mle_trigram_model", + "lidstone_bigram_model", + "laplace_bigram_model", + "wittenbell_trigram_model", + "absolute_discounting_trigram_model", + "kneserney_bigram_model", + pytest.param( + "stupid_backoff_trigram_model", + marks=pytest.mark.xfail( + reason="Stupid Backoff is not a valid distribution" + ), + ), + ], +) +@pytest.mark.parametrize( + "context", + [("a",), ("c",), ("",), ("b",), ("",), ("d",), ("e",), ("r",), ("w",)], + ids=itemgetter(0), +) +def test_sums_to_1(model_fixture, context, request): + model = request.getfixturevalue(model_fixture) + scores_for_context = sum(model.score(w, context) for w in model.vocab) + assert pytest.approx(scores_for_context, 1e-7) == 1.0 + + +############################################################################### +# Generating Text # +############################################################################### + + +def test_generate_one_no_context(mle_trigram_model): + assert mle_trigram_model.generate(random_seed=3) == "" + + +def test_generate_one_from_limiting_context(mle_trigram_model): + # We don't need random_seed for contexts with only one continuation + assert mle_trigram_model.generate(text_seed=["c"]) == "d" + assert mle_trigram_model.generate(text_seed=["b", "c"]) == "d" + assert mle_trigram_model.generate(text_seed=["a", "c"]) == "d" + + +def test_generate_one_from_varied_context(mle_trigram_model): + # When context doesn't limit our options enough, seed the random choice + assert mle_trigram_model.generate(text_seed=("a", ""), random_seed=2) == "a" + + +def test_generate_cycle(mle_trigram_model): + # Add a cycle to the model: bd -> b, db -> d + more_training_text = [padded_everygrams(mle_trigram_model.order, list("bdbdbd"))] + + mle_trigram_model.fit(more_training_text) + # Test that we can escape the cycle + assert mle_trigram_model.generate(7, text_seed=("b", "d"), random_seed=5) == [ + "b", + "d", + "b", + "d", + "b", + "d", + "", + ] + + +def test_generate_with_text_seed(mle_trigram_model): + assert mle_trigram_model.generate(5, text_seed=("", "e"), random_seed=3) == [ + "", + "a", + "d", + "b", + "", + ] + + +def test_generate_oov_text_seed(mle_trigram_model): + assert mle_trigram_model.generate( + text_seed=("aliens",), random_seed=3 + ) == mle_trigram_model.generate(text_seed=("",), random_seed=3) + + +def test_generate_None_text_seed(mle_trigram_model): + # should crash with type error when we try to look it up in vocabulary + with pytest.raises(TypeError): + mle_trigram_model.generate(text_seed=(None,)) + + # This will work + assert mle_trigram_model.generate( + text_seed=None, random_seed=3 + ) == mle_trigram_model.generate(random_seed=3) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_preprocessing.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..4a4e236e6e4b4a7aa23cc431d186f192c28f46b8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_preprocessing.py @@ -0,0 +1,30 @@ +# Natural Language Toolkit: Language Model Unit Tests +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT +import unittest + +from nltk.lm.preprocessing import padded_everygram_pipeline + + +class TestPreprocessing(unittest.TestCase): + def test_padded_everygram_pipeline(self): + expected_train = [ + [ + ("",), + ("", "a"), + ("a",), + ("a", "b"), + ("b",), + ("b", "c"), + ("c",), + ("c", ""), + ("",), + ] + ] + expected_vocab = ["", "a", "b", "c", ""] + train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]]) + self.assertEqual([list(sent) for sent in train_data], expected_train) + self.assertEqual(list(vocab_data), expected_vocab) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_vocabulary.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_vocabulary.py new file mode 100644 index 0000000000000000000000000000000000000000..4492ec97130726adcad8efc3a9403846c6007d56 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/lm/test_vocabulary.py @@ -0,0 +1,156 @@ +# Natural Language Toolkit: Language Model Unit Tests +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT + +import unittest +from collections import Counter +from timeit import timeit + +from nltk.lm import Vocabulary + + +class NgramModelVocabularyTests(unittest.TestCase): + """tests Vocabulary Class""" + + @classmethod + def setUpClass(cls): + cls.vocab = Vocabulary( + ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"], + unk_cutoff=2, + ) + + def test_truthiness(self): + self.assertTrue(self.vocab) + + def test_cutoff_value_set_correctly(self): + self.assertEqual(self.vocab.cutoff, 2) + + def test_unable_to_change_cutoff(self): + with self.assertRaises(AttributeError): + self.vocab.cutoff = 3 + + def test_cutoff_setter_checks_value(self): + with self.assertRaises(ValueError) as exc_info: + Vocabulary("abc", unk_cutoff=0) + expected_error_msg = "Cutoff value cannot be less than 1. Got: 0" + self.assertEqual(expected_error_msg, str(exc_info.exception)) + + def test_counts_set_correctly(self): + self.assertEqual(self.vocab.counts["a"], 2) + self.assertEqual(self.vocab.counts["b"], 2) + self.assertEqual(self.vocab.counts["c"], 1) + + def test_membership_check_respects_cutoff(self): + # a was seen 2 times, so it should be considered part of the vocabulary + self.assertTrue("a" in self.vocab) + # "c" was seen once, it shouldn't be considered part of the vocab + self.assertFalse("c" in self.vocab) + # "z" was never seen at all, also shouldn't be considered in the vocab + self.assertFalse("z" in self.vocab) + + def test_vocab_len_respects_cutoff(self): + # Vocab size is the number of unique tokens that occur at least as often + # as the cutoff value, plus 1 to account for unknown words. + self.assertEqual(5, len(self.vocab)) + + def test_vocab_iter_respects_cutoff(self): + vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"] + vocab_items = ["a", "b", "d", "e", ""] + + self.assertCountEqual(vocab_counts, list(self.vocab.counts.keys())) + self.assertCountEqual(vocab_items, list(self.vocab)) + + def test_update_empty_vocab(self): + empty = Vocabulary(unk_cutoff=2) + self.assertEqual(len(empty), 0) + self.assertFalse(empty) + self.assertIn(empty.unk_label, empty) + + empty.update(list("abcde")) + self.assertIn(empty.unk_label, empty) + + def test_lookup(self): + self.assertEqual(self.vocab.lookup("a"), "a") + self.assertEqual(self.vocab.lookup("c"), "") + + def test_lookup_iterables(self): + self.assertEqual(self.vocab.lookup(["a", "b"]), ("a", "b")) + self.assertEqual(self.vocab.lookup(("a", "b")), ("a", "b")) + self.assertEqual(self.vocab.lookup(("a", "c")), ("a", "")) + self.assertEqual( + self.vocab.lookup(map(str, range(3))), ("", "", "") + ) + + def test_lookup_empty_iterables(self): + self.assertEqual(self.vocab.lookup(()), ()) + self.assertEqual(self.vocab.lookup([]), ()) + self.assertEqual(self.vocab.lookup(iter([])), ()) + self.assertEqual(self.vocab.lookup(n for n in range(0, 0)), ()) + + def test_lookup_recursive(self): + self.assertEqual( + self.vocab.lookup([["a", "b"], ["a", "c"]]), (("a", "b"), ("a", "")) + ) + self.assertEqual(self.vocab.lookup([["a", "b"], "c"]), (("a", "b"), "")) + self.assertEqual(self.vocab.lookup([[[[["a", "b"]]]]]), ((((("a", "b"),),),),)) + + def test_lookup_None(self): + with self.assertRaises(TypeError): + self.vocab.lookup(None) + with self.assertRaises(TypeError): + list(self.vocab.lookup([None, None])) + + def test_lookup_int(self): + with self.assertRaises(TypeError): + self.vocab.lookup(1) + with self.assertRaises(TypeError): + list(self.vocab.lookup([1, 2])) + + def test_lookup_empty_str(self): + self.assertEqual(self.vocab.lookup(""), "") + + def test_eqality(self): + v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1) + v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1) + v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah") + v4 = Vocabulary(["a", "b"], unk_cutoff=1) + + self.assertEqual(v1, v2) + self.assertNotEqual(v1, v3) + self.assertNotEqual(v1, v4) + + def test_str(self): + self.assertEqual( + str(self.vocab), "" + ) + + def test_creation_with_counter(self): + self.assertEqual( + self.vocab, + Vocabulary( + Counter( + ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"] + ), + unk_cutoff=2, + ), + ) + + @unittest.skip( + reason="Test is known to be flaky as it compares (runtime) performance." + ) + def test_len_is_constant(self): + # Given an obviously small and an obviously large vocabulary. + small_vocab = Vocabulary("abcde") + from nltk.corpus.europarl_raw import english + + large_vocab = Vocabulary(english.words()) + + # If we time calling `len` on them. + small_vocab_len_time = timeit("len(small_vocab)", globals=locals()) + large_vocab_len_time = timeit("len(large_vocab)", globals=locals()) + + # The timing should be the same order of magnitude. + self.assertAlmostEqual(small_vocab_len_time, large_vocab_len_time, places=1) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_aline.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_aline.py new file mode 100644 index 0000000000000000000000000000000000000000..68cb55f74809ac3cb53e8dfd56be8706a656f0fb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_aline.py @@ -0,0 +1,48 @@ +""" +Test Aline algorithm for aligning phonetic sequences +""" +from nltk.metrics import aline + + +def test_aline(): + result = aline.align("θin", "tenwis") + expected = [[("θ", "t"), ("i", "e"), ("n", "n")]] + + assert result == expected + + result = aline.align("jo", "ʒə") + expected = [[("j", "ʒ"), ("o", "ə")]] + + assert result == expected + + result = aline.align("pematesiweni", "pematesewen") + expected = [ + [ + ("p", "p"), + ("e", "e"), + ("m", "m"), + ("a", "a"), + ("t", "t"), + ("e", "e"), + ("s", "s"), + ("i", "e"), + ("w", "w"), + ("e", "e"), + ("n", "n"), + ] + ] + + assert result == expected + + result = aline.align("tuwθ", "dentis") + expected = [[("t", "t"), ("u", "i"), ("w", "-"), ("θ", "s")]] + + assert result == expected + + +def test_aline_delta(): + """ + Test aline for computing the difference between two segments + """ + assert aline.delta("p", "q") == 20.0 + assert aline.delta("a", "A") == 0.0 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_bllip.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_bllip.py new file mode 100644 index 0000000000000000000000000000000000000000..b134dd0dd1a217ecff53309614cd96529cc407e9 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_bllip.py @@ -0,0 +1,42 @@ +import pytest + +from nltk.data import find +from nltk.parse.bllip import BllipParser +from nltk.tree import Tree + + +@pytest.fixture(scope="module") +def parser(): + model_dir = find("models/bllip_wsj_no_aux").path + return BllipParser.from_unified_model_dir(model_dir) + + +def setup_module(): + pytest.importorskip("bllipparser") + + +class TestBllipParser: + def test_parser_loads_a_valid_tree(self, parser): + parsed = parser.parse("I saw the man with the telescope") + tree = next(parsed) + + assert isinstance(tree, Tree) + assert ( + tree.pformat() + == """ +(S1 + (S + (NP (PRP I)) + (VP + (VBD saw) + (NP (DT the) (NN man)) + (PP (IN with) (NP (DT the) (NN telescope)))))) +""".strip() + ) + + def test_tagged_parse_finds_matching_element(self, parser): + parsed = parser.parse("I saw the man with the telescope") + tagged_tree = next(parser.tagged_parse([("telescope", "NN")])) + + assert isinstance(tagged_tree, Tree) + assert tagged_tree.pformat() == "(S1 (NP (NN telescope)))" diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_brill.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_brill.py new file mode 100644 index 0000000000000000000000000000000000000000..cea8a854ea27b37bd9cadb4493e4dfc4ddb46cf5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_brill.py @@ -0,0 +1,34 @@ +""" +Tests for Brill tagger. +""" + +import unittest + +from nltk.corpus import treebank +from nltk.tag import UnigramTagger, brill, brill_trainer +from nltk.tbl import demo + + +class TestBrill(unittest.TestCase): + def test_pos_template(self): + train_sents = treebank.tagged_sents()[:1000] + tagger = UnigramTagger(train_sents) + trainer = brill_trainer.BrillTaggerTrainer( + tagger, [brill.Template(brill.Pos([-1]))] + ) + brill_tagger = trainer.train(train_sents) + # Example from https://github.com/nltk/nltk/issues/769 + result = brill_tagger.tag("This is a foo bar sentence".split()) + expected = [ + ("This", "DT"), + ("is", "VBZ"), + ("a", "DT"), + ("foo", None), + ("bar", "NN"), + ("sentence", None), + ] + self.assertEqual(result, expected) + + @unittest.skip("Should be tested in __main__ of nltk.tbl.demo") + def test_brill_demo(self): + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_cfd_mutation.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_cfd_mutation.py new file mode 100644 index 0000000000000000000000000000000000000000..8952f1f35fe7f78d96b92cc4c377dbd1d387aaf0 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_cfd_mutation.py @@ -0,0 +1,39 @@ +import unittest + +import pytest + +from nltk import ConditionalFreqDist, tokenize + + +class TestEmptyCondFreq(unittest.TestCase): + def test_tabulate(self): + empty = ConditionalFreqDist() + self.assertEqual(empty.conditions(), []) + with pytest.raises(ValueError): + empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added + self.assertEqual(empty.conditions(), []) + + def test_plot(self): + empty = ConditionalFreqDist() + self.assertEqual(empty.conditions(), []) + empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added + self.assertEqual(empty.conditions(), []) + + def test_increment(self): + # make sure that we can still mutate cfd normally + text = "cow cat mouse cat tiger" + cfd = ConditionalFreqDist() + + # create cfd with word length as condition + for word in tokenize.word_tokenize(text): + condition = len(word) + cfd[condition][word] += 1 + + self.assertEqual(cfd.conditions(), [3, 5]) + + # incrementing previously unseen key is still possible + cfd[2]["hi"] += 1 + self.assertCountEqual(cfd.conditions(), [3, 5, 2]) # new condition added + self.assertEqual( + cfd[2]["hi"], 1 + ) # key's frequency incremented from 0 (unseen) to 1 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_cfg2chomsky.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_cfg2chomsky.py new file mode 100644 index 0000000000000000000000000000000000000000..1a9f24d245d5c9dfd4c4d507237651407d2cc444 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_cfg2chomsky.py @@ -0,0 +1,49 @@ +import unittest + +import nltk +from nltk.grammar import CFG + + +class ChomskyNormalFormForCFGTest(unittest.TestCase): + def test_simple(self): + grammar = CFG.fromstring( + """ + S -> NP VP + PP -> P NP + NP -> Det N | NP PP P + VP -> V NP | VP PP + VP -> Det + Det -> 'a' | 'the' + N -> 'dog' | 'cat' + V -> 'chased' | 'sat' + P -> 'on' | 'in' + """ + ) + self.assertFalse(grammar.is_flexible_chomsky_normal_form()) + self.assertFalse(grammar.is_chomsky_normal_form()) + grammar = grammar.chomsky_normal_form(flexible=True) + self.assertTrue(grammar.is_flexible_chomsky_normal_form()) + self.assertFalse(grammar.is_chomsky_normal_form()) + + grammar2 = CFG.fromstring( + """ + S -> NP VP + NP -> VP N P + VP -> P + N -> 'dog' | 'cat' + P -> 'on' | 'in' + """ + ) + self.assertFalse(grammar2.is_flexible_chomsky_normal_form()) + self.assertFalse(grammar2.is_chomsky_normal_form()) + grammar2 = grammar2.chomsky_normal_form() + self.assertTrue(grammar2.is_flexible_chomsky_normal_form()) + self.assertTrue(grammar2.is_chomsky_normal_form()) + + def test_complex(self): + grammar = nltk.data.load("grammars/large_grammars/atis.cfg") + self.assertFalse(grammar.is_flexible_chomsky_normal_form()) + self.assertFalse(grammar.is_chomsky_normal_form()) + grammar = grammar.chomsky_normal_form(flexible=True) + self.assertTrue(grammar.is_flexible_chomsky_normal_form()) + self.assertFalse(grammar.is_chomsky_normal_form()) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_chunk.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_chunk.py new file mode 100644 index 0000000000000000000000000000000000000000..60b56317f2b5cae224b906f0c71458144a74f6a8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_chunk.py @@ -0,0 +1,85 @@ +import unittest + +from nltk import RegexpParser + + +class TestChunkRule(unittest.TestCase): + def test_tag_pattern2re_pattern_quantifier(self): + """Test for bug https://github.com/nltk/nltk/issues/1597 + + Ensures that curly bracket quantifiers can be used inside a chunk rule. + This type of quantifier has been used for the supplementary example + in https://www.nltk.org/book/ch07.html#exploring-text-corpora. + """ + sent = [ + ("The", "AT"), + ("September-October", "NP"), + ("term", "NN"), + ("jury", "NN"), + ("had", "HVD"), + ("been", "BEN"), + ("charged", "VBN"), + ("by", "IN"), + ("Fulton", "NP-TL"), + ("Superior", "JJ-TL"), + ("Court", "NN-TL"), + ("Judge", "NN-TL"), + ("Durwood", "NP"), + ("Pye", "NP"), + ("to", "TO"), + ("investigate", "VB"), + ("reports", "NNS"), + ("of", "IN"), + ("possible", "JJ"), + ("``", "``"), + ("irregularities", "NNS"), + ("''", "''"), + ("in", "IN"), + ("the", "AT"), + ("hard-fought", "JJ"), + ("primary", "NN"), + ("which", "WDT"), + ("was", "BEDZ"), + ("won", "VBN"), + ("by", "IN"), + ("Mayor-nominate", "NN-TL"), + ("Ivan", "NP"), + ("Allen", "NP"), + ("Jr.", "NP"), + (".", "."), + ] # source: brown corpus + cp = RegexpParser("CHUNK: {{4,}}") + tree = cp.parse(sent) + assert ( + tree.pformat() + == """(S + The/AT + September-October/NP + term/NN + jury/NN + had/HVD + been/BEN + charged/VBN + by/IN + Fulton/NP-TL + Superior/JJ-TL + (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP) + to/TO + investigate/VB + reports/NNS + of/IN + possible/JJ + ``/`` + irregularities/NNS + ''/'' + in/IN + the/AT + hard-fought/JJ + primary/NN + which/WDT + was/BEDZ + won/VBN + by/IN + (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP) + ./.)""" + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_classify.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_classify.py new file mode 100644 index 0000000000000000000000000000000000000000..4e21a6cf4aa119e6696a9d2ba618ff08220b0fac --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_classify.py @@ -0,0 +1,49 @@ +""" +Unit tests for nltk.classify. See also: nltk/test/classify.doctest +""" +import pytest + +from nltk import classify + +TRAIN = [ + (dict(a=1, b=1, c=1), "y"), + (dict(a=1, b=1, c=1), "x"), + (dict(a=1, b=1, c=0), "y"), + (dict(a=0, b=1, c=1), "x"), + (dict(a=0, b=1, c=1), "y"), + (dict(a=0, b=0, c=1), "y"), + (dict(a=0, b=1, c=0), "x"), + (dict(a=0, b=0, c=0), "x"), + (dict(a=0, b=1, c=1), "y"), +] + +TEST = [ + (dict(a=1, b=0, c=1)), # unseen + (dict(a=1, b=0, c=0)), # unseen + (dict(a=0, b=1, c=1)), # seen 3 times, labels=y,y,x + (dict(a=0, b=1, c=0)), # seen 1 time, label=x +] + +RESULTS = [(0.16, 0.84), (0.46, 0.54), (0.41, 0.59), (0.76, 0.24)] + + +def assert_classifier_correct(algorithm): + try: + classifier = classify.MaxentClassifier.train( + TRAIN, algorithm, trace=0, max_iter=1000 + ) + except (LookupError, AttributeError) as e: + pytest.skip(str(e)) + + for (px, py), featureset in zip(RESULTS, TEST): + pdist = classifier.prob_classify(featureset) + assert abs(pdist.prob("x") - px) < 1e-2, (pdist.prob("x"), px) + assert abs(pdist.prob("y") - py) < 1e-2, (pdist.prob("y"), py) + + +def test_megam(): + assert_classifier_correct("MEGAM") + + +def test_tadm(): + assert_classifier_correct("TADM") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_collocations.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_collocations.py new file mode 100644 index 0000000000000000000000000000000000000000..2351c61f42942f497d66cbcd4ac2fa845433c2db --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_collocations.py @@ -0,0 +1,120 @@ +from nltk.collocations import BigramCollocationFinder +from nltk.metrics import BigramAssocMeasures + +## Test bigram counters with discontinuous bigrams and repeated words + +_EPSILON = 1e-8 +SENT = "this this is is a a test test".split() + + +def close_enough(x, y): + """Verify that two sequences of n-gram association values are within + _EPSILON of each other. + """ + + return all(abs(x1[1] - y1[1]) <= _EPSILON for x1, y1 in zip(x, y)) + + +def test_bigram2(): + b = BigramCollocationFinder.from_words(SENT) + + assert sorted(b.ngram_fd.items()) == [ + (("a", "a"), 1), + (("a", "test"), 1), + (("is", "a"), 1), + (("is", "is"), 1), + (("test", "test"), 1), + (("this", "is"), 1), + (("this", "this"), 1), + ] + assert sorted(b.word_fd.items()) == [("a", 2), ("is", 2), ("test", 2), ("this", 2)] + + assert len(SENT) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1 + assert close_enough( + sorted(b.score_ngrams(BigramAssocMeasures.pmi)), + [ + (("a", "a"), 1.0), + (("a", "test"), 1.0), + (("is", "a"), 1.0), + (("is", "is"), 1.0), + (("test", "test"), 1.0), + (("this", "is"), 1.0), + (("this", "this"), 1.0), + ], + ) + + +def test_bigram3(): + b = BigramCollocationFinder.from_words(SENT, window_size=3) + assert sorted(b.ngram_fd.items()) == sorted( + [ + (("a", "test"), 3), + (("is", "a"), 3), + (("this", "is"), 3), + (("a", "a"), 1), + (("is", "is"), 1), + (("test", "test"), 1), + (("this", "this"), 1), + ] + ) + + assert sorted(b.word_fd.items()) == sorted( + [("a", 2), ("is", 2), ("test", 2), ("this", 2)] + ) + + assert ( + len(SENT) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0 + ) + assert close_enough( + sorted(b.score_ngrams(BigramAssocMeasures.pmi)), + sorted( + [ + (("a", "test"), 1.584962500721156), + (("is", "a"), 1.584962500721156), + (("this", "is"), 1.584962500721156), + (("a", "a"), 0.0), + (("is", "is"), 0.0), + (("test", "test"), 0.0), + (("this", "this"), 0.0), + ] + ), + ) + + +def test_bigram5(): + b = BigramCollocationFinder.from_words(SENT, window_size=5) + assert sorted(b.ngram_fd.items()) == sorted( + [ + (("a", "test"), 4), + (("is", "a"), 4), + (("this", "is"), 4), + (("is", "test"), 3), + (("this", "a"), 3), + (("a", "a"), 1), + (("is", "is"), 1), + (("test", "test"), 1), + (("this", "this"), 1), + ] + ) + assert sorted(b.word_fd.items()) == sorted( + [("a", 2), ("is", 2), ("test", 2), ("this", 2)] + ) + n_word_fd = sum(b.word_fd.values()) + n_ngram_fd = (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0 + assert len(SENT) == n_word_fd == n_ngram_fd + assert close_enough( + sorted(b.score_ngrams(BigramAssocMeasures.pmi)), + sorted( + [ + (("a", "test"), 1.0), + (("is", "a"), 1.0), + (("this", "is"), 1.0), + (("is", "test"), 0.5849625007211562), + (("this", "a"), 0.5849625007211562), + (("a", "a"), -1.0), + (("is", "is"), -1.0), + (("test", "test"), -1.0), + (("this", "this"), -1.0), + ] + ), + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_concordance.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_concordance.py new file mode 100644 index 0000000000000000000000000000000000000000..02fc5f35a4901598617a71c266ef0448c27694c6 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_concordance.py @@ -0,0 +1,98 @@ +import contextlib +import sys +import unittest +from io import StringIO + +from nltk.corpus import gutenberg +from nltk.text import Text + + +@contextlib.contextmanager +def stdout_redirect(where): + sys.stdout = where + try: + yield where + finally: + sys.stdout = sys.__stdout__ + + +class TestConcordance(unittest.TestCase): + """Text constructed using: https://www.nltk.org/book/ch01.html""" + + @classmethod + def setUpClass(cls): + cls.corpus = gutenberg.words("melville-moby_dick.txt") + + @classmethod + def tearDownClass(cls): + pass + + def setUp(self): + self.text = Text(TestConcordance.corpus) + self.query = "monstrous" + self.maxDiff = None + self.list_out = [ + "ong the former , one was of a most monstrous size . ... This came towards us , ", + 'ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r', + "ll over with a heathenish array of monstrous clubs and spears . Some were thick", + "d as you gazed , and wondered what monstrous cannibal and savage could ever hav", + "that has survived the flood ; most monstrous and most mountainous ! That Himmal", + "they might scout at Moby Dick as a monstrous fable , or still worse and more de", + "th of Radney .'\" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l", + "ing Scenes . In connexion with the monstrous pictures of whales , I am strongly", + "ere to enter upon those still more monstrous stories of them which are to be fo", + "ght have been rummaged out of this monstrous cabinet there is no telling . But ", + "of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u", + ] + + def tearDown(self): + pass + + def test_concordance_list(self): + concordance_out = self.text.concordance_list(self.query) + self.assertEqual(self.list_out, [c.line for c in concordance_out]) + + def test_concordance_width(self): + list_out = [ + "monstrous", + "monstrous", + "monstrous", + "monstrous", + "monstrous", + "monstrous", + "Monstrous", + "monstrous", + "monstrous", + "monstrous", + "monstrous", + ] + + concordance_out = self.text.concordance_list(self.query, width=0) + self.assertEqual(list_out, [c.query for c in concordance_out]) + + def test_concordance_lines(self): + concordance_out = self.text.concordance_list(self.query, lines=3) + self.assertEqual(self.list_out[:3], [c.line for c in concordance_out]) + + def test_concordance_print(self): + print_out = """Displaying 11 of 11 matches: + ong the former , one was of a most monstrous size . ... This came towards us , + ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r + ll over with a heathenish array of monstrous clubs and spears . Some were thick + d as you gazed , and wondered what monstrous cannibal and savage could ever hav + that has survived the flood ; most monstrous and most mountainous ! That Himmal + they might scout at Moby Dick as a monstrous fable , or still worse and more de + th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l + ing Scenes . In connexion with the monstrous pictures of whales , I am strongly + ere to enter upon those still more monstrous stories of them which are to be fo + ght have been rummaged out of this monstrous cabinet there is no telling . But + of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u + """ + + with stdout_redirect(StringIO()) as stdout: + self.text.concordance(self.query) + + def strip_space(raw_str): + return raw_str.replace(" ", "") + + self.assertEqual(strip_space(print_out), strip_space(stdout.getvalue())) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corenlp.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corenlp.py new file mode 100644 index 0000000000000000000000000000000000000000..8b0024b11470c1bee501c898ff508622e783287f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corenlp.py @@ -0,0 +1,1436 @@ +""" +Mock test for Stanford CoreNLP wrappers. +""" + +from unittest import TestCase +from unittest.mock import MagicMock + +import pytest + +from nltk.parse import corenlp +from nltk.tree import Tree + + +def setup_module(module): + global server + + try: + server = corenlp.CoreNLPServer(port=9000) + except LookupError: + pytest.skip("Could not instantiate CoreNLPServer.") + + try: + server.start() + except corenlp.CoreNLPServerError as e: + pytest.skip( + "Skipping CoreNLP tests because the server could not be started. " + "Make sure that the 9000 port is free. " + "{}".format(e.strerror) + ) + + +def teardown_module(module): + server.stop() + + +class TestTokenizerAPI(TestCase): + def test_tokenize(self): + corenlp_tokenizer = corenlp.CoreNLPParser() + + api_return_value = { + "sentences": [ + { + "index": 0, + "tokens": [ + { + "after": " ", + "before": "", + "characterOffsetBegin": 0, + "characterOffsetEnd": 4, + "index": 1, + "originalText": "Good", + "word": "Good", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 5, + "characterOffsetEnd": 12, + "index": 2, + "originalText": "muffins", + "word": "muffins", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 13, + "characterOffsetEnd": 17, + "index": 3, + "originalText": "cost", + "word": "cost", + }, + { + "after": "", + "before": " ", + "characterOffsetBegin": 18, + "characterOffsetEnd": 19, + "index": 4, + "originalText": "$", + "word": "$", + }, + { + "after": "\n", + "before": "", + "characterOffsetBegin": 19, + "characterOffsetEnd": 23, + "index": 5, + "originalText": "3.88", + "word": "3.88", + }, + { + "after": " ", + "before": "\n", + "characterOffsetBegin": 24, + "characterOffsetEnd": 26, + "index": 6, + "originalText": "in", + "word": "in", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 27, + "characterOffsetEnd": 30, + "index": 7, + "originalText": "New", + "word": "New", + }, + { + "after": "", + "before": " ", + "characterOffsetBegin": 31, + "characterOffsetEnd": 35, + "index": 8, + "originalText": "York", + "word": "York", + }, + { + "after": " ", + "before": "", + "characterOffsetBegin": 35, + "characterOffsetEnd": 36, + "index": 9, + "originalText": ".", + "word": ".", + }, + ], + }, + { + "index": 1, + "tokens": [ + { + "after": " ", + "before": " ", + "characterOffsetBegin": 38, + "characterOffsetEnd": 44, + "index": 1, + "originalText": "Please", + "word": "Please", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 45, + "characterOffsetEnd": 48, + "index": 2, + "originalText": "buy", + "word": "buy", + }, + { + "after": "\n", + "before": " ", + "characterOffsetBegin": 49, + "characterOffsetEnd": 51, + "index": 3, + "originalText": "me", + "word": "me", + }, + { + "after": " ", + "before": "\n", + "characterOffsetBegin": 52, + "characterOffsetEnd": 55, + "index": 4, + "originalText": "two", + "word": "two", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 56, + "characterOffsetEnd": 58, + "index": 5, + "originalText": "of", + "word": "of", + }, + { + "after": "", + "before": " ", + "characterOffsetBegin": 59, + "characterOffsetEnd": 63, + "index": 6, + "originalText": "them", + "word": "them", + }, + { + "after": "\n", + "before": "", + "characterOffsetBegin": 63, + "characterOffsetEnd": 64, + "index": 7, + "originalText": ".", + "word": ".", + }, + ], + }, + { + "index": 2, + "tokens": [ + { + "after": "", + "before": "\n", + "characterOffsetBegin": 65, + "characterOffsetEnd": 71, + "index": 1, + "originalText": "Thanks", + "word": "Thanks", + }, + { + "after": "", + "before": "", + "characterOffsetBegin": 71, + "characterOffsetEnd": 72, + "index": 2, + "originalText": ".", + "word": ".", + }, + ], + }, + ] + } + corenlp_tokenizer.api_call = MagicMock(return_value=api_return_value) + + input_string = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." + + expected_output = [ + "Good", + "muffins", + "cost", + "$", + "3.88", + "in", + "New", + "York", + ".", + "Please", + "buy", + "me", + "two", + "of", + "them", + ".", + "Thanks", + ".", + ] + + tokenized_output = list(corenlp_tokenizer.tokenize(input_string)) + + corenlp_tokenizer.api_call.assert_called_once_with( + "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.", + properties={"annotators": "tokenize,ssplit"}, + ) + self.assertEqual(expected_output, tokenized_output) + + +class TestTaggerAPI(TestCase): + def test_pos_tagger(self): + corenlp_tagger = corenlp.CoreNLPParser(tagtype="pos") + + api_return_value = { + "sentences": [ + { + "basicDependencies": [ + { + "dep": "ROOT", + "dependent": 1, + "dependentGloss": "What", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "cop", + "dependent": 2, + "dependentGloss": "is", + "governor": 1, + "governorGloss": "What", + }, + { + "dep": "det", + "dependent": 3, + "dependentGloss": "the", + "governor": 4, + "governorGloss": "airspeed", + }, + { + "dep": "nsubj", + "dependent": 4, + "dependentGloss": "airspeed", + "governor": 1, + "governorGloss": "What", + }, + { + "dep": "case", + "dependent": 5, + "dependentGloss": "of", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "det", + "dependent": 6, + "dependentGloss": "an", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "compound", + "dependent": 7, + "dependentGloss": "unladen", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "nmod", + "dependent": 8, + "dependentGloss": "swallow", + "governor": 4, + "governorGloss": "airspeed", + }, + { + "dep": "punct", + "dependent": 9, + "dependentGloss": "?", + "governor": 1, + "governorGloss": "What", + }, + ], + "enhancedDependencies": [ + { + "dep": "ROOT", + "dependent": 1, + "dependentGloss": "What", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "cop", + "dependent": 2, + "dependentGloss": "is", + "governor": 1, + "governorGloss": "What", + }, + { + "dep": "det", + "dependent": 3, + "dependentGloss": "the", + "governor": 4, + "governorGloss": "airspeed", + }, + { + "dep": "nsubj", + "dependent": 4, + "dependentGloss": "airspeed", + "governor": 1, + "governorGloss": "What", + }, + { + "dep": "case", + "dependent": 5, + "dependentGloss": "of", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "det", + "dependent": 6, + "dependentGloss": "an", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "compound", + "dependent": 7, + "dependentGloss": "unladen", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "nmod:of", + "dependent": 8, + "dependentGloss": "swallow", + "governor": 4, + "governorGloss": "airspeed", + }, + { + "dep": "punct", + "dependent": 9, + "dependentGloss": "?", + "governor": 1, + "governorGloss": "What", + }, + ], + "enhancedPlusPlusDependencies": [ + { + "dep": "ROOT", + "dependent": 1, + "dependentGloss": "What", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "cop", + "dependent": 2, + "dependentGloss": "is", + "governor": 1, + "governorGloss": "What", + }, + { + "dep": "det", + "dependent": 3, + "dependentGloss": "the", + "governor": 4, + "governorGloss": "airspeed", + }, + { + "dep": "nsubj", + "dependent": 4, + "dependentGloss": "airspeed", + "governor": 1, + "governorGloss": "What", + }, + { + "dep": "case", + "dependent": 5, + "dependentGloss": "of", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "det", + "dependent": 6, + "dependentGloss": "an", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "compound", + "dependent": 7, + "dependentGloss": "unladen", + "governor": 8, + "governorGloss": "swallow", + }, + { + "dep": "nmod:of", + "dependent": 8, + "dependentGloss": "swallow", + "governor": 4, + "governorGloss": "airspeed", + }, + { + "dep": "punct", + "dependent": 9, + "dependentGloss": "?", + "governor": 1, + "governorGloss": "What", + }, + ], + "index": 0, + "parse": "(ROOT\n (SBARQ\n (WHNP (WP What))\n (SQ (VBZ is)\n (NP\n (NP (DT the) (NN airspeed))\n (PP (IN of)\n (NP (DT an) (NN unladen) (NN swallow)))))\n (. ?)))", + "tokens": [ + { + "after": " ", + "before": "", + "characterOffsetBegin": 0, + "characterOffsetEnd": 4, + "index": 1, + "lemma": "what", + "originalText": "What", + "pos": "WP", + "word": "What", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 5, + "characterOffsetEnd": 7, + "index": 2, + "lemma": "be", + "originalText": "is", + "pos": "VBZ", + "word": "is", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 8, + "characterOffsetEnd": 11, + "index": 3, + "lemma": "the", + "originalText": "the", + "pos": "DT", + "word": "the", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 12, + "characterOffsetEnd": 20, + "index": 4, + "lemma": "airspeed", + "originalText": "airspeed", + "pos": "NN", + "word": "airspeed", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 21, + "characterOffsetEnd": 23, + "index": 5, + "lemma": "of", + "originalText": "of", + "pos": "IN", + "word": "of", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 24, + "characterOffsetEnd": 26, + "index": 6, + "lemma": "a", + "originalText": "an", + "pos": "DT", + "word": "an", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 27, + "characterOffsetEnd": 34, + "index": 7, + "lemma": "unladen", + "originalText": "unladen", + "pos": "JJ", + "word": "unladen", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 35, + "characterOffsetEnd": 42, + "index": 8, + "lemma": "swallow", + "originalText": "swallow", + "pos": "VB", + "word": "swallow", + }, + { + "after": "", + "before": " ", + "characterOffsetBegin": 43, + "characterOffsetEnd": 44, + "index": 9, + "lemma": "?", + "originalText": "?", + "pos": ".", + "word": "?", + }, + ], + } + ] + } + corenlp_tagger.api_call = MagicMock(return_value=api_return_value) + + input_tokens = "What is the airspeed of an unladen swallow ?".split() + expected_output = [ + ("What", "WP"), + ("is", "VBZ"), + ("the", "DT"), + ("airspeed", "NN"), + ("of", "IN"), + ("an", "DT"), + ("unladen", "JJ"), + ("swallow", "VB"), + ("?", "."), + ] + tagged_output = corenlp_tagger.tag(input_tokens) + + corenlp_tagger.api_call.assert_called_once_with( + "What is the airspeed of an unladen swallow ?", + properties={ + "ssplit.isOneSentence": "true", + "annotators": "tokenize,ssplit,pos", + }, + ) + self.assertEqual(expected_output, tagged_output) + + def test_ner_tagger(self): + corenlp_tagger = corenlp.CoreNLPParser(tagtype="ner") + + api_return_value = { + "sentences": [ + { + "index": 0, + "tokens": [ + { + "after": " ", + "before": "", + "characterOffsetBegin": 0, + "characterOffsetEnd": 4, + "index": 1, + "lemma": "Rami", + "ner": "PERSON", + "originalText": "Rami", + "pos": "NNP", + "word": "Rami", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 5, + "characterOffsetEnd": 8, + "index": 2, + "lemma": "Eid", + "ner": "PERSON", + "originalText": "Eid", + "pos": "NNP", + "word": "Eid", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 9, + "characterOffsetEnd": 11, + "index": 3, + "lemma": "be", + "ner": "O", + "originalText": "is", + "pos": "VBZ", + "word": "is", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 12, + "characterOffsetEnd": 20, + "index": 4, + "lemma": "study", + "ner": "O", + "originalText": "studying", + "pos": "VBG", + "word": "studying", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 21, + "characterOffsetEnd": 23, + "index": 5, + "lemma": "at", + "ner": "O", + "originalText": "at", + "pos": "IN", + "word": "at", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 24, + "characterOffsetEnd": 29, + "index": 6, + "lemma": "Stony", + "ner": "ORGANIZATION", + "originalText": "Stony", + "pos": "NNP", + "word": "Stony", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 30, + "characterOffsetEnd": 35, + "index": 7, + "lemma": "Brook", + "ner": "ORGANIZATION", + "originalText": "Brook", + "pos": "NNP", + "word": "Brook", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 36, + "characterOffsetEnd": 46, + "index": 8, + "lemma": "University", + "ner": "ORGANIZATION", + "originalText": "University", + "pos": "NNP", + "word": "University", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 47, + "characterOffsetEnd": 49, + "index": 9, + "lemma": "in", + "ner": "O", + "originalText": "in", + "pos": "IN", + "word": "in", + }, + { + "after": "", + "before": " ", + "characterOffsetBegin": 50, + "characterOffsetEnd": 52, + "index": 10, + "lemma": "NY", + "ner": "O", + "originalText": "NY", + "pos": "NNP", + "word": "NY", + }, + ], + } + ] + } + + corenlp_tagger.api_call = MagicMock(return_value=api_return_value) + + input_tokens = "Rami Eid is studying at Stony Brook University in NY".split() + expected_output = [ + ("Rami", "PERSON"), + ("Eid", "PERSON"), + ("is", "O"), + ("studying", "O"), + ("at", "O"), + ("Stony", "ORGANIZATION"), + ("Brook", "ORGANIZATION"), + ("University", "ORGANIZATION"), + ("in", "O"), + ("NY", "O"), + ] + tagged_output = corenlp_tagger.tag(input_tokens) + + corenlp_tagger.api_call.assert_called_once_with( + "Rami Eid is studying at Stony Brook University in NY", + properties={ + "ssplit.isOneSentence": "true", + "annotators": "tokenize,ssplit,ner", + }, + ) + self.assertEqual(expected_output, tagged_output) + + def test_unexpected_tagtype(self): + with self.assertRaises(ValueError): + corenlp_tagger = corenlp.CoreNLPParser(tagtype="test") + + +class TestParserAPI(TestCase): + def test_parse(self): + corenlp_parser = corenlp.CoreNLPParser() + + api_return_value = { + "sentences": [ + { + "basicDependencies": [ + { + "dep": "ROOT", + "dependent": 4, + "dependentGloss": "fox", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "det", + "dependent": 1, + "dependentGloss": "The", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 2, + "dependentGloss": "quick", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 3, + "dependentGloss": "brown", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "dep", + "dependent": 5, + "dependentGloss": "jumps", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "case", + "dependent": 6, + "dependentGloss": "over", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "det", + "dependent": 7, + "dependentGloss": "the", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "amod", + "dependent": 8, + "dependentGloss": "lazy", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "nmod", + "dependent": 9, + "dependentGloss": "dog", + "governor": 5, + "governorGloss": "jumps", + }, + ], + "enhancedDependencies": [ + { + "dep": "ROOT", + "dependent": 4, + "dependentGloss": "fox", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "det", + "dependent": 1, + "dependentGloss": "The", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 2, + "dependentGloss": "quick", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 3, + "dependentGloss": "brown", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "dep", + "dependent": 5, + "dependentGloss": "jumps", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "case", + "dependent": 6, + "dependentGloss": "over", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "det", + "dependent": 7, + "dependentGloss": "the", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "amod", + "dependent": 8, + "dependentGloss": "lazy", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "nmod:over", + "dependent": 9, + "dependentGloss": "dog", + "governor": 5, + "governorGloss": "jumps", + }, + ], + "enhancedPlusPlusDependencies": [ + { + "dep": "ROOT", + "dependent": 4, + "dependentGloss": "fox", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "det", + "dependent": 1, + "dependentGloss": "The", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 2, + "dependentGloss": "quick", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 3, + "dependentGloss": "brown", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "dep", + "dependent": 5, + "dependentGloss": "jumps", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "case", + "dependent": 6, + "dependentGloss": "over", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "det", + "dependent": 7, + "dependentGloss": "the", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "amod", + "dependent": 8, + "dependentGloss": "lazy", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "nmod:over", + "dependent": 9, + "dependentGloss": "dog", + "governor": 5, + "governorGloss": "jumps", + }, + ], + "index": 0, + "parse": "(ROOT\n (NP\n (NP (DT The) (JJ quick) (JJ brown) (NN fox))\n (NP\n (NP (NNS jumps))\n (PP (IN over)\n (NP (DT the) (JJ lazy) (NN dog))))))", + "tokens": [ + { + "after": " ", + "before": "", + "characterOffsetBegin": 0, + "characterOffsetEnd": 3, + "index": 1, + "lemma": "the", + "originalText": "The", + "pos": "DT", + "word": "The", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 4, + "characterOffsetEnd": 9, + "index": 2, + "lemma": "quick", + "originalText": "quick", + "pos": "JJ", + "word": "quick", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 10, + "characterOffsetEnd": 15, + "index": 3, + "lemma": "brown", + "originalText": "brown", + "pos": "JJ", + "word": "brown", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 16, + "characterOffsetEnd": 19, + "index": 4, + "lemma": "fox", + "originalText": "fox", + "pos": "NN", + "word": "fox", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 20, + "characterOffsetEnd": 25, + "index": 5, + "lemma": "jump", + "originalText": "jumps", + "pos": "VBZ", + "word": "jumps", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 26, + "characterOffsetEnd": 30, + "index": 6, + "lemma": "over", + "originalText": "over", + "pos": "IN", + "word": "over", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 31, + "characterOffsetEnd": 34, + "index": 7, + "lemma": "the", + "originalText": "the", + "pos": "DT", + "word": "the", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 35, + "characterOffsetEnd": 39, + "index": 8, + "lemma": "lazy", + "originalText": "lazy", + "pos": "JJ", + "word": "lazy", + }, + { + "after": "", + "before": " ", + "characterOffsetBegin": 40, + "characterOffsetEnd": 43, + "index": 9, + "lemma": "dog", + "originalText": "dog", + "pos": "NN", + "word": "dog", + }, + ], + } + ] + } + + corenlp_parser.api_call = MagicMock(return_value=api_return_value) + + input_string = "The quick brown fox jumps over the lazy dog".split() + expected_output = Tree( + "ROOT", + [ + Tree( + "NP", + [ + Tree( + "NP", + [ + Tree("DT", ["The"]), + Tree("JJ", ["quick"]), + Tree("JJ", ["brown"]), + Tree("NN", ["fox"]), + ], + ), + Tree( + "NP", + [ + Tree("NP", [Tree("NNS", ["jumps"])]), + Tree( + "PP", + [ + Tree("IN", ["over"]), + Tree( + "NP", + [ + Tree("DT", ["the"]), + Tree("JJ", ["lazy"]), + Tree("NN", ["dog"]), + ], + ), + ], + ), + ], + ), + ], + ) + ], + ) + + parsed_data = next(corenlp_parser.parse(input_string)) + + corenlp_parser.api_call.assert_called_once_with( + "The quick brown fox jumps over the lazy dog", + properties={"ssplit.eolonly": "true"}, + ) + self.assertEqual(expected_output, parsed_data) + + def test_dependency_parser(self): + corenlp_parser = corenlp.CoreNLPDependencyParser() + + api_return_value = { + "sentences": [ + { + "basicDependencies": [ + { + "dep": "ROOT", + "dependent": 5, + "dependentGloss": "jumps", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "det", + "dependent": 1, + "dependentGloss": "The", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 2, + "dependentGloss": "quick", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 3, + "dependentGloss": "brown", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "nsubj", + "dependent": 4, + "dependentGloss": "fox", + "governor": 5, + "governorGloss": "jumps", + }, + { + "dep": "case", + "dependent": 6, + "dependentGloss": "over", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "det", + "dependent": 7, + "dependentGloss": "the", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "amod", + "dependent": 8, + "dependentGloss": "lazy", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "nmod", + "dependent": 9, + "dependentGloss": "dog", + "governor": 5, + "governorGloss": "jumps", + }, + ], + "enhancedDependencies": [ + { + "dep": "ROOT", + "dependent": 5, + "dependentGloss": "jumps", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "det", + "dependent": 1, + "dependentGloss": "The", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 2, + "dependentGloss": "quick", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 3, + "dependentGloss": "brown", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "nsubj", + "dependent": 4, + "dependentGloss": "fox", + "governor": 5, + "governorGloss": "jumps", + }, + { + "dep": "case", + "dependent": 6, + "dependentGloss": "over", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "det", + "dependent": 7, + "dependentGloss": "the", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "amod", + "dependent": 8, + "dependentGloss": "lazy", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "nmod:over", + "dependent": 9, + "dependentGloss": "dog", + "governor": 5, + "governorGloss": "jumps", + }, + ], + "enhancedPlusPlusDependencies": [ + { + "dep": "ROOT", + "dependent": 5, + "dependentGloss": "jumps", + "governor": 0, + "governorGloss": "ROOT", + }, + { + "dep": "det", + "dependent": 1, + "dependentGloss": "The", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 2, + "dependentGloss": "quick", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "amod", + "dependent": 3, + "dependentGloss": "brown", + "governor": 4, + "governorGloss": "fox", + }, + { + "dep": "nsubj", + "dependent": 4, + "dependentGloss": "fox", + "governor": 5, + "governorGloss": "jumps", + }, + { + "dep": "case", + "dependent": 6, + "dependentGloss": "over", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "det", + "dependent": 7, + "dependentGloss": "the", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "amod", + "dependent": 8, + "dependentGloss": "lazy", + "governor": 9, + "governorGloss": "dog", + }, + { + "dep": "nmod:over", + "dependent": 9, + "dependentGloss": "dog", + "governor": 5, + "governorGloss": "jumps", + }, + ], + "index": 0, + "tokens": [ + { + "after": " ", + "before": "", + "characterOffsetBegin": 0, + "characterOffsetEnd": 3, + "index": 1, + "lemma": "the", + "originalText": "The", + "pos": "DT", + "word": "The", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 4, + "characterOffsetEnd": 9, + "index": 2, + "lemma": "quick", + "originalText": "quick", + "pos": "JJ", + "word": "quick", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 10, + "characterOffsetEnd": 15, + "index": 3, + "lemma": "brown", + "originalText": "brown", + "pos": "JJ", + "word": "brown", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 16, + "characterOffsetEnd": 19, + "index": 4, + "lemma": "fox", + "originalText": "fox", + "pos": "NN", + "word": "fox", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 20, + "characterOffsetEnd": 25, + "index": 5, + "lemma": "jump", + "originalText": "jumps", + "pos": "VBZ", + "word": "jumps", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 26, + "characterOffsetEnd": 30, + "index": 6, + "lemma": "over", + "originalText": "over", + "pos": "IN", + "word": "over", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 31, + "characterOffsetEnd": 34, + "index": 7, + "lemma": "the", + "originalText": "the", + "pos": "DT", + "word": "the", + }, + { + "after": " ", + "before": " ", + "characterOffsetBegin": 35, + "characterOffsetEnd": 39, + "index": 8, + "lemma": "lazy", + "originalText": "lazy", + "pos": "JJ", + "word": "lazy", + }, + { + "after": "", + "before": " ", + "characterOffsetBegin": 40, + "characterOffsetEnd": 43, + "index": 9, + "lemma": "dog", + "originalText": "dog", + "pos": "NN", + "word": "dog", + }, + ], + } + ] + } + + corenlp_parser.api_call = MagicMock(return_value=api_return_value) + + input_string = "The quick brown fox jumps over the lazy dog".split() + expected_output = Tree( + "jumps", + [ + Tree("fox", ["The", "quick", "brown"]), + Tree("dog", ["over", "the", "lazy"]), + ], + ) + + parsed_data = next(corenlp_parser.parse(input_string)) + + corenlp_parser.api_call.assert_called_once_with( + "The quick brown fox jumps over the lazy dog", + properties={"ssplit.eolonly": "true"}, + ) + self.assertEqual(expected_output, parsed_data.tree()) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corpora.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corpora.py new file mode 100644 index 0000000000000000000000000000000000000000..888dd20b5af2f798d966d0a138d4c453675ae0f6 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corpora.py @@ -0,0 +1,274 @@ +import unittest + +import pytest + +from nltk.corpus import ( # mwa_ppdb + cess_cat, + cess_esp, + conll2007, + floresta, + indian, + ptb, + sinica_treebank, + udhr, +) +from nltk.tree import Tree + + +class TestUdhr(unittest.TestCase): + def test_words(self): + for name in udhr.fileids(): + words = list(udhr.words(name)) + self.assertTrue(words) + + def test_raw_unicode(self): + for name in udhr.fileids(): + txt = udhr.raw(name) + assert not isinstance(txt, bytes), name + + def test_polish_encoding(self): + text_pl = udhr.raw("Polish-Latin2")[:164] + text_ppl = udhr.raw("Polish_Polski-Latin2")[:164] + expected = """POWSZECHNA DEKLARACJA PRAW CZŁOWIEKA +[Preamble] +Trzecia Sesja Ogólnego Zgromadzenia ONZ, obradująca w Paryżu, \ +uchwaliła 10 grudnia 1948 roku jednomyślnie Powszechną""" + assert text_pl == expected, "Polish-Latin2" + assert text_ppl == expected, "Polish_Polski-Latin2" + + +class TestIndian(unittest.TestCase): + def test_words(self): + words = indian.words()[:3] + self.assertEqual(words, ["মহিষের", "সন্তান", ":"]) + + def test_tagged_words(self): + tagged_words = indian.tagged_words()[:3] + self.assertEqual( + tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")] + ) + + +class TestCess(unittest.TestCase): + def test_catalan(self): + words = cess_cat.words()[:15] + txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial" + self.assertEqual(words, txt.split()) + self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs") + + def test_esp(self): + words = cess_esp.words()[:15] + txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del" + self.assertEqual(words, txt.split()) + self.assertEqual(cess_esp.words()[115], "años") + + +class TestFloresta(unittest.TestCase): + def test_words(self): + words = floresta.words()[:10] + txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a" + self.assertEqual(words, txt.split()) + + +class TestSinicaTreebank(unittest.TestCase): + def test_sents(self): + first_3_sents = sinica_treebank.sents()[:3] + self.assertEqual( + first_3_sents, [["一"], ["友情"], ["嘉珍", "和", "我", "住在", "同一條", "巷子"]] + ) + + def test_parsed_sents(self): + parsed_sents = sinica_treebank.parsed_sents()[25] + self.assertEqual( + parsed_sents, + Tree( + "S", + [ + Tree("NP", [Tree("Nba", ["嘉珍"])]), + Tree("V‧地", [Tree("VA11", ["不停"]), Tree("DE", ["的"])]), + Tree("VA4", ["哭泣"]), + ], + ), + ) + + +class TestCoNLL2007(unittest.TestCase): + # Reading the CoNLL 2007 Dependency Treebanks + + def test_sents(self): + sents = conll2007.sents("esp.train")[0] + self.assertEqual( + sents[:6], ["El", "aumento", "del", "índice", "de", "desempleo"] + ) + + def test_parsed_sents(self): + + parsed_sents = conll2007.parsed_sents("esp.train")[0] + + self.assertEqual( + parsed_sents.tree(), + Tree( + "fortaleció", + [ + Tree( + "aumento", + [ + "El", + Tree( + "del", + [ + Tree( + "índice", + [ + Tree( + "de", + [Tree("desempleo", ["estadounidense"])], + ) + ], + ) + ], + ), + ], + ), + "hoy", + "considerablemente", + Tree( + "al", + [ + Tree( + "euro", + [ + Tree( + "cotizaba", + [ + ",", + "que", + Tree("a", [Tree("15.35", ["las", "GMT"])]), + "se", + Tree( + "en", + [ + Tree( + "mercado", + [ + "el", + Tree("de", ["divisas"]), + Tree("de", ["Fráncfort"]), + ], + ) + ], + ), + Tree("a", ["0,9452_dólares"]), + Tree( + "frente_a", + [ + ",", + Tree( + "0,9349_dólares", + [ + "los", + Tree( + "de", + [ + Tree( + "mañana", + ["esta"], + ) + ], + ), + ], + ), + ], + ), + ], + ) + ], + ) + ], + ), + ".", + ], + ), + ) + + +@pytest.mark.skipif( + not ptb.fileids(), + reason="A full installation of the Penn Treebank is not available", +) +class TestPTB(unittest.TestCase): + def test_fileids(self): + self.assertEqual( + ptb.fileids()[:4], + [ + "BROWN/CF/CF01.MRG", + "BROWN/CF/CF02.MRG", + "BROWN/CF/CF03.MRG", + "BROWN/CF/CF04.MRG", + ], + ) + + def test_words(self): + self.assertEqual( + ptb.words("WSJ/00/WSJ_0003.MRG")[:7], + ["A", "form", "of", "asbestos", "once", "used", "*"], + ) + + def test_tagged_words(self): + self.assertEqual( + ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3], + [("A", "DT"), ("form", "NN"), ("of", "IN")], + ) + + def test_categories(self): + self.assertEqual( + ptb.categories(), + [ + "adventure", + "belles_lettres", + "fiction", + "humor", + "lore", + "mystery", + "news", + "romance", + "science_fiction", + ], + ) + + def test_news_fileids(self): + self.assertEqual( + ptb.fileids("news")[:3], + ["WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG"], + ) + + def test_category_words(self): + self.assertEqual( + ptb.words(categories=["humor", "fiction"])[:6], + ["Thirty-three", "Scotty", "did", "not", "go", "back"], + ) + + +@pytest.mark.skip("Skipping test for mwa_ppdb.") +class TestMWAPPDB(unittest.TestCase): + def test_fileids(self): + self.assertEqual( + mwa_ppdb.fileids(), ["ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"] + ) + + def test_entries(self): + self.assertEqual( + mwa_ppdb.entries()[:10], + [ + ("10/17/01", "17/10/2001"), + ("102,70", "102.70"), + ("13,53", "13.53"), + ("3.2.5.3.2.1", "3.2.5.3.2.1."), + ("53,76", "53.76"), + ("6.9.5", "6.9.5."), + ("7.7.6.3", "7.7.6.3."), + ("76,20", "76.20"), + ("79,85", "79.85"), + ("93,65", "93.65"), + ], + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corpus_views.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corpus_views.py new file mode 100644 index 0000000000000000000000000000000000000000..890825fa8d65b761f661417656f3ae37075cdc6f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_corpus_views.py @@ -0,0 +1,48 @@ +""" +Corpus View Regression Tests +""" +import unittest + +import nltk.data +from nltk.corpus.reader.util import ( + StreamBackedCorpusView, + read_line_block, + read_whitespace_block, +) + + +class TestCorpusViews(unittest.TestCase): + + linetok = nltk.LineTokenizer(blanklines="keep") + names = [ + "corpora/inaugural/README", # A very short file (160 chars) + "corpora/inaugural/1793-Washington.txt", # A relatively short file (791 chars) + "corpora/inaugural/1909-Taft.txt", # A longer file (32k chars) + ] + + def data(self): + for name in self.names: + f = nltk.data.find(name) + with f.open() as fp: + file_data = fp.read().decode("utf8") + yield f, file_data + + def test_correct_values(self): + # Check that corpus views produce the correct sequence of values. + + for f, file_data in self.data(): + v = StreamBackedCorpusView(f, read_whitespace_block) + self.assertEqual(list(v), file_data.split()) + + v = StreamBackedCorpusView(f, read_line_block) + self.assertEqual(list(v), self.linetok.tokenize(file_data)) + + def test_correct_length(self): + # Check that the corpus views report the correct lengths: + + for f, file_data in self.data(): + v = StreamBackedCorpusView(f, read_whitespace_block) + self.assertEqual(len(v), len(file_data.split())) + + v = StreamBackedCorpusView(f, read_line_block) + self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_data.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_data.py new file mode 100644 index 0000000000000000000000000000000000000000..b05eea84bfaaca4e439f319057d56821764f75c7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_data.py @@ -0,0 +1,15 @@ +import pytest + +import nltk.data + + +def test_find_raises_exception(): + with pytest.raises(LookupError): + nltk.data.find("no_such_resource/foo") + + +def test_find_raises_exception_with_full_resource_name(): + no_such_thing = "no_such_thing/bar" + with pytest.raises(LookupError) as exc: + nltk.data.find(no_such_thing) + assert no_such_thing in str(exc) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_disagreement.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_disagreement.py new file mode 100644 index 0000000000000000000000000000000000000000..1f29add9058e25b2a2736ce513734655c85d4abf --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_disagreement.py @@ -0,0 +1,144 @@ +import unittest + +from nltk.metrics.agreement import AnnotationTask + + +class TestDisagreement(unittest.TestCase): + + """ + Class containing unit tests for nltk.metrics.agreement.Disagreement. + """ + + def test_easy(self): + """ + Simple test, based on + https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf. + """ + data = [ + ("coder1", "dress1", "YES"), + ("coder2", "dress1", "NO"), + ("coder3", "dress1", "NO"), + ("coder1", "dress2", "YES"), + ("coder2", "dress2", "NO"), + ("coder3", "dress3", "NO"), + ] + annotation_task = AnnotationTask(data) + self.assertAlmostEqual(annotation_task.alpha(), -0.3333333) + + def test_easy2(self): + """ + Same simple test with 1 rating removed. + Removal of that rating should not matter: K-Apha ignores items with + only 1 rating. + """ + data = [ + ("coder1", "dress1", "YES"), + ("coder2", "dress1", "NO"), + ("coder3", "dress1", "NO"), + ("coder1", "dress2", "YES"), + ("coder2", "dress2", "NO"), + ] + annotation_task = AnnotationTask(data) + self.assertAlmostEqual(annotation_task.alpha(), -0.3333333) + + def test_advanced(self): + """ + More advanced test, based on + http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf + """ + data = [ + ("A", "1", "1"), + ("B", "1", "1"), + ("D", "1", "1"), + ("A", "2", "2"), + ("B", "2", "2"), + ("C", "2", "3"), + ("D", "2", "2"), + ("A", "3", "3"), + ("B", "3", "3"), + ("C", "3", "3"), + ("D", "3", "3"), + ("A", "4", "3"), + ("B", "4", "3"), + ("C", "4", "3"), + ("D", "4", "3"), + ("A", "5", "2"), + ("B", "5", "2"), + ("C", "5", "2"), + ("D", "5", "2"), + ("A", "6", "1"), + ("B", "6", "2"), + ("C", "6", "3"), + ("D", "6", "4"), + ("A", "7", "4"), + ("B", "7", "4"), + ("C", "7", "4"), + ("D", "7", "4"), + ("A", "8", "1"), + ("B", "8", "1"), + ("C", "8", "2"), + ("D", "8", "1"), + ("A", "9", "2"), + ("B", "9", "2"), + ("C", "9", "2"), + ("D", "9", "2"), + ("B", "10", "5"), + ("C", "10", "5"), + ("D", "10", "5"), + ("C", "11", "1"), + ("D", "11", "1"), + ("C", "12", "3"), + ] + annotation_task = AnnotationTask(data) + self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632) + + def test_advanced2(self): + """ + Same more advanced example, but with 1 rating removed. + Again, removal of that 1 rating should not matter. + """ + data = [ + ("A", "1", "1"), + ("B", "1", "1"), + ("D", "1", "1"), + ("A", "2", "2"), + ("B", "2", "2"), + ("C", "2", "3"), + ("D", "2", "2"), + ("A", "3", "3"), + ("B", "3", "3"), + ("C", "3", "3"), + ("D", "3", "3"), + ("A", "4", "3"), + ("B", "4", "3"), + ("C", "4", "3"), + ("D", "4", "3"), + ("A", "5", "2"), + ("B", "5", "2"), + ("C", "5", "2"), + ("D", "5", "2"), + ("A", "6", "1"), + ("B", "6", "2"), + ("C", "6", "3"), + ("D", "6", "4"), + ("A", "7", "4"), + ("B", "7", "4"), + ("C", "7", "4"), + ("D", "7", "4"), + ("A", "8", "1"), + ("B", "8", "1"), + ("C", "8", "2"), + ("D", "8", "1"), + ("A", "9", "2"), + ("B", "9", "2"), + ("C", "9", "2"), + ("D", "9", "2"), + ("B", "10", "5"), + ("C", "10", "5"), + ("D", "10", "5"), + ("C", "11", "1"), + ("D", "11", "1"), + ("C", "12", "3"), + ] + annotation_task = AnnotationTask(data) + self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_distance.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..71e0bf06a6a5d3e75ceefa06670542303803d3eb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_distance.py @@ -0,0 +1,129 @@ +from typing import Tuple + +import pytest + +from nltk.metrics.distance import edit_distance + + +class TestEditDistance: + @pytest.mark.parametrize( + "left,right,substitution_cost,expecteds", + [ + # Allowing transpositions reduces the number of edits required. + # with transpositions: + # e.g. "abc" -T-> "cba" -D-> "ca": 2 steps + # + # without transpositions: + # e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps + ("abc", "ca", 1, (2, 3)), + ("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions + # Note, a substition_cost of higher than 2 doesn't make much + # sense, as a deletion + insertion is identical, and always + # costs 2. + # + # + # Transpositions don't always reduce the number of edits required: + # with or without transpositions: + # e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps + ("wants", "wasp", 1, (3, 3)), + ("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions + # + # + # Ought to have the same results with and without transpositions + # with or without transpositions: + # e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps + # (but cost 5 if substitution_cost=2) + ("rain", "shine", 1, (3, 3)), + ("rain", "shine", 2, (5, 5)), # Does *require* substitutions + # + # + # Several potentially interesting typos + # with transpositions: + # e.g. "acbdef" -T-> "abcdef": 1 step + # + # without transpositions: + # e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps + ("acbdef", "abcdef", 1, (1, 2)), + ("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions + # + # + # with transpositions: + # e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps + # + # without transpositions: + # e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps + ("lnaguaeg", "language", 1, (2, 4)), + ("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions + # + # + # with transpositions: + # e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps + # + # without transpositions: + # e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps + # (but one substitution, so a cost of 4 if substition_cost = 2) + ("lnaugage", "language", 1, (2, 3)), + ("lnaugage", "language", 2, (2, 4)), + # Does *require* substitutions if no transpositions + # + # + # with transpositions: + # e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps + # without transpositions: + # e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps + ("lngauage", "language", 1, (2, 2)), + ("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions + # + # + # with or without transpositions: + # e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps + # + # with substitution_cost=2 and transpositions: + # e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw" + # -I-> "swi" -I-> "swim": 6 steps + # + # with substitution_cost=2 and no transpositions: + # e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw" + # -I-> "swi" -I-> "swim": 7 steps + ("wants", "swim", 1, (5, 5)), + ("wants", "swim", 2, (6, 7)), + # + # + # with or without transpositions: + # e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps + # (but cost 5 if substitution_cost=2) + ("kitten", "sitting", 1, (3, 3)), + ("kitten", "sitting", 2, (5, 5)), + # + # duplicated letter + # e.g. "duplicated" -D-> "duplicated" + ("duplicated", "duuplicated", 1, (1, 1)), + ("duplicated", "duuplicated", 2, (1, 1)), + ("very duplicated", "very duuplicateed", 2, (2, 2)), + ], + ) + def test_with_transpositions( + self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int] + ): + """ + Test `edit_distance` between two strings, given some `substitution_cost`, + and whether transpositions are allowed. + + :param str left: First input string to `edit_distance`. + :param str right: Second input string to `edit_distance`. + :param int substitution_cost: The cost of a substitution action in `edit_distance`. + :param Tuple[int, int] expecteds: A tuple of expected outputs, such that `expecteds[0]` is + the expected output with `transpositions=True`, and `expecteds[1]` is + the expected output with `transpositions=False`. + """ + # Test the input strings in both orderings + for s1, s2 in ((left, right), (right, left)): + # zip with [True, False] to get the transpositions value + for expected, transpositions in zip(expecteds, [True, False]): + predicted = edit_distance( + s1, + s2, + substitution_cost=substitution_cost, + transpositions=transpositions, + ) + assert predicted == expected diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_downloader.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_downloader.py new file mode 100644 index 0000000000000000000000000000000000000000..408372259142592003a3689cb35a0430e0bec190 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_downloader.py @@ -0,0 +1,19 @@ +from nltk import download + + +def test_downloader_using_existing_parent_download_dir(tmp_path): + """Test that download works properly when the parent folder of the download_dir exists""" + + download_dir = str(tmp_path.joinpath("another_dir")) + download_status = download("mwa_ppdb", download_dir) + assert download_status is True + + +def test_downloader_using_non_existing_parent_download_dir(tmp_path): + """Test that download works properly when the parent folder of the download_dir does not exist""" + + download_dir = str( + tmp_path.joinpath("non-existing-parent-folder", "another-non-existing-folder") + ) + download_status = download("mwa_ppdb", download_dir) + assert download_status is True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_hmm.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_hmm.py new file mode 100644 index 0000000000000000000000000000000000000000..2ce5213230ae4c58ba58ff94872b7240f5884d79 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_hmm.py @@ -0,0 +1,82 @@ +import pytest + +from nltk.tag import hmm + + +def _wikipedia_example_hmm(): + # Example from wikipedia + # (https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm) + + states = ["rain", "no rain"] + symbols = ["umbrella", "no umbrella"] + + A = [[0.7, 0.3], [0.3, 0.7]] # transition probabilities + B = [[0.9, 0.1], [0.2, 0.8]] # emission probabilities + pi = [0.5, 0.5] # initial probabilities + + seq = ["umbrella", "umbrella", "no umbrella", "umbrella", "umbrella"] + seq = list(zip(seq, [None] * len(seq))) + + model = hmm._create_hmm_tagger(states, symbols, A, B, pi) + return model, states, symbols, seq + + +def test_forward_probability(): + from numpy.testing import assert_array_almost_equal + + # example from p. 385, Huang et al + model, states, symbols = hmm._market_hmm_example() + seq = [("up", None), ("up", None)] + expected = [[0.35, 0.02, 0.09], [0.1792, 0.0085, 0.0357]] + + fp = 2 ** model._forward_probability(seq) + + assert_array_almost_equal(fp, expected) + + +def test_forward_probability2(): + from numpy.testing import assert_array_almost_equal + + model, states, symbols, seq = _wikipedia_example_hmm() + fp = 2 ** model._forward_probability(seq) + + # examples in wikipedia are normalized + fp = (fp.T / fp.sum(axis=1)).T + + wikipedia_results = [ + [0.8182, 0.1818], + [0.8834, 0.1166], + [0.1907, 0.8093], + [0.7308, 0.2692], + [0.8673, 0.1327], + ] + + assert_array_almost_equal(wikipedia_results, fp, 4) + + +def test_backward_probability(): + from numpy.testing import assert_array_almost_equal + + model, states, symbols, seq = _wikipedia_example_hmm() + + bp = 2 ** model._backward_probability(seq) + # examples in wikipedia are normalized + + bp = (bp.T / bp.sum(axis=1)).T + + wikipedia_results = [ + # Forward-backward algorithm doesn't need b0_5, + # so .backward_probability doesn't compute it. + # [0.6469, 0.3531], + [0.5923, 0.4077], + [0.3763, 0.6237], + [0.6533, 0.3467], + [0.6273, 0.3727], + [0.5, 0.5], + ] + + assert_array_almost_equal(wikipedia_results, bp, 4) + + +def setup_module(module): + pytest.importorskip("numpy") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_json2csv_corpus.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_json2csv_corpus.py new file mode 100644 index 0000000000000000000000000000000000000000..ba4cefa039e5365f8a8cd2397429e4e8ccd3c907 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_json2csv_corpus.py @@ -0,0 +1,210 @@ +# Natural Language Toolkit: Twitter client +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Lorenzo Rubio +# URL: +# For license information, see LICENSE.TXT + +""" +Regression tests for `json2csv()` and `json2csv_entities()` in Twitter +package. +""" +from pathlib import Path + +import pytest + +from nltk.corpus import twitter_samples +from nltk.twitter.common import json2csv, json2csv_entities + + +def files_are_identical(pathA, pathB): + """ + Compare two files, ignoring carriage returns, + leading whitespace, and trailing whitespace + """ + f1 = [l.strip() for l in pathA.read_bytes().splitlines()] + f2 = [l.strip() for l in pathB.read_bytes().splitlines()] + return f1 == f2 + + +subdir = Path(__file__).parent / "files" + + +@pytest.fixture +def infile(): + with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile: + return [next(infile) for x in range(100)] + + +def test_textoutput(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.text.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.text.csv" + json2csv(infile, outfn, ["text"], gzip_compress=False) + assert files_are_identical(outfn, ref_fn) + + +def test_tweet_metadata(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.tweet.csv.ref" + fields = [ + "created_at", + "favorite_count", + "id", + "in_reply_to_status_id", + "in_reply_to_user_id", + "retweet_count", + "retweeted", + "text", + "truncated", + "user.id", + ] + + outfn = tmp_path / "tweets.20150430-223406.tweet.csv" + json2csv(infile, outfn, fields, gzip_compress=False) + assert files_are_identical(outfn, ref_fn) + + +def test_user_metadata(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.user.csv.ref" + fields = ["id", "text", "user.id", "user.followers_count", "user.friends_count"] + + outfn = tmp_path / "tweets.20150430-223406.user.csv" + json2csv(infile, outfn, fields, gzip_compress=False) + assert files_are_identical(outfn, ref_fn) + + +def test_tweet_hashtag(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.hashtag.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.hashtag.csv" + json2csv_entities( + infile, + outfn, + ["id", "text"], + "hashtags", + ["text"], + gzip_compress=False, + ) + assert files_are_identical(outfn, ref_fn) + + +def test_tweet_usermention(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.usermention.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.usermention.csv" + json2csv_entities( + infile, + outfn, + ["id", "text"], + "user_mentions", + ["id", "screen_name"], + gzip_compress=False, + ) + assert files_are_identical(outfn, ref_fn) + + +def test_tweet_media(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.media.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.media.csv" + json2csv_entities( + infile, + outfn, + ["id"], + "media", + ["media_url", "url"], + gzip_compress=False, + ) + + assert files_are_identical(outfn, ref_fn) + + +def test_tweet_url(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.url.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.url.csv" + json2csv_entities( + infile, + outfn, + ["id"], + "urls", + ["url", "expanded_url"], + gzip_compress=False, + ) + + assert files_are_identical(outfn, ref_fn) + + +def test_userurl(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.userurl.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.userurl.csv" + json2csv_entities( + infile, + outfn, + ["id", "screen_name"], + "user.urls", + ["url", "expanded_url"], + gzip_compress=False, + ) + + assert files_are_identical(outfn, ref_fn) + + +def test_tweet_place(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.place.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.place.csv" + json2csv_entities( + infile, + outfn, + ["id", "text"], + "place", + ["name", "country"], + gzip_compress=False, + ) + + assert files_are_identical(outfn, ref_fn) + + +def test_tweet_place_boundingbox(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.placeboundingbox.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.placeboundingbox.csv" + json2csv_entities( + infile, + outfn, + ["id", "name"], + "place.bounding_box", + ["coordinates"], + gzip_compress=False, + ) + + assert files_are_identical(outfn, ref_fn) + + +def test_retweet_original_tweet(tmp_path, infile): + ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.retweet.csv" + json2csv_entities( + infile, + outfn, + ["id"], + "retweeted_status", + [ + "created_at", + "favorite_count", + "id", + "in_reply_to_status_id", + "in_reply_to_user_id", + "retweet_count", + "text", + "truncated", + "user.id", + ], + gzip_compress=False, + ) + + assert files_are_identical(outfn, ref_fn) + + +def test_file_is_wrong(tmp_path, infile): + """ + Sanity check that file comparison is not giving false positives. + """ + ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref" + outfn = tmp_path / "tweets.20150430-223406.text.csv" + json2csv(infile, outfn, ["text"], gzip_compress=False) + assert not files_are_identical(outfn, ref_fn) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_json_serialization.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_json_serialization.py new file mode 100644 index 0000000000000000000000000000000000000000..1b9dc11b6eec30fe14e6cd419a853e8ce516cd62 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_json_serialization.py @@ -0,0 +1,95 @@ +import unittest + +from nltk.corpus import brown +from nltk.jsontags import JSONTaggedDecoder, JSONTaggedEncoder +from nltk.tag import ( + AffixTagger, + BigramTagger, + BrillTagger, + BrillTaggerTrainer, + DefaultTagger, + NgramTagger, + PerceptronTagger, + RegexpTagger, + TrigramTagger, + UnigramTagger, +) +from nltk.tag.brill import nltkdemo18 + + +class TestJSONSerialization(unittest.TestCase): + def setUp(self): + self.corpus = brown.tagged_sents()[:35] + self.decoder = JSONTaggedDecoder() + self.encoder = JSONTaggedEncoder() + self.default_tagger = DefaultTagger("NN") + + def test_default_tagger(self): + encoded = self.encoder.encode(self.default_tagger) + decoded = self.decoder.decode(encoded) + + self.assertEqual(repr(self.default_tagger), repr(decoded)) + self.assertEqual(self.default_tagger._tag, decoded._tag) + + def test_regexp_tagger(self): + tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger) + + encoded = self.encoder.encode(tagger) + decoded = self.decoder.decode(encoded) + + self.assertEqual(repr(tagger), repr(decoded)) + self.assertEqual(repr(tagger.backoff), repr(decoded.backoff)) + self.assertEqual(tagger._regexps, decoded._regexps) + + def test_affix_tagger(self): + tagger = AffixTagger(self.corpus, backoff=self.default_tagger) + + encoded = self.encoder.encode(tagger) + decoded = self.decoder.decode(encoded) + + self.assertEqual(repr(tagger), repr(decoded)) + self.assertEqual(repr(tagger.backoff), repr(decoded.backoff)) + self.assertEqual(tagger._affix_length, decoded._affix_length) + self.assertEqual(tagger._min_word_length, decoded._min_word_length) + self.assertEqual(tagger._context_to_tag, decoded._context_to_tag) + + def test_ngram_taggers(self): + unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger) + bitagger = BigramTagger(self.corpus, backoff=unitagger) + tritagger = TrigramTagger(self.corpus, backoff=bitagger) + ntagger = NgramTagger(4, self.corpus, backoff=tritagger) + + encoded = self.encoder.encode(ntagger) + decoded = self.decoder.decode(encoded) + + self.assertEqual(repr(ntagger), repr(decoded)) + self.assertEqual(repr(tritagger), repr(decoded.backoff)) + self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff)) + self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff)) + self.assertEqual( + repr(self.default_tagger), repr(decoded.backoff.backoff.backoff.backoff) + ) + + def test_perceptron_tagger(self): + tagger = PerceptronTagger(load=False) + tagger.train(self.corpus) + + encoded = self.encoder.encode(tagger) + decoded = self.decoder.decode(encoded) + + self.assertEqual(tagger.model.weights, decoded.model.weights) + self.assertEqual(tagger.tagdict, decoded.tagdict) + self.assertEqual(tagger.classes, decoded.classes) + + def test_brill_tagger(self): + trainer = BrillTaggerTrainer( + self.default_tagger, nltkdemo18(), deterministic=True + ) + tagger = trainer.train(self.corpus, max_rules=30) + + encoded = self.encoder.encode(tagger) + decoded = self.decoder.decode(encoded) + + self.assertEqual(repr(tagger._initial_tagger), repr(decoded._initial_tagger)) + self.assertEqual(tagger._rules, decoded._rules) + self.assertEqual(tagger._training_stats, decoded._training_stats) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_bleu.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_bleu.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa1e07903036885be24a23392ea68c16065dfde --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_bleu.py @@ -0,0 +1,405 @@ +""" +Tests for BLEU translation evaluation metric +""" + +import io +import unittest + +from nltk.data import find +from nltk.translate.bleu_score import ( + SmoothingFunction, + brevity_penalty, + closest_ref_length, + corpus_bleu, + modified_precision, + sentence_bleu, +) + + +class TestBLEU(unittest.TestCase): + def test_modified_precision(self): + """ + Examples from the original BLEU paper + https://www.aclweb.org/anthology/P02-1040.pdf + """ + # Example 1: the "the*" example. + # Reference sentences. + ref1 = "the cat is on the mat".split() + ref2 = "there is a cat on the mat".split() + # Hypothesis sentence(s). + hyp1 = "the the the the the the the".split() + + references = [ref1, ref2] + + # Testing modified unigram precision. + hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) + assert round(hyp1_unigram_precision, 4) == 0.2857 + # With assertAlmostEqual at 4 place precision. + self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4) + + # Testing modified bigram precision. + assert float(modified_precision(references, hyp1, n=2)) == 0.0 + + # Example 2: the "of the" example. + # Reference sentences + ref1 = str( + "It is a guide to action that ensures that the military " + "will forever heed Party commands" + ).split() + ref2 = str( + "It is the guiding principle which guarantees the military " + "forces always being under the command of the Party" + ).split() + ref3 = str( + "It is the practical guide for the army always to heed " + "the directions of the party" + ).split() + # Hypothesis sentence(s). + hyp1 = "of the".split() + + references = [ref1, ref2, ref3] + # Testing modified unigram precision. + assert float(modified_precision(references, hyp1, n=1)) == 1.0 + + # Testing modified bigram precision. + assert float(modified_precision(references, hyp1, n=2)) == 1.0 + + # Example 3: Proper MT outputs. + hyp1 = str( + "It is a guide to action which ensures that the military " + "always obeys the commands of the party" + ).split() + hyp2 = str( + "It is to insure the troops forever hearing the activity " + "guidebook that party direct" + ).split() + + references = [ref1, ref2, ref3] + + # Unigram precision. + hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) + hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1)) + # Test unigram precision with assertAlmostEqual at 4 place precision. + self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4) + self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4) + # Test unigram precision with rounding. + assert round(hyp1_unigram_precision, 4) == 0.9444 + assert round(hyp2_unigram_precision, 4) == 0.5714 + + # Bigram precision + hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2)) + hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2)) + # Test bigram precision with assertAlmostEqual at 4 place precision. + self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4) + self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4) + # Test bigram precision with rounding. + assert round(hyp1_bigram_precision, 4) == 0.5882 + assert round(hyp2_bigram_precision, 4) == 0.0769 + + def test_brevity_penalty(self): + # Test case from brevity_penalty_closest function in mteval-v13a.pl. + # Same test cases as in the doctest in nltk.translate.bleu_score.py + references = [["a"] * 11, ["a"] * 8] + hypothesis = ["a"] * 7 + hyp_len = len(hypothesis) + closest_ref_len = closest_ref_length(references, hyp_len) + self.assertAlmostEqual( + brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4 + ) + + references = [["a"] * 11, ["a"] * 8, ["a"] * 6, ["a"] * 7] + hypothesis = ["a"] * 7 + hyp_len = len(hypothesis) + closest_ref_len = closest_ref_length(references, hyp_len) + assert brevity_penalty(closest_ref_len, hyp_len) == 1.0 + + def test_zero_matches(self): + # Test case where there's 0 matches + references = ["The candidate has no alignment to any of the references".split()] + hypothesis = "John loves Mary".split() + + # Test BLEU to nth order of n-grams, where n is len(hypothesis). + for n in range(1, len(hypothesis)): + weights = (1.0 / n,) * n # Uniform weights. + assert sentence_bleu(references, hypothesis, weights) == 0 + + def test_full_matches(self): + # Test case where there's 100% matches + references = ["John loves Mary".split()] + hypothesis = "John loves Mary".split() + + # Test BLEU to nth order of n-grams, where n is len(hypothesis). + for n in range(1, len(hypothesis)): + weights = (1.0 / n,) * n # Uniform weights. + assert sentence_bleu(references, hypothesis, weights) == 1.0 + + def test_partial_matches_hypothesis_longer_than_reference(self): + references = ["John loves Mary".split()] + hypothesis = "John loves Mary who loves Mike".split() + # Since no 4-grams matches were found the result should be zero + # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 + self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4) + # Checks that the warning has been raised because len(reference) < 4. + try: + self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) + except AttributeError: + pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. + + +# @unittest.skip("Skipping fringe cases for BLEU.") +class TestBLEUFringeCases(unittest.TestCase): + def test_case_where_n_is_bigger_than_hypothesis_length(self): + # Test BLEU to nth order of n-grams, where n > len(hypothesis). + references = ["John loves Mary ?".split()] + hypothesis = "John loves Mary".split() + n = len(hypothesis) + 1 # + weights = (1.0 / n,) * n # Uniform weights. + # Since no n-grams matches were found the result should be zero + # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 + self.assertAlmostEqual( + sentence_bleu(references, hypothesis, weights), 0.0, places=4 + ) + # Checks that the warning has been raised because len(hypothesis) < 4. + try: + self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) + except AttributeError: + pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. + + # Test case where n > len(hypothesis) but so is n > len(reference), and + # it's a special case where reference == hypothesis. + references = ["John loves Mary".split()] + hypothesis = "John loves Mary".split() + # Since no 4-grams matches were found the result should be zero + # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 + self.assertAlmostEqual( + sentence_bleu(references, hypothesis, weights), 0.0, places=4 + ) + + def test_empty_hypothesis(self): + # Test case where there's hypothesis is empty. + references = ["The candidate has no alignment to any of the references".split()] + hypothesis = [] + assert sentence_bleu(references, hypothesis) == 0 + + def test_length_one_hypothesis(self): + # Test case where there's hypothesis is of length 1 in Smoothing method 4. + references = ["The candidate has no alignment to any of the references".split()] + hypothesis = ["Foo"] + method4 = SmoothingFunction().method4 + try: + sentence_bleu(references, hypothesis, smoothing_function=method4) + except ValueError: + pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. + + def test_empty_references(self): + # Test case where there's reference is empty. + references = [[]] + hypothesis = "John loves Mary".split() + assert sentence_bleu(references, hypothesis) == 0 + + def test_empty_references_and_hypothesis(self): + # Test case where both references and hypothesis is empty. + references = [[]] + hypothesis = [] + assert sentence_bleu(references, hypothesis) == 0 + + def test_reference_or_hypothesis_shorter_than_fourgrams(self): + # Test case where the length of reference or hypothesis + # is shorter than 4. + references = ["let it go".split()] + hypothesis = "let go it".split() + # Checks that the value the hypothesis and reference returns is 0.0 + # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 + self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4) + # Checks that the warning has been raised. + try: + self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) + except AttributeError: + pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. + + +class TestBLEUvsMteval13a(unittest.TestCase): + def test_corpus_bleu(self): + ref_file = find("models/wmt15_eval/ref.ru") + hyp_file = find("models/wmt15_eval/google.ru") + mteval_output_file = find("models/wmt15_eval/mteval-13a.output") + + # Reads the BLEU scores from the `mteval-13a.output` file. + # The order of the list corresponds to the order of the ngrams. + with open(mteval_output_file) as mteval_fin: + # The numbers are located in the last 2nd line of the file. + # The first and 2nd item in the list are the score and system names. + mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1]) + + with open(ref_file, encoding="utf8") as ref_fin: + with open(hyp_file, encoding="utf8") as hyp_fin: + # Whitespace tokenize the file. + # Note: split() automatically strip(). + hypothesis = list(map(lambda x: x.split(), hyp_fin)) + # Note that the corpus_bleu input is list of list of references. + references = list(map(lambda x: [x.split()], ref_fin)) + # Without smoothing. + for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores): + nltk_bleu = corpus_bleu( + references, hypothesis, weights=(1.0 / i,) * i + ) + # Check that the BLEU scores difference is less than 0.005 . + # Note: This is an approximate comparison; as much as + # +/- 0.01 BLEU might be "statistically significant", + # the actual translation quality might not be. + assert abs(mteval_bleu - nltk_bleu) < 0.005 + + # With the same smoothing method used in mteval-v13a.pl + chencherry = SmoothingFunction() + for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores): + nltk_bleu = corpus_bleu( + references, + hypothesis, + weights=(1.0 / i,) * i, + smoothing_function=chencherry.method3, + ) + assert abs(mteval_bleu - nltk_bleu) < 0.005 + + +class TestBLEUWithBadSentence(unittest.TestCase): + def test_corpus_bleu_with_bad_sentence(self): + hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R" + ref = str( + "Their tasks include changing a pump on the faulty stokehold ." + "Likewise , two species that are very similar in morphology " + "were distinguished using genetics ." + ) + references = [[ref.split()]] + hypotheses = [hyp.split()] + try: # Check that the warning is raised since no. of 2-grams < 0. + with self.assertWarns(UserWarning): + # Verify that the BLEU output is undesired since no. of 2-grams < 0. + self.assertAlmostEqual( + corpus_bleu(references, hypotheses), 0.0, places=4 + ) + except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2. + self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4) + + +class TestBLEUWithMultipleWeights(unittest.TestCase): + def test_corpus_bleu_with_multiple_weights(self): + hyp1 = [ + "It", + "is", + "a", + "guide", + "to", + "action", + "which", + "ensures", + "that", + "the", + "military", + "always", + "obeys", + "the", + "commands", + "of", + "the", + "party", + ] + ref1a = [ + "It", + "is", + "a", + "guide", + "to", + "action", + "that", + "ensures", + "that", + "the", + "military", + "will", + "forever", + "heed", + "Party", + "commands", + ] + ref1b = [ + "It", + "is", + "the", + "guiding", + "principle", + "which", + "guarantees", + "the", + "military", + "forces", + "always", + "being", + "under", + "the", + "command", + "of", + "the", + "Party", + ] + ref1c = [ + "It", + "is", + "the", + "practical", + "guide", + "for", + "the", + "army", + "always", + "to", + "heed", + "the", + "directions", + "of", + "the", + "party", + ] + hyp2 = [ + "he", + "read", + "the", + "book", + "because", + "he", + "was", + "interested", + "in", + "world", + "history", + ] + ref2a = [ + "he", + "was", + "interested", + "in", + "world", + "history", + "because", + "he", + "read", + "the", + "book", + ] + weight_1 = (1, 0, 0, 0) + weight_2 = (0.25, 0.25, 0.25, 0.25) + weight_3 = (0, 0, 0, 0, 1) + + bleu_scores = corpus_bleu( + list_of_references=[[ref1a, ref1b, ref1c], [ref2a]], + hypotheses=[hyp1, hyp2], + weights=[weight_1, weight_2, weight_3], + ) + assert bleu_scores[0] == corpus_bleu( + [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1 + ) + assert bleu_scores[1] == corpus_bleu( + [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2 + ) + assert bleu_scores[2] == corpus_bleu( + [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3 + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_gdfa.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_gdfa.py new file mode 100644 index 0000000000000000000000000000000000000000..1824be45265762050ad4f61fb181a822d5aaa7a7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_gdfa.py @@ -0,0 +1,154 @@ +""" +Tests GDFA alignments +""" + +import unittest + +from nltk.translate.gdfa import grow_diag_final_and + + +class TestGDFA(unittest.TestCase): + def test_from_eflomal_outputs(self): + """ + Testing GDFA with first 10 eflomal outputs from issue #1829 + https://github.com/nltk/nltk/issues/1829 + """ + # Input. + forwards = [ + "0-0 1-2", + "0-0 1-1", + "0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14", + "0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10", + "0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31", + "0-0 1-1 0-2 2-3", + "0-0 2-2 4-4", + "0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20", + "3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14", + "1-0", + ] + backwards = [ + "0-0 1-2", + "0-0 1-1", + "0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13", + "0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8", + "0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31", + "0-0 1-1 2-3", + "0-0 1-1 2-3 4-4", + "0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18", + "0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10", + "1-0", + ] + source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18] + target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16] + # Expected Output. + expected = [ + [(0, 0), (1, 2)], + [(0, 0), (1, 1)], + [ + (0, 0), + (2, 1), + (3, 2), + (4, 3), + (5, 4), + (6, 5), + (7, 6), + (8, 7), + (10, 10), + (11, 12), + ], + [ + (0, 0), + (1, 1), + (1, 2), + (2, 3), + (3, 4), + (4, 5), + (4, 6), + (5, 7), + (6, 8), + (7, 5), + (8, 7), + (8, 9), + (9, 8), + (9, 10), + ], + [ + (0, 0), + (1, 8), + (2, 9), + (3, 10), + (4, 11), + (5, 8), + (6, 9), + (6, 11), + (7, 10), + (8, 11), + (31, 31), + ], + [(0, 0), (0, 2), (1, 1), (2, 3)], + [(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)], + [ + (0, 0), + (1, 1), + (2, 3), + (3, 4), + (5, 5), + (7, 6), + (8, 7), + (9, 8), + (10, 9), + (11, 10), + (12, 11), + (13, 12), + (14, 13), + (15, 14), + (16, 16), + (17, 17), + (18, 18), + (19, 19), + ], + [ + (0, 0), + (1, 1), + (3, 0), + (3, 2), + (4, 1), + (5, 3), + (6, 2), + (6, 4), + (7, 5), + (8, 6), + (9, 7), + (9, 12), + (10, 8), + (10, 13), + (11, 9), + (12, 8), + (12, 14), + (13, 9), + (14, 8), + (15, 9), + (16, 10), + ], + [(1, 0)], + [ + (0, 0), + (1, 1), + (3, 2), + (4, 3), + (5, 4), + (6, 5), + (7, 6), + (9, 10), + (10, 12), + (11, 13), + (12, 14), + (13, 15), + ], + ] + + # Iterate through all 10 examples and check for expected outputs. + for fw, bw, src_len, trg_len, expect in zip( + forwards, backwards, source_lens, target_lens, expected + ): + self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm1.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm1.py new file mode 100644 index 0000000000000000000000000000000000000000..a4f32ef73cae1f789ee587b6d3d214cfeb0e70d2 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm1.py @@ -0,0 +1,73 @@ +""" +Tests for IBM Model 1 training methods +""" + +import unittest +from collections import defaultdict + +from nltk.translate import AlignedSent, IBMModel, IBMModel1 +from nltk.translate.ibm_model import AlignmentInfo + + +class TestIBMModel1(unittest.TestCase): + def test_set_uniform_translation_probabilities(self): + # arrange + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model1 = IBMModel1(corpus, 0) + + # act + model1.set_uniform_probabilities(corpus) + + # assert + # expected_prob = 1.0 / (target vocab size + 1) + self.assertEqual(model1.translation_table["ham"]["eier"], 1.0 / 3) + self.assertEqual(model1.translation_table["eggs"][None], 1.0 / 3) + + def test_set_uniform_translation_probabilities_of_non_domain_values(self): + # arrange + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model1 = IBMModel1(corpus, 0) + + # act + model1.set_uniform_probabilities(corpus) + + # assert + # examine target words that are not in the training data domain + self.assertEqual(model1.translation_table["parrot"]["eier"], IBMModel.MIN_PROB) + + def test_prob_t_a_given_s(self): + # arrange + src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] + trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] + corpus = [AlignedSent(trg_sentence, src_sentence)] + alignment_info = AlignmentInfo( + (0, 1, 4, 0, 2, 5, 5), + [None] + src_sentence, + ["UNUSED"] + trg_sentence, + None, + ) + + translation_table = defaultdict(lambda: defaultdict(float)) + translation_table["i"]["ich"] = 0.98 + translation_table["love"]["gern"] = 0.98 + translation_table["to"][None] = 0.98 + translation_table["eat"]["esse"] = 0.98 + translation_table["smoked"]["räucherschinken"] = 0.98 + translation_table["ham"]["räucherschinken"] = 0.98 + + model1 = IBMModel1(corpus, 0) + model1.translation_table = translation_table + + # act + probability = model1.prob_t_a_given_s(alignment_info) + + # assert + lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 + expected_probability = lexical_translation + self.assertEqual(round(probability, 4), round(expected_probability, 4)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm2.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm2.py new file mode 100644 index 0000000000000000000000000000000000000000..e2194dde9aabd503489e4f961b85da550b56d7c2 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm2.py @@ -0,0 +1,86 @@ +""" +Tests for IBM Model 2 training methods +""" + +import unittest +from collections import defaultdict + +from nltk.translate import AlignedSent, IBMModel, IBMModel2 +from nltk.translate.ibm_model import AlignmentInfo + + +class TestIBMModel2(unittest.TestCase): + def test_set_uniform_alignment_probabilities(self): + # arrange + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model2 = IBMModel2(corpus, 0) + + # act + model2.set_uniform_probabilities(corpus) + + # assert + # expected_prob = 1.0 / (length of source sentence + 1) + self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4) + self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3) + + def test_set_uniform_alignment_probabilities_of_non_domain_values(self): + # arrange + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model2 = IBMModel2(corpus, 0) + + # act + model2.set_uniform_probabilities(corpus) + + # assert + # examine i and j values that are not in the training data domain + self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB) + self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB) + + def test_prob_t_a_given_s(self): + # arrange + src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] + trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] + corpus = [AlignedSent(trg_sentence, src_sentence)] + alignment_info = AlignmentInfo( + (0, 1, 4, 0, 2, 5, 5), + [None] + src_sentence, + ["UNUSED"] + trg_sentence, + None, + ) + + translation_table = defaultdict(lambda: defaultdict(float)) + translation_table["i"]["ich"] = 0.98 + translation_table["love"]["gern"] = 0.98 + translation_table["to"][None] = 0.98 + translation_table["eat"]["esse"] = 0.98 + translation_table["smoked"]["räucherschinken"] = 0.98 + translation_table["ham"]["räucherschinken"] = 0.98 + + alignment_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))) + ) + alignment_table[0][3][5][6] = 0.97 # None -> to + alignment_table[1][1][5][6] = 0.97 # ich -> i + alignment_table[2][4][5][6] = 0.97 # esse -> eat + alignment_table[4][2][5][6] = 0.97 # gern -> love + alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked + alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham + + model2 = IBMModel2(corpus, 0) + model2.translation_table = translation_table + model2.alignment_table = alignment_table + + # act + probability = model2.prob_t_a_given_s(alignment_info) + + # assert + lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 + alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96 + expected_probability = lexical_translation * alignment + self.assertEqual(round(probability, 4), round(expected_probability, 4)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm3.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm3.py new file mode 100644 index 0000000000000000000000000000000000000000..14d89d6d9857f0ff62bb4388cf1e8f04c2f90d46 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm3.py @@ -0,0 +1,105 @@ +""" +Tests for IBM Model 3 training methods +""" + +import unittest +from collections import defaultdict + +from nltk.translate import AlignedSent, IBMModel, IBMModel3 +from nltk.translate.ibm_model import AlignmentInfo + + +class TestIBMModel3(unittest.TestCase): + def test_set_uniform_distortion_probabilities(self): + # arrange + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model3 = IBMModel3(corpus, 0) + + # act + model3.set_uniform_probabilities(corpus) + + # assert + # expected_prob = 1.0 / length of target sentence + self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2) + self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4) + + def test_set_uniform_distortion_probabilities_of_non_domain_values(self): + # arrange + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model3 = IBMModel3(corpus, 0) + + # act + model3.set_uniform_probabilities(corpus) + + # assert + # examine i and j values that are not in the training data domain + self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB) + self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB) + self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB) + + def test_prob_t_a_given_s(self): + # arrange + src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] + trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] + corpus = [AlignedSent(trg_sentence, src_sentence)] + alignment_info = AlignmentInfo( + (0, 1, 4, 0, 2, 5, 5), + [None] + src_sentence, + ["UNUSED"] + trg_sentence, + [[3], [1], [4], [], [2], [5, 6]], + ) + + distortion_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))) + ) + distortion_table[1][1][5][6] = 0.97 # i -> ich + distortion_table[2][4][5][6] = 0.97 # love -> gern + distortion_table[3][0][5][6] = 0.97 # to -> NULL + distortion_table[4][2][5][6] = 0.97 # eat -> esse + distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken + distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken + + translation_table = defaultdict(lambda: defaultdict(float)) + translation_table["i"]["ich"] = 0.98 + translation_table["love"]["gern"] = 0.98 + translation_table["to"][None] = 0.98 + translation_table["eat"]["esse"] = 0.98 + translation_table["smoked"]["räucherschinken"] = 0.98 + translation_table["ham"]["räucherschinken"] = 0.98 + + fertility_table = defaultdict(lambda: defaultdict(float)) + fertility_table[1]["ich"] = 0.99 + fertility_table[1]["esse"] = 0.99 + fertility_table[0]["ja"] = 0.99 + fertility_table[1]["gern"] = 0.99 + fertility_table[2]["räucherschinken"] = 0.999 + fertility_table[1][None] = 0.99 + + probabilities = { + "p1": 0.167, + "translation_table": translation_table, + "distortion_table": distortion_table, + "fertility_table": fertility_table, + "alignment_table": None, + } + + model3 = IBMModel3(corpus, 0, probabilities) + + # act + probability = model3.prob_t_a_given_s(alignment_info) + + # assert + null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) + fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 + lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 + distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97 + expected_probability = ( + null_generation * fertility * lexical_translation * distortion + ) + self.assertEqual(round(probability, 4), round(expected_probability, 4)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm4.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm4.py new file mode 100644 index 0000000000000000000000000000000000000000..674b2bc37aaae3a42711d13f05a9bd9d0b35a717 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm4.py @@ -0,0 +1,120 @@ +""" +Tests for IBM Model 4 training methods +""" + +import unittest +from collections import defaultdict + +from nltk.translate import AlignedSent, IBMModel, IBMModel4 +from nltk.translate.ibm_model import AlignmentInfo + + +class TestIBMModel4(unittest.TestCase): + def test_set_uniform_distortion_probabilities_of_max_displacements(self): + # arrange + src_classes = {"schinken": 0, "eier": 0, "spam": 1} + trg_classes = {"ham": 0, "eggs": 1, "spam": 2} + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model4 = IBMModel4(corpus, 0, src_classes, trg_classes) + + # act + model4.set_uniform_probabilities(corpus) + + # assert + # number of displacement values = + # 2 *(number of words in longest target sentence - 1) + expected_prob = 1.0 / (2 * (4 - 1)) + + # examine the boundary values for (displacement, src_class, trg_class) + self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob) + self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob) + self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob) + self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob) + + def test_set_uniform_distortion_probabilities_of_non_domain_values(self): + # arrange + src_classes = {"schinken": 0, "eier": 0, "spam": 1} + trg_classes = {"ham": 0, "eggs": 1, "spam": 2} + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model4 = IBMModel4(corpus, 0, src_classes, trg_classes) + + # act + model4.set_uniform_probabilities(corpus) + + # assert + # examine displacement values that are not in the training data domain + self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB) + self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB) + self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB) + self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB) + + def test_prob_t_a_given_s(self): + # arrange + src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] + trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] + src_classes = {"räucherschinken": 0, "ja": 1, "ich": 2, "esse": 3, "gern": 4} + trg_classes = {"ham": 0, "smoked": 1, "i": 3, "love": 4, "to": 2, "eat": 4} + corpus = [AlignedSent(trg_sentence, src_sentence)] + alignment_info = AlignmentInfo( + (0, 1, 4, 0, 2, 5, 5), + [None] + src_sentence, + ["UNUSED"] + trg_sentence, + [[3], [1], [4], [], [2], [5, 6]], + ) + + head_distortion_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(float)) + ) + head_distortion_table[1][None][3] = 0.97 # None, i + head_distortion_table[3][2][4] = 0.97 # ich, eat + head_distortion_table[-2][3][4] = 0.97 # esse, love + head_distortion_table[3][4][1] = 0.97 # gern, smoked + + non_head_distortion_table = defaultdict(lambda: defaultdict(float)) + non_head_distortion_table[1][0] = 0.96 # ham + + translation_table = defaultdict(lambda: defaultdict(float)) + translation_table["i"]["ich"] = 0.98 + translation_table["love"]["gern"] = 0.98 + translation_table["to"][None] = 0.98 + translation_table["eat"]["esse"] = 0.98 + translation_table["smoked"]["räucherschinken"] = 0.98 + translation_table["ham"]["räucherschinken"] = 0.98 + + fertility_table = defaultdict(lambda: defaultdict(float)) + fertility_table[1]["ich"] = 0.99 + fertility_table[1]["esse"] = 0.99 + fertility_table[0]["ja"] = 0.99 + fertility_table[1]["gern"] = 0.99 + fertility_table[2]["räucherschinken"] = 0.999 + fertility_table[1][None] = 0.99 + + probabilities = { + "p1": 0.167, + "translation_table": translation_table, + "head_distortion_table": head_distortion_table, + "non_head_distortion_table": non_head_distortion_table, + "fertility_table": fertility_table, + "alignment_table": None, + } + + model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities) + + # act + probability = model4.prob_t_a_given_s(alignment_info) + + # assert + null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) + fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 + lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 + distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96 + expected_probability = ( + null_generation * fertility * lexical_translation * distortion + ) + self.assertEqual(round(probability, 4), round(expected_probability, 4)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm5.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm5.py new file mode 100644 index 0000000000000000000000000000000000000000..7c29c47de230c0e128cb969514787b2ded0451ef --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm5.py @@ -0,0 +1,160 @@ +""" +Tests for IBM Model 5 training methods +""" + +import unittest +from collections import defaultdict + +from nltk.translate import AlignedSent, IBMModel, IBMModel4, IBMModel5 +from nltk.translate.ibm_model import AlignmentInfo + + +class TestIBMModel5(unittest.TestCase): + def test_set_uniform_vacancy_probabilities_of_max_displacements(self): + # arrange + src_classes = {"schinken": 0, "eier": 0, "spam": 1} + trg_classes = {"ham": 0, "eggs": 1, "spam": 2} + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model5 = IBMModel5(corpus, 0, src_classes, trg_classes) + + # act + model5.set_uniform_probabilities(corpus) + + # assert + # number of vacancy difference values = + # 2 * number of words in longest target sentence + expected_prob = 1.0 / (2 * 4) + + # examine the boundary values for (dv, max_v, trg_class) + self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob) + self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob) + self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob) + self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob) + + def test_set_uniform_vacancy_probabilities_of_non_domain_values(self): + # arrange + src_classes = {"schinken": 0, "eier": 0, "spam": 1} + trg_classes = {"ham": 0, "eggs": 1, "spam": 2} + corpus = [ + AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), + AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), + ] + model5 = IBMModel5(corpus, 0, src_classes, trg_classes) + + # act + model5.set_uniform_probabilities(corpus) + + # assert + # examine dv and max_v values that are not in the training data domain + self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB) + self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) + self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB) + self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB) + self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) + + def test_prob_t_a_given_s(self): + # arrange + src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] + trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] + src_classes = {"räucherschinken": 0, "ja": 1, "ich": 2, "esse": 3, "gern": 4} + trg_classes = {"ham": 0, "smoked": 1, "i": 3, "love": 4, "to": 2, "eat": 4} + corpus = [AlignedSent(trg_sentence, src_sentence)] + alignment_info = AlignmentInfo( + (0, 1, 4, 0, 2, 5, 5), + [None] + src_sentence, + ["UNUSED"] + trg_sentence, + [[3], [1], [4], [], [2], [5, 6]], + ) + + head_vacancy_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(float)) + ) + head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i + head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat + head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love + head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked + + non_head_vacancy_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(float)) + ) + non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham + + translation_table = defaultdict(lambda: defaultdict(float)) + translation_table["i"]["ich"] = 0.98 + translation_table["love"]["gern"] = 0.98 + translation_table["to"][None] = 0.98 + translation_table["eat"]["esse"] = 0.98 + translation_table["smoked"]["räucherschinken"] = 0.98 + translation_table["ham"]["räucherschinken"] = 0.98 + + fertility_table = defaultdict(lambda: defaultdict(float)) + fertility_table[1]["ich"] = 0.99 + fertility_table[1]["esse"] = 0.99 + fertility_table[0]["ja"] = 0.99 + fertility_table[1]["gern"] = 0.99 + fertility_table[2]["räucherschinken"] = 0.999 + fertility_table[1][None] = 0.99 + + probabilities = { + "p1": 0.167, + "translation_table": translation_table, + "fertility_table": fertility_table, + "head_vacancy_table": head_vacancy_table, + "non_head_vacancy_table": non_head_vacancy_table, + "head_distortion_table": None, + "non_head_distortion_table": None, + "alignment_table": None, + } + + model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities) + + # act + probability = model5.prob_t_a_given_s(alignment_info) + + # assert + null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) + fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 + lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 + vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96 + expected_probability = ( + null_generation * fertility * lexical_translation * vacancy + ) + self.assertEqual(round(probability, 4), round(expected_probability, 4)) + + def test_prune(self): + # arrange + alignment_infos = [ + AlignmentInfo((1, 1), None, None, None), + AlignmentInfo((1, 2), None, None, None), + AlignmentInfo((2, 1), None, None, None), + AlignmentInfo((2, 2), None, None, None), + AlignmentInfo((0, 0), None, None, None), + ] + min_factor = IBMModel5.MIN_SCORE_FACTOR + best_score = 0.9 + scores = { + (1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold + (1, 2): best_score, + (2, 1): min_factor * best_score, # at threshold + (2, 2): min_factor * best_score * 0.5, # low score + (0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold + } + corpus = [AlignedSent(["a"], ["b"])] + original_prob_function = IBMModel4.model4_prob_t_a_given_s + # mock static method + IBMModel4.model4_prob_t_a_given_s = staticmethod( + lambda a, model: scores[a.alignment] + ) + model5 = IBMModel5(corpus, 0, None, None) + + # act + pruned_alignments = model5.prune(alignment_infos) + + # assert + self.assertEqual(len(pruned_alignments), 3) + + # restore static method + IBMModel4.model4_prob_t_a_given_s = original_prob_function diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm_model.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm_model.py new file mode 100644 index 0000000000000000000000000000000000000000..2707fc6e8c8825c9e1c042cfcb28b3edacff3e87 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_ibm_model.py @@ -0,0 +1,269 @@ +""" +Tests for common methods of IBM translation models +""" + +import unittest +from collections import defaultdict + +from nltk.translate import AlignedSent, IBMModel +from nltk.translate.ibm_model import AlignmentInfo + + +class TestIBMModel(unittest.TestCase): + __TEST_SRC_SENTENCE = ["j'", "aime", "bien", "jambon"] + __TEST_TRG_SENTENCE = ["i", "love", "ham"] + + def test_vocabularies_are_initialized(self): + parallel_corpora = [ + AlignedSent(["one", "two", "three", "four"], ["un", "deux", "trois"]), + AlignedSent(["five", "one", "six"], ["quatre", "cinq", "six"]), + AlignedSent([], ["sept"]), + ] + + ibm_model = IBMModel(parallel_corpora) + self.assertEqual(len(ibm_model.src_vocab), 8) + self.assertEqual(len(ibm_model.trg_vocab), 6) + + def test_vocabularies_are_initialized_even_with_empty_corpora(self): + parallel_corpora = [] + + ibm_model = IBMModel(parallel_corpora) + self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token + self.assertEqual(len(ibm_model.trg_vocab), 0) + + def test_best_model2_alignment(self): + # arrange + sentence_pair = AlignedSent( + TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE + ) + # None and 'bien' have zero fertility + translation_table = { + "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, + "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, + "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, + } + alignment_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) + ) + + ibm_model = IBMModel([]) + ibm_model.translation_table = translation_table + ibm_model.alignment_table = alignment_table + + # act + a_info = ibm_model.best_model2_alignment(sentence_pair) + + # assert + self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused + self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]]) + + def test_best_model2_alignment_does_not_change_pegged_alignment(self): + # arrange + sentence_pair = AlignedSent( + TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE + ) + translation_table = { + "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, + "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, + "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, + } + alignment_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) + ) + + ibm_model = IBMModel([]) + ibm_model.translation_table = translation_table + ibm_model.alignment_table = alignment_table + + # act: force 'love' to be pegged to 'jambon' + a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4) + # assert + self.assertEqual(a_info.alignment[1:], (1, 4, 4)) + self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]]) + + def test_best_model2_alignment_handles_fertile_words(self): + # arrange + sentence_pair = AlignedSent( + ["i", "really", ",", "really", "love", "ham"], + TestIBMModel.__TEST_SRC_SENTENCE, + ) + # 'bien' produces 2 target words: 'really' and another 'really' + translation_table = { + "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, + "really": {"j'": 0, "aime": 0, "bien": 0.9, "jambon": 0.01, None: 0.09}, + ",": {"j'": 0, "aime": 0, "bien": 0.3, "jambon": 0, None: 0.7}, + "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, + "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, + } + alignment_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) + ) + + ibm_model = IBMModel([]) + ibm_model.translation_table = translation_table + ibm_model.alignment_table = alignment_table + + # act + a_info = ibm_model.best_model2_alignment(sentence_pair) + + # assert + self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4)) + self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]]) + + def test_best_model2_alignment_handles_empty_src_sentence(self): + # arrange + sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, []) + ibm_model = IBMModel([]) + + # act + a_info = ibm_model.best_model2_alignment(sentence_pair) + + # assert + self.assertEqual(a_info.alignment[1:], (0, 0, 0)) + self.assertEqual(a_info.cepts, [[1, 2, 3]]) + + def test_best_model2_alignment_handles_empty_trg_sentence(self): + # arrange + sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE) + ibm_model = IBMModel([]) + + # act + a_info = ibm_model.best_model2_alignment(sentence_pair) + + # assert + self.assertEqual(a_info.alignment[1:], ()) + self.assertEqual(a_info.cepts, [[], [], [], [], []]) + + def test_neighboring_finds_neighbor_alignments(self): + # arrange + a_info = AlignmentInfo( + (0, 3, 2), + (None, "des", "œufs", "verts"), + ("UNUSED", "green", "eggs"), + [[], [], [2], [1]], + ) + ibm_model = IBMModel([]) + + # act + neighbors = ibm_model.neighboring(a_info) + + # assert + neighbor_alignments = set() + for neighbor in neighbors: + neighbor_alignments.add(neighbor.alignment) + expected_alignments = { + # moves + (0, 0, 2), + (0, 1, 2), + (0, 2, 2), + (0, 3, 0), + (0, 3, 1), + (0, 3, 3), + # swaps + (0, 2, 3), + # original alignment + (0, 3, 2), + } + self.assertEqual(neighbor_alignments, expected_alignments) + + def test_neighboring_sets_neighbor_alignment_info(self): + # arrange + a_info = AlignmentInfo( + (0, 3, 2), + (None, "des", "œufs", "verts"), + ("UNUSED", "green", "eggs"), + [[], [], [2], [1]], + ) + ibm_model = IBMModel([]) + + # act + neighbors = ibm_model.neighboring(a_info) + + # assert: select a few particular alignments + for neighbor in neighbors: + if neighbor.alignment == (0, 2, 2): + moved_alignment = neighbor + elif neighbor.alignment == (0, 3, 2): + swapped_alignment = neighbor + + self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []]) + self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]]) + + def test_neighboring_returns_neighbors_with_pegged_alignment(self): + # arrange + a_info = AlignmentInfo( + (0, 3, 2), + (None, "des", "œufs", "verts"), + ("UNUSED", "green", "eggs"), + [[], [], [2], [1]], + ) + ibm_model = IBMModel([]) + + # act: peg 'eggs' to align with 'œufs' + neighbors = ibm_model.neighboring(a_info, 2) + + # assert + neighbor_alignments = set() + for neighbor in neighbors: + neighbor_alignments.add(neighbor.alignment) + expected_alignments = { + # moves + (0, 0, 2), + (0, 1, 2), + (0, 2, 2), + # no swaps + # original alignment + (0, 3, 2), + } + self.assertEqual(neighbor_alignments, expected_alignments) + + def test_hillclimb(self): + # arrange + initial_alignment = AlignmentInfo((0, 3, 2), None, None, None) + + def neighboring_mock(a, j): + if a.alignment == (0, 3, 2): + return { + AlignmentInfo((0, 2, 2), None, None, None), + AlignmentInfo((0, 1, 1), None, None, None), + } + elif a.alignment == (0, 2, 2): + return { + AlignmentInfo((0, 3, 3), None, None, None), + AlignmentInfo((0, 4, 4), None, None, None), + } + return set() + + def prob_t_a_given_s_mock(a): + prob_values = { + (0, 3, 2): 0.5, + (0, 2, 2): 0.6, + (0, 1, 1): 0.4, + (0, 3, 3): 0.6, + (0, 4, 4): 0.7, + } + return prob_values.get(a.alignment, 0.01) + + ibm_model = IBMModel([]) + ibm_model.neighboring = neighboring_mock + ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock + + # act + best_alignment = ibm_model.hillclimb(initial_alignment) + + # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4) + self.assertEqual(best_alignment.alignment, (0, 4, 4)) + + def test_sample(self): + # arrange + sentence_pair = AlignedSent( + TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE + ) + ibm_model = IBMModel([]) + ibm_model.prob_t_a_given_s = lambda x: 0.001 + + # act + samples, best_alignment = ibm_model.sample(sentence_pair) + + # assert + self.assertEqual(len(samples), 61) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_meteor.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_meteor.py new file mode 100644 index 0000000000000000000000000000000000000000..13d8e311c9337266a9cdc1b2ecfd67ef58cfb5b2 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_meteor.py @@ -0,0 +1,20 @@ +import unittest + +from nltk.translate.meteor_score import meteor_score + + +class TestMETEOR(unittest.TestCase): + reference = [["this", "is", "a", "test"], ["this", "is" "test"]] + candidate = ["THIS", "Is", "a", "tEST"] + + def test_meteor(self): + score = meteor_score(self.reference, self.candidate, preprocess=str.lower) + assert score == 0.9921875 + + def test_reference_type_check(self): + str_reference = [" ".join(ref) for ref in self.reference] + self.assertRaises(TypeError, meteor_score, str_reference, self.candidate) + + def test_candidate_type_check(self): + str_candidate = " ".join(self.candidate) + self.assertRaises(TypeError, meteor_score, self.reference, str_candidate) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_nist.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_nist.py new file mode 100644 index 0000000000000000000000000000000000000000..1bb8829abaf40e892680f83b283bcb23884f386b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_nist.py @@ -0,0 +1,36 @@ +""" +Tests for NIST translation evaluation metric +""" + +import io +import unittest + +from nltk.data import find +from nltk.translate.nist_score import corpus_nist + + +class TestNIST(unittest.TestCase): + def test_sentence_nist(self): + ref_file = find("models/wmt15_eval/ref.ru") + hyp_file = find("models/wmt15_eval/google.ru") + mteval_output_file = find("models/wmt15_eval/mteval-13a.output") + + # Reads the NIST scores from the `mteval-13a.output` file. + # The order of the list corresponds to the order of the ngrams. + with open(mteval_output_file) as mteval_fin: + # The numbers are located in the last 4th line of the file. + # The first and 2nd item in the list are the score and system names. + mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1]) + + with open(ref_file, encoding="utf8") as ref_fin: + with open(hyp_file, encoding="utf8") as hyp_fin: + # Whitespace tokenize the file. + # Note: split() automatically strip(). + hypotheses = list(map(lambda x: x.split(), hyp_fin)) + # Note that the corpus_bleu input is list of list of references. + references = list(map(lambda x: [x.split()], ref_fin)) + # Without smoothing. + for i, mteval_nist in zip(range(1, 10), mteval_nist_scores): + nltk_nist = corpus_nist(references, hypotheses, i) + # Check that the NIST scores difference is less than 0.5 + assert abs(mteval_nist - nltk_nist) < 0.05 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_stack_decoder.py b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_stack_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..e55ed3f4125f0b98c1e41bd5ff6ecc6247b4eac7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/translate/test_stack_decoder.py @@ -0,0 +1,294 @@ +# Natural Language Toolkit: Stack decoder +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Tah Wei Hoon +# URL: +# For license information, see LICENSE.TXT + +""" +Tests for stack decoder +""" + +import unittest +from collections import defaultdict +from math import log + +from nltk.translate import PhraseTable, StackDecoder +from nltk.translate.stack_decoder import _Hypothesis, _Stack + + +class TestStackDecoder(unittest.TestCase): + def test_find_all_src_phrases(self): + # arrange + phrase_table = TestStackDecoder.create_fake_phrase_table() + stack_decoder = StackDecoder(phrase_table, None) + sentence = ("my", "hovercraft", "is", "full", "of", "eels") + + # act + src_phrase_spans = stack_decoder.find_all_src_phrases(sentence) + + # assert + self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft' + self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft' + self.assertEqual(src_phrase_spans[2], [3]) # 'is' + self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels' + self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of' + self.assertEqual(src_phrase_spans[5], [6]) # 'eels' + + def test_distortion_score(self): + # arrange + stack_decoder = StackDecoder(None, None) + stack_decoder.distortion_factor = 0.5 + hypothesis = _Hypothesis() + hypothesis.src_phrase_span = (3, 5) + + # act + score = stack_decoder.distortion_score(hypothesis, (8, 10)) + + # assert + expected_score = log(stack_decoder.distortion_factor) * (8 - 5) + self.assertEqual(score, expected_score) + + def test_distortion_score_of_first_expansion(self): + # arrange + stack_decoder = StackDecoder(None, None) + stack_decoder.distortion_factor = 0.5 + hypothesis = _Hypothesis() + + # act + score = stack_decoder.distortion_score(hypothesis, (8, 10)) + + # assert + # expansion from empty hypothesis always has zero distortion cost + self.assertEqual(score, 0.0) + + def test_compute_future_costs(self): + # arrange + phrase_table = TestStackDecoder.create_fake_phrase_table() + language_model = TestStackDecoder.create_fake_language_model() + stack_decoder = StackDecoder(phrase_table, language_model) + sentence = ("my", "hovercraft", "is", "full", "of", "eels") + + # act + future_scores = stack_decoder.compute_future_scores(sentence) + + # assert + self.assertEqual( + future_scores[1][2], + ( + phrase_table.translations_for(("hovercraft",))[0].log_prob + + language_model.probability(("hovercraft",)) + ), + ) + self.assertEqual( + future_scores[0][2], + ( + phrase_table.translations_for(("my", "hovercraft"))[0].log_prob + + language_model.probability(("my", "hovercraft")) + ), + ) + + def test_compute_future_costs_for_phrases_not_in_phrase_table(self): + # arrange + phrase_table = TestStackDecoder.create_fake_phrase_table() + language_model = TestStackDecoder.create_fake_language_model() + stack_decoder = StackDecoder(phrase_table, language_model) + sentence = ("my", "hovercraft", "is", "full", "of", "eels") + + # act + future_scores = stack_decoder.compute_future_scores(sentence) + + # assert + self.assertEqual( + future_scores[1][3], # 'hovercraft is' is not in phrase table + future_scores[1][2] + future_scores[2][3], + ) # backoff + + def test_future_score(self): + # arrange: sentence with 8 words; words 2, 3, 4 already translated + hypothesis = _Hypothesis() + hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock + future_score_table = defaultdict(lambda: defaultdict(float)) + future_score_table[0][2] = 0.4 + future_score_table[5][8] = 0.5 + stack_decoder = StackDecoder(None, None) + + # act + future_score = stack_decoder.future_score(hypothesis, future_score_table, 8) + + # assert + self.assertEqual(future_score, 0.4 + 0.5) + + def test_valid_phrases(self): + # arrange + hypothesis = _Hypothesis() + # mock untranslated_spans method + hypothesis.untranslated_spans = lambda _: [(0, 2), (3, 6)] + all_phrases_from = [[1, 4], [2], [], [5], [5, 6, 7], [], [7]] + + # act + phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis) + + # assert + self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)]) + + @staticmethod + def create_fake_phrase_table(): + phrase_table = PhraseTable() + phrase_table.add(("hovercraft",), ("",), 0.8) + phrase_table.add(("my", "hovercraft"), ("", ""), 0.7) + phrase_table.add(("my", "cheese"), ("", ""), 0.7) + phrase_table.add(("is",), ("",), 0.8) + phrase_table.add(("is",), ("",), 0.5) + phrase_table.add(("full", "of"), ("", ""), 0.01) + phrase_table.add(("full", "of", "eels"), ("", "", ""), 0.5) + phrase_table.add(("full", "of", "spam"), ("", ""), 0.5) + phrase_table.add(("eels",), ("",), 0.5) + phrase_table.add(("spam",), ("",), 0.5) + return phrase_table + + @staticmethod + def create_fake_language_model(): + # nltk.model should be used here once it is implemented + language_prob = defaultdict(lambda: -999.0) + language_prob[("my",)] = log(0.1) + language_prob[("hovercraft",)] = log(0.1) + language_prob[("is",)] = log(0.1) + language_prob[("full",)] = log(0.1) + language_prob[("of",)] = log(0.1) + language_prob[("eels",)] = log(0.1) + language_prob[("my", "hovercraft")] = log(0.3) + language_model = type( + "", (object,), {"probability": lambda _, phrase: language_prob[phrase]} + )() + return language_model + + +class TestHypothesis(unittest.TestCase): + def setUp(self): + root = _Hypothesis() + child = _Hypothesis( + raw_score=0.5, + src_phrase_span=(3, 7), + trg_phrase=("hello", "world"), + previous=root, + ) + grandchild = _Hypothesis( + raw_score=0.4, + src_phrase_span=(1, 2), + trg_phrase=("and", "goodbye"), + previous=child, + ) + self.hypothesis_chain = grandchild + + def test_translation_so_far(self): + # act + translation = self.hypothesis_chain.translation_so_far() + + # assert + self.assertEqual(translation, ["hello", "world", "and", "goodbye"]) + + def test_translation_so_far_for_empty_hypothesis(self): + # arrange + hypothesis = _Hypothesis() + + # act + translation = hypothesis.translation_so_far() + + # assert + self.assertEqual(translation, []) + + def test_total_translated_words(self): + # act + total_translated_words = self.hypothesis_chain.total_translated_words() + + # assert + self.assertEqual(total_translated_words, 5) + + def test_translated_positions(self): + # act + translated_positions = self.hypothesis_chain.translated_positions() + + # assert + translated_positions.sort() + self.assertEqual(translated_positions, [1, 3, 4, 5, 6]) + + def test_untranslated_spans(self): + # act + untranslated_spans = self.hypothesis_chain.untranslated_spans(10) + + # assert + self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)]) + + def test_untranslated_spans_for_empty_hypothesis(self): + # arrange + hypothesis = _Hypothesis() + + # act + untranslated_spans = hypothesis.untranslated_spans(10) + + # assert + self.assertEqual(untranslated_spans, [(0, 10)]) + + +class TestStack(unittest.TestCase): + def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self): + # arrange + stack = _Stack(3) + poor_hypothesis = _Hypothesis(0.01) + + # act + stack.push(_Hypothesis(0.2)) + stack.push(poor_hypothesis) + stack.push(_Hypothesis(0.1)) + stack.push(_Hypothesis(0.3)) + + # assert + self.assertFalse(poor_hypothesis in stack) + + def test_push_removes_hypotheses_that_fall_below_beam_threshold(self): + # arrange + stack = _Stack(3, 0.5) + poor_hypothesis = _Hypothesis(0.01) + worse_hypothesis = _Hypothesis(0.009) + + # act + stack.push(poor_hypothesis) + stack.push(worse_hypothesis) + stack.push(_Hypothesis(0.9)) # greatly superior hypothesis + + # assert + self.assertFalse(poor_hypothesis in stack) + self.assertFalse(worse_hypothesis in stack) + + def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self): + # arrange + stack = _Stack(3, 0.5) + poor_hypothesis = _Hypothesis(0.01) + + # act + stack.push(_Hypothesis(0.9)) # greatly superior hypothesis + stack.push(poor_hypothesis) + + # assert + self.assertFalse(poor_hypothesis in stack) + + def test_best_returns_the_best_hypothesis(self): + # arrange + stack = _Stack(3) + best_hypothesis = _Hypothesis(0.99) + + # act + stack.push(_Hypothesis(0.0)) + stack.push(best_hypothesis) + stack.push(_Hypothesis(0.5)) + + # assert + self.assertEqual(stack.best(), best_hypothesis) + + def test_best_returns_none_when_stack_is_empty(self): + # arrange + stack = _Stack(3) + + # assert + self.assertEqual(stack.best(), None) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/util.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/util.doctest new file mode 100644 index 0000000000000000000000000000000000000000..0f8783f05fdaf4fdfec3c097dee8e12fbe5631bd --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/util.doctest @@ -0,0 +1,47 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +================= +Utility functions +================= + + >>> from nltk.util import * + >>> from nltk.tree import Tree + + >>> print_string("This is a long string, therefore it should break", 25) + This is a long string, + therefore it should break + + >>> re_show("[a-z]+", "sdf123") + {sdf}123 + + >>> tree = Tree(5, + ... [Tree(4, [Tree(2, [1, 3])]), + ... Tree(8, [Tree(6, [7]), 9])]) + >>> for x in breadth_first(tree): + ... if isinstance(x, int): print(x) + ... else: print(x.label()) + 5 + 4 + 8 + 2 + 6 + 9 + 1 + 3 + 7 + >>> for x in breadth_first(tree, maxdepth=2): + ... if isinstance(x, int): print(x) + ... else: print(x.label()) + 5 + 4 + 8 + 2 + 6 + 9 + + >>> invert_dict({1: 2}) + defaultdict(<... 'list'>, {2: 1}) + + >>> invert_dict({1: [3, 4, 5]}) + defaultdict(<... 'list'>, {3: [1], 4: [1], 5: [1]}) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/wordnet.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/wordnet.doctest new file mode 100644 index 0000000000000000000000000000000000000000..1ab67f4cc89350dab40a201d3f7e126400823f72 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/wordnet.doctest @@ -0,0 +1,828 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +================= +WordNet Interface +================= + +WordNet is just another NLTK corpus reader, and can be imported like this: + + >>> from nltk.corpus import wordnet + +For more compact code, we recommend: + + >>> from nltk.corpus import wordnet as wn + +----- +Words +----- + +Look up a word using ``synsets()``; this function has an optional ``pos`` argument +which lets you constrain the part of speech of the word: + + >>> wn.synsets('dog') + [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), + Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')] + >>> wn.synsets('dog', pos=wn.VERB) + [Synset('chase.v.01')] + +The other parts of speech are ``NOUN``, ``ADJ`` and ``ADV``. +A synset is identified with a 3-part name of the form: word.pos.nn: + + >>> wn.synset('dog.n.01') + Synset('dog.n.01') + >>> print(wn.synset('dog.n.01').definition()) + a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds + >>> len(wn.synset('dog.n.01').examples()) + 1 + >>> print(wn.synset('dog.n.01').examples()[0]) + the dog barked all night + >>> wn.synset('dog.n.01').lemmas() + [Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')] + >>> [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()] + ['dog', 'domestic_dog', 'Canis_familiaris'] + >>> wn.lemma('dog.n.01.dog').synset() + Synset('dog.n.01') + +The WordNet corpus reader gives access to the Open Multilingual +WordNet, using ISO-639 language codes. These languages are not +loaded by default, but only lazily, when needed. + + >>> wn.langs() + ['eng'] + + >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn') + [Synset('dog.n.01'), Synset('spy.n.01')] + + >>> wn.synset('spy.n.01').lemma_names('jpn') + ['いぬ', 'まわし者', 'スパイ', '回し者', '回者', '密偵', + '工作員', '廻し者', '廻者', '探', '探り', '犬', '秘密捜査員', + '諜報員', '諜者', '間者', '間諜', '隠密'] + + >>> sorted(wn.langs()) + ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', + 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn', + 'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'ron', 'slk', + 'slv', 'spa', 'swe', 'tha', 'zsm'] + + >>> wn.synset('dog.n.01').lemma_names('ita') + ['Canis_familiaris', 'cane'] + >>> wn.lemmas('cane', lang='ita') + [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'), + Lemma('incompetent.n.01.cane')] + >>> sorted(wn.synset('dog.n.01').lemmas('dan')) + [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'), + Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')] + + >>> sorted(wn.synset('dog.n.01').lemmas('por')) + [Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')] + + >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por') + >>> dog_lemma + Lemma('dog.n.01.c\xe3o') + >>> dog_lemma.lang() + 'por' + >>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn'))) + 66031 + +The synonyms of a word are returned as a nested list of synonyms of the different senses of +the input word in the given language, since these different senses are not mutual synonyms: + + >>> wn.synonyms('car') + [['auto', 'automobile', 'machine', 'motorcar'], ['railcar', 'railroad_car', 'railway_car'], ['gondola'], ['elevator_car'], ['cable_car']] + >>> wn.synonyms('coche', lang='spa') + [['auto', 'automóvil', 'carro', 'máquina', 'turismo', 'vehículo'], ['automotor', 'vagón'], ['vagón', 'vagón_de_pasajeros']] + + +------- +Synsets +------- + +`Synset`: a set of synonyms that share a common meaning. + + >>> dog = wn.synset('dog.n.01') + >>> dog.hypernyms() + [Synset('canine.n.02'), Synset('domestic_animal.n.01')] + >>> dog.hyponyms() + [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), ...] + >>> dog.member_holonyms() + [Synset('canis.n.01'), Synset('pack.n.06')] + >>> dog.root_hypernyms() + [Synset('entity.n.01')] + >>> wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')) + [Synset('carnivore.n.01')] + +Each synset contains one or more lemmas, which represent a specific +sense of a specific word. + +Note that some relations are defined by WordNet only over Lemmas: + + >>> good = wn.synset('good.a.01') + >>> good.antonyms() + Traceback (most recent call last): + File "", line 1, in + AttributeError: 'Synset' object has no attribute 'antonyms' + >>> good.lemmas()[0].antonyms() + [Lemma('bad.a.01.bad')] + +The relations that are currently defined in this way are `antonyms`, +`derivationally_related_forms` and `pertainyms`. + +If you know the byte offset used to identify a synset in the original +Princeton WordNet data file, you can use that to instantiate the synset +in NLTK: + + >>> wn.synset_from_pos_and_offset('n', 4543158) + Synset('wagon.n.01') + +Likewise, instantiate a synset from a known sense key: + >>> wn.synset_from_sense_key("driving%1:04:03::") + Synset('drive.n.06') + + +------ +Lemmas +------ + + >>> eat = wn.lemma('eat.v.03.eat') + >>> eat + Lemma('feed.v.06.eat') + >>> print(eat.key()) + eat%2:34:02:: + >>> eat.count() + 4 + >>> wn.lemma_from_key(eat.key()) + Lemma('feed.v.06.eat') + >>> wn.lemma_from_key(eat.key()).synset() + Synset('feed.v.06') + >>> wn.lemma_from_key('feebleminded%5:00:00:retarded:00') + Lemma('backward.s.03.feebleminded') + >>> for lemma in wn.synset('eat.v.03').lemmas(): + ... print(lemma, lemma.count()) + ... + Lemma('feed.v.06.feed') 3 + Lemma('feed.v.06.eat') 4 + >>> for lemma in wn.lemmas('eat', 'v'): + ... print(lemma, lemma.count()) + ... + Lemma('eat.v.01.eat') 61 + Lemma('eat.v.02.eat') 13 + Lemma('feed.v.06.eat') 4 + Lemma('eat.v.04.eat') 0 + Lemma('consume.v.05.eat') 0 + Lemma('corrode.v.01.eat') 0 + >>> wn.lemma('jump.v.11.jump') + Lemma('jump.v.11.jump') + +Lemmas can also have relations between them: + + >>> vocal = wn.lemma('vocal.a.01.vocal') + >>> vocal.derivationally_related_forms() + [Lemma('vocalize.v.02.vocalize')] + >>> vocal.pertainyms() + [Lemma('voice.n.02.voice')] + >>> vocal.antonyms() + [Lemma('instrumental.a.01.instrumental')] + +The three relations above exist only on lemmas, not on synsets. + +----------- +Verb Frames +----------- + + >>> wn.synset('think.v.01').frame_ids() + [5, 9] + >>> for lemma in wn.synset('think.v.01').lemmas(): + ... print(lemma, lemma.frame_ids()) + ... print(" | ".join(lemma.frame_strings())) + ... + Lemma('think.v.01.think') [5, 9] + Something think something Adjective/Noun | Somebody think somebody + Lemma('think.v.01.believe') [5, 9] + Something believe something Adjective/Noun | Somebody believe somebody + Lemma('think.v.01.consider') [5, 9] + Something consider something Adjective/Noun | Somebody consider somebody + Lemma('think.v.01.conceive') [5, 9] + Something conceive something Adjective/Noun | Somebody conceive somebody + >>> wn.synset('stretch.v.02').frame_ids() + [8] + >>> for lemma in wn.synset('stretch.v.02').lemmas(): + ... print(lemma, lemma.frame_ids()) + ... print(" | ".join(lemma.frame_strings())) + ... + Lemma('stretch.v.02.stretch') [8, 2] + Somebody stretch something | Somebody stretch + Lemma('stretch.v.02.extend') [8] + Somebody extend something + + +---------- +Similarity +---------- + + >>> dog = wn.synset('dog.n.01') + >>> cat = wn.synset('cat.n.01') + + >>> hit = wn.synset('hit.v.01') + >>> slap = wn.synset('slap.v.01') + + +``synset1.path_similarity(synset2):`` +Return a score denoting how similar two word senses are, based on the +shortest path that connects the senses in the is-a (hypernym/hypnoym) +taxonomy. The score is in the range 0 to 1. By default, there is now +a fake root node added to verbs so for cases where previously a path +could not be found---and None was returned---it should return a value. +The old behavior can be achieved by setting simulate_root to be False. +A score of 1 represents identity i.e. comparing a sense with itself +will return 1. + + >>> dog.path_similarity(cat) + 0.2... + + >>> hit.path_similarity(slap) + 0.142... + + >>> wn.path_similarity(hit, slap) + 0.142... + + >>> print(hit.path_similarity(slap, simulate_root=False)) + None + + >>> print(wn.path_similarity(hit, slap, simulate_root=False)) + None + +``synset1.lch_similarity(synset2):`` +Leacock-Chodorow Similarity: +Return a score denoting how similar two word senses are, based on the +shortest path that connects the senses (as above) and the maximum depth +of the taxonomy in which the senses occur. The relationship is given +as -log(p/2d) where p is the shortest path length and d the taxonomy +depth. + + >>> dog.lch_similarity(cat) + 2.028... + + >>> hit.lch_similarity(slap) + 1.312... + + >>> wn.lch_similarity(hit, slap) + 1.312... + + >>> print(hit.lch_similarity(slap, simulate_root=False)) + None + + >>> print(wn.lch_similarity(hit, slap, simulate_root=False)) + None + +``synset1.wup_similarity(synset2):`` +Wu-Palmer Similarity: +Return a score denoting how similar two word senses are, based on the +depth of the two senses in the taxonomy and that of their Least Common +Subsumer (most specific ancestor node). Note that at this time the +scores given do **not** always agree with those given by Pedersen's Perl +implementation of Wordnet Similarity. + +The LCS does not necessarily feature in the shortest path connecting the +two senses, as it is by definition the common ancestor deepest in the +taxonomy, not closest to the two senses. Typically, however, it will so +feature. Where multiple candidates for the LCS exist, that whose +shortest path to the root node is the longest will be selected. Where +the LCS has multiple paths to the root, the longer path is used for +the purposes of the calculation. + + >>> dog.wup_similarity(cat) + 0.857... + + >>> hit.wup_similarity(slap) + 0.25 + + >>> wn.wup_similarity(hit, slap) + 0.25 + + >>> print(hit.wup_similarity(slap, simulate_root=False)) + None + + >>> print(wn.wup_similarity(hit, slap, simulate_root=False)) + None + +``wordnet_ic`` +Information Content: +Load an information content file from the wordnet_ic corpus. + + >>> from nltk.corpus import wordnet_ic + >>> brown_ic = wordnet_ic.ic('ic-brown.dat') + >>> semcor_ic = wordnet_ic.ic('ic-semcor.dat') + +Or you can create an information content dictionary from a corpus (or +anything that has a words() method). + + >>> from nltk.corpus import genesis + >>> genesis_ic = wn.ic(genesis, False, 0.0) + +``synset1.res_similarity(synset2, ic):`` +Resnik Similarity: +Return a score denoting how similar two word senses are, based on the +Information Content (IC) of the Least Common Subsumer (most specific +ancestor node). Note that for any similarity measure that uses +information content, the result is dependent on the corpus used to +generate the information content and the specifics of how the +information content was created. + + >>> dog.res_similarity(cat, brown_ic) + 7.911... + >>> dog.res_similarity(cat, genesis_ic) + 7.204... + +``synset1.jcn_similarity(synset2, ic):`` +Jiang-Conrath Similarity +Return a score denoting how similar two word senses are, based on the +Information Content (IC) of the Least Common Subsumer (most specific +ancestor node) and that of the two input Synsets. The relationship is +given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). + + >>> dog.jcn_similarity(cat, brown_ic) + 0.449... + >>> dog.jcn_similarity(cat, genesis_ic) + 0.285... + +``synset1.lin_similarity(synset2, ic):`` +Lin Similarity: +Return a score denoting how similar two word senses are, based on the +Information Content (IC) of the Least Common Subsumer (most specific +ancestor node) and that of the two input Synsets. The relationship is +given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). + + >>> dog.lin_similarity(cat, semcor_ic) + 0.886... + + +--------------------- +Access to all Synsets +--------------------- + +Iterate over all the noun synsets: + + >>> for synset in list(wn.all_synsets('n'))[:10]: + ... print(synset) + ... + Synset('entity.n.01') + Synset('physical_entity.n.01') + Synset('abstraction.n.06') + Synset('thing.n.12') + Synset('object.n.01') + Synset('whole.n.02') + Synset('congener.n.03') + Synset('living_thing.n.01') + Synset('organism.n.01') + Synset('benthos.n.02') + +Get all synsets for this word, possibly restricted by POS: + + >>> wn.synsets('dog') + [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), ...] + >>> wn.synsets('dog', pos='v') + [Synset('chase.v.01')] + +Walk through the noun synsets looking at their hypernyms: + + >>> from itertools import islice + >>> for synset in islice(wn.all_synsets('n'), 5): + ... print(synset, synset.hypernyms()) + ... + Synset('entity.n.01') [] + Synset('physical_entity.n.01') [Synset('entity.n.01')] + Synset('abstraction.n.06') [Synset('entity.n.01')] + Synset('thing.n.12') [Synset('physical_entity.n.01')] + Synset('object.n.01') [Synset('physical_entity.n.01')] + + +------ +Morphy +------ + +Look up forms not in WordNet, with the help of Morphy: + + >>> wn.morphy('denied', wn.NOUN) + >>> print(wn.morphy('denied', wn.VERB)) + deny + >>> wn.synsets('denied', wn.NOUN) + [] + >>> wn.synsets('denied', wn.VERB) + [Synset('deny.v.01'), Synset('deny.v.02'), Synset('deny.v.03'), Synset('deny.v.04'), + Synset('deny.v.05'), Synset('traverse.v.03'), Synset('deny.v.07')] + +Morphy uses a combination of inflectional ending rules and exception +lists to handle a variety of different possibilities: + + >>> print(wn.morphy('dogs')) + dog + >>> print(wn.morphy('churches')) + church + >>> print(wn.morphy('aardwolves')) + aardwolf + >>> print(wn.morphy('abaci')) + abacus + >>> print(wn.morphy('book', wn.NOUN)) + book + >>> wn.morphy('hardrock', wn.ADV) + >>> wn.morphy('book', wn.ADJ) + >>> wn.morphy('his', wn.NOUN) + >>> + +--------------- +Synset Closures +--------------- + +Compute transitive closures of synsets + + >>> dog = wn.synset('dog.n.01') + >>> hypo = lambda s: s.hyponyms() + >>> hyper = lambda s: s.hypernyms() + >>> list(dog.closure(hypo, depth=1)) == dog.hyponyms() + True + >>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() + True + >>> list(dog.closure(hypo)) + [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), + Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), + Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), + Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), + Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...] + >>> list(dog.closure(hyper)) + [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), + Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'), + Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'), + Synset('physical_entity.n.01'), Synset('entity.n.01')] + + +---------------- +Regression Tests +---------------- + +Bug 85: morphy returns the base form of a word, if it's input is given +as a base form for a POS for which that word is not defined: + + >>> wn.synsets('book', wn.NOUN) + [Synset('book.n.01'), Synset('book.n.02'), Synset('record.n.05'), Synset('script.n.01'), Synset('ledger.n.01'), Synset('book.n.06'), Synset('book.n.07'), Synset('koran.n.01'), Synset('bible.n.01'), Synset('book.n.10'), Synset('book.n.11')] + >>> wn.synsets('book', wn.ADJ) + [] + >>> wn.morphy('book', wn.NOUN) + 'book' + >>> wn.morphy('book', wn.ADJ) + >>> + +Bug 160: wup_similarity breaks when the two synsets have no common hypernym + + >>> t = wn.synsets('picasso')[0] + >>> m = wn.synsets('male')[1] + >>> t.wup_similarity(m) + 0.631... + +Issue #2278: wup_similarity not commutative when comparing a noun and a verb. +Patch #2650 resolved this error. As a result, the output of the following use of wup_similarity no longer returns None. + + >>> t = wn.synsets('titan')[1] + >>> s = wn.synsets('say', wn.VERB)[0] + >>> t.wup_similarity(s) + 0.142... + +Bug 21: "instance of" not included in LCS (very similar to bug 160) + + >>> a = wn.synsets("writings")[0] + >>> b = wn.synsets("scripture")[0] + >>> brown_ic = wordnet_ic.ic('ic-brown.dat') + >>> a.jcn_similarity(b, brown_ic) + 0.175... + +Bug 221: Verb root IC is zero + + >>> from nltk.corpus.reader.wordnet import information_content + >>> s = wn.synsets('say', wn.VERB)[0] + >>> information_content(s, brown_ic) + 4.623... + +Bug 161: Comparison between WN keys/lemmas should not be case sensitive + + >>> k = wn.synsets("jefferson")[0].lemmas()[0].key() + >>> wn.lemma_from_key(k) + Lemma('jefferson.n.01.Jefferson') + >>> wn.lemma_from_key(k.upper()) + Lemma('jefferson.n.01.Jefferson') + +Bug 99: WordNet root_hypernyms gives incorrect results + + >>> from nltk.corpus import wordnet as wn + >>> for s in wn.all_synsets(wn.NOUN): + ... if s.root_hypernyms()[0] != wn.synset('entity.n.01'): + ... print(s, s.root_hypernyms()) + ... + >>> + +Bug 382: JCN Division by zero error + + >>> tow = wn.synset('tow.v.01') + >>> shlep = wn.synset('shlep.v.02') + >>> from nltk.corpus import wordnet_ic + >>> brown_ic = wordnet_ic.ic('ic-brown.dat') + >>> tow.jcn_similarity(shlep, brown_ic) + 1...e+300 + +Bug 428: Depth is zero for instance nouns + + >>> s = wn.synset("lincoln.n.01") + >>> s.max_depth() > 0 + True + +Bug 429: Information content smoothing used old reference to all_synsets + + >>> genesis_ic = wn.ic(genesis, True, 1.0) + +Bug 430: all_synsets used wrong pos lookup when synsets were cached + + >>> for ii in wn.all_synsets(): pass + >>> for ii in wn.all_synsets(): pass + +Bug 470: shortest_path_distance ignored instance hypernyms + + >>> google = wordnet.synsets("google")[0] + >>> earth = wordnet.synsets("earth")[0] + >>> google.wup_similarity(earth) + 0.1... + +Bug 484: similarity metrics returned -1 instead of None for no LCS + + >>> t = wn.synsets('fly', wn.VERB)[0] + >>> s = wn.synsets('say', wn.VERB)[0] + >>> print(s.shortest_path_distance(t)) + None + >>> print(s.path_similarity(t, simulate_root=False)) + None + >>> print(s.lch_similarity(t, simulate_root=False)) + None + >>> print(s.wup_similarity(t, simulate_root=False)) + None + +Bug 427: "pants" does not return all the senses it should + + >>> from nltk.corpus import wordnet + >>> wordnet.synsets("pants",'n') + [Synset('bloomers.n.01'), Synset('pant.n.01'), Synset('trouser.n.01'), Synset('gasp.n.01')] + +Bug 482: Some nouns not being lemmatised by WordNetLemmatizer().lemmatize + + >>> from nltk.stem.wordnet import WordNetLemmatizer + >>> WordNetLemmatizer().lemmatize("eggs", pos="n") + 'egg' + >>> WordNetLemmatizer().lemmatize("legs", pos="n") + 'leg' + +Bug 284: instance hypernyms not used in similarity calculations + + >>> wn.synset('john.n.02').lch_similarity(wn.synset('dog.n.01')) + 1.335... + >>> wn.synset('john.n.02').wup_similarity(wn.synset('dog.n.01')) + 0.571... + >>> wn.synset('john.n.02').res_similarity(wn.synset('dog.n.01'), brown_ic) + 2.224... + >>> wn.synset('john.n.02').jcn_similarity(wn.synset('dog.n.01'), brown_ic) + 0.075... + >>> wn.synset('john.n.02').lin_similarity(wn.synset('dog.n.01'), brown_ic) + 0.252... + >>> wn.synset('john.n.02').hypernym_paths() + [[Synset('entity.n.01'), ..., Synset('john.n.02')]] + +Issue 541: add domains to wordnet + + >>> wn.synset('code.n.03').topic_domains() + [Synset('computer_science.n.01')] + >>> wn.synset('pukka.a.01').region_domains() + [Synset('india.n.01')] + >>> wn.synset('freaky.a.01').usage_domains() + [Synset('slang.n.02')] + +Issue 629: wordnet failures when python run with -O optimizations + + >>> # Run the test suite with python -O to check this + >>> wn.synsets("brunch") + [Synset('brunch.n.01'), Synset('brunch.v.01')] + +Issue 395: wordnet returns incorrect result for lowest_common_hypernyms of chef and policeman + + >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) + [Synset('person.n.01')] + +Bug https://github.com/nltk/nltk/issues/1641: Non-English lemmas containing capital letters cannot be looked up using wordnet.lemmas() or wordnet.synsets() + + >>> wn.lemmas('Londres', lang='fra') + [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')] + >>> wn.lemmas('londres', lang='fra') + [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')] + +Patch-1 https://github.com/nltk/nltk/pull/2065 Adding 3 functions (relations) to WordNet class + + >>> wn.synsets("computer_science")[0].in_topic_domains()[2] + Synset('access_time.n.01') + >>> wn.synsets("France")[0].in_region_domains()[18] + Synset('french.n.01') + >>> wn.synsets("slang")[1].in_usage_domains()[18] + Synset('can-do.s.01') + +Issue 2721: WordNetCorpusReader.ic() does not add smoothing to N + + >>> class FakeCorpus: + ... def words(self): return ['word'] + ... + >>> fake_ic = wn.ic(FakeCorpus(), False, 1.0) + >>> word = wn.synset('word.n.01') + >>> information_content(word, fake_ic) > 0 + True + +Issue 3077: Incorrect part-of-speech filtering in all_synsets + + >>> next(wn.all_synsets(pos="a")) + Synset('able.a.01') + >>> next(wn.all_synsets(pos="s")) + Synset('emergent.s.02') + >>> wn.add_omw() + >>> next(wn.all_synsets(lang="hrv")) + Synset('able.a.01') + >>> next(wn.all_synsets(lang="hrv", pos="n")) + Synset('entity.n.01') + >>> next(wn.all_synsets(lang="hrv", pos="v")) + Synset('breathe.v.01') + >>> next(wn.all_synsets(lang="hrv", pos="s")) + Synset('ideological.s.02') + >>> next(wn.all_synsets(lang="hrv", pos="a")) + Synset('able.a.01') + + +------------------------------------------------ +Endlessness vs. intractability in relation trees +------------------------------------------------ + +1. Endlessness +-------------- + +Until NLTK v. 3.5, the ``tree()`` function looped forever on symmetric +relations (verb_groups, attributes, and most also_sees). But in +the current version, ``tree()`` now detects and discards these cycles: + + >>> from pprint import pprint + >>> pprint(wn.synset('bound.a.01').tree(lambda s:s.also_sees())) + [Synset('bound.a.01'), + [Synset('unfree.a.02'), + [Synset('confined.a.02'), + [Synset('restricted.a.01'), [Synset('classified.a.02')]]], + [Synset('dependent.a.01')], + [Synset('restricted.a.01'), + [Synset('classified.a.02')], + [Synset('confined.a.02')]]]] + +Specifying the "cut_mark" parameter increases verbosity, so that the cycles +are mentioned in the output, together with the level where they occur: + + >>> pprint(wn.synset('bound.a.01').tree(lambda s:s.also_sees(),cut_mark='...')) + [Synset('bound.a.01'), + [Synset('unfree.a.02'), + "Cycle(Synset('bound.a.01'),-3,...)", + [Synset('confined.a.02'), + [Synset('restricted.a.01'), + [Synset('classified.a.02')], + "Cycle(Synset('confined.a.02'),-5,...)", + "Cycle(Synset('unfree.a.02'),-5,...)"], + "Cycle(Synset('unfree.a.02'),-4,...)"], + [Synset('dependent.a.01'), "Cycle(Synset('unfree.a.02'),-4,...)"], + [Synset('restricted.a.01'), + [Synset('classified.a.02')], + [Synset('confined.a.02'), + "Cycle(Synset('restricted.a.01'),-5,...)", + "Cycle(Synset('unfree.a.02'),-5,...)"], + "Cycle(Synset('unfree.a.02'),-4,...)"]]] + + +2. Intractability +----------------- + +However, even after discarding the infinite cycles, some trees can remain +intractable, due to combinatorial explosion in a relation. This happens in +WordNet, because the ``also_sees()`` relation has a big Strongly Connected +Component (_SCC_) consisting in 758 synsets, where any member node is +transitively connected by the same relation, to all other members of the +same SCC. This produces intractable relation trees for each of these 758 +synsets, i. e. trees that are too big to compute or display on any computer. + +For example, the synset 'concrete.a.01' is a member of the largest SCC, +so its ``also_sees()`` tree is intractable, and can normally only be handled +by limiting the ``depth`` parameter to display a small number of levels: + + >>> from pprint import pprint + >>> pprint(wn.synset('concrete.a.01').tree(lambda s:s.also_sees(),cut_mark='...',depth=2)) + [Synset('concrete.a.01'), + [Synset('practical.a.01'), + "Cycle(Synset('concrete.a.01'),0,...)", + [Synset('possible.a.01'), '...'], + [Synset('realistic.a.01'), '...'], + [Synset('serviceable.a.01'), '...']], + [Synset('real.a.01'), + "Cycle(Synset('concrete.a.01'),0,...)", + [Synset('genuine.a.01'), '...'], + [Synset('realistic.a.01'), '...'], + [Synset('sincere.a.01'), '...']], + [Synset('tangible.a.01'), "Cycle(Synset('concrete.a.01'),0,...)"]] + + +2.1 First solution: ``acyclic_tree()`` +...................................... + +On the other hand, the new ``acyclic_tree()`` function is able to also handle +the intractable cases. The ``also_sees()`` acyclic tree of 'concrete.a.01' is +several hundred lines long, so here is a simpler example, concerning a much +smaller SCC: counting only five members, the SCC that includes 'bound.a.01' +is tractable with the normal ``tree()`` function, as seen above. + +But while ``tree()`` only prunes redundancy within local branches, ``acyclic_tree()`` +prunes the tree globally, thus discarding any additional redundancy, and +produces a tree that includes all reachable nodes (i.e., a **spanning tree**). +This tree is **minimal** because it includes the reachable nodes only once, +but it is not necessarily a **Minimum Spanning Tree** (MST), because the +Depth-first search strategy does not guarantee that nodes are reached +through the lowest number of links (as Breadth-first search would). + + >>> pprint(wn.synset('bound.a.01').acyclic_tree(lambda s:s.also_sees())) + [Synset('bound.a.01'), + [Synset('unfree.a.02'), + [Synset('confined.a.02'), + [Synset('restricted.a.01'), [Synset('classified.a.02')]]], + [Synset('dependent.a.01')]]] + +Again, specifying the ``cut_mark`` parameter increases verbosity, so that the +cycles are mentioned in the output, together with the level where they occur: + + >>> pprint(wn.synset('bound.a.01').acyclic_tree(lambda s:s.also_sees(),cut_mark='...')) + [Synset('bound.a.01'), + [Synset('unfree.a.02'), + "Cycle(Synset('bound.a.01'),-3,...)", + [Synset('confined.a.02'), + [Synset('restricted.a.01'), + [Synset('classified.a.02')], + "Cycle(Synset('confined.a.02'),-5,...)", + "Cycle(Synset('unfree.a.02'),-5,...)"], + "Cycle(Synset('unfree.a.02'),-4,...)"], + [Synset('dependent.a.01'), "Cycle(Synset('unfree.a.02'),-4,...)"], + "Cycle(Synset('restricted.a.01'),-3,...)"]] + + +2.2 Better solution: mst() +.......................... + +A Minimum Spanning Tree (MST) spans all the nodes of a relation subgraph once, +while guaranteeing that each node is reached through the shortest path possible. +In unweighted relation graphs like WordNet, a MST can be computed very efficiently +in linear time, using Breadth-First Search (BFS). Like acyclic_tree(), the new +``unweighted_minimum_spanning_tree()`` function (imported in the Wordnet +module as ``mst``) handles intractable trees, such as the example discussed above: +``wn.synset('concrete.a.01').mst(lambda s:s.also_sees())``. + +But, while the also_sees() acyclic_tree of 'bound.a.01' reaches +'classified.a.02' through four links, using depth-first search as seen above +(bound.a.01 > unfree.a.02 > confined.a.02 > restricted.a.01 > classified.a.02), +in the following MST, the path to 'classified.a.02' is the shortest possible, +consisting only in three links (bound.a.01 > unfree.a.02 > restricted.a.01 > +classified.a.02): + + >>> pprint(wn.synset('bound.a.01').mst(lambda s:s.also_sees())) + [Synset('bound.a.01'), + [Synset('unfree.a.02'), + [Synset('confined.a.02')], + [Synset('dependent.a.01')], + [Synset('restricted.a.01'), [Synset('classified.a.02')]]]] + + +---------------------------------------------------------------- +Loading alternative Wordnet versions +---------------------------------------------------------------- + + >>> print("Wordnet {}".format(wn.get_version())) + Wordnet 3.0 + + >>> from nltk.corpus import wordnet31 as wn31 + >>> print("Wordnet {}".format(wn31.get_version())) + Wordnet 3.1 + + >>> print(wn.synset('restrain.v.01').hyponyms()) + [Synset('confine.v.03'), Synset('control.v.02'), Synset('hold.v.36'), Synset('inhibit.v.04')] + + >>> print(wn31.synset('restrain.v.01').hyponyms()) + [Synset('enchain.v.01'), Synset('fetter.v.01'), Synset('ground.v.02'), Synset('impound.v.02'), Synset('pen_up.v.01'), Synset('pinion.v.01'), Synset('pound.v.06'), Synset('tie_down.v.01')] + + >>> print(wn31.synset('restrain.v.04').hyponyms()) + [Synset('baffle.v.03'), Synset('confine.v.02'), Synset('control.v.02'), Synset('hold.v.36'), Synset('rule.v.07'), Synset('swallow.v.06'), Synset('wink.v.04')] + + +------------- +Teardown test +------------- + + >>> from nltk.corpus import wordnet + >>> wordnet._unload() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/wordnet_lch.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/wordnet_lch.doctest new file mode 100644 index 0000000000000000000000000000000000000000..d6be36367999fd181957f8bc9321581ebfa00b00 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/wordnet_lch.doctest @@ -0,0 +1,53 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +=============================== +WordNet Lowest Common Hypernyms +=============================== + +Wordnet's lowest_common_hypernyms() method is based used to locate the +lowest single hypernym that is shared by two given words: + + >>> from nltk.corpus import wordnet as wn + >>> wn.synset('kin.n.01').lowest_common_hypernyms(wn.synset('mother.n.01')) + [Synset('relative.n.01')] + + >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) + [Synset('person.n.01')] + +This method generally returns a single result, but in some cases, more than one +valid LCH is possible: + + >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01')) + [Synset('attribute.n.02'), Synset('measure.n.02')] + +In some cases, lowest_common_hypernyms() can return one of the synsets which was +passed to it as an argument: + + >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02')) + [Synset('woman.n.01')] + +In NLTK 3.0a2 the behavior of lowest_common_hypernyms() was changed to give more +accurate results in a small set of cases, generally when dealing with nouns describing +social roles or jobs. To emulate the pre v3.0a2 behavior, you can set the use_min_depth=True +flag: + + >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) + [Synset('person.n.01')] + >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'), use_min_depth=True) + [Synset('organism.n.01')] + +In some cases use_min_depth=True may return more or fewer results than the default +behavior: + + >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02')) + [Synset('woman.n.01')] + >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'), use_min_depth=True) + [Synset('organism.n.01'), Synset('woman.n.01')] + +In the general case, however, they tend to return the same results: + + >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01')) + [Synset('attribute.n.02'), Synset('measure.n.02')] + >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'), use_min_depth=True) + [Synset('attribute.n.02'), Synset('measure.n.02')] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/test/wsd.doctest b/.eggs/nltk-3.8-py3.10.egg/nltk/test/wsd.doctest new file mode 100644 index 0000000000000000000000000000000000000000..e9dab2196eca90c4f613c451bbb1b3dc661b60dd --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/test/wsd.doctest @@ -0,0 +1,68 @@ +.. Copyright (C) 2001-2022 NLTK Project +.. For license information, see LICENSE.TXT + +.. -*- coding: utf-8 -*- + +========================= +Word Sense Disambiguation +========================= + + +Lesk Algorithm +-------------- + + +Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using +a the definitions of the ambiguous word. + +Given an ambiguous word and the context in which the word occurs, Lesk returns +a Synset with the highest number of overlapping words between the context +sentence and different definitions from each Synset. + + >>> from nltk.wsd import lesk + >>> sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'] + + >>> print(lesk(sent, 'bank', 'n')) + Synset('savings_bank.n.02') + + >>> print(lesk(sent, 'bank')) + Synset('savings_bank.n.02') + +The definitions for "bank" are: + + >>> from nltk.corpus import wordnet as wn + >>> for ss in wn.synsets('bank'): + ... print(ss, ss.definition()) + ... + Synset('bank.n.01') sloping land (especially the slope beside a body of water) + Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities + Synset('bank.n.03') a long ridge or pile + Synset('bank.n.04') an arrangement of similar objects in a row or in tiers + Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies) + Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games + Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force + Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home + Synset('bank.n.09') a building in which the business of banking transacted + Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning) + Synset('bank.v.01') tip laterally + Synset('bank.v.02') enclose with a bank + Synset('bank.v.03') do business with a bank or keep an account at a bank + Synset('bank.v.04') act as the banker in a game or in gambling + Synset('bank.v.05') be in the banking business + Synset('deposit.v.02') put into a bank account + Synset('bank.v.07') cover with ashes so to control the rate of burning + Synset('trust.v.01') have confidence or faith in + +Test disambiguation of POS tagged `able`. + + >>> [(s, s.pos()) for s in wn.synsets('able')] + [(Synset('able.a.01'), 'a'), (Synset('able.s.02'), 's'), (Synset('able.s.03'), 's'), (Synset('able.s.04'), 's')] + >>> sent = 'people should be able to marry a person of their choice'.split() + >>> lesk(sent, 'able') + Synset('able.s.04') + >>> lesk(sent, 'able', pos='a') + Synset('able.a.01') + +Test behavior if there is are no matching senses. + + >>> lesk('John loves Mary'.split(), 'loves', synsets=[]) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5468ea8a5a1f734f091c4cf52d6d3458990b3ef3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/__init__.py @@ -0,0 +1,132 @@ +# Natural Language Toolkit: Tokenizers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# Contributors: matthewmc, clouds56 +# URL: +# For license information, see LICENSE.TXT + +r""" +NLTK Tokenizer Package + +Tokenizers divide strings into lists of substrings. For example, +tokenizers can be used to find the words and punctuation in a string: + + >>> from nltk.tokenize import word_tokenize + >>> s = '''Good muffins cost $3.88\nin New York. Please buy me + ... two of them.\n\nThanks.''' + >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', + 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + +This particular tokenizer requires the Punkt sentence tokenization +models to be installed. NLTK also provides a simpler, +regular-expression based tokenizer, which splits text on whitespace +and punctuation: + + >>> from nltk.tokenize import wordpunct_tokenize + >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', + 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + +We can also operate at the level of sentences, using the sentence +tokenizer directly as follows: + + >>> from nltk.tokenize import sent_tokenize, word_tokenize + >>> sent_tokenize(s) + ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.'] + >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE + [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'], + ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']] + +Caution: when tokenizing a Unicode string, make sure you are not +using an encoded version of the string (it may be necessary to +decode it first, e.g. with ``s.decode("utf8")``. + +NLTK tokenizers can produce token-spans, represented as tuples of integers +having the same semantics as string slices, to support efficient comparison +of tokenizers. (These methods are implemented as generators.) + + >>> from nltk.tokenize import WhitespaceTokenizer + >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE + [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), + (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] + +There are numerous ways to tokenize text. If you need more control over +tokenization, see the other methods provided in this package. + +For further information, please see Chapter 3 of the NLTK book. +""" + +import re + +from nltk.data import load +from nltk.tokenize.casual import TweetTokenizer, casual_tokenize +from nltk.tokenize.destructive import NLTKWordTokenizer +from nltk.tokenize.legality_principle import LegalitySyllableTokenizer +from nltk.tokenize.mwe import MWETokenizer +from nltk.tokenize.punkt import PunktSentenceTokenizer +from nltk.tokenize.regexp import ( + BlanklineTokenizer, + RegexpTokenizer, + WhitespaceTokenizer, + WordPunctTokenizer, + blankline_tokenize, + regexp_tokenize, + wordpunct_tokenize, +) +from nltk.tokenize.repp import ReppTokenizer +from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize +from nltk.tokenize.simple import ( + LineTokenizer, + SpaceTokenizer, + TabTokenizer, + line_tokenize, +) +from nltk.tokenize.sonority_sequencing import SyllableTokenizer +from nltk.tokenize.stanford_segmenter import StanfordSegmenter +from nltk.tokenize.texttiling import TextTilingTokenizer +from nltk.tokenize.toktok import ToktokTokenizer +from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer +from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize + + +# Standard sentence tokenizer. +def sent_tokenize(text, language="english"): + """ + Return a sentence-tokenized copy of *text*, + using NLTK's recommended sentence tokenizer + (currently :class:`.PunktSentenceTokenizer` + for the specified language). + + :param text: text to split into sentences + :param language: the model name in the Punkt corpus + """ + tokenizer = load(f"tokenizers/punkt/{language}.pickle") + return tokenizer.tokenize(text) + + +# Standard word tokenizer. +_treebank_word_tokenizer = NLTKWordTokenizer() + + +def word_tokenize(text, language="english", preserve_line=False): + """ + Return a tokenized copy of *text*, + using NLTK's recommended word tokenizer + (currently an improved :class:`.TreebankWordTokenizer` + along with :class:`.PunktSentenceTokenizer` + for the specified language). + + :param text: text to split into words + :type text: str + :param language: the model name in the Punkt corpus + :type language: str + :param preserve_line: A flag to decide whether to sentence tokenize the text or not. + :type preserve_line: bool + """ + sentences = [text] if preserve_line else sent_tokenize(text, language) + return [ + token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent) + ] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/api.py new file mode 100644 index 0000000000000000000000000000000000000000..5c5c2c21d919846ba10a50d1488a1f8f38604f09 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/api.py @@ -0,0 +1,83 @@ +# Natural Language Toolkit: Tokenizer Interface +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Tokenizer Interface +""" + +from abc import ABC, abstractmethod +from typing import Iterator, List, Tuple + +from nltk.internals import overridden +from nltk.tokenize.util import string_span_tokenize + + +class TokenizerI(ABC): + """ + A processing interface for tokenizing a string. + Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both). + """ + + @abstractmethod + def tokenize(self, s: str) -> List[str]: + """ + Return a tokenized copy of *s*. + + :rtype: List[str] + """ + if overridden(self.tokenize_sents): + return self.tokenize_sents([s])[0] + + def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]: + """ + Identify the tokens using integer offsets ``(start_i, end_i)``, + where ``s[start_i:end_i]`` is the corresponding token. + + :rtype: Iterator[Tuple[int, int]] + """ + raise NotImplementedError() + + def tokenize_sents(self, strings: List[str]) -> List[List[str]]: + """ + Apply ``self.tokenize()`` to each element of ``strings``. I.e.: + + return [self.tokenize(s) for s in strings] + + :rtype: List[List[str]] + """ + return [self.tokenize(s) for s in strings] + + def span_tokenize_sents( + self, strings: List[str] + ) -> Iterator[List[Tuple[int, int]]]: + """ + Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.: + + return [self.span_tokenize(s) for s in strings] + + :yield: List[Tuple[int, int]] + """ + for s in strings: + yield list(self.span_tokenize(s)) + + +class StringTokenizer(TokenizerI): + """A tokenizer that divides a string into substrings by splitting + on the specified string (defined in subclasses). + """ + + @property + @abstractmethod + def _string(self): + raise NotImplementedError + + def tokenize(self, s): + return s.split(self._string) + + def span_tokenize(self, s): + yield from string_span_tokenize(s, self._string) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/casual.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/casual.py new file mode 100644 index 0000000000000000000000000000000000000000..ddc58e05e401a0575343acb7327cf3e1d2ae9939 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/casual.py @@ -0,0 +1,458 @@ +# +# Natural Language Toolkit: Twitter Tokenizer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Christopher Potts +# Ewan Klein (modifications) +# Pierpaolo Pantone <> (modifications) +# Tom Aarsen <> (modifications) +# URL: +# For license information, see LICENSE.TXT +# + + +""" +Twitter-aware tokenizer, designed to be flexible and easy to adapt to new +domains and tasks. The basic logic is this: + +1. The tuple REGEXPS defines a list of regular expression + strings. + +2. The REGEXPS strings are put, in order, into a compiled + regular expression object called WORD_RE, under the TweetTokenizer + class. + +3. The tokenization is done by WORD_RE.findall(s), where s is the + user-supplied string, inside the tokenize() method of the class + TweetTokenizer. + +4. When instantiating Tokenizer objects, there are several options: + * preserve_case. By default, it is set to True. If it is set to + False, then the tokenizer will downcase everything except for + emoticons. + * reduce_len. By default, it is set to False. It specifies whether + to replace repeated character sequences of length 3 or greater + with sequences of length 3. + * strip_handles. By default, it is set to False. It specifies + whether to remove Twitter handles of text used in the + `tokenize` method. + * match_phone_numbers. By default, it is set to True. It indicates + whether the `tokenize` method should look for phone numbers. +""" + + +###################################################################### + +import html +from typing import List + +import regex # https://github.com/nltk/nltk/issues/2409 + +from nltk.tokenize.api import TokenizerI + +###################################################################### +# The following strings are components in the regular expression +# that is used for tokenizing. It's important that phone_number +# appears first in the final regex (since it can contain whitespace). +# It also could matter that tags comes after emoticons, due to the +# possibility of having text like +# +# <:| and some text >:) +# +# Most importantly, the final element should always be last, since it +# does a last ditch whitespace-based tokenization of whatever is left. + +# ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ? + +# This particular element is used in a couple ways, so we define it +# with a name: +EMOTICONS = r""" + (?: + [<>]? + [:;=8] # eyes + [\-o\*\']? # optional nose + [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth + | + [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth + [\-o\*\']? # optional nose + [:;=8] # eyes + [<>]? + | + {}\[\]]+ # Run of non-space, non-()<>{}[] + | # or + \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) + | + \([^\s]+?\) # balanced parens, non-recursive: (...) + )+ + (?: # End with: + \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) + | + \([^\s]+?\) # balanced parens, non-recursive: (...) + | # or + [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars + ) + | # OR, the following to match naked domains: + (?: + (?\s]+>""", + # ASCII Arrows + r"""[\-]+>|<[\-]+""", + # Twitter username: + r"""(?:@[\w_]+)""", + # Twitter hashtags: + r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""", + # email addresses + r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""", + # Zero-Width-Joiner and Skin tone modifier emojis + """.(?: + [\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+ + | + [\U0001F3FB-\U0001F3FF] + )""", + # flags + FLAGS, + # Remaining word types: + r""" + (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes. + | + (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. + | + (?:[\w_]+) # Words without apostrophes or dashes. + | + (?:\.(?:\s*\.){1,}) # Ellipsis dots. + | + (?:\S) # Everything else that isn't whitespace. + """, +) + +# Take the main components and add a phone regex as the second parameter +REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:]) + +###################################################################### +# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent +# the core tokenizing regexes. They are compiled lazily. + +# WORD_RE performs poorly on these patterns: +HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}") + +# The emoticon string gets its own regex so that we can preserve case for +# them as needed: +EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE) + +# These are for regularizing HTML entities to Unicode: +ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);") + +# For stripping away handles from a tweet: +HANDLES_RE = regex.compile( + r"(?>> from nltk.tokenize.casual import _replace_html_entities + >>> _replace_html_entities(b'Price: £100') + 'Price: \\xa3100' + >>> print(_replace_html_entities(b'Price: £100')) + Price: £100 + >>> + """ + + def _convert_entity(match): + entity_body = match.group(3) + if match.group(1): + try: + if match.group(2): + number = int(entity_body, 16) + else: + number = int(entity_body, 10) + # Numeric character references in the 80-9F range are typically + # interpreted by browsers as representing the characters mapped + # to bytes 80-9F in the Windows-1252 encoding. For more info + # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets + if 0x80 <= number <= 0x9F: + return bytes((number,)).decode("cp1252") + except ValueError: + number = None + else: + if entity_body in keep: + return match.group(0) + number = html.entities.name2codepoint.get(entity_body) + if number is not None: + try: + return chr(number) + except (ValueError, OverflowError): + pass + + return "" if remove_illegal else match.group(0) + + return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding)) + + +###################################################################### + + +class TweetTokenizer(TokenizerI): + r""" + Tokenizer for tweets. + + >>> from nltk.tokenize import TweetTokenizer + >>> tknzr = TweetTokenizer() + >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" + >>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE + ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', + '<--'] + + Examples using `strip_handles` and `reduce_len parameters`: + + >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) + >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' + >>> tknzr.tokenize(s1) + [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] + """ + + # Values used to lazily compile WORD_RE and PHONE_WORD_RE, + # which are the core tokenizing regexes. + _WORD_RE = None + _PHONE_WORD_RE = None + + ###################################################################### + + def __init__( + self, + preserve_case=True, + reduce_len=False, + strip_handles=False, + match_phone_numbers=True, + ): + """ + Create a `TweetTokenizer` instance with settings for use in the `tokenize` method. + + :param preserve_case: Flag indicating whether to preserve the casing (capitalisation) + of text used in the `tokenize` method. Defaults to True. + :type preserve_case: bool + :param reduce_len: Flag indicating whether to replace repeated character sequences + of length 3 or greater with sequences of length 3. Defaults to False. + :type reduce_len: bool + :param strip_handles: Flag indicating whether to remove Twitter handles of text used + in the `tokenize` method. Defaults to False. + :type strip_handles: bool + :param match_phone_numbers: Flag indicating whether the `tokenize` method should look + for phone numbers. Defaults to True. + :type match_phone_numbers: bool + """ + self.preserve_case = preserve_case + self.reduce_len = reduce_len + self.strip_handles = strip_handles + self.match_phone_numbers = match_phone_numbers + + def tokenize(self, text: str) -> List[str]: + """Tokenize the input text. + + :param text: str + :rtype: list(str) + :return: a tokenized list of strings; joining this list returns\ + the original string if `preserve_case=False`. + """ + # Fix HTML character entities: + text = _replace_html_entities(text) + # Remove username handles + if self.strip_handles: + text = remove_handles(text) + # Normalize word lengthening + if self.reduce_len: + text = reduce_lengthening(text) + # Shorten problematic sequences of characters + safe_text = HANG_RE.sub(r"\1\1\1", text) + # Recognise phone numbers during tokenization + if self.match_phone_numbers: + words = self.PHONE_WORD_RE.findall(safe_text) + else: + words = self.WORD_RE.findall(safe_text) + # Possibly alter the case, but avoid changing emoticons like :D into :d: + if not self.preserve_case: + words = list( + map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) + ) + return words + + @property + def WORD_RE(self) -> "regex.Pattern": + """Core TweetTokenizer regex""" + # Compiles the regex for this and all future instantiations of TweetTokenizer. + if not type(self)._WORD_RE: + type(self)._WORD_RE = regex.compile( + f"({'|'.join(REGEXPS)})", + regex.VERBOSE | regex.I | regex.UNICODE, + ) + return type(self)._WORD_RE + + @property + def PHONE_WORD_RE(self) -> "regex.Pattern": + """Secondary core TweetTokenizer regex""" + # Compiles the regex for this and all future instantiations of TweetTokenizer. + if not type(self)._PHONE_WORD_RE: + type(self)._PHONE_WORD_RE = regex.compile( + f"({'|'.join(REGEXPS_PHONE)})", + regex.VERBOSE | regex.I | regex.UNICODE, + ) + return type(self)._PHONE_WORD_RE + + +###################################################################### +# Normalization Functions +###################################################################### + + +def reduce_lengthening(text): + """ + Replace repeated character sequences of length 3 or greater with sequences + of length 3. + """ + pattern = regex.compile(r"(.)\1{2,}") + return pattern.sub(r"\1\1\1", text) + + +def remove_handles(text): + """ + Remove Twitter username handles from text. + """ + # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly + return HANDLES_RE.sub(" ", text) + + +###################################################################### +# Tokenization Function +###################################################################### + + +def casual_tokenize( + text, + preserve_case=True, + reduce_len=False, + strip_handles=False, + match_phone_numbers=True, +): + """ + Convenience function for wrapping the tokenizer. + """ + return TweetTokenizer( + preserve_case=preserve_case, + reduce_len=reduce_len, + strip_handles=strip_handles, + match_phone_numbers=match_phone_numbers, + ).tokenize(text) + + +############################################################################### diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/destructive.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/destructive.py new file mode 100644 index 0000000000000000000000000000000000000000..a788b3d2d782dbac9f6806128facbef631d1c8f7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/destructive.py @@ -0,0 +1,233 @@ +# Natural Language Toolkit: NLTK's very own tokenizer. +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Liling Tan +# Tom Aarsen <> (modifications) +# URL: +# For license information, see LICENSE.TXT + + +import re +import warnings +from typing import Iterator, List, Tuple + +from nltk.tokenize.api import TokenizerI +from nltk.tokenize.util import align_tokens + + +class MacIntyreContractions: + """ + List of contractions adapted from Robert MacIntyre's tokenizer. + """ + + CONTRACTIONS2 = [ + r"(?i)\b(can)(?#X)(not)\b", + r"(?i)\b(d)(?#X)('ye)\b", + r"(?i)\b(gim)(?#X)(me)\b", + r"(?i)\b(gon)(?#X)(na)\b", + r"(?i)\b(got)(?#X)(ta)\b", + r"(?i)\b(lem)(?#X)(me)\b", + r"(?i)\b(more)(?#X)('n)\b", + r"(?i)\b(wan)(?#X)(na)(?=\s)", + ] + CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"] + CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"] + + +class NLTKWordTokenizer(TokenizerI): + """ + The NLTK tokenizer that has improved upon the TreebankWordTokenizer. + + This is the method that is invoked by ``word_tokenize()``. It assumes that the + text has already been segmented into sentences, e.g. using ``sent_tokenize()``. + + The tokenizer is "destructive" such that the regexes applied will munge the + input string to a state beyond re-construction. It is possible to apply + `TreebankWordDetokenizer.detokenize` to the tokenized outputs of + `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to + revert to the original string. + """ + + # Starting quotes. + STARTING_QUOTES = [ + (re.compile("([«“‘„]|[`]+)", re.U), r" \1 "), + (re.compile(r"^\""), r"``"), + (re.compile(r"(``)"), r" \1 "), + (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "), + (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"), + ] + + # Ending quotes. + ENDING_QUOTES = [ + (re.compile("([»”’])", re.U), r" \1 "), + (re.compile(r"''"), " '' "), + (re.compile(r'"'), " '' "), + (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), + (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), + ] + + # For improvements for starting/closing quotes from TreebankWordTokenizer, + # see discussion on https://github.com/nltk/nltk/pull/1437 + # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on + # - chervon quotes u'\xab' and u'\xbb' . + # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' + # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608 + # Also, behavior of splitting on clitics now follows Stanford CoreNLP + # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b + + # Punctuation. + PUNCTUATION = [ + (re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "), + (re.compile(r"([:,])([^\d])"), r" \1 \2"), + (re.compile(r"([:,])$"), r" \1 "), + ( + re.compile(r"\.{2,}", re.U), + r" \g<0> ", + ), # See https://github.com/nltk/nltk/pull/2322 + (re.compile(r"[;@#$%&]"), r" \g<0> "), + ( + re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), + r"\1 \2\3 ", + ), # Handles the final period. + (re.compile(r"[?!]"), r" \g<0> "), + (re.compile(r"([^'])' "), r"\1 ' "), + ( + re.compile(r"[*]", re.U), + r" \g<0> ", + ), # See https://github.com/nltk/nltk/pull/2322 + ] + + # Pads parentheses + PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ") + + # Optionally: Convert parentheses, brackets and converts them to PTB symbols. + CONVERT_PARENTHESES = [ + (re.compile(r"\("), "-LRB-"), + (re.compile(r"\)"), "-RRB-"), + (re.compile(r"\["), "-LSB-"), + (re.compile(r"\]"), "-RSB-"), + (re.compile(r"\{"), "-LCB-"), + (re.compile(r"\}"), "-RCB-"), + ] + + DOUBLE_DASHES = (re.compile(r"--"), r" -- ") + + # List of contractions adapted from Robert MacIntyre's tokenizer. + _contractions = MacIntyreContractions() + CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2)) + CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) + + def tokenize( + self, text: str, convert_parentheses: bool = False, return_str: bool = False + ) -> List[str]: + r"""Return a tokenized copy of `text`. + + >>> from nltk.tokenize import NLTKWordTokenizer + >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.''' + >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', + 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', + 'of', 'them.', 'Thanks', '.'] + >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36', + 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', + 'of', 'them.', 'Thanks', '.'] + + + :param text: A string with a sentence or sentences. + :type text: str + :param convert_parentheses: if True, replace parentheses to PTB symbols, + e.g. `(` to `-LRB-`. Defaults to False. + :type convert_parentheses: bool, optional + :param return_str: If True, return tokens as space-separated string, + defaults to False. + :type return_str: bool, optional + :return: List of tokens from `text`. + :rtype: List[str] + """ + if return_str: + warnings.warn( + "Parameter 'return_str' has been deprecated and should no " + "longer be used.", + category=DeprecationWarning, + stacklevel=2, + ) + + for regexp, substitution in self.STARTING_QUOTES: + text = regexp.sub(substitution, text) + + for regexp, substitution in self.PUNCTUATION: + text = regexp.sub(substitution, text) + + # Handles parentheses. + regexp, substitution = self.PARENS_BRACKETS + text = regexp.sub(substitution, text) + # Optionally convert parentheses + if convert_parentheses: + for regexp, substitution in self.CONVERT_PARENTHESES: + text = regexp.sub(substitution, text) + + # Handles double dash. + regexp, substitution = self.DOUBLE_DASHES + text = regexp.sub(substitution, text) + + # add extra space to make things easier + text = " " + text + " " + + for regexp, substitution in self.ENDING_QUOTES: + text = regexp.sub(substitution, text) + + for regexp in self.CONTRACTIONS2: + text = regexp.sub(r" \1 \2 ", text) + for regexp in self.CONTRACTIONS3: + text = regexp.sub(r" \1 \2 ", text) + + # We are not using CONTRACTIONS4 since + # they are also commented out in the SED scripts + # for regexp in self._contractions.CONTRACTIONS4: + # text = regexp.sub(r' \1 \2 \3 ', text) + + return text.split() + + def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]: + r""" + Returns the spans of the tokens in ``text``. + Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. + + >>> from nltk.tokenize import NLTKWordTokenizer + >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' + >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), + ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), + ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), + ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', + ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', + ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + + :param text: A string with a sentence or sentences. + :type text: str + :yield: Tuple[int, int] + """ + raw_tokens = self.tokenize(text) + + # Convert converted quotes back to original double quotes + # Do this only if original text contains double quote(s) or double + # single-quotes (because '' might be transformed to `` if it is + # treated as starting quotes). + if ('"' in text) or ("''" in text): + # Find double quotes and converted quotes + matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] + + # Replace converted quotes back to double quotes + tokens = [ + matched.pop(0) if tok in ['"', "``", "''"] else tok + for tok in raw_tokens + ] + else: + tokens = raw_tokens + + yield from align_tokens(tokens, text) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/legality_principle.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/legality_principle.py new file mode 100644 index 0000000000000000000000000000000000000000..a09ce2eac7f8607e0ad96cb8f3be0232a607f20a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/legality_principle.py @@ -0,0 +1,147 @@ +# Natural Language Toolkit: Tokenizers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Christopher Hench +# Alex Estes +# URL: +# For license information, see LICENSE.TXT + +""" +The Legality Principle is a language agnostic principle maintaining that syllable +onsets and codas (the beginning and ends of syllables not including the vowel) +are only legal if they are found as word onsets or codas in the language. The English +word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found +word-initially in the English language (Bartlett et al.). This principle was first proposed +in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''. + +Kahn further argues that there is a ''strong tendency to syllabify in such a way that +initial clusters are of maximal length, consistent with the general constraints on +word-initial consonant clusters.'' Consequently, in addition to being legal onsets, +the longest legal onset is preferable---''Onset Maximization''. + +The default implementation assumes an English vowel set, but the `vowels` attribute +can be set to IPA or any other alphabet's vowel set for the use-case. +Both a valid set of vowels as well as a text corpus of words in the language +are necessary to determine legal onsets and subsequently syllabify words. + +The legality principle with onset maximization is a universal syllabification algorithm, +but that does not mean it performs equally across languages. Bartlett et al. (2009) +is a good benchmark for English accuracy if utilizing IPA (pg. 311). + +References: + +- Otto Jespersen. 1904. Lehrbuch der Phonetik. + Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203. +- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11. +- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976). +- Elisabeth Selkirk. 1984. On the major class features and syllable theory. + In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology. + Cambridge, MIT Press. pp. 107-136. +- Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409–436. +- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes. + In HLT-NAACL. pp. 308-316. +- Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley. +""" + +from collections import Counter + +from nltk.tokenize.api import TokenizerI + + +class LegalitySyllableTokenizer(TokenizerI): + """ + Syllabifies words based on the Legality Principle and Onset Maximization. + + >>> from nltk.tokenize import LegalitySyllableTokenizer + >>> from nltk import word_tokenize + >>> from nltk.corpus import words + >>> text = "This is a wonderful sentence." + >>> text_words = word_tokenize(text) + >>> LP = LegalitySyllableTokenizer(words.words()) + >>> [LP.tokenize(word) for word in text_words] + [['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']] + """ + + def __init__( + self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001 + ): + """ + :param tokenized_source_text: List of valid tokens in the language + :type tokenized_source_text: list(str) + :param vowels: Valid vowels in language or IPA representation + :type vowels: str + :param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset + :type legal_frequency_threshold: float + """ + self.legal_frequency_threshold = legal_frequency_threshold + self.vowels = vowels + self.legal_onsets = self.find_legal_onsets(tokenized_source_text) + + def find_legal_onsets(self, words): + """ + Gathers all onsets and then return only those above the frequency threshold + + :param words: List of words in a language + :type words: list(str) + :return: Set of legal onsets + :rtype: set(str) + """ + onsets = [self.onset(word) for word in words] + legal_onsets = [ + k + for k, v in Counter(onsets).items() + if (v / len(onsets)) > self.legal_frequency_threshold + ] + return set(legal_onsets) + + def onset(self, word): + """ + Returns consonant cluster of word, i.e. all characters until the first vowel. + + :param word: Single word or token + :type word: str + :return: String of characters of onset + :rtype: str + """ + onset = "" + for c in word.lower(): + if c in self.vowels: + return onset + else: + onset += c + return onset + + def tokenize(self, token): + """ + Apply the Legality Principle in combination with + Onset Maximization to return a list of syllables. + + :param token: Single word or token + :type token: str + :return syllable_list: Single word or token broken up into syllables. + :rtype: list(str) + """ + syllables = [] + syllable, current_onset = "", "" + vowel, onset = False, False + for char in token[::-1]: + char_lower = char.lower() + if not vowel: + syllable += char + vowel = bool(char_lower in self.vowels) + else: + if char_lower + current_onset[::-1] in self.legal_onsets: + syllable += char + current_onset += char_lower + onset = True + elif char_lower in self.vowels and not onset: + syllable += char + current_onset += char_lower + else: + syllables.append(syllable) + syllable = char + current_onset = "" + vowel = bool(char_lower in self.vowels) + syllables.append(syllable) + syllables_ordered = [syllable[::-1] for syllable in syllables][::-1] + return syllables_ordered diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/mwe.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/mwe.py new file mode 100644 index 0000000000000000000000000000000000000000..f351ce64ce99c1666a3ff3cf8aa2dc73708ff956 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/mwe.py @@ -0,0 +1,124 @@ +# Multi-Word Expression tokenizer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Rob Malouf +# URL: +# For license information, see LICENSE.TXT + +""" +Multi-Word Expression Tokenizer + +A ``MWETokenizer`` takes a string which has already been divided into tokens and +retokenizes it, merging multi-word expressions into single tokens, using a lexicon +of MWEs: + + + >>> from nltk.tokenize import MWETokenizer + + >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')]) + >>> tokenizer.add_mwe(('in', 'spite', 'of')) + + >>> tokenizer.tokenize('Testing testing testing one two three'.split()) + ['Testing', 'testing', 'testing', 'one', 'two', 'three'] + + >>> tokenizer.tokenize('This is a test in spite'.split()) + ['This', 'is', 'a', 'test', 'in', 'spite'] + + >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split()) + ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of'] + +""" +from nltk.tokenize.api import TokenizerI +from nltk.util import Trie + + +class MWETokenizer(TokenizerI): + """A tokenizer that processes tokenized text and merges multi-word expressions + into single tokens. + """ + + def __init__(self, mwes=None, separator="_"): + """Initialize the multi-word tokenizer with a list of expressions and a + separator + + :type mwes: list(list(str)) + :param mwes: A sequence of multi-word expressions to be merged, where + each MWE is a sequence of strings. + :type separator: str + :param separator: String that should be inserted between words in a multi-word + expression token. (Default is '_') + + """ + if not mwes: + mwes = [] + self._mwes = Trie(mwes) + self._separator = separator + + def add_mwe(self, mwe): + """Add a multi-word expression to the lexicon (stored as a word trie) + + We use ``util.Trie`` to represent the trie. Its form is a dict of dicts. + The key True marks the end of a valid MWE. + + :param mwe: The multi-word expression we're adding into the word trie + :type mwe: tuple(str) or list(str) + + :Example: + + >>> tokenizer = MWETokenizer() + >>> tokenizer.add_mwe(('a', 'b')) + >>> tokenizer.add_mwe(('a', 'b', 'c')) + >>> tokenizer.add_mwe(('a', 'x')) + >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}} + >>> tokenizer._mwes == expected + True + + """ + self._mwes.insert(mwe) + + def tokenize(self, text): + """ + + :param text: A list containing tokenized text + :type text: list(str) + :return: A list of the tokenized text with multi-words merged together + :rtype: list(str) + + :Example: + + >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+') + >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split()) + ['An', "hors+d'oeuvre", 'tonight,', 'sir?'] + + """ + i = 0 + n = len(text) + result = [] + + while i < n: + if text[i] in self._mwes: + # possible MWE match + j = i + trie = self._mwes + last_match = -1 + while j < n and text[j] in trie: # and len(trie[text[j]]) > 0 : + trie = trie[text[j]] + j = j + 1 + if Trie.LEAF in trie: + last_match = j + else: + if last_match > -1: + j = last_match + + if Trie.LEAF in trie or last_match > -1: + # success! + result.append(self._separator.join(text[i:j])) + i = j + else: + # no match, so backtrack + result.append(text[i]) + i += 1 + else: + result.append(text[i]) + i += 1 + return result diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/nist.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/nist.py new file mode 100644 index 0000000000000000000000000000000000000000..b9e13dad28b81d91891a838d89bcdf5a0c1ad086 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/nist.py @@ -0,0 +1,179 @@ +# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer. +# +# Copyright (C) 2001-2015 NLTK Project +# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl) +# Contributors: Ozan Caglayan, Wiktor Stribizew +# +# URL: +# For license information, see LICENSE.TXT + +""" +This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script, +https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926 +which was also ported into Python in +https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162 +""" + + +import io +import re + +from nltk.corpus import perluniprops +from nltk.tokenize.api import TokenizerI +from nltk.tokenize.util import xml_unescape + + +class NISTTokenizer(TokenizerI): + """ + This NIST tokenizer is sentence-based instead of the original + paragraph-based tokenization from mteval-14.pl; The sentence-based + tokenization is consistent with the other tokenizers available in NLTK. + + >>> from nltk.tokenize.nist import NISTTokenizer + >>> nist = NISTTokenizer() + >>> s = "Good muffins cost $3.88 in New York." + >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.'] + >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.'] + >>> nist.tokenize(s, lowercase=False) == expected_cased + True + >>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased. + True + + The international_tokenize() is the preferred function when tokenizing + non-european text, e.g. + + >>> from nltk.tokenize.nist import NISTTokenizer + >>> nist = NISTTokenizer() + + # Input strings. + >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...' + >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...' + >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.' + + # Expected tokens. + >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')'] + >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm'] + >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha'] + + >>> nist.international_tokenize(albb)[:10] == expected_albb + True + >>> nist.international_tokenize(amz)[:10] == expected_amz + True + >>> nist.international_tokenize(rkt)[:10] == expected_rkt + True + + # Doctest for patching issue #1926 + >>> sent = u'this is a foo\u2604sentence.' + >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.'] + >>> nist.international_tokenize(sent) == expected_sent + True + """ + + # Strip "skipped" tags + STRIP_SKIP = re.compile(""), "" + # Strip end-of-line hyphenation and join lines + STRIP_EOL_HYPHEN = re.compile("\u2028"), " " + # Tokenize punctuation. + PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 " + # Tokenize period and comma unless preceded by a digit. + PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 " + # Tokenize period and comma unless followed by a digit. + PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2" + # Tokenize dash when preceded by a digit + DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 " + + LANG_DEPENDENT_REGEXES = [ + PUNCT, + PERIOD_COMMA_PRECEED, + PERIOD_COMMA_FOLLOW, + DASH_PRECEED_DIGIT, + ] + + # Perluniprops characters used in NIST tokenizer. + pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N} + pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P} + pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S} + + # Python regexes needs to escape some special symbols, see + # see https://stackoverflow.com/q/45670950/610569 + number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number) + punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct) + symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol) + + # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to + # (i) strip trailing and heading spaces and + # (ii) de-deuplicate spaces. + # In Python, this would do: ' '.join(str.strip().split()) + # Thus, the next two lines were commented out. + # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl} + # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z} + + # Pads non-ascii strings with space. + NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 " + # Tokenize any punctuation unless followed AND preceded by a digit. + PUNCT_1 = ( + re.compile(f"([{number_regex}])([{punct_regex}])"), + "\\1 \\2 ", + ) + PUNCT_2 = ( + re.compile(f"([{punct_regex}])([{number_regex}])"), + " \\1 \\2", + ) + # Tokenize symbols + SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 " + + INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS] + + def lang_independent_sub(self, text): + """Performs the language independent string substituitions.""" + # It's a strange order of regexes. + # It'll be better to unescape after STRIP_EOL_HYPHEN + # but let's keep it close to the original NIST implementation. + regexp, substitution = self.STRIP_SKIP + text = regexp.sub(substitution, text) + text = xml_unescape(text) + regexp, substitution = self.STRIP_EOL_HYPHEN + text = regexp.sub(substitution, text) + return text + + def tokenize(self, text, lowercase=False, western_lang=True, return_str=False): + text = str(text) + # Language independent regex. + text = self.lang_independent_sub(text) + # Language dependent regex. + if western_lang: + # Pad string with whitespace. + text = " " + text + " " + if lowercase: + text = text.lower() + for regexp, substitution in self.LANG_DEPENDENT_REGEXES: + text = regexp.sub(substitution, text) + # Remove contiguous whitespaces. + text = " ".join(text.split()) + # Finally, strips heading and trailing spaces + # and converts output string into unicode. + text = str(text.strip()) + return text if return_str else text.split() + + def international_tokenize( + self, text, lowercase=False, split_non_ascii=True, return_str=False + ): + text = str(text) + # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied + # first before unescaping. + regexp, substitution = self.STRIP_SKIP + text = regexp.sub(substitution, text) + regexp, substitution = self.STRIP_EOL_HYPHEN + text = regexp.sub(substitution, text) + text = xml_unescape(text) + + if lowercase: + text = text.lower() + + for regexp, substitution in self.INTERNATIONAL_REGEXES: + text = regexp.sub(substitution, text) + + # Make sure that there's only one space only between words. + # Strip leading and trailing spaces. + text = " ".join(text.strip().split()) + return text if return_str else text.split() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/punkt.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/punkt.py new file mode 100644 index 0000000000000000000000000000000000000000..47273fa08da8a650fda371cfa7cc4a8913935b89 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/punkt.py @@ -0,0 +1,1767 @@ +# Natural Language Toolkit: Punkt sentence tokenizer +# +# Copyright (C) 2001-2022 NLTK Project +# Algorithm: Kiss & Strunk (2006) +# Author: Willy (original Python port) +# Steven Bird (additions) +# Edward Loper (rewrite) +# Joel Nothman (almost rewrite) +# Arthur Darcet (fixes) +# Tom Aarsen <> (tackle ReDoS & performance issues) +# URL: +# For license information, see LICENSE.TXT + +r""" +Punkt Sentence Tokenizer + +This tokenizer divides a text into a list of sentences +by using an unsupervised algorithm to build a model for abbreviation +words, collocations, and words that start sentences. It must be +trained on a large collection of plaintext in the target language +before it can be used. + +The NLTK data package includes a pre-trained Punkt tokenizer for +English. + + >>> import nltk.data + >>> text = ''' + ... Punkt knows that the periods in Mr. Smith and Johann S. Bach + ... do not mark sentence boundaries. And sometimes sentences + ... can start with non-capitalized words. i is a good variable + ... name. + ... ''' + >>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') + >>> print('\n-----\n'.join(sent_detector.tokenize(text.strip()))) + Punkt knows that the periods in Mr. Smith and Johann S. Bach + do not mark sentence boundaries. + ----- + And sometimes sentences + can start with non-capitalized words. + ----- + i is a good variable + name. + +(Note that whitespace from the original text, including newlines, is +retained in the output.) + +Punctuation following sentences is also included by default +(from NLTK 3.0 onwards). It can be excluded with the realign_boundaries +flag. + + >>> text = ''' + ... (How does it deal with this parenthesis?) "It should be part of the + ... previous sentence." "(And the same with this one.)" ('And this one!') + ... "('(And (this)) '?)" [(and this. )] + ... ''' + >>> print('\n-----\n'.join( + ... sent_detector.tokenize(text.strip()))) + (How does it deal with this parenthesis?) + ----- + "It should be part of the + previous sentence." + ----- + "(And the same with this one.)" + ----- + ('And this one!') + ----- + "('(And (this)) '?)" + ----- + [(and this. )] + >>> print('\n-----\n'.join( + ... sent_detector.tokenize(text.strip(), realign_boundaries=False))) + (How does it deal with this parenthesis? + ----- + ) "It should be part of the + previous sentence. + ----- + " "(And the same with this one. + ----- + )" ('And this one! + ----- + ') + "('(And (this)) '? + ----- + )" [(and this. + ----- + )] + +However, Punkt is designed to learn parameters (a list of abbreviations, etc.) +unsupervised from a corpus similar to the target domain. The pre-packaged models +may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn +parameters from the given text. + +:class:`.PunktTrainer` learns parameters such as a list of abbreviations +(without supervision) from portions of text. Using a ``PunktTrainer`` directly +allows for incremental training and modification of the hyper-parameters used +to decide what is considered an abbreviation, etc. + +The algorithm for this tokenizer is described in:: + + Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence + Boundary Detection. Computational Linguistics 32: 485-525. +""" + +# TODO: Make orthographic heuristic less susceptible to overtraining +# TODO: Frequent sentence starters optionally exclude always-capitalised words +# FIXME: Problem with ending string with e.g. '!!!' -> '!! !' + +import math +import re +import string +from collections import defaultdict +from typing import Any, Dict, Iterator, List, Match, Optional, Tuple, Union + +from nltk.probability import FreqDist +from nltk.tokenize.api import TokenizerI + +###################################################################### +# { Orthographic Context Constants +###################################################################### +# The following constants are used to describe the orthographic +# contexts in which a word can occur. BEG=beginning, MID=middle, +# UNK=unknown, UC=uppercase, LC=lowercase, NC=no case. + +_ORTHO_BEG_UC = 1 << 1 +"""Orthographic context: beginning of a sentence with upper case.""" + +_ORTHO_MID_UC = 1 << 2 +"""Orthographic context: middle of a sentence with upper case.""" + +_ORTHO_UNK_UC = 1 << 3 +"""Orthographic context: unknown position in a sentence with upper case.""" + +_ORTHO_BEG_LC = 1 << 4 +"""Orthographic context: beginning of a sentence with lower case.""" + +_ORTHO_MID_LC = 1 << 5 +"""Orthographic context: middle of a sentence with lower case.""" + +_ORTHO_UNK_LC = 1 << 6 +"""Orthographic context: unknown position in a sentence with lower case.""" + +_ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC +"""Orthographic context: occurs with upper case.""" + +_ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC +"""Orthographic context: occurs with lower case.""" + +_ORTHO_MAP = { + ("initial", "upper"): _ORTHO_BEG_UC, + ("internal", "upper"): _ORTHO_MID_UC, + ("unknown", "upper"): _ORTHO_UNK_UC, + ("initial", "lower"): _ORTHO_BEG_LC, + ("internal", "lower"): _ORTHO_MID_LC, + ("unknown", "lower"): _ORTHO_UNK_LC, +} +"""A map from context position and first-letter case to the +appropriate orthographic context flag.""" + +# } (end orthographic context constants) +###################################################################### + +###################################################################### +# { Decision reasons for debugging +###################################################################### + +REASON_DEFAULT_DECISION = "default decision" +REASON_KNOWN_COLLOCATION = "known collocation (both words)" +REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = "abbreviation + orthographic heuristic" +REASON_ABBR_WITH_SENTENCE_STARTER = "abbreviation + frequent sentence starter" +REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic" +REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic" +REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = ( + "initial + special orthographic heuristic" +) + + +# } (end decision reasons for debugging) +###################################################################### + +###################################################################### +# { Language-dependent variables +###################################################################### + + +class PunktLanguageVars: + """ + Stores variables, mostly regular expressions, which may be + language-dependent for correct application of the algorithm. + An extension of this class may modify its properties to suit + a language other than English; an instance can then be passed + as an argument to PunktSentenceTokenizer and PunktTrainer + constructors. + """ + + __slots__ = ("_re_period_context", "_re_word_tokenizer") + + def __getstate__(self): + # All modifications to the class are performed by inheritance. + # Non-default parameters to be pickled must be defined in the inherited + # class. + return 1 + + def __setstate__(self, state): + return 1 + + sent_end_chars = (".", "?", "!") + """Characters which are candidates for sentence boundaries""" + + @property + def _re_sent_end_chars(self): + return "[%s]" % re.escape("".join(self.sent_end_chars)) + + internal_punctuation = ",:;" # might want to extend this.. + """sentence internal punctuation, which indicates an abbreviation if + preceded by a period-final token.""" + + re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)', re.MULTILINE) + """Used to realign punctuation that should be included in a sentence + although it follows the period (or ?, !).""" + + _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]" + """Excludes some characters from starting word tokens""" + + @property + def _re_non_word_chars(self): + return r"(?:[)\";}\]\*:@\'\({\[%s])" % re.escape( + "".join(set(self.sent_end_chars) - {"."}) + ) + + """Characters that cannot appear within words""" + + _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)" + """Hyphen and ellipsis are multi-character punctuation""" + + _word_tokenize_fmt = r"""( + %(MultiChar)s + | + (?=%(WordStart)s)\S+? # Accept word characters until end is found + (?= # Sequences marking a word's end + \s| # White-space + $| # End-of-string + %(NonWord)s|%(MultiChar)s| # Punctuation + ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word + ) + | + \S + )""" + """Format of a regular expression to split punctuation from words, + excluding period.""" + + def _word_tokenizer_re(self): + """Compiles and returns a regular expression for word tokenization""" + try: + return self._re_word_tokenizer + except AttributeError: + self._re_word_tokenizer = re.compile( + self._word_tokenize_fmt + % { + "NonWord": self._re_non_word_chars, + "MultiChar": self._re_multi_char_punct, + "WordStart": self._re_word_start, + }, + re.UNICODE | re.VERBOSE, + ) + return self._re_word_tokenizer + + def word_tokenize(self, s): + """Tokenize a string to split off punctuation other than periods""" + return self._word_tokenizer_re().findall(s) + + _period_context_fmt = r""" + %(SentEndChars)s # a potential sentence ending + (?=(?P + %(NonWord)s # either other punctuation + | + \s+(?P\S+) # or whitespace and some other token + ))""" + """Format of a regular expression to find contexts including possible + sentence boundaries. Matches token which the possible sentence boundary + ends, and matches the following token within a lookahead expression.""" + + def period_context_re(self): + """Compiles and returns a regular expression to find contexts + including possible sentence boundaries.""" + try: + return self._re_period_context + except: + self._re_period_context = re.compile( + self._period_context_fmt + % { + "NonWord": self._re_non_word_chars, + "SentEndChars": self._re_sent_end_chars, + }, + re.UNICODE | re.VERBOSE, + ) + return self._re_period_context + + +_re_non_punct = re.compile(r"[^\W\d]", re.UNICODE) +"""Matches token types that are not merely punctuation. (Types for +numeric tokens are changed to ##number## and hence contain alpha.)""" + + +# } +###################################################################### + + +# //////////////////////////////////////////////////////////// +# { Helper Functions +# //////////////////////////////////////////////////////////// + + +def _pair_iter(iterator): + """ + Yields pairs of tokens from the given iterator such that each input + token will appear as the first element in a yielded tuple. The last + pair will have None as its second element. + """ + iterator = iter(iterator) + try: + prev = next(iterator) + except StopIteration: + return + for el in iterator: + yield (prev, el) + prev = el + yield (prev, None) + + +###################################################################### +# { Punkt Parameters +###################################################################### + + +class PunktParameters: + """Stores data used to perform sentence boundary detection with Punkt.""" + + def __init__(self): + self.abbrev_types = set() + """A set of word types for known abbreviations.""" + + self.collocations = set() + """A set of word type tuples for known common collocations + where the first word ends in a period. E.g., ('S.', 'Bach') + is a common collocation in a text that discusses 'Johann + S. Bach'. These count as negative evidence for sentence + boundaries.""" + + self.sent_starters = set() + """A set of word types for words that often appear at the + beginning of sentences.""" + + self.ortho_context = defaultdict(int) + """A dictionary mapping word types to the set of orthographic + contexts that word type appears in. Contexts are represented + by adding orthographic context flags: ...""" + + def clear_abbrevs(self): + self.abbrev_types = set() + + def clear_collocations(self): + self.collocations = set() + + def clear_sent_starters(self): + self.sent_starters = set() + + def clear_ortho_context(self): + self.ortho_context = defaultdict(int) + + def add_ortho_context(self, typ, flag): + self.ortho_context[typ] |= flag + + def _debug_ortho_context(self, typ): + context = self.ortho_context[typ] + if context & _ORTHO_BEG_UC: + yield "BEG-UC" + if context & _ORTHO_MID_UC: + yield "MID-UC" + if context & _ORTHO_UNK_UC: + yield "UNK-UC" + if context & _ORTHO_BEG_LC: + yield "BEG-LC" + if context & _ORTHO_MID_LC: + yield "MID-LC" + if context & _ORTHO_UNK_LC: + yield "UNK-LC" + + +###################################################################### +# { PunktToken +###################################################################### + + +class PunktToken: + """Stores a token of text with annotations produced during + sentence boundary detection.""" + + _properties = ["parastart", "linestart", "sentbreak", "abbr", "ellipsis"] + __slots__ = ["tok", "type", "period_final"] + _properties + + def __init__(self, tok, **params): + self.tok = tok + self.type = self._get_type(tok) + self.period_final = tok.endswith(".") + + for prop in self._properties: + setattr(self, prop, None) + for k in params: + setattr(self, k, params[k]) + + # //////////////////////////////////////////////////////////// + # { Regular expressions for properties + # //////////////////////////////////////////////////////////// + # Note: [A-Za-z] is approximated by [^\W\d] in the general case. + _RE_ELLIPSIS = re.compile(r"\.\.+$") + _RE_NUMERIC = re.compile(r"^-?[\.,]?\d[\d,\.-]*\.?$") + _RE_INITIAL = re.compile(r"[^\W\d]\.$", re.UNICODE) + _RE_ALPHA = re.compile(r"[^\W\d]+$", re.UNICODE) + + # //////////////////////////////////////////////////////////// + # { Derived properties + # //////////////////////////////////////////////////////////// + + def _get_type(self, tok): + """Returns a case-normalized representation of the token.""" + return self._RE_NUMERIC.sub("##number##", tok.lower()) + + @property + def type_no_period(self): + """ + The type with its final period removed if it has one. + """ + if len(self.type) > 1 and self.type[-1] == ".": + return self.type[:-1] + return self.type + + @property + def type_no_sentperiod(self): + """ + The type with its final period removed if it is marked as a + sentence break. + """ + if self.sentbreak: + return self.type_no_period + return self.type + + @property + def first_upper(self): + """True if the token's first character is uppercase.""" + return self.tok[0].isupper() + + @property + def first_lower(self): + """True if the token's first character is lowercase.""" + return self.tok[0].islower() + + @property + def first_case(self): + if self.first_lower: + return "lower" + if self.first_upper: + return "upper" + return "none" + + @property + def is_ellipsis(self): + """True if the token text is that of an ellipsis.""" + return self._RE_ELLIPSIS.match(self.tok) + + @property + def is_number(self): + """True if the token text is that of a number.""" + return self.type.startswith("##number##") + + @property + def is_initial(self): + """True if the token text is that of an initial.""" + return self._RE_INITIAL.match(self.tok) + + @property + def is_alpha(self): + """True if the token text is all alphabetic.""" + return self._RE_ALPHA.match(self.tok) + + @property + def is_non_punct(self): + """True if the token is either a number or is alphabetic.""" + return _re_non_punct.search(self.type) + + # //////////////////////////////////////////////////////////// + # { String representation + # //////////////////////////////////////////////////////////// + + def __repr__(self): + """ + A string representation of the token that can reproduce it + with eval(), which lists all the token's non-default + annotations. + """ + typestr = " type=%s," % repr(self.type) if self.type != self.tok else "" + + propvals = ", ".join( + f"{p}={repr(getattr(self, p))}" + for p in self._properties + if getattr(self, p) + ) + + return "{}({},{} {})".format( + self.__class__.__name__, + repr(self.tok), + typestr, + propvals, + ) + + def __str__(self): + """ + A string representation akin to that used by Kiss and Strunk. + """ + res = self.tok + if self.abbr: + res += "" + if self.ellipsis: + res += "" + if self.sentbreak: + res += "" + return res + + +###################################################################### +# { Punkt base class +###################################################################### + + +class PunktBaseClass: + """ + Includes common components of PunktTrainer and PunktSentenceTokenizer. + """ + + def __init__(self, lang_vars=None, token_cls=PunktToken, params=None): + if lang_vars is None: + lang_vars = PunktLanguageVars() + if params is None: + params = PunktParameters() + self._params = params + self._lang_vars = lang_vars + self._Token = token_cls + """The collection of parameters that determines the behavior + of the punkt tokenizer.""" + + # //////////////////////////////////////////////////////////// + # { Word tokenization + # //////////////////////////////////////////////////////////// + + def _tokenize_words(self, plaintext): + """ + Divide the given text into tokens, using the punkt word + segmentation regular expression, and generate the resulting list + of tokens augmented as three-tuples with two boolean values for whether + the given token occurs at the start of a paragraph or a new line, + respectively. + """ + parastart = False + for line in plaintext.split("\n"): + if line.strip(): + line_toks = iter(self._lang_vars.word_tokenize(line)) + + try: + tok = next(line_toks) + except StopIteration: + continue + + yield self._Token(tok, parastart=parastart, linestart=True) + parastart = False + + for tok in line_toks: + yield self._Token(tok) + else: + parastart = True + + # //////////////////////////////////////////////////////////// + # { Annotation Procedures + # //////////////////////////////////////////////////////////// + + def _annotate_first_pass( + self, tokens: Iterator[PunktToken] + ) -> Iterator[PunktToken]: + """ + Perform the first pass of annotation, which makes decisions + based purely based on the word type of each word: + + - '?', '!', and '.' are marked as sentence breaks. + - sequences of two or more periods are marked as ellipsis. + - any word ending in '.' that's a known abbreviation is + marked as an abbreviation. + - any other word ending in '.' is marked as a sentence break. + + Return these annotations as a tuple of three sets: + + - sentbreak_toks: The indices of all sentence breaks. + - abbrev_toks: The indices of all abbreviations. + - ellipsis_toks: The indices of all ellipsis marks. + """ + for aug_tok in tokens: + self._first_pass_annotation(aug_tok) + yield aug_tok + + def _first_pass_annotation(self, aug_tok: PunktToken) -> None: + """ + Performs type-based annotation on a single token. + """ + + tok = aug_tok.tok + + if tok in self._lang_vars.sent_end_chars: + aug_tok.sentbreak = True + elif aug_tok.is_ellipsis: + aug_tok.ellipsis = True + elif aug_tok.period_final and not tok.endswith(".."): + if ( + tok[:-1].lower() in self._params.abbrev_types + or tok[:-1].lower().split("-")[-1] in self._params.abbrev_types + ): + + aug_tok.abbr = True + else: + aug_tok.sentbreak = True + + return + + +###################################################################### +# { Punkt Trainer +###################################################################### + + +class PunktTrainer(PunktBaseClass): + """Learns parameters used in Punkt sentence boundary detection.""" + + def __init__( + self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken + ): + + PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls) + + self._type_fdist = FreqDist() + """A frequency distribution giving the frequency of each + case-normalized token type in the training data.""" + + self._num_period_toks = 0 + """The number of words ending in period in the training data.""" + + self._collocation_fdist = FreqDist() + """A frequency distribution giving the frequency of all + bigrams in the training data where the first word ends in a + period. Bigrams are encoded as tuples of word types. + Especially common collocations are extracted from this + frequency distribution, and stored in + ``_params``.``collocations ``.""" + + self._sent_starter_fdist = FreqDist() + """A frequency distribution giving the frequency of all words + that occur at the training data at the beginning of a sentence + (after the first pass of annotation). Especially common + sentence starters are extracted from this frequency + distribution, and stored in ``_params.sent_starters``. + """ + + self._sentbreak_count = 0 + """The total number of sentence breaks identified in training, used for + calculating the frequent sentence starter heuristic.""" + + self._finalized = True + """A flag as to whether the training has been finalized by finding + collocations and sentence starters, or whether finalize_training() + still needs to be called.""" + + if train_text: + self.train(train_text, verbose, finalize=True) + + def get_params(self): + """ + Calculates and returns parameters for sentence boundary detection as + derived from training.""" + if not self._finalized: + self.finalize_training() + return self._params + + # //////////////////////////////////////////////////////////// + # { Customization Variables + # //////////////////////////////////////////////////////////// + + ABBREV = 0.3 + """cut-off value whether a 'token' is an abbreviation""" + + IGNORE_ABBREV_PENALTY = False + """allows the disabling of the abbreviation penalty heuristic, which + exponentially disadvantages words that are found at times without a + final period.""" + + ABBREV_BACKOFF = 5 + """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" + + COLLOCATION = 7.88 + """minimal log-likelihood value that two tokens need to be considered + as a collocation""" + + SENT_STARTER = 30 + """minimal log-likelihood value that a token requires to be considered + as a frequent sentence starter""" + + INCLUDE_ALL_COLLOCS = False + """this includes as potential collocations all word pairs where the first + word ends in a period. It may be useful in corpora where there is a lot + of variation that makes abbreviations like Mr difficult to identify.""" + + INCLUDE_ABBREV_COLLOCS = False + """this includes as potential collocations all word pairs where the first + word is an abbreviation. Such collocations override the orthographic + heuristic, but not the sentence starter heuristic. This is overridden by + INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials + and ordinals are considered.""" + """""" + + MIN_COLLOC_FREQ = 1 + """this sets a minimum bound on the number of times a bigram needs to + appear before it can be considered a collocation, in addition to log + likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" + + # //////////////////////////////////////////////////////////// + # { Training.. + # //////////////////////////////////////////////////////////// + + def train(self, text, verbose=False, finalize=True): + """ + Collects training data from a given text. If finalize is True, it + will determine all the parameters for sentence boundary detection. If + not, this will be delayed until get_params() or finalize_training() is + called. If verbose is True, abbreviations found will be listed. + """ + # Break the text into tokens; record which token indices correspond to + # line starts and paragraph starts; and determine their types. + self._train_tokens(self._tokenize_words(text), verbose) + if finalize: + self.finalize_training(verbose) + + def train_tokens(self, tokens, verbose=False, finalize=True): + """ + Collects training data from a given list of tokens. + """ + self._train_tokens((self._Token(t) for t in tokens), verbose) + if finalize: + self.finalize_training(verbose) + + def _train_tokens(self, tokens, verbose): + self._finalized = False + + # Ensure tokens are a list + tokens = list(tokens) + + # Find the frequency of each case-normalized type. (Don't + # strip off final periods.) Also keep track of the number of + # tokens that end in periods. + for aug_tok in tokens: + self._type_fdist[aug_tok.type] += 1 + if aug_tok.period_final: + self._num_period_toks += 1 + + # Look for new abbreviations, and for types that no longer are + unique_types = self._unique_types(tokens) + for abbr, score, is_add in self._reclassify_abbrev_types(unique_types): + if score >= self.ABBREV: + if is_add: + self._params.abbrev_types.add(abbr) + if verbose: + print(f" Abbreviation: [{score:6.4f}] {abbr}") + else: + if not is_add: + self._params.abbrev_types.remove(abbr) + if verbose: + print(f" Removed abbreviation: [{score:6.4f}] {abbr}") + + # Make a preliminary pass through the document, marking likely + # sentence breaks, abbreviations, and ellipsis tokens. + tokens = list(self._annotate_first_pass(tokens)) + + # Check what contexts each word type can appear in, given the + # case of its first letter. + self._get_orthography_data(tokens) + + # We need total number of sentence breaks to find sentence starters + self._sentbreak_count += self._get_sentbreak_count(tokens) + + # The remaining heuristics relate to pairs of tokens where the first + # ends in a period. + for aug_tok1, aug_tok2 in _pair_iter(tokens): + if not aug_tok1.period_final or not aug_tok2: + continue + + # Is the first token a rare abbreviation? + if self._is_rare_abbrev_type(aug_tok1, aug_tok2): + self._params.abbrev_types.add(aug_tok1.type_no_period) + if verbose: + print(" Rare Abbrev: %s" % aug_tok1.type) + + # Does second token have a high likelihood of starting a sentence? + if self._is_potential_sent_starter(aug_tok2, aug_tok1): + self._sent_starter_fdist[aug_tok2.type] += 1 + + # Is this bigram a potential collocation? + if self._is_potential_collocation(aug_tok1, aug_tok2): + self._collocation_fdist[ + (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod) + ] += 1 + + def _unique_types(self, tokens): + return {aug_tok.type for aug_tok in tokens} + + def finalize_training(self, verbose=False): + """ + Uses data that has been gathered in training to determine likely + collocations and sentence starters. + """ + self._params.clear_sent_starters() + for typ, log_likelihood in self._find_sent_starters(): + self._params.sent_starters.add(typ) + if verbose: + print(f" Sent Starter: [{log_likelihood:6.4f}] {typ!r}") + + self._params.clear_collocations() + for (typ1, typ2), log_likelihood in self._find_collocations(): + self._params.collocations.add((typ1, typ2)) + if verbose: + print(f" Collocation: [{log_likelihood:6.4f}] {typ1!r}+{typ2!r}") + + self._finalized = True + + # //////////////////////////////////////////////////////////// + # { Overhead reduction + # //////////////////////////////////////////////////////////// + + def freq_threshold( + self, ortho_thresh=2, type_thresh=2, colloc_thres=2, sentstart_thresh=2 + ): + """ + Allows memory use to be reduced after much training by removing data + about rare tokens that are unlikely to have a statistical effect with + further training. Entries occurring above the given thresholds will be + retained. + """ + if ortho_thresh > 1: + old_oc = self._params.ortho_context + self._params.clear_ortho_context() + for tok in self._type_fdist: + count = self._type_fdist[tok] + if count >= ortho_thresh: + self._params.ortho_context[tok] = old_oc[tok] + + self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh) + self._collocation_fdist = self._freq_threshold( + self._collocation_fdist, colloc_thres + ) + self._sent_starter_fdist = self._freq_threshold( + self._sent_starter_fdist, sentstart_thresh + ) + + def _freq_threshold(self, fdist, threshold): + """ + Returns a FreqDist containing only data with counts below a given + threshold, as well as a mapping (None -> count_removed). + """ + # We assume that there is more data below the threshold than above it + # and so create a new FreqDist rather than working in place. + res = FreqDist() + num_removed = 0 + for tok in fdist: + count = fdist[tok] + if count < threshold: + num_removed += 1 + else: + res[tok] += count + res[None] += num_removed + return res + + # //////////////////////////////////////////////////////////// + # { Orthographic data + # //////////////////////////////////////////////////////////// + + def _get_orthography_data(self, tokens): + """ + Collect information about whether each token type occurs + with different case patterns (i) overall, (ii) at + sentence-initial positions, and (iii) at sentence-internal + positions. + """ + # 'initial' or 'internal' or 'unknown' + context = "internal" + tokens = list(tokens) + + for aug_tok in tokens: + # If we encounter a paragraph break, then it's a good sign + # that it's a sentence break. But err on the side of + # caution (by not positing a sentence break) if we just + # saw an abbreviation. + if aug_tok.parastart and context != "unknown": + context = "initial" + + # If we're at the beginning of a line, then we can't decide + # between 'internal' and 'initial'. + if aug_tok.linestart and context == "internal": + context = "unknown" + + # Find the case-normalized type of the token. If it's a + # sentence-final token, strip off the period. + typ = aug_tok.type_no_sentperiod + + # Update the orthographic context table. + flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0) + if flag: + self._params.add_ortho_context(typ, flag) + + # Decide whether the next word is at a sentence boundary. + if aug_tok.sentbreak: + if not (aug_tok.is_number or aug_tok.is_initial): + context = "initial" + else: + context = "unknown" + elif aug_tok.ellipsis or aug_tok.abbr: + context = "unknown" + else: + context = "internal" + + # //////////////////////////////////////////////////////////// + # { Abbreviations + # //////////////////////////////////////////////////////////// + + def _reclassify_abbrev_types(self, types): + """ + (Re)classifies each given token if + - it is period-final and not a known abbreviation; or + - it is not period-final and is otherwise a known abbreviation + by checking whether its previous classification still holds according + to the heuristics of section 3. + Yields triples (abbr, score, is_add) where abbr is the type in question, + score is its log-likelihood with penalties applied, and is_add specifies + whether the present type is a candidate for inclusion or exclusion as an + abbreviation, such that: + - (is_add and score >= 0.3) suggests a new abbreviation; and + - (not is_add and score < 0.3) suggests excluding an abbreviation. + """ + # (While one could recalculate abbreviations from all .-final tokens at + # every iteration, in cases requiring efficiency, the number of tokens + # in the present training document will be much less.) + + for typ in types: + # Check some basic conditions, to rule out words that are + # clearly not abbrev_types. + if not _re_non_punct.search(typ) or typ == "##number##": + continue + + if typ.endswith("."): + if typ in self._params.abbrev_types: + continue + typ = typ[:-1] + is_add = True + else: + if typ not in self._params.abbrev_types: + continue + is_add = False + + # Count how many periods & nonperiods are in the + # candidate. + num_periods = typ.count(".") + 1 + num_nonperiods = len(typ) - num_periods + 1 + + # Let be the candidate without the period, and + # be the period. Find a log likelihood ratio that + # indicates whether occurs as a single unit (high + # value of log_likelihood), or as two independent units and + # (low value of log_likelihood). + count_with_period = self._type_fdist[typ + "."] + count_without_period = self._type_fdist[typ] + log_likelihood = self._dunning_log_likelihood( + count_with_period + count_without_period, + self._num_period_toks, + count_with_period, + self._type_fdist.N(), + ) + + # Apply three scaling factors to 'tweak' the basic log + # likelihood ratio: + # F_length: long word -> less likely to be an abbrev + # F_periods: more periods -> more likely to be an abbrev + # F_penalty: penalize occurrences w/o a period + f_length = math.exp(-num_nonperiods) + f_periods = num_periods + f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow( + num_nonperiods, -count_without_period + ) + score = log_likelihood * f_length * f_periods * f_penalty + + yield typ, score, is_add + + def find_abbrev_types(self): + """ + Recalculates abbreviations given type frequencies, despite no prior + determination of abbreviations. + This fails to include abbreviations otherwise found as "rare". + """ + self._params.clear_abbrevs() + tokens = (typ for typ in self._type_fdist if typ and typ.endswith(".")) + for abbr, score, _is_add in self._reclassify_abbrev_types(tokens): + if score >= self.ABBREV: + self._params.abbrev_types.add(abbr) + + # This function combines the work done by the original code's + # functions `count_orthography_context`, `get_orthography_count`, + # and `get_rare_abbreviations`. + def _is_rare_abbrev_type(self, cur_tok, next_tok): + """ + A word type is counted as a rare abbreviation if... + - it's not already marked as an abbreviation + - it occurs fewer than ABBREV_BACKOFF times + - either it is followed by a sentence-internal punctuation + mark, *or* it is followed by a lower-case word that + sometimes appears with upper case, but never occurs with + lower case at the beginning of sentences. + """ + if cur_tok.abbr or not cur_tok.sentbreak: + return False + + # Find the case-normalized type of the token. If it's + # a sentence-final token, strip off the period. + typ = cur_tok.type_no_sentperiod + + # Proceed only if the type hasn't been categorized as an + # abbreviation already, and is sufficiently rare... + count = self._type_fdist[typ] + self._type_fdist[typ[:-1]] + if typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF: + return False + + # Record this token as an abbreviation if the next + # token is a sentence-internal punctuation mark. + # [XX] :1 or check the whole thing?? + if next_tok.tok[:1] in self._lang_vars.internal_punctuation: + return True + + # Record this type as an abbreviation if the next + # token... (i) starts with a lower case letter, + # (ii) sometimes occurs with an uppercase letter, + # and (iii) never occus with an uppercase letter + # sentence-internally. + # [xx] should the check for (ii) be modified?? + if next_tok.first_lower: + typ2 = next_tok.type_no_sentperiod + typ2ortho_context = self._params.ortho_context[typ2] + if (typ2ortho_context & _ORTHO_BEG_UC) and not ( + typ2ortho_context & _ORTHO_MID_UC + ): + return True + + # //////////////////////////////////////////////////////////// + # { Log Likelihoods + # //////////////////////////////////////////////////////////// + + # helper for _reclassify_abbrev_types: + @staticmethod + def _dunning_log_likelihood(count_a, count_b, count_ab, N): + """ + A function that calculates the modified Dunning log-likelihood + ratio scores for abbreviation candidates. The details of how + this works is available in the paper. + """ + p1 = count_b / N + p2 = 0.99 + + null_hypo = count_ab * math.log(p1) + (count_a - count_ab) * math.log(1.0 - p1) + alt_hypo = count_ab * math.log(p2) + (count_a - count_ab) * math.log(1.0 - p2) + + likelihood = null_hypo - alt_hypo + + return -2.0 * likelihood + + @staticmethod + def _col_log_likelihood(count_a, count_b, count_ab, N): + """ + A function that will just compute log-likelihood estimate, in + the original paper it's described in algorithm 6 and 7. + + This *should* be the original Dunning log-likelihood values, + unlike the previous log_l function where it used modified + Dunning log-likelihood values + """ + p = count_b / N + p1 = count_ab / count_a + try: + p2 = (count_b - count_ab) / (N - count_a) + except ZeroDivisionError: + p2 = 1 + + try: + summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p) + except ValueError: + summand1 = 0 + + try: + summand2 = (count_b - count_ab) * math.log(p) + ( + N - count_a - count_b + count_ab + ) * math.log(1.0 - p) + except ValueError: + summand2 = 0 + + if count_a == count_ab or p1 <= 0 or p1 >= 1: + summand3 = 0 + else: + summand3 = count_ab * math.log(p1) + (count_a - count_ab) * math.log( + 1.0 - p1 + ) + + if count_b == count_ab or p2 <= 0 or p2 >= 1: + summand4 = 0 + else: + summand4 = (count_b - count_ab) * math.log(p2) + ( + N - count_a - count_b + count_ab + ) * math.log(1.0 - p2) + + likelihood = summand1 + summand2 - summand3 - summand4 + + return -2.0 * likelihood + + # //////////////////////////////////////////////////////////// + # { Collocation Finder + # //////////////////////////////////////////////////////////// + + def _is_potential_collocation(self, aug_tok1, aug_tok2): + """ + Returns True if the pair of tokens may form a collocation given + log-likelihood statistics. + """ + return ( + ( + self.INCLUDE_ALL_COLLOCS + or (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr) + or (aug_tok1.sentbreak and (aug_tok1.is_number or aug_tok1.is_initial)) + ) + and aug_tok1.is_non_punct + and aug_tok2.is_non_punct + ) + + def _find_collocations(self): + """ + Generates likely collocations and their log-likelihood. + """ + for types in self._collocation_fdist: + try: + typ1, typ2 = types + except TypeError: + # types may be None after calling freq_threshold() + continue + if typ2 in self._params.sent_starters: + continue + + col_count = self._collocation_fdist[types] + typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + "."] + typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + "."] + if ( + typ1_count > 1 + and typ2_count > 1 + and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count) + ): + + log_likelihood = self._col_log_likelihood( + typ1_count, typ2_count, col_count, self._type_fdist.N() + ) + # Filter out the not-so-collocative + if log_likelihood >= self.COLLOCATION and ( + self._type_fdist.N() / typ1_count > typ2_count / col_count + ): + yield (typ1, typ2), log_likelihood + + # //////////////////////////////////////////////////////////// + # { Sentence-Starter Finder + # //////////////////////////////////////////////////////////// + + def _is_potential_sent_starter(self, cur_tok, prev_tok): + """ + Returns True given a token and the token that precedes it if it + seems clear that the token is beginning a sentence. + """ + # If a token (i) is preceded by a sentece break that is + # not a potential ordinal number or initial, and (ii) is + # alphabetic, then it is a a sentence-starter. + return ( + prev_tok.sentbreak + and not (prev_tok.is_number or prev_tok.is_initial) + and cur_tok.is_alpha + ) + + def _find_sent_starters(self): + """ + Uses collocation heuristics for each candidate token to + determine if it frequently starts sentences. + """ + for typ in self._sent_starter_fdist: + if not typ: + continue + + typ_at_break_count = self._sent_starter_fdist[typ] + typ_count = self._type_fdist[typ] + self._type_fdist[typ + "."] + if typ_count < typ_at_break_count: + # needed after freq_threshold + continue + + log_likelihood = self._col_log_likelihood( + self._sentbreak_count, + typ_count, + typ_at_break_count, + self._type_fdist.N(), + ) + + if ( + log_likelihood >= self.SENT_STARTER + and self._type_fdist.N() / self._sentbreak_count + > typ_count / typ_at_break_count + ): + yield typ, log_likelihood + + def _get_sentbreak_count(self, tokens): + """ + Returns the number of sentence breaks marked in a given set of + augmented tokens. + """ + return sum(1 for aug_tok in tokens if aug_tok.sentbreak) + + +###################################################################### +# { Punkt Sentence Tokenizer +###################################################################### + + +class PunktSentenceTokenizer(PunktBaseClass, TokenizerI): + """ + A sentence tokenizer which uses an unsupervised algorithm to build + a model for abbreviation words, collocations, and words that start + sentences; and then uses that model to find sentence boundaries. + This approach has been shown to work well for many European + languages. + """ + + def __init__( + self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken + ): + """ + train_text can either be the sole training text for this sentence + boundary detector, or can be a PunktParameters object. + """ + PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls) + + if train_text: + self._params = self.train(train_text, verbose) + + def train(self, train_text, verbose=False): + """ + Derives parameters from a given training text, or uses the parameters + given. Repeated calls to this method destroy previous parameters. For + incremental training, instantiate a separate PunktTrainer instance. + """ + if not isinstance(train_text, str): + return train_text + return PunktTrainer( + train_text, lang_vars=self._lang_vars, token_cls=self._Token + ).get_params() + + # //////////////////////////////////////////////////////////// + # { Tokenization + # //////////////////////////////////////////////////////////// + + def tokenize(self, text: str, realign_boundaries: bool = True) -> List[str]: + """ + Given a text, returns a list of the sentences in that text. + """ + return list(self.sentences_from_text(text, realign_boundaries)) + + def debug_decisions(self, text: str) -> Iterator[Dict[str, Any]]: + """ + Classifies candidate periods as sentence breaks, yielding a dict for + each that may be used to understand why the decision was made. + + See format_debug_decision() to help make this output readable. + """ + + for match, decision_text in self._match_potential_end_contexts(text): + tokens = self._tokenize_words(decision_text) + tokens = list(self._annotate_first_pass(tokens)) + while tokens and not tokens[0].tok.endswith(self._lang_vars.sent_end_chars): + tokens.pop(0) + yield { + "period_index": match.end() - 1, + "text": decision_text, + "type1": tokens[0].type, + "type2": tokens[1].type, + "type1_in_abbrs": bool(tokens[0].abbr), + "type1_is_initial": bool(tokens[0].is_initial), + "type2_is_sent_starter": tokens[1].type_no_sentperiod + in self._params.sent_starters, + "type2_ortho_heuristic": self._ortho_heuristic(tokens[1]), + "type2_ortho_contexts": set( + self._params._debug_ortho_context(tokens[1].type_no_sentperiod) + ), + "collocation": ( + tokens[0].type_no_sentperiod, + tokens[1].type_no_sentperiod, + ) + in self._params.collocations, + "reason": self._second_pass_annotation(tokens[0], tokens[1]) + or REASON_DEFAULT_DECISION, + "break_decision": tokens[0].sentbreak, + } + + def span_tokenize( + self, text: str, realign_boundaries: bool = True + ) -> Iterator[Tuple[int, int]]: + """ + Given a text, generates (start, end) spans of sentences + in the text. + """ + slices = self._slices_from_text(text) + if realign_boundaries: + slices = self._realign_boundaries(text, slices) + for sentence in slices: + yield (sentence.start, sentence.stop) + + def sentences_from_text( + self, text: str, realign_boundaries: bool = True + ) -> List[str]: + """ + Given a text, generates the sentences in that text by only + testing candidate sentence breaks. If realign_boundaries is + True, includes in the sentence closing punctuation that + follows the period. + """ + return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] + + def _get_last_whitespace_index(self, text: str) -> int: + """ + Given a text, find the index of the *last* occurrence of *any* + whitespace character, i.e. " ", "\n", "\t", "\r", etc. + If none is found, return 0. + """ + for i in range(len(text) - 1, -1, -1): + if text[i] in string.whitespace: + return i + return 0 + + def _match_potential_end_contexts(self, text: str) -> Iterator[Tuple[Match, str]]: + """ + Given a text, find the matches of potential sentence breaks, + alongside the contexts surrounding these sentence breaks. + + Since the fix for the ReDOS discovered in issue #2866, we no longer match + the word before a potential end of sentence token. Instead, we use a separate + regex for this. As a consequence, `finditer`'s desire to find non-overlapping + matches no longer aids us in finding the single longest match. + Where previously, we could use:: + + >>> pst = PunktSentenceTokenizer() + >>> text = "Very bad acting!!! I promise." + >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +SKIP + [] + + Now we have to find the word before (i.e. 'acting') separately, and `finditer` + returns:: + + >>> pst = PunktSentenceTokenizer() + >>> text = "Very bad acting!!! I promise." + >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +NORMALIZE_WHITESPACE + [, + , + ] + + So, we need to find the word before the match from right to left, and then manually remove + the overlaps. That is what this method does:: + + >>> pst = PunktSentenceTokenizer() + >>> text = "Very bad acting!!! I promise." + >>> list(pst._match_potential_end_contexts(text)) + [(, 'acting!!! I')] + + :param text: String of one or more sentences + :type text: str + :return: Generator of match-context tuples. + :rtype: Iterator[Tuple[Match, str]] + """ + previous_slice = slice(0, 0) + previous_match = None + for match in self._lang_vars.period_context_re().finditer(text): + + # Get the slice of the previous word + before_text = text[previous_slice.stop : match.start()] + index_after_last_space = self._get_last_whitespace_index(before_text) + if index_after_last_space: + # + 1 to exclude the space itself + index_after_last_space += previous_slice.stop + 1 + else: + index_after_last_space = previous_slice.start + prev_word_slice = slice(index_after_last_space, match.start()) + + # If the previous slice does not overlap with this slice, then + # we can yield the previous match and slice. If there is an overlap, + # then we do not yield the previous match and slice. + if previous_match and previous_slice.stop <= prev_word_slice.start: + yield ( + previous_match, + text[previous_slice] + + previous_match.group() + + previous_match.group("after_tok"), + ) + previous_match = match + previous_slice = prev_word_slice + + # Yield the last match and context, if it exists + if previous_match: + yield ( + previous_match, + text[previous_slice] + + previous_match.group() + + previous_match.group("after_tok"), + ) + + def _slices_from_text(self, text: str) -> Iterator[slice]: + last_break = 0 + for match, context in self._match_potential_end_contexts(text): + if self.text_contains_sentbreak(context): + yield slice(last_break, match.end()) + if match.group("next_tok"): + # next sentence starts after whitespace + last_break = match.start("next_tok") + else: + # next sentence starts at following punctuation + last_break = match.end() + # The last sentence should not contain trailing whitespace. + yield slice(last_break, len(text.rstrip())) + + def _realign_boundaries( + self, text: str, slices: Iterator[slice] + ) -> Iterator[slice]: + """ + Attempts to realign punctuation that falls after the period but + should otherwise be included in the same sentence. + + For example: "(Sent1.) Sent2." will otherwise be split as:: + + ["(Sent1.", ") Sent1."]. + + This method will produce:: + + ["(Sent1.)", "Sent2."]. + """ + realign = 0 + for sentence1, sentence2 in _pair_iter(slices): + sentence1 = slice(sentence1.start + realign, sentence1.stop) + if not sentence2: + if text[sentence1]: + yield sentence1 + continue + + m = self._lang_vars.re_boundary_realignment.match(text[sentence2]) + if m: + yield slice(sentence1.start, sentence2.start + len(m.group(0).rstrip())) + realign = m.end() + else: + realign = 0 + if text[sentence1]: + yield sentence1 + + def text_contains_sentbreak(self, text: str) -> bool: + """ + Returns True if the given text includes a sentence break. + """ + found = False # used to ignore last token + for tok in self._annotate_tokens(self._tokenize_words(text)): + if found: + return True + if tok.sentbreak: + found = True + return False + + def sentences_from_text_legacy(self, text: str) -> Iterator[str]: + """ + Given a text, generates the sentences in that text. Annotates all + tokens, rather than just those with possible sentence breaks. Should + produce the same results as ``sentences_from_text``. + """ + tokens = self._annotate_tokens(self._tokenize_words(text)) + return self._build_sentence_list(text, tokens) + + def sentences_from_tokens( + self, tokens: Iterator[PunktToken] + ) -> Iterator[PunktToken]: + """ + Given a sequence of tokens, generates lists of tokens, each list + corresponding to a sentence. + """ + tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens)) + sentence = [] + for aug_tok in tokens: + sentence.append(aug_tok.tok) + if aug_tok.sentbreak: + yield sentence + sentence = [] + if sentence: + yield sentence + + def _annotate_tokens(self, tokens: Iterator[PunktToken]) -> Iterator[PunktToken]: + """ + Given a set of tokens augmented with markers for line-start and + paragraph-start, returns an iterator through those tokens with full + annotation including predicted sentence breaks. + """ + # Make a preliminary pass through the document, marking likely + # sentence breaks, abbreviations, and ellipsis tokens. + tokens = self._annotate_first_pass(tokens) + + # Make a second pass through the document, using token context + # information to change our preliminary decisions about where + # sentence breaks, abbreviations, and ellipsis occurs. + tokens = self._annotate_second_pass(tokens) + + ## [XX] TESTING + # tokens = list(tokens) + # self.dump(tokens) + + return tokens + + def _build_sentence_list( + self, text: str, tokens: Iterator[PunktToken] + ) -> Iterator[str]: + """ + Given the original text and the list of augmented word tokens, + construct and return a tokenized list of sentence strings. + """ + # Most of the work here is making sure that we put the right + # pieces of whitespace back in all the right places. + + # Our position in the source text, used to keep track of which + # whitespace to add: + pos = 0 + + # A regular expression that finds pieces of whitespace: + white_space_regexp = re.compile(r"\s*") + + sentence = "" + for aug_tok in tokens: + tok = aug_tok.tok + + # Find the whitespace before this token, and update pos. + white_space = white_space_regexp.match(text, pos).group() + pos += len(white_space) + + # Some of the rules used by the punkt word tokenizer + # strip whitespace out of the text, resulting in tokens + # that contain whitespace in the source text. If our + # token doesn't match, see if adding whitespace helps. + # If so, then use the version with whitespace. + if text[pos : pos + len(tok)] != tok: + pat = r"\s*".join(re.escape(c) for c in tok) + m = re.compile(pat).match(text, pos) + if m: + tok = m.group() + + # Move our position pointer to the end of the token. + assert text[pos : pos + len(tok)] == tok + pos += len(tok) + + # Add this token. If it's not at the beginning of the + # sentence, then include any whitespace that separated it + # from the previous token. + if sentence: + sentence += white_space + sentence += tok + + # If we're at a sentence break, then start a new sentence. + if aug_tok.sentbreak: + yield sentence + sentence = "" + + # If the last sentence is empty, discard it. + if sentence: + yield sentence + + # [XX] TESTING + def dump(self, tokens: Iterator[PunktToken]) -> None: + print("writing to /tmp/punkt.new...") + with open("/tmp/punkt.new", "w") as outfile: + for aug_tok in tokens: + if aug_tok.parastart: + outfile.write("\n\n") + elif aug_tok.linestart: + outfile.write("\n") + else: + outfile.write(" ") + + outfile.write(str(aug_tok)) + + # //////////////////////////////////////////////////////////// + # { Customization Variables + # //////////////////////////////////////////////////////////// + + PUNCTUATION = tuple(";:,.!?") + + # //////////////////////////////////////////////////////////// + # { Annotation Procedures + # //////////////////////////////////////////////////////////// + + def _annotate_second_pass( + self, tokens: Iterator[PunktToken] + ) -> Iterator[PunktToken]: + """ + Performs a token-based classification (section 4) over the given + tokens, making use of the orthographic heuristic (4.1.1), collocation + heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3). + """ + for token1, token2 in _pair_iter(tokens): + self._second_pass_annotation(token1, token2) + yield token1 + + def _second_pass_annotation( + self, aug_tok1: PunktToken, aug_tok2: Optional[PunktToken] + ) -> Optional[str]: + """ + Performs token-based classification over a pair of contiguous tokens + updating the first. + """ + # Is it the last token? We can't do anything then. + if not aug_tok2: + return + + if not aug_tok1.period_final: + # We only care about words ending in periods. + return + typ = aug_tok1.type_no_period + next_typ = aug_tok2.type_no_sentperiod + tok_is_initial = aug_tok1.is_initial + + # [4.1.2. Collocation Heuristic] If there's a + # collocation between the word before and after the + # period, then label tok as an abbreviation and NOT + # a sentence break. Note that collocations with + # frequent sentence starters as their second word are + # excluded in training. + if (typ, next_typ) in self._params.collocations: + aug_tok1.sentbreak = False + aug_tok1.abbr = True + return REASON_KNOWN_COLLOCATION + + # [4.2. Token-Based Reclassification of Abbreviations] If + # the token is an abbreviation or an ellipsis, then decide + # whether we should *also* classify it as a sentbreak. + if (aug_tok1.abbr or aug_tok1.ellipsis) and (not tok_is_initial): + # [4.1.1. Orthographic Heuristic] Check if there's + # orthogrpahic evidence about whether the next word + # starts a sentence or not. + is_sent_starter = self._ortho_heuristic(aug_tok2) + if is_sent_starter == True: + aug_tok1.sentbreak = True + return REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC + + # [4.1.3. Frequent Sentence Starter Heruistic] If the + # next word is capitalized, and is a member of the + # frequent-sentence-starters list, then label tok as a + # sentence break. + if aug_tok2.first_upper and next_typ in self._params.sent_starters: + aug_tok1.sentbreak = True + return REASON_ABBR_WITH_SENTENCE_STARTER + + # [4.3. Token-Based Detection of Initials and Ordinals] + # Check if any initials or ordinals tokens that are marked + # as sentbreaks should be reclassified as abbreviations. + if tok_is_initial or typ == "##number##": + + # [4.1.1. Orthographic Heuristic] Check if there's + # orthogrpahic evidence about whether the next word + # starts a sentence or not. + is_sent_starter = self._ortho_heuristic(aug_tok2) + + if is_sent_starter == False: + aug_tok1.sentbreak = False + aug_tok1.abbr = True + if tok_is_initial: + return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC + return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC + + # Special heuristic for initials: if orthogrpahic + # heuristic is unknown, and next word is always + # capitalized, then mark as abbrev (eg: J. Bach). + if ( + is_sent_starter == "unknown" + and tok_is_initial + and aug_tok2.first_upper + and not (self._params.ortho_context[next_typ] & _ORTHO_LC) + ): + aug_tok1.sentbreak = False + aug_tok1.abbr = True + return REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC + + return + + def _ortho_heuristic(self, aug_tok: PunktToken) -> Union[bool, str]: + """ + Decide whether the given token is the first token in a sentence. + """ + # Sentences don't start with punctuation marks: + if aug_tok.tok in self.PUNCTUATION: + return False + + ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod] + + # If the word is capitalized, occurs at least once with a + # lower case first letter, and never occurs with an upper case + # first letter sentence-internally, then it's a sentence starter. + if ( + aug_tok.first_upper + and (ortho_context & _ORTHO_LC) + and not (ortho_context & _ORTHO_MID_UC) + ): + return True + + # If the word is lower case, and either (a) we've seen it used + # with upper case, or (b) we've never seen it used + # sentence-initially with lower case, then it's not a sentence + # starter. + if aug_tok.first_lower and ( + (ortho_context & _ORTHO_UC) or not (ortho_context & _ORTHO_BEG_LC) + ): + return False + + # Otherwise, we're not sure. + return "unknown" + + +DEBUG_DECISION_FMT = """Text: {text!r} (at offset {period_index}) +Sentence break? {break_decision} ({reason}) +Collocation? {collocation} +{type1!r}: + known abbreviation: {type1_in_abbrs} + is initial: {type1_is_initial} +{type2!r}: + known sentence starter: {type2_is_sent_starter} + orthographic heuristic suggests is a sentence starter? {type2_ortho_heuristic} + orthographic contexts in training: {type2_ortho_contexts} +""" + + +def format_debug_decision(d): + return DEBUG_DECISION_FMT.format(**d) + + +def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer): + """Builds a punkt model and applies it to the same text""" + cleanup = ( + lambda s: re.compile(r"(?:\r|^\s+)", re.MULTILINE).sub("", s).replace("\n", " ") + ) + trainer = train_cls() + trainer.INCLUDE_ALL_COLLOCS = True + trainer.train(text) + sbd = tok_cls(trainer.get_params()) + for sentence in sbd.sentences_from_text(text): + print(cleanup(sentence)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/regexp.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/regexp.py new file mode 100644 index 0000000000000000000000000000000000000000..09be6785141cc440a74d552bb6008d4a41b57a5a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/regexp.py @@ -0,0 +1,220 @@ +# Natural Language Toolkit: Tokenizers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Trevor Cohn +# URL: +# For license information, see LICENSE.TXT + +r""" +Regular-Expression Tokenizers + +A ``RegexpTokenizer`` splits a string into substrings using a regular expression. +For example, the following tokenizer forms tokens out of alphabetic sequences, +money expressions, and any other non-whitespace sequences: + + >>> from nltk.tokenize import RegexpTokenizer + >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." + >>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+') + >>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', + 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + +A ``RegexpTokenizer`` can use its regexp to match delimiters instead: + + >>> tokenizer = RegexpTokenizer(r'\s+', gaps=True) + >>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', + 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] + +Note that empty tokens are not returned when the delimiter appears at +the start or end of the string. + +The material between the tokens is discarded. For example, +the following tokenizer selects just the capitalized words: + + >>> capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+') + >>> capword_tokenizer.tokenize(s) + ['Good', 'New', 'York', 'Please', 'Thanks'] + +This module contains several subclasses of ``RegexpTokenizer`` +that use pre-defined regular expressions. + + >>> from nltk.tokenize import BlanklineTokenizer + >>> # Uses '\s*\n\s*\n\s*': + >>> BlanklineTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', + 'Thanks.'] + +All of the regular expression tokenizers are also available as functions: + + >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize + >>> regexp_tokenize(s, pattern=r'\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', + 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', + '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + >>> blankline_tokenize(s) + ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.'] + +Caution: The function ``regexp_tokenize()`` takes the text as its +first argument, and the regular expression pattern as its second +argument. This differs from the conventions used by Python's +``re`` functions, where the pattern is always the first argument. +(This is for consistency with the other NLTK tokenizers.) +""" + +import re + +from nltk.tokenize.api import TokenizerI +from nltk.tokenize.util import regexp_span_tokenize + + +class RegexpTokenizer(TokenizerI): + r""" + A tokenizer that splits a string using a regular expression, which + matches either the tokens or the separators between tokens. + + >>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+') + + :type pattern: str + :param pattern: The pattern used to build this tokenizer. + (This pattern must not contain capturing parentheses; + Use non-capturing parentheses, e.g. (?:...), instead) + :type gaps: bool + :param gaps: True if this tokenizer's pattern should be used + to find separators between tokens; False if this + tokenizer's pattern should be used to find the tokens + themselves. + :type discard_empty: bool + :param discard_empty: True if any empty tokens `''` + generated by the tokenizer should be discarded. Empty + tokens can only be generated if `_gaps == True`. + :type flags: int + :param flags: The regexp flags used to compile this + tokenizer's pattern. By default, the following flags are + used: `re.UNICODE | re.MULTILINE | re.DOTALL`. + + """ + + def __init__( + self, + pattern, + gaps=False, + discard_empty=True, + flags=re.UNICODE | re.MULTILINE | re.DOTALL, + ): + # If they gave us a regexp object, extract the pattern. + pattern = getattr(pattern, "pattern", pattern) + + self._pattern = pattern + self._gaps = gaps + self._discard_empty = discard_empty + self._flags = flags + self._regexp = None + + def _check_regexp(self): + if self._regexp is None: + self._regexp = re.compile(self._pattern, self._flags) + + def tokenize(self, text): + self._check_regexp() + # If our regexp matches gaps, use re.split: + if self._gaps: + if self._discard_empty: + return [tok for tok in self._regexp.split(text) if tok] + else: + return self._regexp.split(text) + + # If our regexp matches tokens, use re.findall: + else: + return self._regexp.findall(text) + + def span_tokenize(self, text): + self._check_regexp() + + if self._gaps: + for left, right in regexp_span_tokenize(text, self._regexp): + if not (self._discard_empty and left == right): + yield left, right + else: + for m in re.finditer(self._regexp, text): + yield m.span() + + def __repr__(self): + return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format( + self.__class__.__name__, + self._pattern, + self._gaps, + self._discard_empty, + self._flags, + ) + + +class WhitespaceTokenizer(RegexpTokenizer): + r""" + Tokenize a string on whitespace (space, tab, newline). + In general, users should use the string ``split()`` method instead. + + >>> from nltk.tokenize import WhitespaceTokenizer + >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." + >>> WhitespaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', + 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] + """ + + def __init__(self): + RegexpTokenizer.__init__(self, r"\s+", gaps=True) + + +class BlanklineTokenizer(RegexpTokenizer): + """ + Tokenize a string, treating any sequence of blank lines as a delimiter. + Blank lines are defined as lines containing no characters, except for + space or tab characters. + """ + + def __init__(self): + RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True) + + +class WordPunctTokenizer(RegexpTokenizer): + r""" + Tokenize a text into a sequence of alphabetic and + non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``. + + >>> from nltk.tokenize import WordPunctTokenizer + >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." + >>> WordPunctTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE + ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', + '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + """ + + def __init__(self): + RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+") + + +###################################################################### +# { Tokenization Functions +###################################################################### + + +def regexp_tokenize( + text, + pattern, + gaps=False, + discard_empty=True, + flags=re.UNICODE | re.MULTILINE | re.DOTALL, +): + """ + Return a tokenized copy of *text*. See :class:`.RegexpTokenizer` + for descriptions of the arguments. + """ + tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags) + return tokenizer.tokenize(text) + + +blankline_tokenize = BlanklineTokenizer().tokenize +wordpunct_tokenize = WordPunctTokenizer().tokenize diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/repp.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/repp.py new file mode 100644 index 0000000000000000000000000000000000000000..6e0740a94645f14ec6162814cdb3c92167f503bb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/repp.py @@ -0,0 +1,149 @@ +# Natural Language Toolkit: Interface to the Repp Tokenizer +# +# Copyright (C) 2001-2015 NLTK Project +# Authors: Rebecca Dridan and Stephan Oepen +# Contributors: Liling Tan +# +# URL: +# For license information, see LICENSE.TXT + +import os +import re +import subprocess +import sys +import tempfile + +from nltk.data import ZipFilePathPointer +from nltk.internals import find_dir +from nltk.tokenize.api import TokenizerI + + +class ReppTokenizer(TokenizerI): + """ + A class for word tokenization using the REPP parser described in + Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a + Long Solved Problem - A Survey, Contrastive Experiment, Recommendations, + and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406 + + >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' , + ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' , + ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.' + ... ] + >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP + >>> for sent in sents: # doctest: +SKIP + ... tokenizer.tokenize(sent) # doctest: +SKIP + ... + (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.') + (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.') + (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.') + + >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP + ... print(sent) # doctest: +SKIP + ... + (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.') + (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.') + (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.') + >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP + ... print(sent) # doctest: +SKIP + ... + [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)] + [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)] + [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)] + """ + + def __init__(self, repp_dir, encoding="utf8"): + self.repp_dir = self.find_repptokenizer(repp_dir) + # Set a directory to store the temporary files. + self.working_dir = tempfile.gettempdir() + # Set an encoding for the input strings. + self.encoding = encoding + + def tokenize(self, sentence): + """ + Use Repp to tokenize a single sentence. + + :param sentence: A single sentence string. + :type sentence: str + :return: A tuple of tokens. + :rtype: tuple(str) + """ + return next(self.tokenize_sents([sentence])) + + def tokenize_sents(self, sentences, keep_token_positions=False): + """ + Tokenize multiple sentences using Repp. + + :param sentences: A list of sentence strings. + :type sentences: list(str) + :return: A list of tuples of tokens + :rtype: iter(tuple(str)) + """ + with tempfile.NamedTemporaryFile( + prefix="repp_input.", dir=self.working_dir, mode="w", delete=False + ) as input_file: + # Write sentences to temporary input file. + for sent in sentences: + input_file.write(str(sent) + "\n") + input_file.close() + # Generate command to run REPP. + cmd = self.generate_repp_command(input_file.name) + # Decode the stdout and strips the ending newline. + repp_output = self._execute(cmd).decode(self.encoding).strip() + for tokenized_sent in self.parse_repp_outputs(repp_output): + if not keep_token_positions: + # Removes token position information. + tokenized_sent, starts, ends = zip(*tokenized_sent) + yield tokenized_sent + + def generate_repp_command(self, inputfilename): + """ + This module generates the REPP command to be used at the terminal. + + :param inputfilename: path to the input file + :type inputfilename: str + """ + cmd = [self.repp_dir + "/src/repp"] + cmd += ["-c", self.repp_dir + "/erg/repp.set"] + cmd += ["--format", "triple"] + cmd += [inputfilename] + return cmd + + @staticmethod + def _execute(cmd): + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + return stdout + + @staticmethod + def parse_repp_outputs(repp_output): + """ + This module parses the tri-tuple format that REPP outputs using the + "--format triple" option and returns an generator with tuple of string + tokens. + + :param repp_output: + :type repp_output: type + :return: an iterable of the tokenized sentences as tuples of strings + :rtype: iter(tuple) + """ + line_regex = re.compile(r"^\((\d+), (\d+), (.+)\)$", re.MULTILINE) + for section in repp_output.split("\n\n"): + words_with_positions = [ + (token, int(start), int(end)) + for start, end, token in line_regex.findall(section) + ] + words = tuple(t[2] for t in words_with_positions) + yield words_with_positions + + def find_repptokenizer(self, repp_dirname): + """ + A module to find REPP tokenizer binary and its *repp.set* config file. + """ + if os.path.exists(repp_dirname): # If a full path is given. + _repp_dir = repp_dirname + else: # Try to find path to REPP directory in environment variables. + _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",)) + # Checks for the REPP binary and erg/repp.set config file. + assert os.path.exists(_repp_dir + "/src/repp") + assert os.path.exists(_repp_dir + "/erg/repp.set") + return _repp_dir diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/sonority_sequencing.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/sonority_sequencing.py new file mode 100644 index 0000000000000000000000000000000000000000..6930210bc3807541ff2be29d5be9cb885b6777df --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/sonority_sequencing.py @@ -0,0 +1,194 @@ +# Natural Language Toolkit: Tokenizers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Christopher Hench +# Alex Estes +# URL: +# For license information, see LICENSE.TXT + +""" +The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed +by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the +openness of the lips. Syllable breaks occur before troughs in sonority. For more +on the SSP see Selkirk (1984). + +The default implementation uses the English alphabet, but the `sonority_hiearchy` +can be modified to IPA or any other alphabet for the use-case. The SSP is a +universal syllabification algorithm, but that does not mean it performs equally +across languages. Bartlett et al. (2009) is a good benchmark for English accuracy +if utilizing IPA (pg. 311). + +Importantly, if a custom hierarchy is supplied and vowels span across more than +one level, they should be given separately to the `vowels` class attribute. + +References: + +- Otto Jespersen. 1904. Lehrbuch der Phonetik. + Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203. +- Elisabeth Selkirk. 1984. On the major class features and syllable theory. + In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology. + Cambridge, MIT Press. pp. 107-136. +- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes. + In HLT-NAACL. pp. 308-316. +""" + +import re +import warnings +from string import punctuation + +from nltk.tokenize.api import TokenizerI +from nltk.util import ngrams + + +class SyllableTokenizer(TokenizerI): + """ + Syllabifies words based on the Sonority Sequencing Principle (SSP). + + >>> from nltk.tokenize import SyllableTokenizer + >>> from nltk import word_tokenize + >>> SSP = SyllableTokenizer() + >>> SSP.tokenize('justification') + ['jus', 'ti', 'fi', 'ca', 'tion'] + >>> text = "This is a foobar-like sentence." + >>> [SSP.tokenize(token) for token in word_tokenize(text)] + [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']] + """ + + def __init__(self, lang="en", sonority_hierarchy=False): + """ + :param lang: Language parameter, default is English, 'en' + :type lang: str + :param sonority_hierarchy: Sonority hierarchy according to the + Sonority Sequencing Principle. + :type sonority_hierarchy: list(str) + """ + # Sonority hierarchy should be provided in descending order. + # If vowels are spread across multiple levels, they should be + # passed assigned self.vowels var together, otherwise should be + # placed in first index of hierarchy. + if not sonority_hierarchy and lang == "en": + sonority_hierarchy = [ + "aeiouy", # vowels. + "lmnrw", # nasals. + "zvsf", # fricatives. + "bcdgtkpqxhj", # stops. + ] + + self.vowels = sonority_hierarchy[0] + self.phoneme_map = {} + for i, level in enumerate(sonority_hierarchy): + for c in level: + sonority_level = len(sonority_hierarchy) - i + self.phoneme_map[c] = sonority_level + self.phoneme_map[c.upper()] = sonority_level + + def assign_values(self, token): + """ + Assigns each phoneme its value from the sonority hierarchy. + Note: Sentence/text has to be tokenized first. + + :param token: Single word or token + :type token: str + :return: List of tuples, first element is character/phoneme and + second is the soronity value. + :rtype: list(tuple(str, int)) + """ + syllables_values = [] + for c in token: + try: + syllables_values.append((c, self.phoneme_map[c])) + except KeyError: + if c not in "0123456789" and c not in punctuation: + warnings.warn( + "Character not defined in sonority_hierarchy," + " assigning as vowel: '{}'".format(c) + ) + syllables_values.append((c, max(self.phoneme_map.values()))) + if c not in self.vowels: + self.vowels += c + else: # If it's a punctuation or numbers, assign -1. + syllables_values.append((c, -1)) + return syllables_values + + def validate_syllables(self, syllable_list): + """ + Ensures each syllable has at least one vowel. + If the following syllable doesn't have vowel, add it to the current one. + + :param syllable_list: Single word or token broken up into syllables. + :type syllable_list: list(str) + :return: Single word or token broken up into syllables + (with added syllables if necessary) + :rtype: list(str) + """ + valid_syllables = [] + front = "" + vowel_pattern = re.compile("|".join(self.vowels)) + for i, syllable in enumerate(syllable_list): + if syllable in punctuation: + valid_syllables.append(syllable) + continue + if not vowel_pattern.search(syllable): + if len(valid_syllables) == 0: + front += syllable + else: + valid_syllables = valid_syllables[:-1] + [ + valid_syllables[-1] + syllable + ] + else: + if len(valid_syllables) == 0: + valid_syllables.append(front + syllable) + else: + valid_syllables.append(syllable) + + return valid_syllables + + def tokenize(self, token): + """ + Apply the SSP to return a list of syllables. + Note: Sentence/text has to be tokenized first. + + :param token: Single word or token + :type token: str + :return syllable_list: Single word or token broken up into syllables. + :rtype: list(str) + """ + # assign values from hierarchy + syllables_values = self.assign_values(token) + + # if only one vowel return word + if sum(token.count(x) for x in self.vowels) <= 1: + return [token] + + syllable_list = [] + syllable = syllables_values[0][0] # start syllable with first phoneme + for trigram in ngrams(syllables_values, n=3): + phonemes, values = zip(*trigram) + # Sonority of previous, focal and following phoneme + prev_value, focal_value, next_value = values + # Focal phoneme. + focal_phoneme = phonemes[1] + + # These cases trigger syllable break. + if focal_value == -1: # If it's a punctuation, just break. + syllable_list.append(syllable) + syllable_list.append(focal_phoneme) + syllable = "" + elif prev_value >= focal_value == next_value: + syllable += focal_phoneme + syllable_list.append(syllable) + syllable = "" + + elif prev_value > focal_value < next_value: + syllable_list.append(syllable) + syllable = "" + syllable += focal_phoneme + + # no syllable break + else: + syllable += focal_phoneme + + syllable += syllables_values[-1][0] # append last phoneme + syllable_list.append(syllable) + + return self.validate_syllables(syllable_list) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/stanford.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/stanford.py new file mode 100644 index 0000000000000000000000000000000000000000..b2514706aa97d0f31de58c23160f676185eb5be8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/stanford.py @@ -0,0 +1,115 @@ +# Natural Language Toolkit: Interface to the Stanford Tokenizer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Xu +# +# URL: +# For license information, see LICENSE.TXT + +import json +import os +import tempfile +import warnings +from subprocess import PIPE + +from nltk.internals import _java_options, config_java, find_jar, java +from nltk.parse.corenlp import CoreNLPParser +from nltk.tokenize.api import TokenizerI + +_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml" + + +class StanfordTokenizer(TokenizerI): + r""" + Interface to the Stanford Tokenizer + + >>> from nltk.tokenize.stanford import StanfordTokenizer + >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." + >>> StanfordTokenizer().tokenize(s) # doctest: +SKIP + ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + >>> s = "The colour of the wall is blue." + >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP + ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] + """ + + _JAR = "stanford-postagger.jar" + + def __init__( + self, + path_to_jar=None, + encoding="utf8", + options=None, + verbose=False, + java_options="-mx1000m", + ): + # Raise deprecation warning. + warnings.warn( + str( + "\nThe StanfordTokenizer will " + "be deprecated in version 3.2.5.\n" + "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'" + ), + DeprecationWarning, + stacklevel=2, + ) + + self._stanford_jar = find_jar( + self._JAR, + path_to_jar, + env_vars=("STANFORD_POSTAGGER",), + searchpath=(), + url=_stanford_url, + verbose=verbose, + ) + + self._encoding = encoding + self.java_options = java_options + + options = {} if options is None else options + self._options_cmd = ",".join(f"{key}={val}" for key, val in options.items()) + + @staticmethod + def _parse_tokenized_output(s): + return s.splitlines() + + def tokenize(self, s): + """ + Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences. + """ + cmd = ["edu.stanford.nlp.process.PTBTokenizer"] + return self._parse_tokenized_output(self._execute(cmd, s)) + + def _execute(self, cmd, input_, verbose=False): + encoding = self._encoding + cmd.extend(["-charset", encoding]) + _options_cmd = self._options_cmd + if _options_cmd: + cmd.extend(["-options", self._options_cmd]) + + default_options = " ".join(_java_options) + + # Configure java. + config_java(options=self.java_options, verbose=verbose) + + # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. + with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file: + # Write the actual sentences to the temporary input file + if isinstance(input_, str) and encoding: + input_ = input_.encode(encoding) + input_file.write(input_) + input_file.flush() + + cmd.append(input_file.name) + + # Run the tagger and get the output. + stdout, stderr = java( + cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE + ) + stdout = stdout.decode(encoding) + + os.unlink(input_file.name) + + # Return java configurations to their default values. + config_java(options=default_options, verbose=False) + + return stdout diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/texttiling.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/texttiling.py new file mode 100644 index 0000000000000000000000000000000000000000..cbcdc3291522e210f7da71f70bf31d85c2d8b395 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tokenize/texttiling.py @@ -0,0 +1,475 @@ +# Natural Language Toolkit: TextTiling +# +# Copyright (C) 2001-2022 NLTK Project +# Author: George Boutsioukis +# +# URL: +# For license information, see LICENSE.TXT + +import math +import re + +try: + import numpy +except ImportError: + pass + +from nltk.tokenize.api import TokenizerI + +BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1 +LC, HC = 0, 1 +DEFAULT_SMOOTHING = [0] + + +class TextTilingTokenizer(TokenizerI): + """Tokenize a document into topical sections using the TextTiling algorithm. + This algorithm detects subtopic shifts based on the analysis of lexical + co-occurrence patterns. + + The process starts by tokenizing the text into pseudosentences of + a fixed size w. Then, depending on the method used, similarity + scores are assigned at sentence gaps. The algorithm proceeds by + detecting the peak differences between these scores and marking + them as boundaries. The boundaries are normalized to the closest + paragraph break and the segmented text is returned. + + :param w: Pseudosentence size + :type w: int + :param k: Size (in sentences) of the block used in the block comparison method + :type k: int + :param similarity_method: The method used for determining similarity scores: + `BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`. + :type similarity_method: constant + :param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus) + :type stopwords: list(str) + :param smoothing_method: The method used for smoothing the score plot: + `DEFAULT_SMOOTHING` (default) + :type smoothing_method: constant + :param smoothing_width: The width of the window used by the smoothing method + :type smoothing_width: int + :param smoothing_rounds: The number of smoothing passes + :type smoothing_rounds: int + :param cutoff_policy: The policy used to determine the number of boundaries: + `HC` (default) or `LC` + :type cutoff_policy: constant + + >>> from nltk.corpus import brown + >>> tt = TextTilingTokenizer(demo_mode=True) + >>> text = brown.raw()[:4000] + >>> s, ss, d, b = tt.tokenize(text) + >>> b + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0] + """ + + def __init__( + self, + w=20, + k=10, + similarity_method=BLOCK_COMPARISON, + stopwords=None, + smoothing_method=DEFAULT_SMOOTHING, + smoothing_width=2, + smoothing_rounds=1, + cutoff_policy=HC, + demo_mode=False, + ): + + if stopwords is None: + from nltk.corpus import stopwords + + stopwords = stopwords.words("english") + self.__dict__.update(locals()) + del self.__dict__["self"] + + def tokenize(self, text): + """Return a tokenized copy of *text*, where each "token" represents + a separate topic.""" + + lowercase_text = text.lower() + paragraph_breaks = self._mark_paragraph_breaks(text) + text_length = len(lowercase_text) + + # Tokenization step starts here + + # Remove punctuation + nopunct_text = "".join( + c for c in lowercase_text if re.match(r"[a-z\-' \n\t]", c) + ) + nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text) + + tokseqs = self._divide_to_tokensequences(nopunct_text) + + # The morphological stemming step mentioned in the TextTile + # paper is not implemented. A comment in the original C + # implementation states that it offers no benefit to the + # process. It might be interesting to test the existing + # stemmers though. + # words = _stem_words(words) + + # Filter stopwords + for ts in tokseqs: + ts.wrdindex_list = [ + wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords + ] + + token_table = self._create_token_table(tokseqs, nopunct_par_breaks) + # End of the Tokenization step + + # Lexical score determination + if self.similarity_method == BLOCK_COMPARISON: + gap_scores = self._block_comparison(tokseqs, token_table) + elif self.similarity_method == VOCABULARY_INTRODUCTION: + raise NotImplementedError("Vocabulary introduction not implemented") + else: + raise ValueError( + f"Similarity method {self.similarity_method} not recognized" + ) + + if self.smoothing_method == DEFAULT_SMOOTHING: + smooth_scores = self._smooth_scores(gap_scores) + else: + raise ValueError(f"Smoothing method {self.smoothing_method} not recognized") + # End of Lexical score Determination + + # Boundary identification + depth_scores = self._depth_scores(smooth_scores) + segment_boundaries = self._identify_boundaries(depth_scores) + + normalized_boundaries = self._normalize_boundaries( + text, segment_boundaries, paragraph_breaks + ) + # End of Boundary Identification + segmented_text = [] + prevb = 0 + + for b in normalized_boundaries: + if b == 0: + continue + segmented_text.append(text[prevb:b]) + prevb = b + + if prevb < text_length: # append any text that may be remaining + segmented_text.append(text[prevb:]) + + if not segmented_text: + segmented_text = [text] + + if self.demo_mode: + return gap_scores, smooth_scores, depth_scores, segment_boundaries + return segmented_text + + def _block_comparison(self, tokseqs, token_table): + """Implements the block comparison method""" + + def blk_frq(tok, block): + ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences) + freq = sum(tsocc[1] for tsocc in ts_occs) + return freq + + gap_scores = [] + numgaps = len(tokseqs) - 1 + + for curr_gap in range(numgaps): + score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0 + score = 0.0 + # adjust window size for boundary conditions + if curr_gap < self.k - 1: + window_size = curr_gap + 1 + elif curr_gap > numgaps - self.k: + window_size = numgaps - curr_gap + else: + window_size = self.k + + b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]] + b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]] + + for t in token_table: + score_dividend += blk_frq(t, b1) * blk_frq(t, b2) + score_divisor_b1 += blk_frq(t, b1) ** 2 + score_divisor_b2 += blk_frq(t, b2) ** 2 + try: + score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2) + except ZeroDivisionError: + pass # score += 0.0 + + gap_scores.append(score) + + return gap_scores + + def _smooth_scores(self, gap_scores): + "Wraps the smooth function from the SciPy Cookbook" + return list( + smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1) + ) + + def _mark_paragraph_breaks(self, text): + """Identifies indented text or line breaks as the beginning of + paragraphs""" + MIN_PARAGRAPH = 100 + pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*") + matches = pattern.finditer(text) + + last_break = 0 + pbreaks = [0] + for pb in matches: + if pb.start() - last_break < MIN_PARAGRAPH: + continue + else: + pbreaks.append(pb.start()) + last_break = pb.start() + + return pbreaks + + def _divide_to_tokensequences(self, text): + "Divides the text into pseudosentences of fixed size" + w = self.w + wrdindex_list = [] + matches = re.finditer(r"\w+", text) + for match in matches: + wrdindex_list.append((match.group(), match.start())) + return [ + TokenSequence(i / w, wrdindex_list[i : i + w]) + for i in range(0, len(wrdindex_list), w) + ] + + def _create_token_table(self, token_sequences, par_breaks): + "Creates a table of TokenTableFields" + token_table = {} + current_par = 0 + current_tok_seq = 0 + pb_iter = par_breaks.__iter__() + current_par_break = next(pb_iter) + if current_par_break == 0: + try: + current_par_break = next(pb_iter) # skip break at 0 + except StopIteration as e: + raise ValueError( + "No paragraph breaks were found(text too short perhaps?)" + ) from e + for ts in token_sequences: + for word, index in ts.wrdindex_list: + try: + while index > current_par_break: + current_par_break = next(pb_iter) + current_par += 1 + except StopIteration: + # hit bottom + pass + + if word in token_table: + token_table[word].total_count += 1 + + if token_table[word].last_par != current_par: + token_table[word].last_par = current_par + token_table[word].par_count += 1 + + if token_table[word].last_tok_seq != current_tok_seq: + token_table[word].last_tok_seq = current_tok_seq + token_table[word].ts_occurences.append([current_tok_seq, 1]) + else: + token_table[word].ts_occurences[-1][1] += 1 + else: # new word + token_table[word] = TokenTableField( + first_pos=index, + ts_occurences=[[current_tok_seq, 1]], + total_count=1, + par_count=1, + last_par=current_par, + last_tok_seq=current_tok_seq, + ) + + current_tok_seq += 1 + + return token_table + + def _identify_boundaries(self, depth_scores): + """Identifies boundaries at the peaks of similarity score + differences""" + + boundaries = [0 for x in depth_scores] + + avg = sum(depth_scores) / len(depth_scores) + stdev = numpy.std(depth_scores) + + if self.cutoff_policy == LC: + cutoff = avg - stdev + else: + cutoff = avg - stdev / 2.0 + + depth_tuples = sorted(zip(depth_scores, range(len(depth_scores)))) + depth_tuples.reverse() + hp = list(filter(lambda x: x[0] > cutoff, depth_tuples)) + + for dt in hp: + boundaries[dt[1]] = 1 + for dt2 in hp: # undo if there is a boundary close already + if ( + dt[1] != dt2[1] + and abs(dt2[1] - dt[1]) < 4 + and boundaries[dt2[1]] == 1 + ): + boundaries[dt[1]] = 0 + return boundaries + + def _depth_scores(self, scores): + """Calculates the depth of each gap, i.e. the average difference + between the left and right peaks and the gap's score""" + + depth_scores = [0 for x in scores] + # clip boundaries: this holds on the rule of thumb(my thumb) + # that a section shouldn't be smaller than at least 2 + # pseudosentences for small texts and around 5 for larger ones. + + clip = min(max(len(scores) // 10, 2), 5) + index = clip + + for gapscore in scores[clip:-clip]: + lpeak = gapscore + for score in scores[index::-1]: + if score >= lpeak: + lpeak = score + else: + break + rpeak = gapscore + for score in scores[index:]: + if score >= rpeak: + rpeak = score + else: + break + depth_scores[index] = lpeak + rpeak - 2 * gapscore + index += 1 + + return depth_scores + + def _normalize_boundaries(self, text, boundaries, paragraph_breaks): + """Normalize the boundaries identified to the original text's + paragraph breaks""" + + norm_boundaries = [] + char_count, word_count, gaps_seen = 0, 0, 0 + seen_word = False + + for char in text: + char_count += 1 + if char in " \t\n" and seen_word: + seen_word = False + word_count += 1 + if char not in " \t\n" and not seen_word: + seen_word = True + if gaps_seen < len(boundaries) and word_count > ( + max(gaps_seen * self.w, self.w) + ): + if boundaries[gaps_seen] == 1: + # find closest paragraph break + best_fit = len(text) + for br in paragraph_breaks: + if best_fit > abs(br - char_count): + best_fit = abs(br - char_count) + bestbr = br + else: + break + if bestbr not in norm_boundaries: # avoid duplicates + norm_boundaries.append(bestbr) + gaps_seen += 1 + + return norm_boundaries + + +class TokenTableField: + """A field in the token table holding parameters for each token, + used later in the process""" + + def __init__( + self, + first_pos, + ts_occurences, + total_count=1, + par_count=1, + last_par=0, + last_tok_seq=None, + ): + self.__dict__.update(locals()) + del self.__dict__["self"] + + +class TokenSequence: + "A token list with its original length and its index" + + def __init__(self, index, wrdindex_list, original_length=None): + original_length = original_length or len(wrdindex_list) + self.__dict__.update(locals()) + del self.__dict__["self"] + + +# Pasted from the SciPy cookbook: https://www.scipy.org/Cookbook/SignalSmooth +def smooth(x, window_len=11, window="flat"): + """smooth the data using a window with requested size. + + This method is based on the convolution of a scaled window with the signal. + The signal is prepared by introducing reflected copies of the signal + (with the window size) in both ends so that transient parts are minimized + in the beginning and end part of the output signal. + + :param x: the input signal + :param window_len: the dimension of the smoothing window; should be an odd integer + :param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman' + flat window will produce a moving average smoothing. + + :return: the smoothed signal + + example:: + + t=linspace(-2,2,0.1) + x=sin(t)+randn(len(t))*0.1 + y=smooth(x) + + :see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve, + scipy.signal.lfilter + + TODO: the window parameter could be the window itself if an array instead of a string + """ + + if x.ndim != 1: + raise ValueError("smooth only accepts 1 dimension arrays.") + + if x.size < window_len: + raise ValueError("Input vector needs to be bigger than window size.") + + if window_len < 3: + return x + + if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]: + raise ValueError( + "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'" + ) + + s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]] + + # print(len(s)) + if window == "flat": # moving average + w = numpy.ones(window_len, "d") + else: + w = eval("numpy." + window + "(window_len)") + + y = numpy.convolve(w / w.sum(), s, mode="same") + + return y[window_len - 1 : -window_len + 1] + + +def demo(text=None): + from matplotlib import pylab + + from nltk.corpus import brown + + tt = TextTilingTokenizer(demo_mode=True) + if text is None: + text = brown.raw()[:10000] + s, ss, d, b = tt.tokenize(text) + pylab.xlabel("Sentence Gap index") + pylab.ylabel("Gap Scores") + pylab.plot(range(len(s)), s, label="Gap Scores") + pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") + pylab.plot(range(len(d)), d, label="Depth scores") + pylab.stem(range(len(b)), b) + pylab.legend() + pylab.show() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8c4cd316638b8e61e6a56e3a47c3e06858fbb4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/__init__.py @@ -0,0 +1,32 @@ +# Natural Language Toolkit: Machine Translation +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird , Tah Wei Hoon +# URL: +# For license information, see LICENSE.TXT + +""" +Experimental features for machine translation. +These interfaces are prone to change. + +isort:skip_file +""" + +from nltk.translate.api import AlignedSent, Alignment, PhraseTable +from nltk.translate.ibm_model import IBMModel +from nltk.translate.ibm1 import IBMModel1 +from nltk.translate.ibm2 import IBMModel2 +from nltk.translate.ibm3 import IBMModel3 +from nltk.translate.ibm4 import IBMModel4 +from nltk.translate.ibm5 import IBMModel5 +from nltk.translate.bleu_score import sentence_bleu as bleu +from nltk.translate.ribes_score import sentence_ribes as ribes +from nltk.translate.meteor_score import meteor_score as meteor +from nltk.translate.metrics import alignment_error_rate +from nltk.translate.stack_decoder import StackDecoder +from nltk.translate.nist_score import sentence_nist as nist +from nltk.translate.chrf_score import sentence_chrf as chrf +from nltk.translate.gale_church import trace +from nltk.translate.gdfa import grow_diag_final_and +from nltk.translate.gleu_score import sentence_gleu as gleu +from nltk.translate.phrase_based import extract diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/api.py new file mode 100644 index 0000000000000000000000000000000000000000..10d967b81b64f470f6f5cf7abfdfa617f201bb76 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/api.py @@ -0,0 +1,334 @@ +# Natural Language Toolkit: API for alignment and translation objects +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Will Zhang +# Guan Gui +# Steven Bird +# Tah Wei Hoon +# URL: +# For license information, see LICENSE.TXT + +import subprocess +from collections import namedtuple + + +class AlignedSent: + """ + Return an aligned sentence object, which encapsulates two sentences + along with an ``Alignment`` between them. + + Typically used in machine translation to represent a sentence and + its translation. + + >>> from nltk.translate import AlignedSent, Alignment + >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'], + ... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1')) + >>> algnsent.words + ['klein', 'ist', 'das', 'Haus'] + >>> algnsent.mots + ['the', 'house', 'is', 'small'] + >>> algnsent.alignment + Alignment([(0, 3), (1, 2), (2, 0), (3, 1)]) + >>> from nltk.corpus import comtrans + >>> print(comtrans.aligned_sents()[54]) + 'So why should EU arm...'> + >>> print(comtrans.aligned_sents()[54].alignment) + 0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13 + + :param words: Words in the target language sentence + :type words: list(str) + :param mots: Words in the source language sentence + :type mots: list(str) + :param alignment: Word-level alignments between ``words`` and ``mots``. + Each alignment is represented as a 2-tuple (words_index, mots_index). + :type alignment: Alignment + """ + + def __init__(self, words, mots, alignment=None): + self._words = words + self._mots = mots + if alignment is None: + self.alignment = Alignment([]) + else: + assert type(alignment) is Alignment + self.alignment = alignment + + @property + def words(self): + return self._words + + @property + def mots(self): + return self._mots + + def _get_alignment(self): + return self._alignment + + def _set_alignment(self, alignment): + _check_alignment(len(self.words), len(self.mots), alignment) + self._alignment = alignment + + alignment = property(_get_alignment, _set_alignment) + + def __repr__(self): + """ + Return a string representation for this ``AlignedSent``. + + :rtype: str + """ + words = "[%s]" % (", ".join("'%s'" % w for w in self._words)) + mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots)) + + return f"AlignedSent({words}, {mots}, {self._alignment!r})" + + def _to_dot(self): + """ + Dot representation of the aligned sentence + """ + s = "graph align {\n" + s += "node[shape=plaintext]\n" + + # Declare node + for w in self._words: + s += f'"{w}_source" [label="{w}"] \n' + + for w in self._mots: + s += f'"{w}_target" [label="{w}"] \n' + + # Alignment + for u, v in self._alignment: + s += f'"{self._words[u]}_source" -- "{self._mots[v]}_target" \n' + + # Connect the source words + for i in range(len(self._words) - 1): + s += '"{}_source" -- "{}_source" [style=invis]\n'.format( + self._words[i], + self._words[i + 1], + ) + + # Connect the target words + for i in range(len(self._mots) - 1): + s += '"{}_target" -- "{}_target" [style=invis]\n'.format( + self._mots[i], + self._mots[i + 1], + ) + + # Put it in the same rank + s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words)) + s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots)) + + s += "}" + + return s + + def _repr_svg_(self): + """ + Ipython magic : show SVG representation of this ``AlignedSent``. + """ + dot_string = self._to_dot().encode("utf8") + output_format = "svg" + try: + process = subprocess.Popen( + ["dot", "-T%s" % output_format], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + except OSError as e: + raise Exception("Cannot find the dot binary from Graphviz package") from e + out, err = process.communicate(dot_string) + + return out.decode("utf8") + + def __str__(self): + """ + Return a human-readable string representation for this ``AlignedSent``. + + :rtype: str + """ + source = " ".join(self._words)[:20] + "..." + target = " ".join(self._mots)[:20] + "..." + return f" '{target}'>" + + def invert(self): + """ + Return the aligned sentence pair, reversing the directionality + + :rtype: AlignedSent + """ + return AlignedSent(self._mots, self._words, self._alignment.invert()) + + +class Alignment(frozenset): + """ + A storage class for representing alignment between two sequences, s1, s2. + In general, an alignment is a set of tuples of the form (i, j, ...) + representing an alignment between the i-th element of s1 and the + j-th element of s2. Tuples are extensible (they might contain + additional data, such as a boolean to indicate sure vs possible alignments). + + >>> from nltk.translate import Alignment + >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)]) + >>> a.invert() + Alignment([(0, 0), (1, 0), (2, 1), (2, 2)]) + >>> print(a.invert()) + 0-0 1-0 2-1 2-2 + >>> a[0] + [(0, 1), (0, 0)] + >>> a.invert()[2] + [(2, 1), (2, 2)] + >>> b = Alignment([(0, 0), (0, 1)]) + >>> b.issubset(a) + True + >>> c = Alignment.fromstring('0-0 0-1') + >>> b == c + True + """ + + def __new__(cls, pairs): + self = frozenset.__new__(cls, pairs) + self._len = max(p[0] for p in self) if self != frozenset([]) else 0 + self._index = None + return self + + @classmethod + def fromstring(cls, s): + """ + Read a giza-formatted string and return an Alignment object. + + >>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5') + Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)]) + + :type s: str + :param s: the positional alignments in giza format + :rtype: Alignment + :return: An Alignment object corresponding to the string representation ``s``. + """ + + return Alignment([_giza2pair(a) for a in s.split()]) + + def __getitem__(self, key): + """ + Look up the alignments that map from a given index or slice. + """ + if not self._index: + self._build_index() + return self._index.__getitem__(key) + + def invert(self): + """ + Return an Alignment object, being the inverted mapping. + """ + return Alignment(((p[1], p[0]) + p[2:]) for p in self) + + def range(self, positions=None): + """ + Work out the range of the mapping from the given positions. + If no positions are specified, compute the range of the entire mapping. + """ + image = set() + if not self._index: + self._build_index() + if not positions: + positions = list(range(len(self._index))) + for p in positions: + image.update(f for _, f in self._index[p]) + return sorted(image) + + def __repr__(self): + """ + Produce a Giza-formatted string representing the alignment. + """ + return "Alignment(%r)" % sorted(self) + + def __str__(self): + """ + Produce a Giza-formatted string representing the alignment. + """ + return " ".join("%d-%d" % p[:2] for p in sorted(self)) + + def _build_index(self): + """ + Build a list self._index such that self._index[i] is a list + of the alignments originating from word i. + """ + self._index = [[] for _ in range(self._len + 1)] + for p in self: + self._index[p[0]].append(p) + + +def _giza2pair(pair_string): + i, j = pair_string.split("-") + return int(i), int(j) + + +def _naacl2pair(pair_string): + i, j, p = pair_string.split("-") + return int(i), int(j) + + +def _check_alignment(num_words, num_mots, alignment): + """ + Check whether the alignments are legal. + + :param num_words: the number of source language words + :type num_words: int + :param num_mots: the number of target language words + :type num_mots: int + :param alignment: alignment to be checked + :type alignment: Alignment + :raise IndexError: if alignment falls outside the sentence + """ + + assert type(alignment) is Alignment + + if not all(0 <= pair[0] < num_words for pair in alignment): + raise IndexError("Alignment is outside boundary of words") + if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment): + raise IndexError("Alignment is outside boundary of mots") + + +PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"]) + + +class PhraseTable: + """ + In-memory store of translations for a given phrase, and the log + probability of the those translations + """ + + def __init__(self): + self.src_phrases = dict() + + def translations_for(self, src_phrase): + """ + Get the translations for a source language phrase + + :param src_phrase: Source language phrase of interest + :type src_phrase: tuple(str) + + :return: A list of target language phrases that are translations + of ``src_phrase``, ordered in decreasing order of + likelihood. Each list element is a tuple of the target + phrase and its log probability. + :rtype: list(PhraseTableEntry) + """ + return self.src_phrases[src_phrase] + + def add(self, src_phrase, trg_phrase, log_prob): + """ + :type src_phrase: tuple(str) + :type trg_phrase: tuple(str) + + :param log_prob: Log probability that given ``src_phrase``, + ``trg_phrase`` is its translation + :type log_prob: float + """ + entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob) + if src_phrase not in self.src_phrases: + self.src_phrases[src_phrase] = [] + self.src_phrases[src_phrase].append(entry) + self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True) + + def __contains__(self, src_phrase): + return src_phrase in self.src_phrases diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/chrf_score.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/chrf_score.py new file mode 100644 index 0000000000000000000000000000000000000000..520f45380ab191cbb5396a611aa0dbdc7f840911 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/chrf_score.py @@ -0,0 +1,222 @@ +# Natural Language Toolkit: ChrF score +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Maja Popovic +# Contributors: Liling Tan, Aleš Tamchyna (Memsource) +# URL: +# For license information, see LICENSE.TXT + +""" ChrF score implementation """ +import re +from collections import Counter, defaultdict + +from nltk.util import ngrams + + +def sentence_chrf( + reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True +): + """ + Calculates the sentence level CHRF (Character n-gram F-score) described in + - Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation. + In Proceedings of the 10th Workshop on Machine Translation. + https://www.statmt.org/wmt15/pdf/WMT49.pdf + - Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights. + In Proceedings of the 1st Conference on Machine Translation. + https://www.statmt.org/wmt16/pdf/W16-2341.pdf + + This implementation of CHRF only supports a single reference at the moment. + + For details not reported in the paper, consult Maja Popovic's original + implementation: https://github.com/m-popovic/chrF + + The code should output results equivalent to running CHRF++ with the + following options: -nw 0 -b 3 + + An example from the original BLEU paper + https://www.aclweb.org/anthology/P02-1040.pdf + + >>> ref1 = str('It is a guide to action that ensures that the military ' + ... 'will forever heed Party commands').split() + >>> hyp1 = str('It is a guide to action which ensures that the military ' + ... 'always obeys the commands of the party').split() + >>> hyp2 = str('It is to insure the troops forever hearing the activity ' + ... 'guidebook that party direct').split() + >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS + 0.6349... + >>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS + 0.3330... + + The infamous "the the the ... " example + + >>> ref = 'the cat is on the mat'.split() + >>> hyp = 'the the the the the the the'.split() + >>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS + 0.1468... + + An example to show that this function allows users to use strings instead of + tokens, i.e. list(str) as inputs. + + >>> ref1 = str('It is a guide to action that ensures that the military ' + ... 'will forever heed Party commands') + >>> hyp1 = str('It is a guide to action which ensures that the military ' + ... 'always obeys the commands of the party') + >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS + 0.6349... + >>> type(ref1) == type(hyp1) == str + True + >>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS + 0.6349... + + To skip the unigrams and only use 2- to 3-grams: + + >>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS + 0.6617... + + :param references: reference sentence + :type references: list(str) / str + :param hypothesis: a hypothesis sentence + :type hypothesis: list(str) / str + :param min_len: The minimum order of n-gram this function should extract. + :type min_len: int + :param max_len: The maximum order of n-gram this function should extract. + :type max_len: int + :param beta: the parameter to assign more importance to recall over precision + :type beta: float + :param ignore_whitespace: ignore whitespace characters in scoring + :type ignore_whitespace: bool + :return: the sentence level CHRF score. + :rtype: float + """ + return corpus_chrf( + [reference], + [hypothesis], + min_len, + max_len, + beta=beta, + ignore_whitespace=ignore_whitespace, + ) + + +def _preprocess(sent, ignore_whitespace): + if type(sent) != str: + # turn list of tokens into a string + sent = " ".join(sent) + + if ignore_whitespace: + sent = re.sub(r"\s+", "", sent) + return sent + + +def chrf_precision_recall_fscore_support( + reference, hypothesis, n, beta=3.0, epsilon=1e-16 +): + """ + This function computes the precision, recall and fscore from the ngram + overlaps. It returns the `support` which is the true positive score. + + By underspecifying the input type, the function will be agnostic as to how + it computes the ngrams and simply take the whichever element in the list; + it could be either token or character. + + :param reference: The reference sentence. + :type reference: list + :param hypothesis: The hypothesis sentence. + :type hypothesis: list + :param n: Extract up to the n-th order ngrams + :type n: int + :param beta: The parameter to assign more importance to recall over precision. + :type beta: float + :param epsilon: The fallback value if the hypothesis or reference is empty. + :type epsilon: float + :return: Returns the precision, recall and f-score and support (true positive). + :rtype: tuple(float) + """ + ref_ngrams = Counter(ngrams(reference, n)) + hyp_ngrams = Counter(ngrams(hypothesis, n)) + + # calculate the number of ngram matches + overlap_ngrams = ref_ngrams & hyp_ngrams + tp = sum(overlap_ngrams.values()) # True positives. + tpfp = sum(hyp_ngrams.values()) # True positives + False positives. + tpfn = sum(ref_ngrams.values()) # True positives + False negatives. + + try: + prec = tp / tpfp # precision + rec = tp / tpfn # recall + factor = beta**2 + fscore = (1 + factor) * (prec * rec) / (factor * prec + rec) + except ZeroDivisionError: + prec = rec = fscore = epsilon + return prec, rec, fscore, tp + + +def corpus_chrf( + references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True +): + """ + Calculates the corpus level CHRF (Character n-gram F-score), it is the + macro-averaged value of the sentence/segment level CHRF score. + + This implementation of CHRF only supports a single reference at the moment. + + >>> ref1 = str('It is a guide to action that ensures that the military ' + ... 'will forever heed Party commands').split() + >>> ref2 = str('It is the guiding principle which guarantees the military ' + ... 'forces always being under the command of the Party').split() + >>> + >>> hyp1 = str('It is a guide to action which ensures that the military ' + ... 'always obeys the commands of the party').split() + >>> hyp2 = str('It is to insure the troops forever hearing the activity ' + ... 'guidebook that party direct') + >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS + 0.3910... + + :param references: a corpus of list of reference sentences, w.r.t. hypotheses + :type references: list(list(str)) + :param hypotheses: a list of hypothesis sentences + :type hypotheses: list(list(str)) + :param min_len: The minimum order of n-gram this function should extract. + :type min_len: int + :param max_len: The maximum order of n-gram this function should extract. + :type max_len: int + :param beta: the parameter to assign more importance to recall over precision + :type beta: float + :param ignore_whitespace: ignore whitespace characters in scoring + :type ignore_whitespace: bool + :return: the sentence level CHRF score. + :rtype: float + """ + + assert len(references) == len( + hypotheses + ), "The number of hypotheses and their references should be the same" + num_sents = len(hypotheses) + + # Keep f-scores for each n-gram order separate + ngram_fscores = defaultdict(lambda: list()) + + # Iterate through each hypothesis and their corresponding references. + for reference, hypothesis in zip(references, hypotheses): + + # preprocess both reference and hypothesis + reference = _preprocess(reference, ignore_whitespace) + hypothesis = _preprocess(hypothesis, ignore_whitespace) + + # Calculate f-scores for each sentence and for each n-gram order + # separately. + for n in range(min_len, max_len + 1): + # Compute the precision, recall, fscore and support. + prec, rec, fscore, tp = chrf_precision_recall_fscore_support( + reference, hypothesis, n, beta=beta + ) + ngram_fscores[n].append(fscore) + + # how many n-gram sizes + num_ngram_sizes = len(ngram_fscores) + + # sum of f-scores over all sentences for each n-gram order + total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()] + + # macro-average over n-gram orders and over all sentences + return (sum(total_scores) / num_ngram_sizes) / num_sents diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gale_church.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gale_church.py new file mode 100644 index 0000000000000000000000000000000000000000..ebdcae581300248d62aa97b68906757b05873c05 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gale_church.py @@ -0,0 +1,263 @@ +# Natural Language Toolkit: Gale-Church Aligner +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Torsten Marek +# Contributor: Cassidy Laidlaw, Liling Tan +# URL: +# For license information, see LICENSE.TXT + +""" + +A port of the Gale-Church Aligner. + +Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora. +https://aclweb.org/anthology/J93-1004.pdf + +""" + +import math + +try: + from norm import logsf as norm_logsf + from scipy.stats import norm +except ImportError: + + def erfcc(x): + """Complementary error function.""" + z = abs(x) + t = 1 / (1 + 0.5 * z) + r = t * math.exp( + -z * z + - 1.26551223 + + t + * ( + 1.00002368 + + t + * ( + 0.37409196 + + t + * ( + 0.09678418 + + t + * ( + -0.18628806 + + t + * ( + 0.27886807 + + t + * ( + -1.13520398 + + t + * (1.48851587 + t * (-0.82215223 + t * 0.17087277)) + ) + ) + ) + ) + ) + ) + ) + if x >= 0.0: + return r + else: + return 2.0 - r + + def norm_cdf(x): + """Return the area under the normal distribution from M{-∞..x}.""" + return 1 - 0.5 * erfcc(x / math.sqrt(2)) + + def norm_logsf(x): + try: + return math.log(1 - norm_cdf(x)) + except ValueError: + return float("-inf") + + +LOG2 = math.log(2) + + +class LanguageIndependent: + # These are the language-independent probabilities and parameters + # given in Gale & Church + + # for the computation, l_1 is always the language with less characters + PRIORS = { + (1, 0): 0.0099, + (0, 1): 0.0099, + (1, 1): 0.89, + (2, 1): 0.089, + (1, 2): 0.089, + (2, 2): 0.011, + } + + AVERAGE_CHARACTERS = 1 + VARIANCE_CHARACTERS = 6.8 + + +def trace(backlinks, source_sents_lens, target_sents_lens): + """ + Traverse the alignment cost from the tracebacks and retrieves + appropriate sentence pairs. + + :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS) + :type backlinks: dict + :param source_sents_lens: A list of target sentences' lengths + :type source_sents_lens: list(int) + :param target_sents_lens: A list of target sentences' lengths + :type target_sents_lens: list(int) + """ + links = [] + position = (len(source_sents_lens), len(target_sents_lens)) + while position != (0, 0) and all(p >= 0 for p in position): + try: + s, t = backlinks[position] + except TypeError: + position = (position[0] - 1, position[1] - 1) + continue + for i in range(s): + for j in range(t): + links.append((position[0] - i - 1, position[1] - j - 1)) + position = (position[0] - s, position[1] - t) + + return links[::-1] + + +def align_log_prob(i, j, source_sents, target_sents, alignment, params): + """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]} + being aligned with a specific C{alignment}. + + @param i: The offset of the source sentence. + @param j: The offset of the target sentence. + @param source_sents: The list of source sentence lengths. + @param target_sents: The list of target sentence lengths. + @param alignment: The alignment type, a tuple of two integers. + @param params: The sentence alignment parameters. + + @returns: The log probability of a specific alignment between the two sentences, given the parameters. + """ + l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0])) + l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1])) + try: + # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C + # reference implementation. With l_s in the denominator, insertions are impossible. + m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2 + delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt( + m * params.VARIANCE_CHARACTERS + ) + except ZeroDivisionError: + return float("-inf") + + return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment])) + + +def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent): + """Return the sentence alignment of two text blocks (usually paragraphs). + + >>> align_blocks([5,5,5], [7,7,7]) + [(0, 0), (1, 1), (2, 2)] + >>> align_blocks([10,5,5], [12,20]) + [(0, 0), (1, 1), (2, 1)] + >>> align_blocks([12,20], [10,5,5]) + [(0, 0), (1, 1), (1, 2)] + >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12]) + [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)] + + @param source_sents_lens: The list of source sentence lengths. + @param target_sents_lens: The list of target sentence lengths. + @param params: the sentence alignment parameters. + @return: The sentence alignments, a list of index pairs. + """ + + alignment_types = list(params.PRIORS.keys()) + + # there are always three rows in the history (with the last of them being filled) + D = [[]] + + backlinks = {} + + for i in range(len(source_sents_lens) + 1): + for j in range(len(target_sents_lens) + 1): + min_dist = float("inf") + min_align = None + for a in alignment_types: + prev_i = -1 - a[0] + prev_j = j - a[1] + if prev_i < -len(D) or prev_j < 0: + continue + p = D[prev_i][prev_j] + align_log_prob( + i, j, source_sents_lens, target_sents_lens, a, params + ) + if p < min_dist: + min_dist = p + min_align = a + + if min_dist == float("inf"): + min_dist = 0 + + backlinks[(i, j)] = min_align + D[-1].append(min_dist) + + if len(D) > 2: + D.pop(0) + D.append([]) + + return trace(backlinks, source_sents_lens, target_sents_lens) + + +def align_texts(source_blocks, target_blocks, params=LanguageIndependent): + """Creates the sentence alignment of two texts. + + Texts can consist of several blocks. Block boundaries cannot be crossed by sentence + alignment links. + + Each block consists of a list that contains the lengths (in characters) of the sentences + in this block. + + @param source_blocks: The list of blocks in the source text. + @param target_blocks: The list of blocks in the target text. + @param params: the sentence alignment parameters. + + @returns: A list of sentence alignment lists + """ + if len(source_blocks) != len(target_blocks): + raise ValueError( + "Source and target texts do not have the same number of blocks." + ) + + return [ + align_blocks(source_block, target_block, params) + for source_block, target_block in zip(source_blocks, target_blocks) + ] + + +# File I/O functions; may belong in a corpus reader + + +def split_at(it, split_value): + """Splits an iterator C{it} at values of C{split_value}. + + Each instance of C{split_value} is swallowed. The iterator produces + subiterators which need to be consumed fully before the next subiterator + can be used. + """ + + def _chunk_iterator(first): + v = first + while v != split_value: + yield v + v = it.next() + + while True: + yield _chunk_iterator(it.next()) + + +def parse_token_stream(stream, soft_delimiter, hard_delimiter): + """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens) + and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function. + """ + return [ + [ + sum(len(token) for token in sentence_it) + for sentence_it in split_at(block_it, soft_delimiter) + ] + for block_it in split_at(stream, hard_delimiter) + ] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gdfa.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gdfa.py new file mode 100644 index 0000000000000000000000000000000000000000..02ddcba9fb1b03db064e6b46911777853ac506bf --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gdfa.py @@ -0,0 +1,138 @@ +# Natural Language Toolkit: GDFA word alignment symmetrization +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Liling Tan +# URL: +# For license information, see LICENSE.TXT + +from collections import defaultdict + + +def grow_diag_final_and(srclen, trglen, e2f, f2e): + """ + This module symmetrisatizes the source-to-target and target-to-source + word alignment output and produces, aka. GDFA algorithm (Koehn, 2005). + + Step 1: Find the intersection of the bidirectional alignment. + + Step 2: Search for additional neighbor alignment points to be added, given + these criteria: (i) neighbor alignments points are not in the + intersection and (ii) neighbor alignments are in the union. + + Step 3: Add all other alignment points that are not in the intersection, not in + the neighboring alignments that met the criteria but in the original + forward/backward alignment outputs. + + >>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 ' + ... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18') + >>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 ' + ... '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 ' + ... '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18') + >>> srctext = ("この よう な ハロー 白色 わい 星 の L 関数 " + ... "は L と 共 に 不連続 に 増加 する こと が " + ... "期待 さ れる こと を 示し た 。") + >>> trgtext = ("Therefore , we expect that the luminosity function " + ... "of such halo white dwarfs increases discontinuously " + ... "with the luminosity .") + >>> srclen = len(srctext.split()) + >>> trglen = len(trgtext.split()) + >>> + >>> gdfa = grow_diag_final_and(srclen, trglen, forw, back) + >>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12), + ... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20, + ... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5), + ... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22, + ... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5, + ... 12), (11, 6), (12, 8)])) + True + + References: + Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot. + 2005. Edinburgh System Description for the 2005 IWSLT Speech + Translation Evaluation. In MT Eval Workshop. + + :type srclen: int + :param srclen: the number of tokens in the source language + :type trglen: int + :param trglen: the number of tokens in the target language + :type e2f: str + :param e2f: the forward word alignment outputs from source-to-target + language (in pharaoh output format) + :type f2e: str + :param f2e: the backward word alignment outputs from target-to-source + language (in pharaoh output format) + :rtype: set(tuple(int)) + :return: the symmetrized alignment points from the GDFA algorithm + """ + + # Converts pharaoh text format into list of tuples. + e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()] + f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()] + + neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)] + alignment = set(e2f).intersection(set(f2e)) # Find the intersection. + union = set(e2f).union(set(f2e)) + + # *aligned* is used to check if neighbors are aligned in grow_diag() + aligned = defaultdict(set) + for i, j in alignment: + aligned["e"].add(i) + aligned["f"].add(j) + + def grow_diag(): + """ + Search for the neighbor points and them to the intersected alignment + points if criteria are met. + """ + prev_len = len(alignment) - 1 + # iterate until no new points added + while prev_len < len(alignment): + no_new_points = True + # for english word e = 0 ... en + for e in range(srclen): + # for foreign word f = 0 ... fn + for f in range(trglen): + # if ( e aligned with f) + if (e, f) in alignment: + # for each neighboring point (e-new, f-new) + for neighbor in neighbors: + neighbor = tuple(i + j for i, j in zip((e, f), neighbor)) + e_new, f_new = neighbor + # if ( ( e-new not aligned and f-new not aligned) + # and (e-new, f-new in union(e2f, f2e) ) + if ( + e_new not in aligned and f_new not in aligned + ) and neighbor in union: + alignment.add(neighbor) + aligned["e"].add(e_new) + aligned["f"].add(f_new) + prev_len += 1 + no_new_points = False + # iterate until no new points added + if no_new_points: + break + + def final_and(a): + """ + Adds remaining points that are not in the intersection, not in the + neighboring alignments but in the original *e2f* and *f2e* alignments + """ + # for english word e = 0 ... en + for e_new in range(srclen): + # for foreign word f = 0 ... fn + for f_new in range(trglen): + # if ( ( e-new not aligned and f-new not aligned) + # and (e-new, f-new in union(e2f, f2e) ) + if ( + e_new not in aligned + and f_new not in aligned + and (e_new, f_new) in union + ): + alignment.add((e_new, f_new)) + aligned["e"].add(e_new) + aligned["f"].add(f_new) + + grow_diag() + final_and(e2f) + final_and(f2e) + return sorted(alignment) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gleu_score.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gleu_score.py new file mode 100644 index 0000000000000000000000000000000000000000..0873ca58f4c292f0aaed78592164173a167eb920 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/gleu_score.py @@ -0,0 +1,190 @@ +# Natural Language Toolkit: GLEU Score +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: +# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan +# URL: +# For license information, see LICENSE.TXT + +""" GLEU score implementation. """ + +from collections import Counter + +from nltk.util import everygrams, ngrams + + +def sentence_gleu(references, hypothesis, min_len=1, max_len=4): + """ + Calculates the sentence level GLEU (Google-BLEU) score described in + + Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, + Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, + Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser, + Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, + George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, + Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, + Jeffrey Dean. (2016) Google’s Neural Machine Translation System: + Bridging the Gap between Human and Machine Translation. + eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf + Retrieved on 27 Oct 2016. + + From Wu et al. (2016): + "The BLEU score has some undesirable properties when used for single + sentences, as it was designed to be a corpus measure. We therefore + use a slightly different score for our RL experiments which we call + the 'GLEU score'. For the GLEU score, we record all sub-sequences of + 1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then + compute a recall, which is the ratio of the number of matching n-grams + to the number of total n-grams in the target (ground truth) sequence, + and a precision, which is the ratio of the number of matching n-grams + to the number of total n-grams in the generated output sequence. Then + GLEU score is simply the minimum of recall and precision. This GLEU + score's range is always between 0 (no matches) and 1 (all match) and + it is symmetrical when switching output and target. According to + our experiments, GLEU score correlates quite well with the BLEU + metric on a corpus level but does not have its drawbacks for our per + sentence reward objective." + + Note: The initial implementation only allowed a single reference, but now + a list of references is required (which is consistent with + bleu_score.sentence_bleu()). + + The infamous "the the the ... " example + + >>> ref = 'the cat is on the mat'.split() + >>> hyp = 'the the the the the the the'.split() + >>> sentence_gleu([ref], hyp) # doctest: +ELLIPSIS + 0.0909... + + An example to evaluate normal machine translation outputs + + >>> ref1 = str('It is a guide to action that ensures that the military ' + ... 'will forever heed Party commands').split() + >>> hyp1 = str('It is a guide to action which ensures that the military ' + ... 'always obeys the commands of the party').split() + >>> hyp2 = str('It is to insure the troops forever hearing the activity ' + ... 'guidebook that party direct').split() + >>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS + 0.4393... + >>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS + 0.1206... + + :param references: a list of reference sentences + :type references: list(list(str)) + :param hypothesis: a hypothesis sentence + :type hypothesis: list(str) + :param min_len: The minimum order of n-gram this function should extract. + :type min_len: int + :param max_len: The maximum order of n-gram this function should extract. + :type max_len: int + :return: the sentence level GLEU score. + :rtype: float + """ + return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len) + + +def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4): + """ + Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all + the hypotheses and their respective references. + + Instead of averaging the sentence level GLEU scores (i.e. macro-average + precision), Wu et al. (2016) sum up the matching tokens and the max of + hypothesis and reference tokens for each sentence, then compute using the + aggregate values. + + From Mike Schuster (via email): + "For the corpus, we just add up the two statistics n_match and + n_all = max(n_all_output, n_all_target) for all sentences, then + calculate gleu_score = n_match / n_all, so it is not just a mean of + the sentence gleu scores (in our case, longer sentences count more, + which I think makes sense as they are more difficult to translate)." + + >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', 'forever', + ... 'heed', 'Party', 'commands'] + >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] + >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + + >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', + ... 'interested', 'in', 'world', 'history'] + >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', + ... 'because', 'he', 'read', 'the', 'book'] + + >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] + >>> hypotheses = [hyp1, hyp2] + >>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS + 0.5673... + + The example below show that corpus_gleu() is different from averaging + sentence_gleu() for hypotheses + + >>> score1 = sentence_gleu([ref1a], hyp1) + >>> score2 = sentence_gleu([ref2a], hyp2) + >>> (score1 + score2) / 2 # doctest: +ELLIPSIS + 0.6144... + + :param list_of_references: a list of reference sentences, w.r.t. hypotheses + :type list_of_references: list(list(list(str))) + :param hypotheses: a list of hypothesis sentences + :type hypotheses: list(list(str)) + :param min_len: The minimum order of n-gram this function should extract. + :type min_len: int + :param max_len: The maximum order of n-gram this function should extract. + :type max_len: int + :return: The corpus-level GLEU score. + :rtype: float + """ + # sanity check + assert len(list_of_references) == len( + hypotheses + ), "The number of hypotheses and their reference(s) should be the same" + + # sum matches and max-token-lengths over all sentences + corpus_n_match = 0 + corpus_n_all = 0 + + for references, hypothesis in zip(list_of_references, hypotheses): + hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) + tpfp = sum(hyp_ngrams.values()) # True positives + False positives. + + hyp_counts = [] + for reference in references: + ref_ngrams = Counter(everygrams(reference, min_len, max_len)) + tpfn = sum(ref_ngrams.values()) # True positives + False negatives. + + overlap_ngrams = ref_ngrams & hyp_ngrams + tp = sum(overlap_ngrams.values()) # True positives. + + # While GLEU is defined as the minimum of precision and + # recall, we can reduce the number of division operations by one by + # instead finding the maximum of the denominators for the precision + # and recall formulae, since the numerators are the same: + # precision = tp / tpfp + # recall = tp / tpfn + # gleu_score = min(precision, recall) == tp / max(tpfp, tpfn) + n_all = max(tpfp, tpfn) + + if n_all > 0: + hyp_counts.append((tp, n_all)) + + # use the reference yielding the highest score + if hyp_counts: + n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1]) + corpus_n_match += n_match + corpus_n_all += n_all + + # corner case: empty corpus or empty references---don't divide by zero! + if corpus_n_all == 0: + gleu_score = 0.0 + else: + gleu_score = corpus_n_match / corpus_n_all + + return gleu_score diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm1.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm1.py new file mode 100644 index 0000000000000000000000000000000000000000..badb896968633d0db99f9b8fb2a7679b65d9a534 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm1.py @@ -0,0 +1,251 @@ +# Natural Language Toolkit: IBM Model 1 +# +# Copyright (C) 2001-2013 NLTK Project +# Author: Chin Yee Lee +# Hengfeng Li +# Ruxin Hou +# Calvin Tanujaya Lim +# Based on earlier version by: +# Will Zhang +# Guan Gui +# URL: +# For license information, see LICENSE.TXT + +""" +Lexical translation model that ignores word order. + +In IBM Model 1, word order is ignored for simplicity. As long as the +word alignments are equivalent, it doesn't matter where the word occurs +in the source or target sentence. Thus, the following three alignments +are equally likely:: + + Source: je mange du jambon + Target: i eat some ham + Alignment: (0,0) (1,1) (2,2) (3,3) + + Source: je mange du jambon + Target: some ham eat i + Alignment: (0,2) (1,3) (2,1) (3,1) + + Source: du jambon je mange + Target: eat i some ham + Alignment: (0,3) (1,2) (2,0) (3,1) + +Note that an alignment is represented here as +(word_index_in_target, word_index_in_source). + +The EM algorithm used in Model 1 is: + +:E step: In the training data, count how many times a source language + word is translated into a target language word, weighted by + the prior probability of the translation. + +:M step: Estimate the new probability of translation based on the + counts from the Expectation step. + +Notations +--------- + +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:s: A word in the source language +:t: A word in the target language + +References +---------- + +Philipp Koehn. 2010. Statistical Machine Translation. +Cambridge University Press, New York. + +Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and +Robert L. Mercer. 1993. The Mathematics of Statistical Machine +Translation: Parameter Estimation. Computational Linguistics, 19 (2), +263-311. +""" + +import warnings +from collections import defaultdict + +from nltk.translate import AlignedSent, Alignment, IBMModel +from nltk.translate.ibm_model import Counts + + +class IBMModel1(IBMModel): + """ + Lexical translation model that ignores word order + + >>> bitext = [] + >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) + >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) + >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) + >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) + + >>> ibm1 = IBMModel1(bitext, 5) + + >>> print(round(ibm1.translation_table['buch']['book'], 3)) + 0.889 + >>> print(round(ibm1.translation_table['das']['book'], 3)) + 0.062 + >>> print(round(ibm1.translation_table['buch'][None], 3)) + 0.113 + >>> print(round(ibm1.translation_table['ja'][None], 3)) + 0.073 + + >>> test_sentence = bitext[2] + >>> test_sentence.words + ['das', 'buch', 'ist', 'ja', 'klein'] + >>> test_sentence.mots + ['the', 'book', 'is', 'small'] + >>> test_sentence.alignment + Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) + + """ + + def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): + """ + Train on ``sentence_aligned_corpus`` and create a lexical + translation model. + + Translation direction is from ``AlignedSent.mots`` to + ``AlignedSent.words``. + + :param sentence_aligned_corpus: Sentence-aligned parallel corpus + :type sentence_aligned_corpus: list(AlignedSent) + + :param iterations: Number of iterations to run training algorithm + :type iterations: int + + :param probability_tables: Optional. Use this to pass in custom + probability values. If not specified, probabilities will be + set to a uniform distribution, or some other sensible value. + If specified, the following entry must be present: + ``translation_table``. + See ``IBMModel`` for the type and purpose of this table. + :type probability_tables: dict[str]: object + """ + super().__init__(sentence_aligned_corpus) + + if probability_tables is None: + self.set_uniform_probabilities(sentence_aligned_corpus) + else: + # Set user-defined probabilities + self.translation_table = probability_tables["translation_table"] + + for n in range(0, iterations): + self.train(sentence_aligned_corpus) + + self.align_all(sentence_aligned_corpus) + + def set_uniform_probabilities(self, sentence_aligned_corpus): + initial_prob = 1 / len(self.trg_vocab) + if initial_prob < IBMModel.MIN_PROB: + warnings.warn( + "Target language vocabulary is too large (" + + str(len(self.trg_vocab)) + + " words). " + "Results may be less accurate." + ) + + for t in self.trg_vocab: + self.translation_table[t] = defaultdict(lambda: initial_prob) + + def train(self, parallel_corpus): + counts = Counts() + for aligned_sentence in parallel_corpus: + trg_sentence = aligned_sentence.words + src_sentence = [None] + aligned_sentence.mots + + # E step (a): Compute normalization factors to weigh counts + total_count = self.prob_all_alignments(src_sentence, trg_sentence) + + # E step (b): Collect counts + for t in trg_sentence: + for s in src_sentence: + count = self.prob_alignment_point(s, t) + normalized_count = count / total_count[t] + counts.t_given_s[t][s] += normalized_count + counts.any_t_given_s[s] += normalized_count + + # M step: Update probabilities with maximum likelihood estimate + self.maximize_lexical_translation_probabilities(counts) + + def prob_all_alignments(self, src_sentence, trg_sentence): + """ + Computes the probability of all possible word alignments, + expressed as a marginal distribution over target words t + + Each entry in the return value represents the contribution to + the total alignment probability by the target word t. + + To obtain probability(alignment | src_sentence, trg_sentence), + simply sum the entries in the return value. + + :return: Probability of t for all s in ``src_sentence`` + :rtype: dict(str): float + """ + alignment_prob_for_t = defaultdict(lambda: 0.0) + for t in trg_sentence: + for s in src_sentence: + alignment_prob_for_t[t] += self.prob_alignment_point(s, t) + return alignment_prob_for_t + + def prob_alignment_point(self, s, t): + """ + Probability that word ``t`` in the target sentence is aligned to + word ``s`` in the source sentence + """ + return self.translation_table[t][s] + + def prob_t_a_given_s(self, alignment_info): + """ + Probability of target sentence and an alignment given the + source sentence + """ + prob = 1.0 + + for j, i in enumerate(alignment_info.alignment): + if j == 0: + continue # skip the dummy zeroeth element + trg_word = alignment_info.trg_sentence[j] + src_word = alignment_info.src_sentence[i] + prob *= self.translation_table[trg_word][src_word] + + return max(prob, IBMModel.MIN_PROB) + + def align_all(self, parallel_corpus): + for sentence_pair in parallel_corpus: + self.align(sentence_pair) + + def align(self, sentence_pair): + """ + Determines the best word alignment for one sentence pair from + the corpus that the model was trained on. + + The best alignment will be set in ``sentence_pair`` when the + method returns. In contrast with the internal implementation of + IBM models, the word indices in the ``Alignment`` are zero- + indexed, not one-indexed. + + :param sentence_pair: A sentence in the source language and its + counterpart sentence in the target language + :type sentence_pair: AlignedSent + """ + best_alignment = [] + + for j, trg_word in enumerate(sentence_pair.words): + # Initialize trg_word to align with the NULL token + best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB) + best_alignment_point = None + for i, src_word in enumerate(sentence_pair.mots): + align_prob = self.translation_table[trg_word][src_word] + if align_prob >= best_prob: # prefer newer word in case of tie + best_prob = align_prob + best_alignment_point = i + + best_alignment.append((j, best_alignment_point)) + + sentence_pair.alignment = Alignment(best_alignment) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm2.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm2.py new file mode 100644 index 0000000000000000000000000000000000000000..0b3ff375f045f4a809778ea8d3221e6b62e5e2ad --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm2.py @@ -0,0 +1,319 @@ +# Natural Language Toolkit: IBM Model 2 +# +# Copyright (C) 2001-2013 NLTK Project +# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim +# URL: +# For license information, see LICENSE.TXT + +""" +Lexical translation model that considers word order. + +IBM Model 2 improves on Model 1 by accounting for word order. +An alignment probability is introduced, a(i | j,l,m), which predicts +a source word position, given its aligned target word's position. + +The EM algorithm used in Model 2 is: + +:E step: In the training data, collect counts, weighted by prior + probabilities. + + - (a) count how many times a source language word is translated + into a target language word + - (b) count how many times a particular position in the source + sentence is aligned to a particular position in the target + sentence + +:M step: Estimate new probabilities based on the counts from the E step + +Notations +--------- + +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:l: Number of words in the source sentence, excluding NULL +:m: Number of words in the target sentence +:s: A word in the source language +:t: A word in the target language + +References +---------- + +Philipp Koehn. 2010. Statistical Machine Translation. +Cambridge University Press, New York. + +Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and +Robert L. Mercer. 1993. The Mathematics of Statistical Machine +Translation: Parameter Estimation. Computational Linguistics, 19 (2), +263-311. +""" + +import warnings +from collections import defaultdict + +from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel1 +from nltk.translate.ibm_model import Counts + + +class IBMModel2(IBMModel): + """ + Lexical translation model that considers word order + + >>> bitext = [] + >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) + >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) + >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) + >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) + + >>> ibm2 = IBMModel2(bitext, 5) + + >>> print(round(ibm2.translation_table['buch']['book'], 3)) + 1.0 + >>> print(round(ibm2.translation_table['das']['book'], 3)) + 0.0 + >>> print(round(ibm2.translation_table['buch'][None], 3)) + 0.0 + >>> print(round(ibm2.translation_table['ja'][None], 3)) + 0.0 + + >>> print(round(ibm2.alignment_table[1][1][2][2], 3)) + 0.939 + >>> print(round(ibm2.alignment_table[1][2][2][2], 3)) + 0.0 + >>> print(round(ibm2.alignment_table[2][2][4][5], 3)) + 1.0 + + >>> test_sentence = bitext[2] + >>> test_sentence.words + ['das', 'buch', 'ist', 'ja', 'klein'] + >>> test_sentence.mots + ['the', 'book', 'is', 'small'] + >>> test_sentence.alignment + Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) + + """ + + def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): + """ + Train on ``sentence_aligned_corpus`` and create a lexical + translation model and an alignment model. + + Translation direction is from ``AlignedSent.mots`` to + ``AlignedSent.words``. + + :param sentence_aligned_corpus: Sentence-aligned parallel corpus + :type sentence_aligned_corpus: list(AlignedSent) + + :param iterations: Number of iterations to run training algorithm + :type iterations: int + + :param probability_tables: Optional. Use this to pass in custom + probability values. If not specified, probabilities will be + set to a uniform distribution, or some other sensible value. + If specified, all the following entries must be present: + ``translation_table``, ``alignment_table``. + See ``IBMModel`` for the type and purpose of these tables. + :type probability_tables: dict[str]: object + """ + super().__init__(sentence_aligned_corpus) + + if probability_tables is None: + # Get translation probabilities from IBM Model 1 + # Run more iterations of training for Model 1, since it is + # faster than Model 2 + ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations) + self.translation_table = ibm1.translation_table + self.set_uniform_probabilities(sentence_aligned_corpus) + else: + # Set user-defined probabilities + self.translation_table = probability_tables["translation_table"] + self.alignment_table = probability_tables["alignment_table"] + + for n in range(0, iterations): + self.train(sentence_aligned_corpus) + + self.align_all(sentence_aligned_corpus) + + def set_uniform_probabilities(self, sentence_aligned_corpus): + # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m + l_m_combinations = set() + for aligned_sentence in sentence_aligned_corpus: + l = len(aligned_sentence.mots) + m = len(aligned_sentence.words) + if (l, m) not in l_m_combinations: + l_m_combinations.add((l, m)) + initial_prob = 1 / (l + 1) + if initial_prob < IBMModel.MIN_PROB: + warnings.warn( + "A source sentence is too long (" + + str(l) + + " words). Results may be less accurate." + ) + + for i in range(0, l + 1): + for j in range(1, m + 1): + self.alignment_table[i][j][l][m] = initial_prob + + def train(self, parallel_corpus): + counts = Model2Counts() + for aligned_sentence in parallel_corpus: + src_sentence = [None] + aligned_sentence.mots + trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed + l = len(aligned_sentence.mots) + m = len(aligned_sentence.words) + + # E step (a): Compute normalization factors to weigh counts + total_count = self.prob_all_alignments(src_sentence, trg_sentence) + + # E step (b): Collect counts + for j in range(1, m + 1): + t = trg_sentence[j] + for i in range(0, l + 1): + s = src_sentence[i] + count = self.prob_alignment_point(i, j, src_sentence, trg_sentence) + normalized_count = count / total_count[t] + + counts.update_lexical_translation(normalized_count, s, t) + counts.update_alignment(normalized_count, i, j, l, m) + + # M step: Update probabilities with maximum likelihood estimates + self.maximize_lexical_translation_probabilities(counts) + self.maximize_alignment_probabilities(counts) + + def maximize_alignment_probabilities(self, counts): + MIN_PROB = IBMModel.MIN_PROB + for i, j_s in counts.alignment.items(): + for j, src_sentence_lengths in j_s.items(): + for l, trg_sentence_lengths in src_sentence_lengths.items(): + for m in trg_sentence_lengths: + estimate = ( + counts.alignment[i][j][l][m] + / counts.alignment_for_any_i[j][l][m] + ) + self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB) + + def prob_all_alignments(self, src_sentence, trg_sentence): + """ + Computes the probability of all possible word alignments, + expressed as a marginal distribution over target words t + + Each entry in the return value represents the contribution to + the total alignment probability by the target word t. + + To obtain probability(alignment | src_sentence, trg_sentence), + simply sum the entries in the return value. + + :return: Probability of t for all s in ``src_sentence`` + :rtype: dict(str): float + """ + alignment_prob_for_t = defaultdict(lambda: 0.0) + for j in range(1, len(trg_sentence)): + t = trg_sentence[j] + for i in range(0, len(src_sentence)): + alignment_prob_for_t[t] += self.prob_alignment_point( + i, j, src_sentence, trg_sentence + ) + return alignment_prob_for_t + + def prob_alignment_point(self, i, j, src_sentence, trg_sentence): + """ + Probability that position j in ``trg_sentence`` is aligned to + position i in the ``src_sentence`` + """ + l = len(src_sentence) - 1 + m = len(trg_sentence) - 1 + s = src_sentence[i] + t = trg_sentence[j] + return self.translation_table[t][s] * self.alignment_table[i][j][l][m] + + def prob_t_a_given_s(self, alignment_info): + """ + Probability of target sentence and an alignment given the + source sentence + """ + prob = 1.0 + l = len(alignment_info.src_sentence) - 1 + m = len(alignment_info.trg_sentence) - 1 + + for j, i in enumerate(alignment_info.alignment): + if j == 0: + continue # skip the dummy zeroeth element + trg_word = alignment_info.trg_sentence[j] + src_word = alignment_info.src_sentence[i] + prob *= ( + self.translation_table[trg_word][src_word] + * self.alignment_table[i][j][l][m] + ) + + return max(prob, IBMModel.MIN_PROB) + + def align_all(self, parallel_corpus): + for sentence_pair in parallel_corpus: + self.align(sentence_pair) + + def align(self, sentence_pair): + """ + Determines the best word alignment for one sentence pair from + the corpus that the model was trained on. + + The best alignment will be set in ``sentence_pair`` when the + method returns. In contrast with the internal implementation of + IBM models, the word indices in the ``Alignment`` are zero- + indexed, not one-indexed. + + :param sentence_pair: A sentence in the source language and its + counterpart sentence in the target language + :type sentence_pair: AlignedSent + """ + best_alignment = [] + + l = len(sentence_pair.mots) + m = len(sentence_pair.words) + + for j, trg_word in enumerate(sentence_pair.words): + # Initialize trg_word to align with the NULL token + best_prob = ( + self.translation_table[trg_word][None] + * self.alignment_table[0][j + 1][l][m] + ) + best_prob = max(best_prob, IBMModel.MIN_PROB) + best_alignment_point = None + for i, src_word in enumerate(sentence_pair.mots): + align_prob = ( + self.translation_table[trg_word][src_word] + * self.alignment_table[i + 1][j + 1][l][m] + ) + if align_prob >= best_prob: + best_prob = align_prob + best_alignment_point = i + + best_alignment.append((j, best_alignment_point)) + + sentence_pair.alignment = Alignment(best_alignment) + + +class Model2Counts(Counts): + """ + Data object to store counts of various parameters during training. + Includes counts for alignment. + """ + + def __init__(self): + super().__init__() + self.alignment = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + ) + self.alignment_for_any_i = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) + ) + + def update_lexical_translation(self, count, s, t): + self.t_given_s[t][s] += count + self.any_t_given_s[s] += count + + def update_alignment(self, count, i, j, l, m): + self.alignment[i][j][l][m] += count + self.alignment_for_any_i[j][l][m] += count diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm4.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm4.py new file mode 100644 index 0000000000000000000000000000000000000000..f0f06e441ec422243ffe195ab480600819428f96 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/ibm4.py @@ -0,0 +1,490 @@ +# Natural Language Toolkit: IBM Model 4 +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Tah Wei Hoon +# URL: +# For license information, see LICENSE.TXT + +""" +Translation model that reorders output words based on their type and +distance from other related words in the output sentence. + +IBM Model 4 improves the distortion model of Model 3, motivated by the +observation that certain words tend to be re-ordered in a predictable +way relative to one another. For example, in English +usually has its order flipped as in French. + +Model 4 requires words in the source and target vocabularies to be +categorized into classes. This can be linguistically driven, like parts +of speech (adjective, nouns, prepositions, etc). Word classes can also +be obtained by statistical methods. The original IBM Model 4 uses an +information theoretic approach to group words into 50 classes for each +vocabulary. + +Terminology +----------- + +:Cept: + A source word with non-zero fertility i.e. aligned to one or more + target words. +:Tablet: + The set of target word(s) aligned to a cept. +:Head of cept: + The first word of the tablet of that cept. +:Center of cept: + The average position of the words in that cept's tablet. If the + value is not an integer, the ceiling is taken. + For example, for a tablet with words in positions 2, 5, 6 in the + target sentence, the center of the corresponding cept is + ceil((2 + 5 + 6) / 3) = 5 +:Displacement: + For a head word, defined as (position of head word - position of + previous cept's center). Can be positive or negative. + For a non-head word, defined as (position of non-head word - + position of previous word in the same tablet). Always positive, + because successive words in a tablet are assumed to appear to the + right of the previous word. + +In contrast to Model 3 which reorders words in a tablet independently of +other words, Model 4 distinguishes between three cases. + +1. Words generated by NULL are distributed uniformly. +2. For a head word t, its position is modeled by the probability + d_head(displacement | word_class_s(s),word_class_t(t)), + where s is the previous cept, and word_class_s and word_class_t maps + s and t to a source and target language word class respectively. +3. For a non-head word t, its position is modeled by the probability + d_non_head(displacement | word_class_t(t)) + +The EM algorithm used in Model 4 is: + +:E step: In the training data, collect counts, weighted by prior + probabilities. + + - (a) count how many times a source language word is translated + into a target language word + - (b) for a particular word class, count how many times a head + word is located at a particular displacement from the + previous cept's center + - (c) for a particular word class, count how many times a + non-head word is located at a particular displacement from + the previous target word + - (d) count how many times a source word is aligned to phi number + of target words + - (e) count how many times NULL is aligned to a target word + +:M step: Estimate new probabilities based on the counts from the E step + +Like Model 3, there are too many possible alignments to consider. Thus, +a hill climbing approach is used to sample good candidates. + +Notations +--------- + +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:l: Number of words in the source sentence, excluding NULL +:m: Number of words in the target sentence +:s: A word in the source language +:t: A word in the target language +:phi: Fertility, the number of target words produced by a source word +:p1: Probability that a target word produced by a source word is + accompanied by another target word that is aligned to NULL +:p0: 1 - p1 +:dj: Displacement, Δj + +References +---------- + +Philipp Koehn. 2010. Statistical Machine Translation. +Cambridge University Press, New York. + +Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and +Robert L. Mercer. 1993. The Mathematics of Statistical Machine +Translation: Parameter Estimation. Computational Linguistics, 19 (2), +263-311. +""" + +import warnings +from collections import defaultdict +from math import factorial + +from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel3 +from nltk.translate.ibm_model import Counts, longest_target_sentence_length + + +class IBMModel4(IBMModel): + """ + Translation model that reorders output words based on their type and + their distance from other related words in the output sentence + + >>> bitext = [] + >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) + >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) + >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) + >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) + >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) + >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) + >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) + >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) + >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } + >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } + + >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes) + + >>> print(round(ibm4.translation_table['buch']['book'], 3)) + 1.0 + >>> print(round(ibm4.translation_table['das']['book'], 3)) + 0.0 + >>> print(round(ibm4.translation_table['ja'][None], 3)) + 1.0 + + >>> print(round(ibm4.head_distortion_table[1][0][1], 3)) + 1.0 + >>> print(round(ibm4.head_distortion_table[2][0][1], 3)) + 0.0 + >>> print(round(ibm4.non_head_distortion_table[3][6], 3)) + 0.5 + + >>> print(round(ibm4.fertility_table[2]['summarize'], 3)) + 1.0 + >>> print(round(ibm4.fertility_table[1]['book'], 3)) + 1.0 + + >>> print(round(ibm4.p1, 3)) + 0.033 + + >>> test_sentence = bitext[2] + >>> test_sentence.words + ['das', 'buch', 'ist', 'ja', 'klein'] + >>> test_sentence.mots + ['the', 'book', 'is', 'small'] + >>> test_sentence.alignment + Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) + + """ + + def __init__( + self, + sentence_aligned_corpus, + iterations, + source_word_classes, + target_word_classes, + probability_tables=None, + ): + """ + Train on ``sentence_aligned_corpus`` and create a lexical + translation model, distortion models, a fertility model, and a + model for generating NULL-aligned words. + + Translation direction is from ``AlignedSent.mots`` to + ``AlignedSent.words``. + + :param sentence_aligned_corpus: Sentence-aligned parallel corpus + :type sentence_aligned_corpus: list(AlignedSent) + + :param iterations: Number of iterations to run training algorithm + :type iterations: int + + :param source_word_classes: Lookup table that maps a source word + to its word class, the latter represented by an integer id + :type source_word_classes: dict[str]: int + + :param target_word_classes: Lookup table that maps a target word + to its word class, the latter represented by an integer id + :type target_word_classes: dict[str]: int + + :param probability_tables: Optional. Use this to pass in custom + probability values. If not specified, probabilities will be + set to a uniform distribution, or some other sensible value. + If specified, all the following entries must be present: + ``translation_table``, ``alignment_table``, + ``fertility_table``, ``p1``, ``head_distortion_table``, + ``non_head_distortion_table``. See ``IBMModel`` and + ``IBMModel4`` for the type and purpose of these tables. + :type probability_tables: dict[str]: object + """ + super().__init__(sentence_aligned_corpus) + self.reset_probabilities() + self.src_classes = source_word_classes + self.trg_classes = target_word_classes + + if probability_tables is None: + # Get probabilities from IBM model 3 + ibm3 = IBMModel3(sentence_aligned_corpus, iterations) + self.translation_table = ibm3.translation_table + self.alignment_table = ibm3.alignment_table + self.fertility_table = ibm3.fertility_table + self.p1 = ibm3.p1 + self.set_uniform_probabilities(sentence_aligned_corpus) + else: + # Set user-defined probabilities + self.translation_table = probability_tables["translation_table"] + self.alignment_table = probability_tables["alignment_table"] + self.fertility_table = probability_tables["fertility_table"] + self.p1 = probability_tables["p1"] + self.head_distortion_table = probability_tables["head_distortion_table"] + self.non_head_distortion_table = probability_tables[ + "non_head_distortion_table" + ] + + for n in range(0, iterations): + self.train(sentence_aligned_corpus) + + def reset_probabilities(self): + super().reset_probabilities() + self.head_distortion_table = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) + ) + """ + dict[int][int][int]: float. Probability(displacement of head + word | word class of previous cept,target word class). + Values accessed as ``distortion_table[dj][src_class][trg_class]``. + """ + + self.non_head_distortion_table = defaultdict( + lambda: defaultdict(lambda: self.MIN_PROB) + ) + """ + dict[int][int]: float. Probability(displacement of non-head + word | target word class). + Values accessed as ``distortion_table[dj][trg_class]``. + """ + + def set_uniform_probabilities(self, sentence_aligned_corpus): + """ + Set distortion probabilities uniformly to + 1 / cardinality of displacement values + """ + max_m = longest_target_sentence_length(sentence_aligned_corpus) + + # The maximum displacement is m-1, when a word is in the last + # position m of the target sentence and the previously placed + # word is in the first position. + # Conversely, the minimum displacement is -(m-1). + # Thus, the displacement range is (m-1) - (-(m-1)). Note that + # displacement cannot be zero and is not included in the range. + if max_m <= 1: + initial_prob = IBMModel.MIN_PROB + else: + initial_prob = 1 / (2 * (max_m - 1)) + if initial_prob < IBMModel.MIN_PROB: + warnings.warn( + "A target sentence is too long (" + + str(max_m) + + " words). Results may be less accurate." + ) + + for dj in range(1, max_m): + self.head_distortion_table[dj] = defaultdict( + lambda: defaultdict(lambda: initial_prob) + ) + self.head_distortion_table[-dj] = defaultdict( + lambda: defaultdict(lambda: initial_prob) + ) + self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob) + self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob) + + def train(self, parallel_corpus): + counts = Model4Counts() + for aligned_sentence in parallel_corpus: + m = len(aligned_sentence.words) + + # Sample the alignment space + sampled_alignments, best_alignment = self.sample(aligned_sentence) + # Record the most probable alignment + aligned_sentence.alignment = Alignment( + best_alignment.zero_indexed_alignment() + ) + + # E step (a): Compute normalization factors to weigh counts + total_count = self.prob_of_alignments(sampled_alignments) + + # E step (b): Collect counts + for alignment_info in sampled_alignments: + count = self.prob_t_a_given_s(alignment_info) + normalized_count = count / total_count + + for j in range(1, m + 1): + counts.update_lexical_translation( + normalized_count, alignment_info, j + ) + counts.update_distortion( + normalized_count, + alignment_info, + j, + self.src_classes, + self.trg_classes, + ) + + counts.update_null_generation(normalized_count, alignment_info) + counts.update_fertility(normalized_count, alignment_info) + + # M step: Update probabilities with maximum likelihood estimates + # If any probability is less than MIN_PROB, clamp it to MIN_PROB + existing_alignment_table = self.alignment_table + self.reset_probabilities() + self.alignment_table = existing_alignment_table # don't retrain + + self.maximize_lexical_translation_probabilities(counts) + self.maximize_distortion_probabilities(counts) + self.maximize_fertility_probabilities(counts) + self.maximize_null_generation_probabilities(counts) + + def maximize_distortion_probabilities(self, counts): + head_d_table = self.head_distortion_table + for dj, src_classes in counts.head_distortion.items(): + for s_cls, trg_classes in src_classes.items(): + for t_cls in trg_classes: + estimate = ( + counts.head_distortion[dj][s_cls][t_cls] + / counts.head_distortion_for_any_dj[s_cls][t_cls] + ) + head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB) + + non_head_d_table = self.non_head_distortion_table + for dj, trg_classes in counts.non_head_distortion.items(): + for t_cls in trg_classes: + estimate = ( + counts.non_head_distortion[dj][t_cls] + / counts.non_head_distortion_for_any_dj[t_cls] + ) + non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB) + + def prob_t_a_given_s(self, alignment_info): + """ + Probability of target sentence and an alignment given the + source sentence + """ + return IBMModel4.model4_prob_t_a_given_s(alignment_info, self) + + @staticmethod # exposed for Model 5 to use + def model4_prob_t_a_given_s(alignment_info, ibm_model): + probability = 1.0 + MIN_PROB = IBMModel.MIN_PROB + + def null_generation_term(): + # Binomial distribution: B(m - null_fertility, p1) + value = 1.0 + p1 = ibm_model.p1 + p0 = 1 - p1 + null_fertility = alignment_info.fertility_of_i(0) + m = len(alignment_info.trg_sentence) - 1 + value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) + if value < MIN_PROB: + return MIN_PROB + + # Combination: (m - null_fertility) choose null_fertility + for i in range(1, null_fertility + 1): + value *= (m - null_fertility - i + 1) / i + return value + + def fertility_term(): + value = 1.0 + src_sentence = alignment_info.src_sentence + for i in range(1, len(src_sentence)): + fertility = alignment_info.fertility_of_i(i) + value *= ( + factorial(fertility) + * ibm_model.fertility_table[fertility][src_sentence[i]] + ) + if value < MIN_PROB: + return MIN_PROB + return value + + def lexical_translation_term(j): + t = alignment_info.trg_sentence[j] + i = alignment_info.alignment[j] + s = alignment_info.src_sentence[i] + return ibm_model.translation_table[t][s] + + def distortion_term(j): + t = alignment_info.trg_sentence[j] + i = alignment_info.alignment[j] + if i == 0: + # case 1: t is aligned to NULL + return 1.0 + if alignment_info.is_head_word(j): + # case 2: t is the first word of a tablet + previous_cept = alignment_info.previous_cept(j) + src_class = None + if previous_cept is not None: + previous_s = alignment_info.src_sentence[previous_cept] + src_class = ibm_model.src_classes[previous_s] + trg_class = ibm_model.trg_classes[t] + dj = j - alignment_info.center_of_cept(previous_cept) + return ibm_model.head_distortion_table[dj][src_class][trg_class] + + # case 3: t is a subsequent word of a tablet + previous_position = alignment_info.previous_in_tablet(j) + trg_class = ibm_model.trg_classes[t] + dj = j - previous_position + return ibm_model.non_head_distortion_table[dj][trg_class] + + # end nested functions + + # Abort computation whenever probability falls below MIN_PROB at + # any point, since MIN_PROB can be considered as zero + probability *= null_generation_term() + if probability < MIN_PROB: + return MIN_PROB + + probability *= fertility_term() + if probability < MIN_PROB: + return MIN_PROB + + for j in range(1, len(alignment_info.trg_sentence)): + probability *= lexical_translation_term(j) + if probability < MIN_PROB: + return MIN_PROB + + probability *= distortion_term(j) + if probability < MIN_PROB: + return MIN_PROB + + return probability + + +class Model4Counts(Counts): + """ + Data object to store counts of various parameters during training. + Includes counts for distortion. + """ + + def __init__(self): + super().__init__() + self.head_distortion = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) + ) + self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(lambda: 0.0)) + self.non_head_distortion = defaultdict(lambda: defaultdict(lambda: 0.0)) + self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0) + + def update_distortion(self, count, alignment_info, j, src_classes, trg_classes): + i = alignment_info.alignment[j] + t = alignment_info.trg_sentence[j] + if i == 0: + # case 1: t is aligned to NULL + pass + elif alignment_info.is_head_word(j): + # case 2: t is the first word of a tablet + previous_cept = alignment_info.previous_cept(j) + if previous_cept is not None: + previous_src_word = alignment_info.src_sentence[previous_cept] + src_class = src_classes[previous_src_word] + else: + src_class = None + trg_class = trg_classes[t] + dj = j - alignment_info.center_of_cept(previous_cept) + self.head_distortion[dj][src_class][trg_class] += count + self.head_distortion_for_any_dj[src_class][trg_class] += count + else: + # case 3: t is a subsequent word of a tablet + previous_j = alignment_info.previous_in_tablet(j) + trg_class = trg_classes[t] + dj = j - previous_j + self.non_head_distortion[dj][trg_class] += count + self.non_head_distortion_for_any_dj[trg_class] += count diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/meteor_score.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/meteor_score.py new file mode 100644 index 0000000000000000000000000000000000000000..da8d6a5c2a2248c30cd4deb576c43555fa91362c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/meteor_score.py @@ -0,0 +1,409 @@ +# Natural Language Toolkit: Machine Translation +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Uday Krishna +# Contributor: Tom Aarsen +# URL: +# For license information, see LICENSE.TXT + + +from itertools import chain, product +from typing import Callable, Iterable, List, Tuple + +from nltk.corpus import WordNetCorpusReader, wordnet +from nltk.stem.api import StemmerI +from nltk.stem.porter import PorterStemmer + + +def _generate_enums( + hypothesis: Iterable[str], + reference: Iterable[str], + preprocess: Callable[[str], str] = str.lower, +) -> Tuple[List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + Takes in pre-tokenized inputs for hypothesis and reference and returns + enumerated word lists for each of them + + :param hypothesis: pre-tokenized hypothesis + :param reference: pre-tokenized reference + :preprocess: preprocessing method (default str.lower) + :return: enumerated words list + """ + if isinstance(hypothesis, str): + raise TypeError( + f'"hypothesis" expects pre-tokenized hypothesis (Iterable[str]): {hypothesis}' + ) + + if isinstance(reference, str): + raise TypeError( + f'"reference" expects pre-tokenized reference (Iterable[str]): {reference}' + ) + + enum_hypothesis_list = list(enumerate(map(preprocess, hypothesis))) + enum_reference_list = list(enumerate(map(preprocess, reference))) + return enum_hypothesis_list, enum_reference_list + + +def exact_match( + hypothesis: Iterable[str], reference: Iterable[str] +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + matches exact words in hypothesis and reference + and returns a word mapping based on the enumerated + word id between hypothesis and reference + + :param hypothesis: pre-tokenized hypothesis + :param reference: pre-tokenized reference + :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, + enumerated unmatched reference tuples + """ + enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) + return _match_enums(enum_hypothesis_list, enum_reference_list) + + +def _match_enums( + enum_hypothesis_list: List[Tuple[int, str]], + enum_reference_list: List[Tuple[int, str]], +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + matches exact words in hypothesis and reference and returns + a word mapping between enum_hypothesis_list and enum_reference_list + based on the enumerated word id. + + :param enum_hypothesis_list: enumerated hypothesis list + :param enum_reference_list: enumerated reference list + :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, + enumerated unmatched reference tuples + """ + word_match = [] + for i in range(len(enum_hypothesis_list))[::-1]: + for j in range(len(enum_reference_list))[::-1]: + if enum_hypothesis_list[i][1] == enum_reference_list[j][1]: + word_match.append( + (enum_hypothesis_list[i][0], enum_reference_list[j][0]) + ) + enum_hypothesis_list.pop(i) + enum_reference_list.pop(j) + break + return word_match, enum_hypothesis_list, enum_reference_list + + +def _enum_stem_match( + enum_hypothesis_list: List[Tuple[int, str]], + enum_reference_list: List[Tuple[int, str]], + stemmer: StemmerI = PorterStemmer(), +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + Stems each word and matches them in hypothesis and reference + and returns a word mapping between enum_hypothesis_list and + enum_reference_list based on the enumerated word id. The function also + returns a enumerated list of unmatched words for hypothesis and reference. + + :param enum_hypothesis_list: enumerated hypothesis list + :param enum_reference_list: enumerated reference list + :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) + :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, + enumerated unmatched reference tuples + """ + stemmed_enum_hypothesis_list = [ + (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list + ] + + stemmed_enum_reference_list = [ + (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list + ] + + return _match_enums(stemmed_enum_hypothesis_list, stemmed_enum_reference_list) + + +def stem_match( + hypothesis: Iterable[str], + reference: Iterable[str], + stemmer: StemmerI = PorterStemmer(), +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + Stems each word and matches them in hypothesis and reference + and returns a word mapping between hypothesis and reference + + :param hypothesis: pre-tokenized hypothesis + :param reference: pre-tokenized reference + :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) + :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, + enumerated unmatched reference tuples + """ + enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) + return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer) + + +def _enum_wordnetsyn_match( + enum_hypothesis_list: List[Tuple[int, str]], + enum_reference_list: List[Tuple[int, str]], + wordnet: WordNetCorpusReader = wordnet, +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + Matches each word in reference to a word in hypothesis + if any synonym of a hypothesis word is the exact match + to the reference word. + + :param enum_hypothesis_list: enumerated hypothesis list + :param enum_reference_list: enumerated reference list + :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) + """ + word_match = [] + for i in range(len(enum_hypothesis_list))[::-1]: + hypothesis_syns = set( + chain.from_iterable( + ( + lemma.name() + for lemma in synset.lemmas() + if lemma.name().find("_") < 0 + ) + for synset in wordnet.synsets(enum_hypothesis_list[i][1]) + ) + ).union({enum_hypothesis_list[i][1]}) + for j in range(len(enum_reference_list))[::-1]: + if enum_reference_list[j][1] in hypothesis_syns: + word_match.append( + (enum_hypothesis_list[i][0], enum_reference_list[j][0]) + ) + enum_hypothesis_list.pop(i) + enum_reference_list.pop(j) + break + return word_match, enum_hypothesis_list, enum_reference_list + + +def wordnetsyn_match( + hypothesis: Iterable[str], + reference: Iterable[str], + wordnet: WordNetCorpusReader = wordnet, +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + Matches each word in reference to a word in hypothesis if any synonym + of a hypothesis word is the exact match to the reference word. + + :param hypothesis: pre-tokenized hypothesis + :param reference: pre-tokenized reference + :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) + :return: list of mapped tuples + """ + enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) + return _enum_wordnetsyn_match( + enum_hypothesis_list, enum_reference_list, wordnet=wordnet + ) + + +def _enum_align_words( + enum_hypothesis_list: List[Tuple[int, str]], + enum_reference_list: List[Tuple[int, str]], + stemmer: StemmerI = PorterStemmer(), + wordnet: WordNetCorpusReader = wordnet, +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + Aligns/matches words in the hypothesis to reference by sequentially + applying exact match, stemmed match and wordnet based synonym match. + in case there are multiple matches the match which has the least number + of crossing is chosen. Takes enumerated list as input instead of + string input + + :param enum_hypothesis_list: enumerated hypothesis list + :param enum_reference_list: enumerated reference list + :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) + :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) + :return: sorted list of matched tuples, unmatched hypothesis list, + unmatched reference list + """ + exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums( + enum_hypothesis_list, enum_reference_list + ) + + stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match( + enum_hypothesis_list, enum_reference_list, stemmer=stemmer + ) + + wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match( + enum_hypothesis_list, enum_reference_list, wordnet=wordnet + ) + + return ( + sorted( + exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0] + ), + enum_hypothesis_list, + enum_reference_list, + ) + + +def align_words( + hypothesis: Iterable[str], + reference: Iterable[str], + stemmer: StemmerI = PorterStemmer(), + wordnet: WordNetCorpusReader = wordnet, +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: + """ + Aligns/matches words in the hypothesis to reference by sequentially + applying exact match, stemmed match and wordnet based synonym match. + In case there are multiple matches the match which has the least number + of crossing is chosen. + + :param hypothesis: pre-tokenized hypothesis + :param reference: pre-tokenized reference + :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) + :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) + :return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list + """ + enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) + return _enum_align_words( + enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet + ) + + +def _count_chunks(matches: List[Tuple[int, int]]) -> int: + """ + Counts the fewest possible number of chunks such that matched unigrams + of each chunk are adjacent to each other. This is used to calculate the + fragmentation part of the metric. + + :param matches: list containing a mapping of matched words (output of align_words) + :return: Number of chunks a sentence is divided into post alignment + """ + i = 0 + chunks = 1 + while i < len(matches) - 1: + if (matches[i + 1][0] == matches[i][0] + 1) and ( + matches[i + 1][1] == matches[i][1] + 1 + ): + i += 1 + continue + i += 1 + chunks += 1 + return chunks + + +def single_meteor_score( + reference: Iterable[str], + hypothesis: Iterable[str], + preprocess: Callable[[str], str] = str.lower, + stemmer: StemmerI = PorterStemmer(), + wordnet: WordNetCorpusReader = wordnet, + alpha: float = 0.9, + beta: float = 3.0, + gamma: float = 0.5, +) -> float: + """ + Calculates METEOR score for single hypothesis and reference as per + "Meteor: An Automatic Metric for MT Evaluation with HighLevels of + Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal, + in Proceedings of ACL. + https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf + + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party'] + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'] + + + >>> round(single_meteor_score(reference1, hypothesis1),4) + 0.6944 + + If there is no words match during the alignment the method returns the + score as 0. We can safely return a zero instead of raising a + division by zero error as no match usually implies a bad translation. + + >>> round(single_meteor_score(['this', 'is', 'a', 'cat'], ['non', 'matching', 'hypothesis']),4) + 0.0 + + :param reference: pre-tokenized reference + :param hypothesis: pre-tokenized hypothesis + :param preprocess: preprocessing function (default str.lower) + :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) + :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) + :param alpha: parameter for controlling relative weights of precision and recall. + :param beta: parameter for controlling shape of penalty as a + function of as a function of fragmentation. + :param gamma: relative weight assigned to fragmentation penalty. + :return: The sentence-level METEOR score. + """ + enum_hypothesis, enum_reference = _generate_enums( + hypothesis, reference, preprocess=preprocess + ) + translation_length = len(enum_hypothesis) + reference_length = len(enum_reference) + matches, _, _ = _enum_align_words( + enum_hypothesis, enum_reference, stemmer=stemmer, wordnet=wordnet + ) + matches_count = len(matches) + try: + precision = float(matches_count) / translation_length + recall = float(matches_count) / reference_length + fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall) + chunk_count = float(_count_chunks(matches)) + frag_frac = chunk_count / matches_count + except ZeroDivisionError: + return 0.0 + penalty = gamma * frag_frac**beta + return (1 - penalty) * fmean + + +def meteor_score( + references: Iterable[Iterable[str]], + hypothesis: Iterable[str], + preprocess: Callable[[str], str] = str.lower, + stemmer: StemmerI = PorterStemmer(), + wordnet: WordNetCorpusReader = wordnet, + alpha: float = 0.9, + beta: float = 3.0, + gamma: float = 0.5, +) -> float: + """ + Calculates METEOR score for hypothesis with multiple references as + described in "Meteor: An Automatic Metric for MT Evaluation with + HighLevels of Correlation with Human Judgments" by Alon Lavie and + Abhaya Agarwal, in Proceedings of ACL. + https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf + + + In case of multiple references the best score is chosen. This method + iterates over single_meteor_score and picks the best pair among all + the references for a given hypothesis + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party'] + >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 'forever', 'hearing', 'the', 'activity', 'guidebook', 'that', 'party', 'direct'] + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'] + >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] + >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party'] + + >>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4) + 0.6944 + + If there is no words match during the alignment the method returns the + score as 0. We can safely return a zero instead of raising a + division by zero error as no match usually implies a bad translation. + + >>> round(meteor_score([['this', 'is', 'a', 'cat']], ['non', 'matching', 'hypothesis']),4) + 0.0 + + :param references: pre-tokenized reference sentences + :param hypothesis: a pre-tokenized hypothesis sentence + :param preprocess: preprocessing function (default str.lower) + :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) + :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) + :param alpha: parameter for controlling relative weights of precision and recall. + :param beta: parameter for controlling shape of penalty as a function + of as a function of fragmentation. + :param gamma: relative weight assigned to fragmentation penalty. + :return: The sentence-level METEOR score. + """ + return max( + single_meteor_score( + reference, + hypothesis, + preprocess=preprocess, + stemmer=stemmer, + wordnet=wordnet, + alpha=alpha, + beta=beta, + gamma=gamma, + ) + for reference in references + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/metrics.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..4035e0578a7bc275cd066161a5a2d870500f38c1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/metrics.py @@ -0,0 +1,41 @@ +# Natural Language Toolkit: Translation metrics +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Will Zhang +# Guan Gui +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + + +def alignment_error_rate(reference, hypothesis, possible=None): + """ + Return the Alignment Error Rate (AER) of an alignment + with respect to a "gold standard" reference alignment. + Return an error rate between 0.0 (perfect alignment) and 1.0 (no + alignment). + + >>> from nltk.translate import Alignment + >>> ref = Alignment([(0, 0), (1, 1), (2, 2)]) + >>> test = Alignment([(0, 0), (1, 2), (2, 1)]) + >>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS + 0.6666666666666667 + + :type reference: Alignment + :param reference: A gold standard alignment (sure alignments) + :type hypothesis: Alignment + :param hypothesis: A hypothesis alignment (aka. candidate alignments) + :type possible: Alignment or None + :param possible: A gold standard reference of possible alignments + (defaults to *reference* if None) + :rtype: float or None + """ + + if possible is None: + possible = reference + else: + assert reference.issubset(possible) # sanity check + + return 1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) / float( + len(hypothesis) + len(reference) + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/nist_score.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/nist_score.py new file mode 100644 index 0000000000000000000000000000000000000000..ac3d2babd0dcd007d20946e824b600ff66b812c4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/nist_score.py @@ -0,0 +1,195 @@ +# Natural Language Toolkit: NIST Score +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: +# Contributors: +# URL: +# For license information, see LICENSE.TXT + +"""NIST score implementation.""" + +import fractions +import math +from collections import Counter + +from nltk.util import ngrams + + +def sentence_nist(references, hypothesis, n=5): + """ + Calculate NIST score from + George Doddington. 2002. "Automatic evaluation of machine translation quality + using n-gram co-occurrence statistics." Proceedings of HLT. + Morgan Kaufmann Publishers Inc. https://dl.acm.org/citation.cfm?id=1289189.1289273 + + DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU + score. The official script used by NIST to compute BLEU and NIST score is + mteval-14.pl. The main differences are: + + - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean. + - NIST has a different brevity penalty + - NIST score from mteval-14.pl has a self-contained tokenizer + + Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT + used in the NIST score computation. + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + + >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', + ... 'forever', 'hearing', 'the', 'activity', 'guidebook', + ... 'that', 'party', 'direct'] + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', 'forever', + ... 'heed', 'Party', 'commands'] + + >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', + ... 'Party'] + + >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + + >>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS + 3.3709... + + >>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS + 1.4619... + + :param references: reference sentences + :type references: list(list(str)) + :param hypothesis: a hypothesis sentence + :type hypothesis: list(str) + :param n: highest n-gram order + :type n: int + """ + return corpus_nist([references], [hypothesis], n) + + +def corpus_nist(list_of_references, hypotheses, n=5): + """ + Calculate a single corpus-level NIST score (aka. system-level BLEU) for all + the hypotheses and their respective references. + + :param references: a corpus of lists of reference sentences, w.r.t. hypotheses + :type references: list(list(list(str))) + :param hypotheses: a list of hypothesis sentences + :type hypotheses: list(list(str)) + :param n: highest n-gram order + :type n: int + """ + # Before proceeding to compute NIST, perform sanity checks. + assert len(list_of_references) == len( + hypotheses + ), "The number of hypotheses and their reference(s) should be the same" + + # Collect the ngram coounts from the reference sentences. + ngram_freq = Counter() + total_reference_words = 0 + for ( + references + ) in list_of_references: # For each source sent, there's a list of reference sents. + for reference in references: + # For each order of ngram, count the ngram occurrences. + for i in range(1, n + 1): + ngram_freq.update(ngrams(reference, i)) + total_reference_words += len(reference) + + # Compute the information weights based on the reference sentences. + # Eqn 2 in Doddington (2002): + # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ] + information_weights = {} + for _ngram in ngram_freq: # w_1 ... w_n + _mgram = _ngram[:-1] # w_1 ... w_n-1 + # From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl#L546 + # it's computed as such: + # denominator = ngram_freq[_mgram] if _mgram and _mgram in ngram_freq else denominator = total_reference_words + # information_weights[_ngram] = -1 * math.log(ngram_freq[_ngram]/denominator) / math.log(2) + # + # Mathematically, it's equivalent to the our implementation: + if _mgram and _mgram in ngram_freq: + numerator = ngram_freq[_mgram] + else: + numerator = total_reference_words + information_weights[_ngram] = math.log(numerator / ngram_freq[_ngram], 2) + + # Micro-average. + nist_precision_numerator_per_ngram = Counter() + nist_precision_denominator_per_ngram = Counter() + l_ref, l_sys = 0, 0 + # For each order of ngram. + for i in range(1, n + 1): + # Iterate through each hypothesis and their corresponding references. + for references, hypothesis in zip(list_of_references, hypotheses): + hyp_len = len(hypothesis) + + # Find reference with the best NIST score. + nist_score_per_ref = [] + for reference in references: + _ref_len = len(reference) + # Counter of ngrams in hypothesis. + hyp_ngrams = ( + Counter(ngrams(hypothesis, i)) + if len(hypothesis) >= i + else Counter() + ) + ref_ngrams = ( + Counter(ngrams(reference, i)) if len(reference) >= i else Counter() + ) + ngram_overlaps = hyp_ngrams & ref_ngrams + # Precision part of the score in Eqn 3 + _numerator = sum( + information_weights[_ngram] * count + for _ngram, count in ngram_overlaps.items() + ) + _denominator = sum(hyp_ngrams.values()) + _precision = 0 if _denominator == 0 else _numerator / _denominator + nist_score_per_ref.append( + (_precision, _numerator, _denominator, _ref_len) + ) + # Best reference. + precision, numerator, denominator, ref_len = max(nist_score_per_ref) + nist_precision_numerator_per_ngram[i] += numerator + nist_precision_denominator_per_ngram[i] += denominator + l_ref += ref_len + l_sys += hyp_len + + # Final NIST micro-average mean aggregation. + nist_precision = 0 + for i in nist_precision_numerator_per_ngram: + precision = ( + nist_precision_numerator_per_ngram[i] + / nist_precision_denominator_per_ngram[i] + ) + nist_precision += precision + # Eqn 3 in Doddington(2002) + return nist_precision * nist_length_penalty(l_ref, l_sys) + + +def nist_length_penalty(ref_len, hyp_len): + """ + Calculates the NIST length penalty, from Eq. 3 in Doddington (2002) + + penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 ))) + + where, + + `beta` is chosen to make the brevity penalty factor = 0.5 when the + no. of words in the system output (hyp) is 2/3 of the average + no. of words in the reference translation (ref) + + The NIST penalty is different from BLEU's such that it minimize the impact + of the score of small variations in the length of a translation. + See Fig. 4 in Doddington (2002) + """ + ratio = hyp_len / ref_len + if 0 < ratio < 1: + ratio_x, score_x = 1.5, 0.5 + beta = math.log(score_x) / math.log(ratio_x) ** 2 + return math.exp(beta * math.log(ratio) ** 2) + else: # ratio <= 0 or ratio >= 1 + return max(min(ratio, 1.0), 0.0) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/translate/phrase_based.py b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/phrase_based.py new file mode 100644 index 0000000000000000000000000000000000000000..2bef4d1f2d4b383633c623bce34d8d1fa57875e1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/translate/phrase_based.py @@ -0,0 +1,193 @@ +# Natural Language Toolkit: Phrase Extraction Algorithm +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova +# URL: +# For license information, see LICENSE.TXT + + +def extract( + f_start, + f_end, + e_start, + e_end, + alignment, + f_aligned, + srctext, + trgtext, + srclen, + trglen, + max_phrase_length, +): + """ + This function checks for alignment point consistency and extracts + phrases using the chunk of consistent phrases. + + A phrase pair (e, f ) is consistent with an alignment A if and only if: + + (i) No English words in the phrase pair are aligned to words outside it. + + ∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f + + (ii) No Foreign words in the phrase pair are aligned to words outside it. + + ∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e + + (iii) The phrase pair contains at least one alignment point. + + ∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A + + :type f_start: int + :param f_start: Starting index of the possible foreign language phrases + :type f_end: int + :param f_end: End index of the possible foreign language phrases + :type e_start: int + :param e_start: Starting index of the possible source language phrases + :type e_end: int + :param e_end: End index of the possible source language phrases + :type srctext: list + :param srctext: The source language tokens, a list of string. + :type trgtext: list + :param trgtext: The target language tokens, a list of string. + :type srclen: int + :param srclen: The number of tokens in the source language tokens. + :type trglen: int + :param trglen: The number of tokens in the target language tokens. + """ + + if f_end < 0: # 0-based indexing. + return {} + # Check if alignment points are consistent. + for e, f in alignment: + if (f_start <= f <= f_end) and (e < e_start or e > e_end): + return {} + + # Add phrase pairs (incl. additional unaligned f) + phrases = set() + fs = f_start + while True: + fe = min(f_end, f_start + max_phrase_length - 1) + while True: + # add phrase pair ([e_start, e_end], [fs, fe]) to set E + # Need to +1 in range to include the end-point. + src_phrase = " ".join(srctext[e_start : e_end + 1]) + trg_phrase = " ".join(trgtext[fs : fe + 1]) + # Include more data for later ordering. + phrases.add(((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase)) + fe += 1 + if fe in f_aligned or fe >= trglen: + break + fs -= 1 + if fs in f_aligned or fs < 0: + break + return phrases + + +def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0): + """ + Phrase extraction algorithm extracts all consistent phrase pairs from + a word-aligned sentence pair. + + The idea is to loop over all possible source language (e) phrases and find + the minimal foreign phrase (f) that matches each of them. Matching is done + by identifying all alignment points for the source phrase and finding the + shortest foreign phrase that includes all the foreign counterparts for the + source words. + + In short, a phrase alignment has to + (a) contain all alignment points for all covered words + (b) contain at least one alignment point + + >>> srctext = "michael assumes that he will stay in the house" + >>> trgtext = "michael geht davon aus , dass er im haus bleibt" + >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), + ... (5,9), (6,7), (7,7), (8,8)] + >>> phrases = phrase_extraction(srctext, trgtext, alignment) + >>> for i in sorted(phrases): + ... print(i) + ... + ((0, 1), (0, 1), 'michael', 'michael') + ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus') + ((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,') + ((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass') + ((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er') + ((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt') + ((1, 2), (1, 4), 'assumes', 'geht davon aus') + ((1, 2), (1, 5), 'assumes', 'geht davon aus ,') + ((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass') + ((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er') + ((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt') + ((2, 3), (4, 6), 'that', ', dass') + ((2, 3), (5, 6), 'that', 'dass') + ((2, 4), (4, 7), 'that he', ', dass er') + ((2, 4), (5, 7), 'that he', 'dass er') + ((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt') + ((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt') + ((3, 4), (6, 7), 'he', 'er') + ((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt') + ((4, 6), (9, 10), 'will stay', 'bleibt') + ((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt') + ((6, 8), (7, 8), 'in the', 'im') + ((6, 9), (7, 9), 'in the house', 'im haus') + ((8, 9), (8, 9), 'house', 'haus') + + :type srctext: str + :param srctext: The sentence string from the source language. + :type trgtext: str + :param trgtext: The sentence string from the target language. + :type alignment: list(tuple) + :param alignment: The word alignment outputs as list of tuples, where + the first elements of tuples are the source words' indices and + second elements are the target words' indices. This is also the output + format of nltk.translate.ibm1 + :rtype: list(tuple) + :return: A list of tuples, each element in a list is a phrase and each + phrase is a tuple made up of (i) its source location, (ii) its target + location, (iii) the source phrase and (iii) the target phrase. The phrase + list of tuples represents all the possible phrases extracted from the + word alignments. + :type max_phrase_length: int + :param max_phrase_length: maximal phrase length, if 0 or not specified + it is set to a length of the longer sentence (srctext or trgtext). + """ + + srctext = srctext.split() # e + trgtext = trgtext.split() # f + srclen = len(srctext) # len(e) + trglen = len(trgtext) # len(f) + # Keeps an index of which source/target words that are aligned. + f_aligned = [j for _, j in alignment] + max_phrase_length = max_phrase_length or max(srclen, trglen) + + # set of phrase pairs BP + bp = set() + + for e_start in range(srclen): + max_idx = min(srclen, e_start + max_phrase_length) + for e_end in range(e_start, max_idx): + # // find the minimally matching foreign phrase + # (f start , f end ) = ( length(f), 0 ) + # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1] + f_start, f_end = trglen - 1, -1 # 0-based indexing + + for e, f in alignment: + if e_start <= e <= e_end: + f_start = min(f, f_start) + f_end = max(f, f_end) + # add extract (f start , f end , e start , e end ) to set BP + phrases = extract( + f_start, + f_end, + e_start, + e_end, + alignment, + f_aligned, + srctext, + trgtext, + srclen, + trglen, + max_phrase_length, + ) + if phrases: + bp.update(phrases) + return bp diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tree/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6756c26735a81fe2844b9164835821dbe67e9825 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/__init__.py @@ -0,0 +1,52 @@ +# Natural Language Toolkit: Machine Translation +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Peter Ljunglöf +# Tom Aarsen <> +# URL: +# For license information, see LICENSE.TXT + +""" +NLTK Tree Package + +This package may be used for representing hierarchical language +structures, such as syntax trees and morphological trees. +""" + +# TODO: add LabelledTree (can be used for dependency trees) + +from nltk.tree.immutable import ( + ImmutableMultiParentedTree, + ImmutableParentedTree, + ImmutableProbabilisticTree, + ImmutableTree, +) +from nltk.tree.parented import MultiParentedTree, ParentedTree +from nltk.tree.parsing import bracket_parse, sinica_parse +from nltk.tree.prettyprinter import TreePrettyPrinter +from nltk.tree.probabilistic import ProbabilisticTree +from nltk.tree.transforms import ( + chomsky_normal_form, + collapse_unary, + un_chomsky_normal_form, +) +from nltk.tree.tree import Tree + +__all__ = [ + "ImmutableMultiParentedTree", + "ImmutableParentedTree", + "ImmutableProbabilisticTree", + "ImmutableTree", + "MultiParentedTree", + "ParentedTree", + "bracket_parse", + "sinica_parse", + "TreePrettyPrinter", + "ProbabilisticTree", + "chomsky_normal_form", + "collapse_unary", + "un_chomsky_normal_form", + "Tree", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tree/immutable.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/immutable.py new file mode 100644 index 0000000000000000000000000000000000000000..417e4f40c70270fc9eb935a38716e07544b75487 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/immutable.py @@ -0,0 +1,124 @@ +# Natural Language Toolkit: Text Trees +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Peter Ljunglöf +# Tom Aarsen <> +# URL: +# For license information, see LICENSE.TXT + +from nltk.probability import ProbabilisticMixIn +from nltk.tree.parented import MultiParentedTree, ParentedTree +from nltk.tree.tree import Tree + + +class ImmutableTree(Tree): + def __init__(self, node, children=None): + super().__init__(node, children) + # Precompute our hash value. This ensures that we're really + # immutable. It also means we only have to calculate it once. + try: + self._hash = hash((self._label, tuple(self))) + except (TypeError, ValueError) as e: + raise ValueError( + "%s: node value and children " "must be immutable" % type(self).__name__ + ) from e + + def __setitem__(self, index, value): + raise ValueError("%s may not be modified" % type(self).__name__) + + def __setslice__(self, i, j, value): + raise ValueError("%s may not be modified" % type(self).__name__) + + def __delitem__(self, index): + raise ValueError("%s may not be modified" % type(self).__name__) + + def __delslice__(self, i, j): + raise ValueError("%s may not be modified" % type(self).__name__) + + def __iadd__(self, other): + raise ValueError("%s may not be modified" % type(self).__name__) + + def __imul__(self, other): + raise ValueError("%s may not be modified" % type(self).__name__) + + def append(self, v): + raise ValueError("%s may not be modified" % type(self).__name__) + + def extend(self, v): + raise ValueError("%s may not be modified" % type(self).__name__) + + def pop(self, v=None): + raise ValueError("%s may not be modified" % type(self).__name__) + + def remove(self, v): + raise ValueError("%s may not be modified" % type(self).__name__) + + def reverse(self): + raise ValueError("%s may not be modified" % type(self).__name__) + + def sort(self): + raise ValueError("%s may not be modified" % type(self).__name__) + + def __hash__(self): + return self._hash + + def set_label(self, value): + """ + Set the node label. This will only succeed the first time the + node label is set, which should occur in ImmutableTree.__init__(). + """ + if hasattr(self, "_label"): + raise ValueError("%s may not be modified" % type(self).__name__) + self._label = value + + +class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn): + def __init__(self, node, children=None, **prob_kwargs): + ImmutableTree.__init__(self, node, children) + ProbabilisticMixIn.__init__(self, **prob_kwargs) + self._hash = hash((self._label, tuple(self), self.prob())) + + # We have to patch up these methods to make them work right: + def _frozen_class(self): + return ImmutableProbabilisticTree + + def __repr__(self): + return f"{Tree.__repr__(self)} [{self.prob()}]" + + def __str__(self): + return f"{self.pformat(margin=60)} [{self.prob()}]" + + def copy(self, deep=False): + if not deep: + return type(self)(self._label, self, prob=self.prob()) + else: + return type(self).convert(self) + + @classmethod + def convert(cls, val): + if isinstance(val, Tree): + children = [cls.convert(child) for child in val] + if isinstance(val, ProbabilisticMixIn): + return cls(val._label, children, prob=val.prob()) + else: + return cls(val._label, children, prob=1.0) + else: + return val + + +class ImmutableParentedTree(ImmutableTree, ParentedTree): + pass + + +class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree): + pass + + +__all__ = [ + "ImmutableProbabilisticTree", + "ImmutableTree", + "ImmutableParentedTree", + "ImmutableMultiParentedTree", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tree/parented.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/parented.py new file mode 100644 index 0000000000000000000000000000000000000000..8f42a60e6b0c9e2e600f8d4d125453a2dcb744f0 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/parented.py @@ -0,0 +1,590 @@ +# Natural Language Toolkit: Text Trees +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Peter Ljunglöf +# Tom Aarsen <> +# URL: +# For license information, see LICENSE.TXT + +import warnings +from abc import ABCMeta, abstractmethod + +from nltk.tree.tree import Tree +from nltk.util import slice_bounds + + +###################################################################### +## Parented trees +###################################################################### +class AbstractParentedTree(Tree, metaclass=ABCMeta): + """ + An abstract base class for a ``Tree`` that automatically maintains + pointers to parent nodes. These parent pointers are updated + whenever any change is made to a tree's structure. Two subclasses + are currently defined: + + - ``ParentedTree`` is used for tree structures where each subtree + has at most one parent. This class should be used in cases + where there is no"sharing" of subtrees. + + - ``MultiParentedTree`` is used for tree structures where a + subtree may have zero or more parents. This class should be + used in cases where subtrees may be shared. + + Subclassing + =========== + The ``AbstractParentedTree`` class redefines all operations that + modify a tree's structure to call two methods, which are used by + subclasses to update parent information: + + - ``_setparent()`` is called whenever a new child is added. + - ``_delparent()`` is called whenever a child is removed. + """ + + def __init__(self, node, children=None): + super().__init__(node, children) + # If children is None, the tree is read from node, and + # all parents will be set during parsing. + if children is not None: + # Otherwise we have to set the parent of the children. + # Iterate over self, and *not* children, because children + # might be an iterator. + for i, child in enumerate(self): + if isinstance(child, Tree): + self._setparent(child, i, dry_run=True) + for i, child in enumerate(self): + if isinstance(child, Tree): + self._setparent(child, i) + + # //////////////////////////////////////////////////////////// + # Parent management + # //////////////////////////////////////////////////////////// + @abstractmethod + def _setparent(self, child, index, dry_run=False): + """ + Update the parent pointer of ``child`` to point to ``self``. This + method is only called if the type of ``child`` is ``Tree``; + i.e., it is not called when adding a leaf to a tree. This method + is always called before the child is actually added to the + child list of ``self``. + + :type child: Tree + :type index: int + :param index: The index of ``child`` in ``self``. + :raise TypeError: If ``child`` is a tree with an impropriate + type. Typically, if ``child`` is a tree, then its type needs + to match the type of ``self``. This prevents mixing of + different tree types (single-parented, multi-parented, and + non-parented). + :param dry_run: If true, the don't actually set the child's + parent pointer; just check for any error conditions, and + raise an exception if one is found. + """ + + @abstractmethod + def _delparent(self, child, index): + """ + Update the parent pointer of ``child`` to not point to self. This + method is only called if the type of ``child`` is ``Tree``; i.e., it + is not called when removing a leaf from a tree. This method + is always called before the child is actually removed from the + child list of ``self``. + + :type child: Tree + :type index: int + :param index: The index of ``child`` in ``self``. + """ + + # //////////////////////////////////////////////////////////// + # Methods that add/remove children + # //////////////////////////////////////////////////////////// + # Every method that adds or removes a child must make + # appropriate calls to _setparent() and _delparent(). + + def __delitem__(self, index): + # del ptree[start:stop] + if isinstance(index, slice): + start, stop, step = slice_bounds(self, index, allow_step=True) + # Clear all the children pointers. + for i in range(start, stop, step): + if isinstance(self[i], Tree): + self._delparent(self[i], i) + # Delete the children from our child list. + super().__delitem__(index) + + # del ptree[i] + elif isinstance(index, int): + if index < 0: + index += len(self) + if index < 0: + raise IndexError("index out of range") + # Clear the child's parent pointer. + if isinstance(self[index], Tree): + self._delparent(self[index], index) + # Remove the child from our child list. + super().__delitem__(index) + + elif isinstance(index, (list, tuple)): + # del ptree[()] + if len(index) == 0: + raise IndexError("The tree position () may not be deleted.") + # del ptree[(i,)] + elif len(index) == 1: + del self[index[0]] + # del ptree[i1, i2, i3] + else: + del self[index[0]][index[1:]] + + else: + raise TypeError( + "%s indices must be integers, not %s" + % (type(self).__name__, type(index).__name__) + ) + + def __setitem__(self, index, value): + # ptree[start:stop] = value + if isinstance(index, slice): + start, stop, step = slice_bounds(self, index, allow_step=True) + # make a copy of value, in case it's an iterator + if not isinstance(value, (list, tuple)): + value = list(value) + # Check for any error conditions, so we can avoid ending + # up in an inconsistent state if an error does occur. + for i, child in enumerate(value): + if isinstance(child, Tree): + self._setparent(child, start + i * step, dry_run=True) + # clear the child pointers of all parents we're removing + for i in range(start, stop, step): + if isinstance(self[i], Tree): + self._delparent(self[i], i) + # set the child pointers of the new children. We do this + # after clearing *all* child pointers, in case we're e.g. + # reversing the elements in a tree. + for i, child in enumerate(value): + if isinstance(child, Tree): + self._setparent(child, start + i * step) + # finally, update the content of the child list itself. + super().__setitem__(index, value) + + # ptree[i] = value + elif isinstance(index, int): + if index < 0: + index += len(self) + if index < 0: + raise IndexError("index out of range") + # if the value is not changing, do nothing. + if value is self[index]: + return + # Set the new child's parent pointer. + if isinstance(value, Tree): + self._setparent(value, index) + # Remove the old child's parent pointer + if isinstance(self[index], Tree): + self._delparent(self[index], index) + # Update our child list. + super().__setitem__(index, value) + + elif isinstance(index, (list, tuple)): + # ptree[()] = value + if len(index) == 0: + raise IndexError("The tree position () may not be assigned to.") + # ptree[(i,)] = value + elif len(index) == 1: + self[index[0]] = value + # ptree[i1, i2, i3] = value + else: + self[index[0]][index[1:]] = value + + else: + raise TypeError( + "%s indices must be integers, not %s" + % (type(self).__name__, type(index).__name__) + ) + + def append(self, child): + if isinstance(child, Tree): + self._setparent(child, len(self)) + super().append(child) + + def extend(self, children): + for child in children: + if isinstance(child, Tree): + self._setparent(child, len(self)) + super().append(child) + + def insert(self, index, child): + # Handle negative indexes. Note that if index < -len(self), + # we do *not* raise an IndexError, unlike __getitem__. This + # is done for consistency with list.__getitem__ and list.index. + if index < 0: + index += len(self) + if index < 0: + index = 0 + # Set the child's parent, and update our child list. + if isinstance(child, Tree): + self._setparent(child, index) + super().insert(index, child) + + def pop(self, index=-1): + if index < 0: + index += len(self) + if index < 0: + raise IndexError("index out of range") + if isinstance(self[index], Tree): + self._delparent(self[index], index) + return super().pop(index) + + # n.b.: like `list`, this is done by equality, not identity! + # To remove a specific child, use del ptree[i]. + def remove(self, child): + index = self.index(child) + if isinstance(self[index], Tree): + self._delparent(self[index], index) + super().remove(child) + + # We need to implement __getslice__ and friends, even though + # they're deprecated, because otherwise list.__getslice__ will get + # called (since we're subclassing from list). Just delegate to + # __getitem__ etc., but use max(0, start) and max(0, stop) because + # because negative indices are already handled *before* + # __getslice__ is called; and we don't want to double-count them. + if hasattr(list, "__getslice__"): + + def __getslice__(self, start, stop): + return self.__getitem__(slice(max(0, start), max(0, stop))) + + def __delslice__(self, start, stop): + return self.__delitem__(slice(max(0, start), max(0, stop))) + + def __setslice__(self, start, stop, value): + return self.__setitem__(slice(max(0, start), max(0, stop)), value) + + def __getnewargs__(self): + """Method used by the pickle module when un-pickling. + This method provides the arguments passed to ``__new__`` + upon un-pickling. Without this method, ParentedTree instances + cannot be pickled and unpickled in Python 3.7+ onwards. + + :return: Tuple of arguments for ``__new__``, i.e. the label + and the children of this node. + :rtype: Tuple[Any, List[AbstractParentedTree]] + """ + return (self._label, list(self)) + + +class ParentedTree(AbstractParentedTree): + """ + A ``Tree`` that automatically maintains parent pointers for + single-parented trees. The following are methods for querying + the structure of a parented tree: ``parent``, ``parent_index``, + ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``. + + Each ``ParentedTree`` may have at most one parent. In + particular, subtrees may not be shared. Any attempt to reuse a + single ``ParentedTree`` as a child of more than one parent (or + as multiple children of the same parent) will cause a + ``ValueError`` exception to be raised. + + ``ParentedTrees`` should never be used in the same tree as ``Trees`` + or ``MultiParentedTrees``. Mixing tree implementations may result + in incorrect parent pointers and in ``TypeError`` exceptions. + """ + + def __init__(self, node, children=None): + self._parent = None + """The parent of this Tree, or None if it has no parent.""" + super().__init__(node, children) + if children is None: + # If children is None, the tree is read from node. + # After parsing, the parent of the immediate children + # will point to an intermediate tree, not self. + # We fix this by brute force: + for i, child in enumerate(self): + if isinstance(child, Tree): + child._parent = None + self._setparent(child, i) + + def _frozen_class(self): + from nltk.tree.immutable import ImmutableParentedTree + + return ImmutableParentedTree + + def copy(self, deep=False): + if not deep: + warnings.warn( + f"{self.__class__.__name__} objects do not support shallow copies. Defaulting to a deep copy." + ) + return super().copy(deep=True) + + # ///////////////////////////////////////////////////////////////// + # Methods + # ///////////////////////////////////////////////////////////////// + + def parent(self): + """The parent of this tree, or None if it has no parent.""" + return self._parent + + def parent_index(self): + """ + The index of this tree in its parent. I.e., + ``ptree.parent()[ptree.parent_index()] is ptree``. Note that + ``ptree.parent_index()`` is not necessarily equal to + ``ptree.parent.index(ptree)``, since the ``index()`` method + returns the first child that is equal to its argument. + """ + if self._parent is None: + return None + for i, child in enumerate(self._parent): + if child is self: + return i + assert False, "expected to find self in self._parent!" + + def left_sibling(self): + """The left sibling of this tree, or None if it has none.""" + parent_index = self.parent_index() + if self._parent and parent_index > 0: + return self._parent[parent_index - 1] + return None # no left sibling + + def right_sibling(self): + """The right sibling of this tree, or None if it has none.""" + parent_index = self.parent_index() + if self._parent and parent_index < (len(self._parent) - 1): + return self._parent[parent_index + 1] + return None # no right sibling + + def root(self): + """ + The root of this tree. I.e., the unique ancestor of this tree + whose parent is None. If ``ptree.parent()`` is None, then + ``ptree`` is its own root. + """ + root = self + while root.parent() is not None: + root = root.parent() + return root + + def treeposition(self): + """ + The tree position of this tree, relative to the root of the + tree. I.e., ``ptree.root[ptree.treeposition] is ptree``. + """ + if self.parent() is None: + return () + else: + return self.parent().treeposition() + (self.parent_index(),) + + # ///////////////////////////////////////////////////////////////// + # Parent Management + # ///////////////////////////////////////////////////////////////// + + def _delparent(self, child, index): + # Sanity checks + assert isinstance(child, ParentedTree) + assert self[index] is child + assert child._parent is self + + # Delete child's parent pointer. + child._parent = None + + def _setparent(self, child, index, dry_run=False): + # If the child's type is incorrect, then complain. + if not isinstance(child, ParentedTree): + raise TypeError("Can not insert a non-ParentedTree into a ParentedTree") + + # If child already has a parent, then complain. + if hasattr(child, "_parent") and child._parent is not None: + raise ValueError("Can not insert a subtree that already has a parent.") + + # Set child's parent pointer & index. + if not dry_run: + child._parent = self + + +class MultiParentedTree(AbstractParentedTree): + """ + A ``Tree`` that automatically maintains parent pointers for + multi-parented trees. The following are methods for querying the + structure of a multi-parented tree: ``parents()``, ``parent_indices()``, + ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``. + + Each ``MultiParentedTree`` may have zero or more parents. In + particular, subtrees may be shared. If a single + ``MultiParentedTree`` is used as multiple children of the same + parent, then that parent will appear multiple times in its + ``parents()`` method. + + ``MultiParentedTrees`` should never be used in the same tree as + ``Trees`` or ``ParentedTrees``. Mixing tree implementations may + result in incorrect parent pointers and in ``TypeError`` exceptions. + """ + + def __init__(self, node, children=None): + self._parents = [] + """A list of this tree's parents. This list should not + contain duplicates, even if a parent contains this tree + multiple times.""" + super().__init__(node, children) + if children is None: + # If children is None, the tree is read from node. + # After parsing, the parent(s) of the immediate children + # will point to an intermediate tree, not self. + # We fix this by brute force: + for i, child in enumerate(self): + if isinstance(child, Tree): + child._parents = [] + self._setparent(child, i) + + def _frozen_class(self): + from nltk.tree.immutable import ImmutableMultiParentedTree + + return ImmutableMultiParentedTree + + # ///////////////////////////////////////////////////////////////// + # Methods + # ///////////////////////////////////////////////////////////////// + + def parents(self): + """ + The set of parents of this tree. If this tree has no parents, + then ``parents`` is the empty set. To check if a tree is used + as multiple children of the same parent, use the + ``parent_indices()`` method. + + :type: list(MultiParentedTree) + """ + return list(self._parents) + + def left_siblings(self): + """ + A list of all left siblings of this tree, in any of its parent + trees. A tree may be its own left sibling if it is used as + multiple contiguous children of the same parent. A tree may + appear multiple times in this list if it is the left sibling + of this tree with respect to multiple parents. + + :type: list(MultiParentedTree) + """ + return [ + parent[index - 1] + for (parent, index) in self._get_parent_indices() + if index > 0 + ] + + def right_siblings(self): + """ + A list of all right siblings of this tree, in any of its parent + trees. A tree may be its own right sibling if it is used as + multiple contiguous children of the same parent. A tree may + appear multiple times in this list if it is the right sibling + of this tree with respect to multiple parents. + + :type: list(MultiParentedTree) + """ + return [ + parent[index + 1] + for (parent, index) in self._get_parent_indices() + if index < (len(parent) - 1) + ] + + def _get_parent_indices(self): + return [ + (parent, index) + for parent in self._parents + for index, child in enumerate(parent) + if child is self + ] + + def roots(self): + """ + The set of all roots of this tree. This set is formed by + tracing all possible parent paths until trees with no parents + are found. + + :type: list(MultiParentedTree) + """ + return list(self._get_roots_helper({}).values()) + + def _get_roots_helper(self, result): + if self._parents: + for parent in self._parents: + parent._get_roots_helper(result) + else: + result[id(self)] = self + return result + + def parent_indices(self, parent): + """ + Return a list of the indices where this tree occurs as a child + of ``parent``. If this child does not occur as a child of + ``parent``, then the empty list is returned. The following is + always true:: + + for parent_index in ptree.parent_indices(parent): + parent[parent_index] is ptree + """ + if parent not in self._parents: + return [] + else: + return [index for (index, child) in enumerate(parent) if child is self] + + def treepositions(self, root): + """ + Return a list of all tree positions that can be used to reach + this multi-parented tree starting from ``root``. I.e., the + following is always true:: + + for treepos in ptree.treepositions(root): + root[treepos] is ptree + """ + if self is root: + return [()] + else: + return [ + treepos + (index,) + for parent in self._parents + for treepos in parent.treepositions(root) + for (index, child) in enumerate(parent) + if child is self + ] + + # ///////////////////////////////////////////////////////////////// + # Parent Management + # ///////////////////////////////////////////////////////////////// + + def _delparent(self, child, index): + # Sanity checks + assert isinstance(child, MultiParentedTree) + assert self[index] is child + assert len([p for p in child._parents if p is self]) == 1 + + # If the only copy of child in self is at index, then delete + # self from child's parent list. + for i, c in enumerate(self): + if c is child and i != index: + break + else: + child._parents.remove(self) + + def _setparent(self, child, index, dry_run=False): + # If the child's type is incorrect, then complain. + if not isinstance(child, MultiParentedTree): + raise TypeError( + "Can not insert a non-MultiParentedTree into a MultiParentedTree" + ) + + # Add self as a parent pointer if it's not already listed. + if not dry_run: + for parent in child._parents: + if parent is self: + break + else: + child._parents.append(self) + + +__all__ = [ + "ParentedTree", + "MultiParentedTree", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tree/parsing.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/parsing.py new file mode 100644 index 0000000000000000000000000000000000000000..f7a6ba190fe720678142a977a6f8f05417a77f18 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/parsing.py @@ -0,0 +1,66 @@ +# Natural Language Toolkit: Text Trees +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Peter Ljunglöf +# Tom Aarsen <> +# URL: +# For license information, see LICENSE.TXT + +import re + +from nltk.tree.tree import Tree + +###################################################################### +## Parsing +###################################################################### + + +def bracket_parse(s): + """ + Use Tree.read(s, remove_empty_top_bracketing=True) instead. + """ + raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.") + + +def sinica_parse(s): + """ + Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, + as shown in the following example (X represents a Chinese character): + S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) + + :return: A tree corresponding to the string representation. + :rtype: Tree + :param s: The string to be converted + :type s: str + """ + tokens = re.split(r"([()| ])", s) + for i in range(len(tokens)): + if tokens[i] == "(": + tokens[i - 1], tokens[i] = ( + tokens[i], + tokens[i - 1], + ) # pull nonterminal inside parens + elif ":" in tokens[i]: + fields = tokens[i].split(":") + if len(fields) == 2: # non-terminal + tokens[i] = fields[1] + else: + tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" + elif tokens[i] == "|": + tokens[i] = "" + + treebank_string = " ".join(tokens) + return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True) + + +# s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier +# s = re.sub(r'\w+:', '', s) # remove role tags + +# return s + +__all__ = [ + "bracket_parse", + "sinica_parse", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tree/prettyprinter.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/prettyprinter.py new file mode 100644 index 0000000000000000000000000000000000000000..7919ac19d55e15101e1d640732b24b92f4476bec --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/prettyprinter.py @@ -0,0 +1,627 @@ +# Natural Language Toolkit: ASCII visualization of NLTK trees +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Andreas van Cranenburgh +# Peter Ljunglöf +# URL: +# For license information, see LICENSE.TXT + +""" +Pretty-printing of discontinuous trees. +Adapted from the disco-dop project, by Andreas van Cranenburgh. +https://github.com/andreasvc/disco-dop + +Interesting reference (not used for this code): +T. Eschbach et al., Orth. Hypergraph Drawing, Journal of +Graph Algorithms and Applications, 10(2) 141--157 (2006)149. +https://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf +""" + +import re + +try: + from html import escape +except ImportError: + from cgi import escape + +from collections import defaultdict +from operator import itemgetter + +from nltk.tree.tree import Tree +from nltk.util import OrderedDict + +ANSICOLOR = { + "black": 30, + "red": 31, + "green": 32, + "yellow": 33, + "blue": 34, + "magenta": 35, + "cyan": 36, + "white": 37, +} + + +class TreePrettyPrinter: + """ + Pretty-print a tree in text format, either as ASCII or Unicode. + The tree can be a normal tree, or discontinuous. + + ``TreePrettyPrinter(tree, sentence=None, highlight=())`` + creates an object from which different visualizations can be created. + + :param tree: a Tree object. + :param sentence: a list of words (strings). If `sentence` is given, + `tree` must contain integers as leaves, which are taken as indices + in `sentence`. Using this you can display a discontinuous tree. + :param highlight: Optionally, a sequence of Tree objects in `tree` which + should be highlighted. Has the effect of only applying colors to nodes + in this sequence (nodes should be given as Tree objects, terminals as + indices). + + >>> from nltk.tree import Tree + >>> tree = Tree.fromstring('(S (NP Mary) (VP walks))') + >>> print(TreePrettyPrinter(tree).text()) + ... # doctest: +NORMALIZE_WHITESPACE + S + ____|____ + NP VP + | | + Mary walks + """ + + def __init__(self, tree, sentence=None, highlight=()): + if sentence is None: + leaves = tree.leaves() + if ( + leaves + and all(len(a) > 0 for a in tree.subtrees()) + and all(isinstance(a, int) for a in leaves) + ): + sentence = [str(a) for a in leaves] + else: + # this deals with empty nodes (frontier non-terminals) + # and multiple/mixed terminals under non-terminals. + tree = tree.copy(True) + sentence = [] + for a in tree.subtrees(): + if len(a) == 0: + a.append(len(sentence)) + sentence.append(None) + elif any(not isinstance(b, Tree) for b in a): + for n, b in enumerate(a): + if not isinstance(b, Tree): + a[n] = len(sentence) + if type(b) == tuple: + b = "/".join(b) + sentence.append("%s" % b) + self.nodes, self.coords, self.edges, self.highlight = self.nodecoords( + tree, sentence, highlight + ) + + def __str__(self): + return self.text() + + def __repr__(self): + return "" % len(self.nodes) + + @staticmethod + def nodecoords(tree, sentence, highlight): + """ + Produce coordinates of nodes on a grid. + + Objective: + + - Produce coordinates for a non-overlapping placement of nodes and + horizontal lines. + - Order edges so that crossing edges cross a minimal number of previous + horizontal lines (never vertical lines). + + Approach: + + - bottom up level order traversal (start at terminals) + - at each level, identify nodes which cannot be on the same row + - identify nodes which cannot be in the same column + - place nodes into a grid at (row, column) + - order child-parent edges with crossing edges last + + Coordinates are (row, column); the origin (0, 0) is at the top left; + the root node is on row 0. Coordinates do not consider the size of a + node (which depends on font, &c), so the width of a column of the grid + should be automatically determined by the element with the greatest + width in that column. Alternatively, the integer coordinates could be + converted to coordinates in which the distances between adjacent nodes + are non-uniform. + + Produces tuple (nodes, coords, edges, highlighted) where: + + - nodes[id]: Tree object for the node with this integer id + - coords[id]: (n, m) coordinate where to draw node with id in the grid + - edges[id]: parent id of node with this id (ordered dictionary) + - highlighted: set of ids that should be highlighted + """ + + def findcell(m, matrix, startoflevel, children): + """ + Find vacant row, column index for node ``m``. + Iterate over current rows for this level (try lowest first) + and look for cell between first and last child of this node, + add new row to level if no free row available. + """ + candidates = [a for _, a in children[m]] + minidx, maxidx = min(candidates), max(candidates) + leaves = tree[m].leaves() + center = scale * sum(leaves) // len(leaves) # center of gravity + if minidx < maxidx and not minidx < center < maxidx: + center = sum(candidates) // len(candidates) + if max(candidates) - min(candidates) > 2 * scale: + center -= center % scale # round to unscaled coordinate + if minidx < maxidx and not minidx < center < maxidx: + center += scale + if ids[m] == 0: + startoflevel = len(matrix) + for rowidx in range(startoflevel, len(matrix) + 1): + if rowidx == len(matrix): # need to add a new row + matrix.append( + [ + vertline if a not in (corner, None) else None + for a in matrix[-1] + ] + ) + row = matrix[rowidx] + if len(children[m]) == 1: # place unaries directly above child + return rowidx, next(iter(children[m]))[1] + elif all( + a is None or a == vertline + for a in row[min(candidates) : max(candidates) + 1] + ): + # find free column + for n in range(scale): + i = j = center + n + while j > minidx or i < maxidx: + if i < maxidx and ( + matrix[rowidx][i] is None or i in candidates + ): + return rowidx, i + elif j > minidx and ( + matrix[rowidx][j] is None or j in candidates + ): + return rowidx, j + i += scale + j -= scale + raise ValueError( + "could not find a free cell for:\n%s\n%s" + "min=%d; max=%d" % (tree[m], minidx, maxidx, dumpmatrix()) + ) + + def dumpmatrix(): + """Dump matrix contents for debugging purposes.""" + return "\n".join( + "%2d: %s" % (n, " ".join(("%2r" % i)[:2] for i in row)) + for n, row in enumerate(matrix) + ) + + leaves = tree.leaves() + if not all(isinstance(n, int) for n in leaves): + raise ValueError("All leaves must be integer indices.") + if len(leaves) != len(set(leaves)): + raise ValueError("Indices must occur at most once.") + if not all(0 <= n < len(sentence) for n in leaves): + raise ValueError( + "All leaves must be in the interval 0..n " + "with n=len(sentence)\ntokens: %d indices: " + "%r\nsentence: %s" % (len(sentence), tree.leaves(), sentence) + ) + vertline, corner = -1, -2 # constants + tree = tree.copy(True) + for a in tree.subtrees(): + a.sort(key=lambda n: min(n.leaves()) if isinstance(n, Tree) else n) + scale = 2 + crossed = set() + # internal nodes and lexical nodes (no frontiers) + positions = tree.treepositions() + maxdepth = max(map(len, positions)) + 1 + childcols = defaultdict(set) + matrix = [[None] * (len(sentence) * scale)] + nodes = {} + ids = {a: n for n, a in enumerate(positions)} + highlighted_nodes = { + n for a, n in ids.items() if not highlight or tree[a] in highlight + } + levels = {n: [] for n in range(maxdepth - 1)} + terminals = [] + for a in positions: + node = tree[a] + if isinstance(node, Tree): + levels[maxdepth - node.height()].append(a) + else: + terminals.append(a) + + for n in levels: + levels[n].sort(key=lambda n: max(tree[n].leaves()) - min(tree[n].leaves())) + terminals.sort() + positions = set(positions) + + for m in terminals: + i = int(tree[m]) * scale + assert matrix[0][i] is None, (matrix[0][i], m, i) + matrix[0][i] = ids[m] + nodes[ids[m]] = sentence[tree[m]] + if nodes[ids[m]] is None: + nodes[ids[m]] = "..." + highlighted_nodes.discard(ids[m]) + positions.remove(m) + childcols[m[:-1]].add((0, i)) + + # add other nodes centered on their children, + # if the center is already taken, back off + # to the left and right alternately, until an empty cell is found. + for n in sorted(levels, reverse=True): + nodesatdepth = levels[n] + startoflevel = len(matrix) + matrix.append( + [vertline if a not in (corner, None) else None for a in matrix[-1]] + ) + for m in nodesatdepth: # [::-1]: + if n < maxdepth - 1 and childcols[m]: + _, pivot = min(childcols[m], key=itemgetter(1)) + if { + a[:-1] + for row in matrix[:-1] + for a in row[:pivot] + if isinstance(a, tuple) + } & { + a[:-1] + for row in matrix[:-1] + for a in row[pivot:] + if isinstance(a, tuple) + }: + crossed.add(m) + + rowidx, i = findcell(m, matrix, startoflevel, childcols) + positions.remove(m) + + # block positions where children of this node branch out + for _, x in childcols[m]: + matrix[rowidx][x] = corner + # assert m == () or matrix[rowidx][i] in (None, corner), ( + # matrix[rowidx][i], m, str(tree), ' '.join(sentence)) + # node itself + matrix[rowidx][i] = ids[m] + nodes[ids[m]] = tree[m] + # add column to the set of children for its parent + if len(m) > 0: + childcols[m[:-1]].add((rowidx, i)) + assert len(positions) == 0 + + # remove unused columns, right to left + for m in range(scale * len(sentence) - 1, -1, -1): + if not any(isinstance(row[m], (Tree, int)) for row in matrix): + for row in matrix: + del row[m] + + # remove unused rows, reverse + matrix = [ + row + for row in reversed(matrix) + if not all(a is None or a == vertline for a in row) + ] + + # collect coordinates of nodes + coords = {} + for n, _ in enumerate(matrix): + for m, i in enumerate(matrix[n]): + if isinstance(i, int) and i >= 0: + coords[i] = n, m + + # move crossed edges last + positions = sorted( + (a for level in levels.values() for a in level), + key=lambda a: a[:-1] in crossed, + ) + + # collect edges from node to node + edges = OrderedDict() + for i in reversed(positions): + for j, _ in enumerate(tree[i]): + edges[ids[i + (j,)]] = ids[i] + + return nodes, coords, edges, highlighted_nodes + + def text( + self, + nodedist=1, + unicodelines=False, + html=False, + ansi=False, + nodecolor="blue", + leafcolor="red", + funccolor="green", + abbreviate=None, + maxwidth=16, + ): + """ + :return: ASCII art for a discontinuous tree. + + :param unicodelines: whether to use Unicode line drawing characters + instead of plain (7-bit) ASCII. + :param html: whether to wrap output in html code (default plain text). + :param ansi: whether to produce colors with ANSI escape sequences + (only effective when html==False). + :param leafcolor, nodecolor: specify colors of leaves and phrasal + nodes; effective when either html or ansi is True. + :param abbreviate: if True, abbreviate labels longer than 5 characters. + If integer, abbreviate labels longer than `abbr` characters. + :param maxwidth: maximum number of characters before a label starts to + wrap; pass None to disable. + """ + if abbreviate == True: + abbreviate = 5 + if unicodelines: + horzline = "\u2500" + leftcorner = "\u250c" + rightcorner = "\u2510" + vertline = " \u2502 " + tee = horzline + "\u252C" + horzline + bottom = horzline + "\u2534" + horzline + cross = horzline + "\u253c" + horzline + ellipsis = "\u2026" + else: + horzline = "_" + leftcorner = rightcorner = " " + vertline = " | " + tee = 3 * horzline + cross = bottom = "_|_" + ellipsis = "." + + def crosscell(cur, x=vertline): + """Overwrite center of this cell with a vertical branch.""" + splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1 + lst = list(cur) + lst[splitl : splitl + len(x)] = list(x) + return "".join(lst) + + result = [] + matrix = defaultdict(dict) + maxnodewith = defaultdict(lambda: 3) + maxnodeheight = defaultdict(lambda: 1) + maxcol = 0 + minchildcol = {} + maxchildcol = {} + childcols = defaultdict(set) + labels = {} + wrapre = re.compile( + "(.{%d,%d}\\b\\W*|.{%d})" % (maxwidth - 4, maxwidth, maxwidth) + ) + # collect labels and coordinates + for a in self.nodes: + row, column = self.coords[a] + matrix[row][column] = a + maxcol = max(maxcol, column) + label = ( + self.nodes[a].label() + if isinstance(self.nodes[a], Tree) + else self.nodes[a] + ) + if abbreviate and len(label) > abbreviate: + label = label[:abbreviate] + ellipsis + if maxwidth and len(label) > maxwidth: + label = wrapre.sub(r"\1\n", label).strip() + label = label.split("\n") + maxnodeheight[row] = max(maxnodeheight[row], len(label)) + maxnodewith[column] = max(maxnodewith[column], max(map(len, label))) + labels[a] = label + if a not in self.edges: + continue # e.g., root + parent = self.edges[a] + childcols[parent].add((row, column)) + minchildcol[parent] = min(minchildcol.get(parent, column), column) + maxchildcol[parent] = max(maxchildcol.get(parent, column), column) + # bottom up level order traversal + for row in sorted(matrix, reverse=True): + noderows = [ + ["".center(maxnodewith[col]) for col in range(maxcol + 1)] + for _ in range(maxnodeheight[row]) + ] + branchrow = ["".center(maxnodewith[col]) for col in range(maxcol + 1)] + for col in matrix[row]: + n = matrix[row][col] + node = self.nodes[n] + text = labels[n] + if isinstance(node, Tree): + # draw horizontal branch towards children for this node + if n in minchildcol and minchildcol[n] < maxchildcol[n]: + i, j = minchildcol[n], maxchildcol[n] + a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2 + branchrow[i] = ((" " * a) + leftcorner).ljust( + maxnodewith[i], horzline + ) + branchrow[j] = (rightcorner + (" " * b)).rjust( + maxnodewith[j], horzline + ) + for i in range(minchildcol[n] + 1, maxchildcol[n]): + if i == col and any(a == i for _, a in childcols[n]): + line = cross + elif i == col: + line = bottom + elif any(a == i for _, a in childcols[n]): + line = tee + else: + line = horzline + branchrow[i] = line.center(maxnodewith[i], horzline) + else: # if n and n in minchildcol: + branchrow[col] = crosscell(branchrow[col]) + text = [a.center(maxnodewith[col]) for a in text] + color = nodecolor if isinstance(node, Tree) else leafcolor + if isinstance(node, Tree) and node.label().startswith("-"): + color = funccolor + if html: + text = [escape(a, quote=False) for a in text] + if n in self.highlight: + text = [f"{a}" for a in text] + elif ansi and n in self.highlight: + text = ["\x1b[%d;1m%s\x1b[0m" % (ANSICOLOR[color], a) for a in text] + for x in range(maxnodeheight[row]): + # draw vertical lines in partially filled multiline node + # labels, but only if it's not a frontier node. + noderows[x][col] = ( + text[x] + if x < len(text) + else (vertline if childcols[n] else " ").center( + maxnodewith[col], " " + ) + ) + # for each column, if there is a node below us which has a parent + # above us, draw a vertical branch in that column. + if row != max(matrix): + for n, (childrow, col) in self.coords.items(): + if n > 0 and self.coords[self.edges[n]][0] < row < childrow: + branchrow[col] = crosscell(branchrow[col]) + if col not in matrix[row]: + for noderow in noderows: + noderow[col] = crosscell(noderow[col]) + branchrow = [ + a + ((a[-1] if a[-1] != " " else b[0]) * nodedist) + for a, b in zip(branchrow, branchrow[1:] + [" "]) + ] + result.append("".join(branchrow)) + result.extend( + (" " * nodedist).join(noderow) for noderow in reversed(noderows) + ) + return "\n".join(reversed(result)) + "\n" + + def svg(self, nodecolor="blue", leafcolor="red", funccolor="green"): + """ + :return: SVG representation of a tree. + """ + fontsize = 12 + hscale = 40 + vscale = 25 + hstart = vstart = 20 + width = max(col for _, col in self.coords.values()) + height = max(row for row, _ in self.coords.values()) + result = [ + '' + % ( + width * 3, + height * 2.5, + -hstart, + -vstart, + width * hscale + 3 * hstart, + height * vscale + 3 * vstart, + ) + ] + + children = defaultdict(set) + for n in self.nodes: + if n: + children[self.edges[n]].add(n) + + # horizontal branches from nodes to children + for node in self.nodes: + if not children[node]: + continue + y, x = self.coords[node] + x *= hscale + y *= vscale + x += hstart + y += vstart + fontsize // 2 + childx = [self.coords[c][1] for c in children[node]] + xmin = hstart + hscale * min(childx) + xmax = hstart + hscale * max(childx) + result.append( + '\t' % (xmin, y, xmax, y) + ) + result.append( + '\t' % (x, y, x, y - fontsize // 3) + ) + + # vertical branches from children to parents + for child, parent in self.edges.items(): + y, _ = self.coords[parent] + y *= vscale + y += vstart + fontsize // 2 + childy, childx = self.coords[child] + childx *= hscale + childy *= vscale + childx += hstart + childy += vstart - fontsize + result += [ + '\t' % (childx, childy, childx, y + 5), + '\t' % (childx, childy, childx, y), + ] + + # write nodes with coordinates + for n, (row, column) in self.coords.items(): + node = self.nodes[n] + x = column * hscale + hstart + y = row * vscale + vstart + if n in self.highlight: + color = nodecolor if isinstance(node, Tree) else leafcolor + if isinstance(node, Tree) and node.label().startswith("-"): + color = funccolor + else: + color = "black" + result += [ + '\t%s' + % ( + color, + fontsize, + x, + y, + escape( + node.label() if isinstance(node, Tree) else node, quote=False + ), + ) + ] + + result += [""] + return "\n".join(result) + + +def test(): + """Do some tree drawing tests.""" + + def print_tree(n, tree, sentence=None, ansi=True, **xargs): + print() + print('{}: "{}"'.format(n, " ".join(sentence or tree.leaves()))) + print(tree) + print() + drawtree = TreePrettyPrinter(tree, sentence) + try: + print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs)) + except (UnicodeDecodeError, UnicodeEncodeError): + print(drawtree.text(unicodelines=False, ansi=False, **xargs)) + + from nltk.corpus import treebank + + for n in [0, 1440, 1591, 2771, 2170]: + tree = treebank.parsed_sents()[n] + print_tree(n, tree, nodedist=2, maxwidth=8) + print() + print("ASCII version:") + print(TreePrettyPrinter(tree).text(nodedist=2)) + + tree = Tree.fromstring( + "(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) " + "(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) " + "(vg 10) (inf (verb 11)))))) (punct 12))", + read_leaf=int, + ) + sentence = ( + "Ze had met haar moeder kunnen gaan winkelen ," + " zwemmen of terrassen .".split() + ) + print_tree("Discontinuous tree", tree, sentence, nodedist=2) + + +__all__ = ["TreePrettyPrinter"] + +if __name__ == "__main__": + test() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tree/probabilistic.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/probabilistic.py new file mode 100644 index 0000000000000000000000000000000000000000..31526b6dc22b0594bc01c2d81fedbc394737ace1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/probabilistic.py @@ -0,0 +1,74 @@ +# Natural Language Toolkit: Text Trees +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Peter Ljunglöf +# Tom Aarsen <> +# URL: +# For license information, see LICENSE.TXT + + +from nltk.internals import raise_unorderable_types +from nltk.probability import ProbabilisticMixIn +from nltk.tree.immutable import ImmutableProbabilisticTree +from nltk.tree.tree import Tree + +###################################################################### +## Probabilistic trees +###################################################################### + + +class ProbabilisticTree(Tree, ProbabilisticMixIn): + def __init__(self, node, children=None, **prob_kwargs): + Tree.__init__(self, node, children) + ProbabilisticMixIn.__init__(self, **prob_kwargs) + + # We have to patch up these methods to make them work right: + def _frozen_class(self): + return ImmutableProbabilisticTree + + def __repr__(self): + return f"{Tree.__repr__(self)} (p={self.prob()!r})" + + def __str__(self): + return f"{self.pformat(margin=60)} (p={self.prob():.6g})" + + def copy(self, deep=False): + if not deep: + return type(self)(self._label, self, prob=self.prob()) + else: + return type(self).convert(self) + + @classmethod + def convert(cls, val): + if isinstance(val, Tree): + children = [cls.convert(child) for child in val] + if isinstance(val, ProbabilisticMixIn): + return cls(val._label, children, prob=val.prob()) + else: + return cls(val._label, children, prob=1.0) + else: + return val + + def __eq__(self, other): + return self.__class__ is other.__class__ and ( + self._label, + list(self), + self.prob(), + ) == (other._label, list(other), other.prob()) + + def __lt__(self, other): + if not isinstance(other, Tree): + raise_unorderable_types("<", self, other) + if self.__class__ is other.__class__: + return (self._label, list(self), self.prob()) < ( + other._label, + list(other), + other.prob(), + ) + else: + return self.__class__.__name__ < other.__class__.__name__ + + +__all__ = ["ProbabilisticTree"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tree/transforms.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..99cd6893ce9f168ffa024f2bb8c39177617dced2 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/transforms.py @@ -0,0 +1,338 @@ +# Natural Language Toolkit: Tree Transformations +# +# Copyright (C) 2005-2007 Oregon Graduate Institute +# Author: Nathan Bodenstab +# URL: +# For license information, see LICENSE.TXT + +r""" +A collection of methods for tree (grammar) transformations used +in parsing natural language. + +Although many of these methods are technically grammar transformations +(ie. Chomsky Norm Form), when working with treebanks it is much more +natural to visualize these modifications in a tree structure. Hence, +we will do all transformation directly to the tree itself. +Transforming the tree directly also allows us to do parent annotation. +A grammar can then be simply induced from the modified tree. + +The following is a short tutorial on the available transformations. + + 1. Chomsky Normal Form (binarization) + + It is well known that any grammar has a Chomsky Normal Form (CNF) + equivalent grammar where CNF is defined by every production having + either two non-terminals or one terminal on its right hand side. + When we have hierarchically structured data (ie. a treebank), it is + natural to view this in terms of productions where the root of every + subtree is the head (left hand side) of the production and all of + its children are the right hand side constituents. In order to + convert a tree into CNF, we simply need to ensure that every subtree + has either two subtrees as children (binarization), or one leaf node + (non-terminal). In order to binarize a subtree with more than two + children, we must introduce artificial nodes. + + There are two popular methods to convert a tree into CNF: left + factoring and right factoring. The following example demonstrates + the difference between them. Example:: + + Original Right-Factored Left-Factored + + A A A + / | \ / \ / \ + B C D ==> B A| OR A| D + / \ / \ + C D B C + + 2. Parent Annotation + + In addition to binarizing the tree, there are two standard + modifications to node labels we can do in the same traversal: parent + annotation and Markov order-N smoothing (or sibling smoothing). + + The purpose of parent annotation is to refine the probabilities of + productions by adding a small amount of context. With this simple + addition, a CYK (inside-outside, dynamic programming chart parse) + can improve from 74% to 79% accuracy. A natural generalization from + parent annotation is to grandparent annotation and beyond. The + tradeoff becomes accuracy gain vs. computational complexity. We + must also keep in mind data sparcity issues. Example:: + + Original Parent Annotation + + A A^ + / | \ / \ + B C D ==> B^ A|^ where ? is the + / \ parent of A + C^ D^ + + + 3. Markov order-N smoothing + + Markov smoothing combats data sparcity issues as well as decreasing + computational requirements by limiting the number of children + included in artificial nodes. In practice, most people use an order + 2 grammar. Example:: + + Original No Smoothing Markov order 1 Markov order 2 etc. + + __A__ A A A + / /|\ \ / \ / \ / \ + B C D E F ==> B A| ==> B A| ==> B A| + / \ / \ / \ + C ... C ... C ... + + + + Annotation decisions can be thought about in the vertical direction + (parent, grandparent, etc) and the horizontal direction (number of + siblings to keep). Parameters to the following functions specify + these values. For more information see: + + Dan Klein and Chris Manning (2003) "Accurate Unlexicalized + Parsing", ACL-03. https://www.aclweb.org/anthology/P03-1054 + + 4. Unary Collapsing + + Collapse unary productions (ie. subtrees with a single child) into a + new non-terminal (Tree node). This is useful when working with + algorithms that do not allow unary productions, yet you do not wish + to lose the parent information. Example:: + + A + | + B ==> A+B + / \ / \ + C D C D + +""" + +from nltk.tree.tree import Tree + + +def chomsky_normal_form( + tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^" +): + # assume all subtrees have homogeneous children + # assume all terminals have no siblings + + # A semi-hack to have elegant looking code below. As a result, + # any subtree with a branching factor greater than 999 will be incorrectly truncated. + if horzMarkov is None: + horzMarkov = 999 + + # Traverse the tree depth-first keeping a list of ancestor nodes to the root. + # I chose not to use the tree.treepositions() method since it requires + # two traversals of the tree (one to get the positions, one to iterate + # over them) and node access time is proportional to the height of the node. + # This method is 7x faster which helps when parsing 40,000 sentences. + + nodeList = [(tree, [tree.label()])] + while nodeList != []: + node, parent = nodeList.pop() + if isinstance(node, Tree): + + # parent annotation + parentString = "" + originalNode = node.label() + if vertMarkov != 0 and node != tree and isinstance(node[0], Tree): + parentString = "{}<{}>".format(parentChar, "-".join(parent)) + node.set_label(node.label() + parentString) + parent = [originalNode] + parent[: vertMarkov - 1] + + # add children to the agenda before we mess with them + for child in node: + nodeList.append((child, parent)) + + # chomsky normal form factorization + if len(node) > 2: + childNodes = [child.label() for child in node] + nodeCopy = node.copy() + node[0:] = [] # delete the children + + curNode = node + numChildren = len(nodeCopy) + for i in range(1, numChildren - 1): + if factor == "right": + newHead = "{}{}<{}>{}".format( + originalNode, + childChar, + "-".join( + childNodes[i : min([i + horzMarkov, numChildren])] + ), + parentString, + ) # create new head + newNode = Tree(newHead, []) + curNode[0:] = [nodeCopy.pop(0), newNode] + else: + newHead = "{}{}<{}>{}".format( + originalNode, + childChar, + "-".join( + childNodes[max([numChildren - i - horzMarkov, 0]) : -i] + ), + parentString, + ) + newNode = Tree(newHead, []) + curNode[0:] = [newNode, nodeCopy.pop()] + + curNode = newNode + + curNode[0:] = [child for child in nodeCopy] + + +def un_chomsky_normal_form( + tree, expandUnary=True, childChar="|", parentChar="^", unaryChar="+" +): + # Traverse the tree-depth first keeping a pointer to the parent for modification purposes. + nodeList = [(tree, [])] + while nodeList != []: + node, parent = nodeList.pop() + if isinstance(node, Tree): + # if the node contains the 'childChar' character it means that + # it is an artificial node and can be removed, although we still need + # to move its children to its parent + childIndex = node.label().find(childChar) + if childIndex != -1: + nodeIndex = parent.index(node) + parent.remove(parent[nodeIndex]) + # Generated node was on the left if the nodeIndex is 0 which + # means the grammar was left factored. We must insert the children + # at the beginning of the parent's children + if nodeIndex == 0: + parent.insert(0, node[0]) + parent.insert(1, node[1]) + else: + parent.extend([node[0], node[1]]) + + # parent is now the current node so the children of parent will be added to the agenda + node = parent + else: + parentIndex = node.label().find(parentChar) + if parentIndex != -1: + # strip the node name of the parent annotation + node.set_label(node.label()[:parentIndex]) + + # expand collapsed unary productions + if expandUnary == True: + unaryIndex = node.label().find(unaryChar) + if unaryIndex != -1: + newNode = Tree( + node.label()[unaryIndex + 1 :], [i for i in node] + ) + node.set_label(node.label()[:unaryIndex]) + node[0:] = [newNode] + + for child in node: + nodeList.append((child, node)) + + +def collapse_unary(tree, collapsePOS=False, collapseRoot=False, joinChar="+"): + """ + Collapse subtrees with a single child (ie. unary productions) + into a new non-terminal (Tree node) joined by 'joinChar'. + This is useful when working with algorithms that do not allow + unary productions, and completely removing the unary productions + would require loss of useful information. The Tree is modified + directly (since it is passed by reference) and no value is returned. + + :param tree: The Tree to be collapsed + :type tree: Tree + :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. + Part-of-Speech tags) since they are always unary productions + :type collapsePOS: bool + :param collapseRoot: 'False' (default) will not modify the root production + if it is unary. For the Penn WSJ treebank corpus, this corresponds + to the TOP -> productions. + :type collapseRoot: bool + :param joinChar: A string used to connect collapsed node values (default = "+") + :type joinChar: str + """ + + if collapseRoot == False and isinstance(tree, Tree) and len(tree) == 1: + nodeList = [tree[0]] + else: + nodeList = [tree] + + # depth-first traversal of tree + while nodeList != []: + node = nodeList.pop() + if isinstance(node, Tree): + if ( + len(node) == 1 + and isinstance(node[0], Tree) + and (collapsePOS == True or isinstance(node[0, 0], Tree)) + ): + node.set_label(node.label() + joinChar + node[0].label()) + node[0:] = [child for child in node[0]] + # since we assigned the child's children to the current node, + # evaluate the current node again + nodeList.append(node) + else: + for child in node: + nodeList.append(child) + + +################################################################# +# Demonstration +################################################################# + + +def demo(): + """ + A demonstration showing how each tree transform can be used. + """ + + from copy import deepcopy + + from nltk.draw.tree import draw_trees + from nltk.tree.tree import Tree + + # original tree from WSJ bracketed text + sentence = """(TOP + (S + (S + (VP + (VBN Turned) + (ADVP (RB loose)) + (PP + (IN in) + (NP + (NP (NNP Shane) (NNP Longman) (POS 's)) + (NN trading) + (NN room))))) + (, ,) + (NP (DT the) (NN yuppie) (NNS dealers)) + (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) + (. .)))""" + t = Tree.fromstring(sentence, remove_empty_top_bracketing=True) + + # collapse subtrees with only one child + collapsedTree = deepcopy(t) + collapse_unary(collapsedTree) + + # convert the tree to CNF + cnfTree = deepcopy(collapsedTree) + chomsky_normal_form(cnfTree) + + # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two + parentTree = deepcopy(collapsedTree) + chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) + + # convert the tree back to its original form (used to make CYK results comparable) + original = deepcopy(parentTree) + un_chomsky_normal_form(original) + + # convert tree back to bracketed text + sentence2 = original.pprint() + print(sentence) + print(sentence2) + print("Sentences the same? ", sentence == sentence2) + + draw_trees(t, collapsedTree, cnfTree, parentTree, original) + + +if __name__ == "__main__": + demo() + +__all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tree/tree.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/tree.py new file mode 100644 index 0000000000000000000000000000000000000000..6cb19f4535117ed6b3d086bf6afffa23ae24fbec --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tree/tree.py @@ -0,0 +1,982 @@ +# Natural Language Toolkit: Text Trees +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Peter Ljunglöf +# Nathan Bodenstab (tree transforms) +# Eric Kafe (Tree.fromlist()) +# Mohaned mashaly (Deprecating methods) +# URL: +# For license information, see LICENSE.TXT + +""" +Class for representing hierarchical language structures, such as +syntax trees and morphological trees. +""" + +import re + +from nltk.grammar import Nonterminal, Production +from nltk.internals import deprecated + +###################################################################### +## Trees +###################################################################### + + +class Tree(list): + r""" + A Tree represents a hierarchical grouping of leaves and subtrees. + For example, each constituent in a syntax tree is represented by a single Tree. + + A tree's children are encoded as a list of leaves and subtrees, + where a leaf is a basic (non-tree) value; and a subtree is a + nested Tree. + + >>> from nltk.tree import Tree + >>> print(Tree(1, [2, Tree(3, [4]), 5])) + (1 2 (3 4) 5) + >>> vp = Tree('VP', [Tree('V', ['saw']), + ... Tree('NP', ['him'])]) + >>> s = Tree('S', [Tree('NP', ['I']), vp]) + >>> print(s) + (S (NP I) (VP (V saw) (NP him))) + >>> print(s[1]) + (VP (V saw) (NP him)) + >>> print(s[1,1]) + (NP him) + >>> t = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))") + >>> s == t + True + >>> t[1][1].set_label('X') + >>> t[1][1].label() + 'X' + >>> print(t) + (S (NP I) (VP (V saw) (X him))) + >>> t[0], t[1,1] = t[1,1], t[0] + >>> print(t) + (S (X him) (VP (V saw) (NP I))) + + The length of a tree is the number of children it has. + + >>> len(t) + 2 + + The set_label() and label() methods allow individual constituents + to be labeled. For example, syntax trees use this label to specify + phrase tags, such as "NP" and "VP". + + Several Tree methods use "tree positions" to specify + children or descendants of a tree. Tree positions are defined as + follows: + + - The tree position *i* specifies a Tree's *i*\ th child. + - The tree position ``()`` specifies the Tree itself. + - If *p* is the tree position of descendant *d*, then + *p+i* specifies the *i*\ th child of *d*. + + I.e., every tree position is either a single index *i*, + specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*, + specifying ``tree[i1][i2]...[iN]``. + + Construct a new tree. This constructor can be called in one + of two ways: + + - ``Tree(label, children)`` constructs a new tree with the + specified label and list of children. + + - ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``. + """ + + def __init__(self, node, children=None): + if children is None: + raise TypeError( + "%s: Expected a node value and child list " % type(self).__name__ + ) + elif isinstance(children, str): + raise TypeError( + "%s() argument 2 should be a list, not a " + "string" % type(self).__name__ + ) + else: + list.__init__(self, children) + self._label = node + + # //////////////////////////////////////////////////////////// + # Comparison operators + # //////////////////////////////////////////////////////////// + + def __eq__(self, other): + return self.__class__ is other.__class__ and (self._label, list(self)) == ( + other._label, + list(other), + ) + + def __lt__(self, other): + if not isinstance(other, Tree): + # raise_unorderable_types("<", self, other) + # Sometimes children can be pure strings, + # so we need to be able to compare with non-trees: + return self.__class__.__name__ < other.__class__.__name__ + elif self.__class__ is other.__class__: + return (self._label, list(self)) < (other._label, list(other)) + else: + return self.__class__.__name__ < other.__class__.__name__ + + # @total_ordering doesn't work here, since the class inherits from a builtin class + __ne__ = lambda self, other: not self == other + __gt__ = lambda self, other: not (self < other or self == other) + __le__ = lambda self, other: self < other or self == other + __ge__ = lambda self, other: not self < other + + # //////////////////////////////////////////////////////////// + # Disabled list operations + # //////////////////////////////////////////////////////////// + + def __mul__(self, v): + raise TypeError("Tree does not support multiplication") + + def __rmul__(self, v): + raise TypeError("Tree does not support multiplication") + + def __add__(self, v): + raise TypeError("Tree does not support addition") + + def __radd__(self, v): + raise TypeError("Tree does not support addition") + + # //////////////////////////////////////////////////////////// + # Indexing (with support for tree positions) + # //////////////////////////////////////////////////////////// + + def __getitem__(self, index): + if isinstance(index, (int, slice)): + return list.__getitem__(self, index) + elif isinstance(index, (list, tuple)): + if len(index) == 0: + return self + elif len(index) == 1: + return self[index[0]] + else: + return self[index[0]][index[1:]] + else: + raise TypeError( + "%s indices must be integers, not %s" + % (type(self).__name__, type(index).__name__) + ) + + def __setitem__(self, index, value): + if isinstance(index, (int, slice)): + return list.__setitem__(self, index, value) + elif isinstance(index, (list, tuple)): + if len(index) == 0: + raise IndexError("The tree position () may not be " "assigned to.") + elif len(index) == 1: + self[index[0]] = value + else: + self[index[0]][index[1:]] = value + else: + raise TypeError( + "%s indices must be integers, not %s" + % (type(self).__name__, type(index).__name__) + ) + + def __delitem__(self, index): + if isinstance(index, (int, slice)): + return list.__delitem__(self, index) + elif isinstance(index, (list, tuple)): + if len(index) == 0: + raise IndexError("The tree position () may not be deleted.") + elif len(index) == 1: + del self[index[0]] + else: + del self[index[0]][index[1:]] + else: + raise TypeError( + "%s indices must be integers, not %s" + % (type(self).__name__, type(index).__name__) + ) + + # //////////////////////////////////////////////////////////// + # Basic tree operations + # //////////////////////////////////////////////////////////// + @deprecated("Use label() instead") + def _get_node(self): + """Outdated method to access the node value; use the label() method instead.""" + + @deprecated("Use set_label() instead") + def _set_node(self, value): + """Outdated method to set the node value; use the set_label() method instead.""" + + node = property(_get_node, _set_node) + + def label(self): + """ + Return the node label of the tree. + + >>> t = Tree.fromstring('(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))') + >>> t.label() + 'S' + + :return: the node label (typically a string) + :rtype: any + """ + return self._label + + def set_label(self, label): + """ + Set the node label of the tree. + + >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") + >>> t.set_label("T") + >>> print(t) + (T (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat)))) + + :param label: the node label (typically a string) + :type label: any + """ + self._label = label + + def leaves(self): + """ + Return the leaves of the tree. + + >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") + >>> t.leaves() + ['the', 'dog', 'chased', 'the', 'cat'] + + :return: a list containing this tree's leaves. + The order reflects the order of the + leaves in the tree's hierarchical structure. + :rtype: list + """ + leaves = [] + for child in self: + if isinstance(child, Tree): + leaves.extend(child.leaves()) + else: + leaves.append(child) + return leaves + + def flatten(self): + """ + Return a flat version of the tree, with all non-root non-terminals removed. + + >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") + >>> print(t.flatten()) + (S the dog chased the cat) + + :return: a tree consisting of this tree's root connected directly to + its leaves, omitting all intervening non-terminal nodes. + :rtype: Tree + """ + return Tree(self.label(), self.leaves()) + + def height(self): + """ + Return the height of the tree. + + >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") + >>> t.height() + 5 + >>> print(t[0,0]) + (D the) + >>> t[0,0].height() + 2 + + :return: The height of this tree. The height of a tree + containing no children is 1; the height of a tree + containing only leaves is 2; and the height of any other + tree is one plus the maximum of its children's + heights. + :rtype: int + """ + max_child_height = 0 + for child in self: + if isinstance(child, Tree): + max_child_height = max(max_child_height, child.height()) + else: + max_child_height = max(max_child_height, 1) + return 1 + max_child_height + + def treepositions(self, order="preorder"): + """ + >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") + >>> t.treepositions() # doctest: +ELLIPSIS + [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...] + >>> for pos in t.treepositions('leaves'): + ... t[pos] = t[pos][::-1].upper() + >>> print(t) + (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC)))) + + :param order: One of: ``preorder``, ``postorder``, ``bothorder``, + ``leaves``. + """ + positions = [] + if order in ("preorder", "bothorder"): + positions.append(()) + for i, child in enumerate(self): + if isinstance(child, Tree): + childpos = child.treepositions(order) + positions.extend((i,) + p for p in childpos) + else: + positions.append((i,)) + if order in ("postorder", "bothorder"): + positions.append(()) + return positions + + def subtrees(self, filter=None): + """ + Generate all the subtrees of this tree, optionally restricted + to trees matching the filter function. + + >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") + >>> for s in t.subtrees(lambda t: t.height() == 2): + ... print(s) + (D the) + (N dog) + (V chased) + (D the) + (N cat) + + :type filter: function + :param filter: the function to filter all local trees + """ + if not filter or filter(self): + yield self + for child in self: + if isinstance(child, Tree): + yield from child.subtrees(filter) + + def productions(self): + """ + Generate the productions that correspond to the non-terminal nodes of the tree. + For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the + form P -> C1 C2 ... Cn. + + >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") + >>> t.productions() # doctest: +NORMALIZE_WHITESPACE + [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased', + NP -> D N, D -> 'the', N -> 'cat'] + + :rtype: list(Production) + """ + + if not isinstance(self._label, str): + raise TypeError( + "Productions can only be generated from trees having node labels that are strings" + ) + + prods = [Production(Nonterminal(self._label), _child_names(self))] + for child in self: + if isinstance(child, Tree): + prods += child.productions() + return prods + + def pos(self): + """ + Return a sequence of pos-tagged words extracted from the tree. + + >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") + >>> t.pos() + [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')] + + :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags). + The order reflects the order of the leaves in the tree's hierarchical structure. + :rtype: list(tuple) + """ + pos = [] + for child in self: + if isinstance(child, Tree): + pos.extend(child.pos()) + else: + pos.append((child, self._label)) + return pos + + def leaf_treeposition(self, index): + """ + :return: The tree position of the ``index``-th leaf in this + tree. I.e., if ``tp=self.leaf_treeposition(i)``, then + ``self[tp]==self.leaves()[i]``. + + :raise IndexError: If this tree contains fewer than ``index+1`` + leaves, or if ``index<0``. + """ + if index < 0: + raise IndexError("index must be non-negative") + + stack = [(self, ())] + while stack: + value, treepos = stack.pop() + if not isinstance(value, Tree): + if index == 0: + return treepos + else: + index -= 1 + else: + for i in range(len(value) - 1, -1, -1): + stack.append((value[i], treepos + (i,))) + + raise IndexError("index must be less than or equal to len(self)") + + def treeposition_spanning_leaves(self, start, end): + """ + :return: The tree position of the lowest descendant of this + tree that dominates ``self.leaves()[start:end]``. + :raise ValueError: if ``end <= start`` + """ + if end <= start: + raise ValueError("end must be greater than start") + # Find the tree positions of the start & end leaves, and + # take the longest common subsequence. + start_treepos = self.leaf_treeposition(start) + end_treepos = self.leaf_treeposition(end - 1) + # Find the first index where they mismatch: + for i in range(len(start_treepos)): + if i == len(end_treepos) or start_treepos[i] != end_treepos[i]: + return start_treepos[:i] + return start_treepos + + # //////////////////////////////////////////////////////////// + # Transforms + # //////////////////////////////////////////////////////////// + + def chomsky_normal_form( + self, + factor="right", + horzMarkov=None, + vertMarkov=0, + childChar="|", + parentChar="^", + ): + """ + This method can modify a tree in three ways: + + 1. Convert a tree into its Chomsky Normal Form (CNF) + equivalent -- Every subtree has either two non-terminals + or one terminal as its children. This process requires + the creation of more"artificial" non-terminal nodes. + 2. Markov (vertical) smoothing of children in new artificial + nodes + 3. Horizontal (parent) annotation of nodes + + :param factor: Right or left factoring method (default = "right") + :type factor: str = [left|right] + :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings) + :type horzMarkov: int | None + :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation) + :type vertMarkov: int | None + :param childChar: A string used in construction of the artificial nodes, separating the head of the + original subtree from the child nodes that have yet to be expanded (default = "|") + :type childChar: str + :param parentChar: A string used to separate the node representation from its vertical annotation + :type parentChar: str + """ + from nltk.tree.transforms import chomsky_normal_form + + chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar) + + def un_chomsky_normal_form( + self, expandUnary=True, childChar="|", parentChar="^", unaryChar="+" + ): + """ + This method modifies the tree in three ways: + + 1. Transforms a tree in Chomsky Normal Form back to its + original structure (branching greater than two) + 2. Removes any parent annotation (if it exists) + 3. (optional) expands unary subtrees (if previously + collapsed with collapseUnary(...) ) + + :param expandUnary: Flag to expand unary or not (default = True) + :type expandUnary: bool + :param childChar: A string separating the head node from its children in an artificial node (default = "|") + :type childChar: str + :param parentChar: A string separating the node label from its parent annotation (default = "^") + :type parentChar: str + :param unaryChar: A string joining two non-terminals in a unary production (default = "+") + :type unaryChar: str + """ + from nltk.tree.transforms import un_chomsky_normal_form + + un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar) + + def collapse_unary(self, collapsePOS=False, collapseRoot=False, joinChar="+"): + """ + Collapse subtrees with a single child (ie. unary productions) + into a new non-terminal (Tree node) joined by 'joinChar'. + This is useful when working with algorithms that do not allow + unary productions, and completely removing the unary productions + would require loss of useful information. The Tree is modified + directly (since it is passed by reference) and no value is returned. + + :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. + Part-of-Speech tags) since they are always unary productions + :type collapsePOS: bool + :param collapseRoot: 'False' (default) will not modify the root production + if it is unary. For the Penn WSJ treebank corpus, this corresponds + to the TOP -> productions. + :type collapseRoot: bool + :param joinChar: A string used to connect collapsed node values (default = "+") + :type joinChar: str + """ + from nltk.tree.transforms import collapse_unary + + collapse_unary(self, collapsePOS, collapseRoot, joinChar) + + # //////////////////////////////////////////////////////////// + # Convert, copy + # //////////////////////////////////////////////////////////// + + @classmethod + def convert(cls, tree): + """ + Convert a tree between different subtypes of Tree. ``cls`` determines + which class will be used to encode the new tree. + + :type tree: Tree + :param tree: The tree that should be converted. + :return: The new Tree. + """ + if isinstance(tree, Tree): + children = [cls.convert(child) for child in tree] + return cls(tree._label, children) + else: + return tree + + def __copy__(self): + return self.copy() + + def __deepcopy__(self, memo): + return self.copy(deep=True) + + def copy(self, deep=False): + if not deep: + return type(self)(self._label, self) + else: + return type(self).convert(self) + + def _frozen_class(self): + from nltk.tree.immutable import ImmutableTree + + return ImmutableTree + + def freeze(self, leaf_freezer=None): + frozen_class = self._frozen_class() + if leaf_freezer is None: + newcopy = frozen_class.convert(self) + else: + newcopy = self.copy(deep=True) + for pos in newcopy.treepositions("leaves"): + newcopy[pos] = leaf_freezer(newcopy[pos]) + newcopy = frozen_class.convert(newcopy) + hash(newcopy) # Make sure the leaves are hashable. + return newcopy + + # //////////////////////////////////////////////////////////// + # Parsing + # //////////////////////////////////////////////////////////// + + @classmethod + def fromstring( + cls, + s, + brackets="()", + read_node=None, + read_leaf=None, + node_pattern=None, + leaf_pattern=None, + remove_empty_top_bracketing=False, + ): + """ + Read a bracketed tree string and return the resulting tree. + Trees are represented as nested brackettings, such as:: + + (S (NP (NNP John)) (VP (V runs))) + + :type s: str + :param s: The string to read + + :type brackets: str (length=2) + :param brackets: The bracket characters used to mark the + beginning and end of trees and subtrees. + + :type read_node: function + :type read_leaf: function + :param read_node, read_leaf: If specified, these functions + are applied to the substrings of ``s`` corresponding to + nodes and leaves (respectively) to obtain the values for + those nodes and leaves. They should have the following + signature: + + read_node(str) -> value + + For example, these functions could be used to process nodes + and leaves whose values should be some type other than + string (such as ``FeatStruct``). + Note that by default, node strings and leaf strings are + delimited by whitespace and brackets; to override this + default, use the ``node_pattern`` and ``leaf_pattern`` + arguments. + + :type node_pattern: str + :type leaf_pattern: str + :param node_pattern, leaf_pattern: Regular expression patterns + used to find node and leaf substrings in ``s``. By + default, both nodes patterns are defined to match any + sequence of non-whitespace non-bracket characters. + + :type remove_empty_top_bracketing: bool + :param remove_empty_top_bracketing: If the resulting tree has + an empty node label, and is length one, then return its + single child instead. This is useful for treebank trees, + which sometimes contain an extra level of bracketing. + + :return: A tree corresponding to the string representation ``s``. + If this class method is called using a subclass of Tree, + then it will return a tree of that type. + :rtype: Tree + """ + if not isinstance(brackets, str) or len(brackets) != 2: + raise TypeError("brackets must be a length-2 string") + if re.search(r"\s", brackets): + raise TypeError("whitespace brackets not allowed") + # Construct a regexp that will tokenize the string. + open_b, close_b = brackets + open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b)) + if node_pattern is None: + node_pattern = rf"[^\s{open_pattern}{close_pattern}]+" + if leaf_pattern is None: + leaf_pattern = rf"[^\s{open_pattern}{close_pattern}]+" + token_re = re.compile( + r"%s\s*(%s)?|%s|(%s)" + % (open_pattern, node_pattern, close_pattern, leaf_pattern) + ) + # Walk through each token, updating a stack of trees. + stack = [(None, [])] # list of (node, children) tuples + for match in token_re.finditer(s): + token = match.group() + # Beginning of a tree/subtree + if token[0] == open_b: + if len(stack) == 1 and len(stack[0][1]) > 0: + cls._parse_error(s, match, "end-of-string") + label = token[1:].lstrip() + if read_node is not None: + label = read_node(label) + stack.append((label, [])) + # End of a tree/subtree + elif token == close_b: + if len(stack) == 1: + if len(stack[0][1]) == 0: + cls._parse_error(s, match, open_b) + else: + cls._parse_error(s, match, "end-of-string") + label, children = stack.pop() + stack[-1][1].append(cls(label, children)) + # Leaf node + else: + if len(stack) == 1: + cls._parse_error(s, match, open_b) + if read_leaf is not None: + token = read_leaf(token) + stack[-1][1].append(token) + + # check that we got exactly one complete tree. + if len(stack) > 1: + cls._parse_error(s, "end-of-string", close_b) + elif len(stack[0][1]) == 0: + cls._parse_error(s, "end-of-string", open_b) + else: + assert stack[0][0] is None + assert len(stack[0][1]) == 1 + tree = stack[0][1][0] + + # If the tree has an extra level with node='', then get rid of + # it. E.g.: "((S (NP ...) (VP ...)))" + if remove_empty_top_bracketing and tree._label == "" and len(tree) == 1: + tree = tree[0] + # return the tree. + return tree + + @classmethod + def _parse_error(cls, s, match, expecting): + """ + Display a friendly error message when parsing a tree string fails. + :param s: The string we're parsing. + :param match: regexp match of the problem token. + :param expecting: what we expected to see instead. + """ + # Construct a basic error message + if match == "end-of-string": + pos, token = len(s), "end-of-string" + else: + pos, token = match.start(), match.group() + msg = "%s.read(): expected %r but got %r\n%sat index %d." % ( + cls.__name__, + expecting, + token, + " " * 12, + pos, + ) + # Add a display showing the error token itsels: + s = s.replace("\n", " ").replace("\t", " ") + offset = pos + if len(s) > pos + 10: + s = s[: pos + 10] + "..." + if pos > 10: + s = "..." + s[pos - 10 :] + offset = 13 + msg += '\n{}"{}"\n{}^'.format(" " * 16, s, " " * (17 + offset)) + raise ValueError(msg) + + @classmethod + def fromlist(cls, l): + """ + :type l: list + :param l: a tree represented as nested lists + + :return: A tree corresponding to the list representation ``l``. + :rtype: Tree + + Convert nested lists to a NLTK Tree + """ + if type(l) == list and len(l) > 0: + label = repr(l[0]) + if len(l) > 1: + return Tree(label, [cls.fromlist(child) for child in l[1:]]) + else: + return label + + # //////////////////////////////////////////////////////////// + # Visualization & String Representation + # //////////////////////////////////////////////////////////// + + def draw(self): + """ + Open a new window containing a graphical diagram of this tree. + """ + from nltk.draw.tree import draw_trees + + draw_trees(self) + + def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs): + """ + Pretty-print this tree as ASCII or Unicode art. + For explanation of the arguments, see the documentation for + `nltk.tree.prettyprinter.TreePrettyPrinter`. + """ + from nltk.tree.prettyprinter import TreePrettyPrinter + + print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream) + + def __repr__(self): + childstr = ", ".join(repr(c) for c in self) + return "{}({}, [{}])".format( + type(self).__name__, + repr(self._label), + childstr, + ) + + def _repr_svg_(self): + from svgling import draw_tree + + return draw_tree(self)._repr_svg_() + + def __str__(self): + return self.pformat() + + def pprint(self, **kwargs): + """ + Print a string representation of this Tree to 'stream' + """ + + if "stream" in kwargs: + stream = kwargs["stream"] + del kwargs["stream"] + else: + stream = None + print(self.pformat(**kwargs), file=stream) + + def pformat(self, margin=70, indent=0, nodesep="", parens="()", quotes=False): + """ + :return: A pretty-printed string representation of this tree. + :rtype: str + :param margin: The right margin at which to do line-wrapping. + :type margin: int + :param indent: The indentation level at which printing + begins. This number is used to decide how far to indent + subsequent lines. + :type indent: int + :param nodesep: A string that is used to separate the node + from the children. E.g., the default value ``':'`` gives + trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``. + """ + + # Try writing it on one line. + s = self._pformat_flat(nodesep, parens, quotes) + if len(s) + indent < margin: + return s + + # If it doesn't fit on one line, then write it on multi-lines. + if isinstance(self._label, str): + s = f"{parens[0]}{self._label}{nodesep}" + else: + s = f"{parens[0]}{repr(self._label)}{nodesep}" + for child in self: + if isinstance(child, Tree): + s += ( + "\n" + + " " * (indent + 2) + + child.pformat(margin, indent + 2, nodesep, parens, quotes) + ) + elif isinstance(child, tuple): + s += "\n" + " " * (indent + 2) + "/".join(child) + elif isinstance(child, str) and not quotes: + s += "\n" + " " * (indent + 2) + "%s" % child + else: + s += "\n" + " " * (indent + 2) + repr(child) + return s + parens[1] + + def pformat_latex_qtree(self): + r""" + Returns a representation of the tree compatible with the + LaTeX qtree package. This consists of the string ``\Tree`` + followed by the tree represented in bracketed notation. + + For example, the following result was generated from a parse tree of + the sentence ``The announcement astounded us``:: + + \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] + [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ] + + See https://www.ling.upenn.edu/advice/latex.html for the LaTeX + style file for the qtree package. + + :return: A latex qtree representation of this tree. + :rtype: str + """ + reserved_chars = re.compile(r"([#\$%&~_\{\}])") + + pformat = self.pformat(indent=6, nodesep="", parens=("[.", " ]")) + return r"\Tree " + re.sub(reserved_chars, r"\\\1", pformat) + + def _pformat_flat(self, nodesep, parens, quotes): + childstrs = [] + for child in self: + if isinstance(child, Tree): + childstrs.append(child._pformat_flat(nodesep, parens, quotes)) + elif isinstance(child, tuple): + childstrs.append("/".join(child)) + elif isinstance(child, str) and not quotes: + childstrs.append("%s" % child) + else: + childstrs.append(repr(child)) + if isinstance(self._label, str): + return "{}{}{} {}{}".format( + parens[0], + self._label, + nodesep, + " ".join(childstrs), + parens[1], + ) + else: + return "{}{}{} {}{}".format( + parens[0], + repr(self._label), + nodesep, + " ".join(childstrs), + parens[1], + ) + + +def _child_names(tree): + names = [] + for child in tree: + if isinstance(child, Tree): + names.append(Nonterminal(child._label)) + else: + names.append(child) + return names + + +###################################################################### +## Demonstration +###################################################################### + + +def demo(): + """ + A demonstration showing how Trees and Trees can be + used. This demonstration creates a Tree, and loads a + Tree from the Treebank corpus, + and shows the results of calling several of their methods. + """ + + from nltk import ProbabilisticTree, Tree + + # Demonstrate tree parsing. + s = "(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))" + t = Tree.fromstring(s) + print("Convert bracketed string into tree:") + print(t) + print(t.__repr__()) + + print("Display tree properties:") + print(t.label()) # tree's constituent type + print(t[0]) # tree's first child + print(t[1]) # tree's second child + print(t.height()) + print(t.leaves()) + print(t[1]) + print(t[1, 1]) + print(t[1, 1, 0]) + + # Demonstrate tree modification. + the_cat = t[0] + the_cat.insert(1, Tree.fromstring("(JJ big)")) + print("Tree modification:") + print(t) + t[1, 1, 1] = Tree.fromstring("(NN cake)") + print(t) + print() + + # Tree transforms + print("Collapse unary:") + t.collapse_unary() + print(t) + print("Chomsky normal form:") + t.chomsky_normal_form() + print(t) + print() + + # Demonstrate probabilistic trees. + pt = ProbabilisticTree("x", ["y", "z"], prob=0.5) + print("Probabilistic Tree:") + print(pt) + print() + + # Demonstrate parsing of treebank output format. + t = Tree.fromstring(t.pformat()) + print("Convert tree to bracketed string and back again:") + print(t) + print() + + # Demonstrate LaTeX output + print("LaTeX output:") + print(t.pformat_latex_qtree()) + print() + + # Demonstrate Productions + print("Production output:") + print(t.productions()) + print() + + # Demonstrate tree nodes containing objects other than strings + t.set_label(("test", 3)) + print(t) + + +__all__ = [ + "Tree", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4879c1f66c62f1e74d91673869acea765e3d54fc --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/__init__.py @@ -0,0 +1,35 @@ +# Natural Language Toolkit: Twitter +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# URL: +# For license information, see LICENSE.TXT + +""" +NLTK Twitter Package + +This package contains classes for retrieving Tweet documents using the +Twitter API. + +""" +try: + import twython +except ImportError: + import warnings + + warnings.warn( + "The twython library has not been installed. " + "Some functionality from the twitter package will not be available." + ) +else: + from nltk.twitter.util import Authenticate, credsfromfile + from nltk.twitter.twitterclient import ( + Streamer, + Query, + Twitter, + TweetViewer, + TweetWriter, + ) + + +from nltk.twitter.common import json2csv diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/api.py new file mode 100644 index 0000000000000000000000000000000000000000..e3dfbb0c4ddde5eb0de300587cada61b6714176b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/api.py @@ -0,0 +1,145 @@ +# Natural Language Toolkit: Twitter API +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# Lorenzo Rubio +# URL: +# For license information, see LICENSE.TXT + +""" +This module provides an interface for TweetHandlers, and support for timezone +handling. +""" + +import time as _time +from abc import ABCMeta, abstractmethod +from datetime import datetime, timedelta, timezone, tzinfo + + +class LocalTimezoneOffsetWithUTC(tzinfo): + """ + This is not intended to be a general purpose class for dealing with the + local timezone. In particular: + + * it assumes that the date passed has been created using + `datetime(..., tzinfo=Local)`, where `Local` is an instance of + the object `LocalTimezoneOffsetWithUTC`; + * for such an object, it returns the offset with UTC, used for date comparisons. + + Reference: https://docs.python.org/3/library/datetime.html + """ + + STDOFFSET = timedelta(seconds=-_time.timezone) + + if _time.daylight: + DSTOFFSET = timedelta(seconds=-_time.altzone) + else: + DSTOFFSET = STDOFFSET + + def utcoffset(self, dt): + """ + Access the relevant time offset. + """ + return self.DSTOFFSET + + +LOCAL = LocalTimezoneOffsetWithUTC() + + +class BasicTweetHandler(metaclass=ABCMeta): + """ + Minimal implementation of `TweetHandler`. + + Counts the number of Tweets and decides when the client should stop + fetching them. + """ + + def __init__(self, limit=20): + self.limit = limit + self.counter = 0 + + """ + A flag to indicate to the client whether to stop fetching data given + some condition (e.g., reaching a date limit). + """ + self.do_stop = False + + """ + Stores the id of the last fetched Tweet to handle pagination. + """ + self.max_id = None + + def do_continue(self): + """ + Returns `False` if the client should stop fetching Tweets. + """ + return self.counter < self.limit and not self.do_stop + + +class TweetHandlerI(BasicTweetHandler): + """ + Interface class whose subclasses should implement a handle method that + Twitter clients can delegate to. + """ + + def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None): + """ + :param int limit: The number of data items to process in the current\ + round of processing. + + :param tuple upper_date_limit: The date at which to stop collecting\ + new data. This should be entered as a tuple which can serve as the\ + argument to `datetime.datetime`.\ + E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. + + :param tuple lower_date_limit: The date at which to stop collecting\ + new data. See `upper_data_limit` for formatting. + """ + BasicTweetHandler.__init__(self, limit) + + self.upper_date_limit = None + self.lower_date_limit = None + if upper_date_limit: + self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL) + if lower_date_limit: + self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL) + + self.startingup = True + + @abstractmethod + def handle(self, data): + """ + Deal appropriately with data returned by the Twitter API + """ + + @abstractmethod + def on_finish(self): + """ + Actions when the tweet limit has been reached + """ + + def check_date_limit(self, data, verbose=False): + """ + Validate date limits. + """ + if self.upper_date_limit or self.lower_date_limit: + date_fmt = "%a %b %d %H:%M:%S +0000 %Y" + tweet_date = datetime.strptime(data["created_at"], date_fmt).replace( + tzinfo=timezone.utc + ) + if (self.upper_date_limit and tweet_date > self.upper_date_limit) or ( + self.lower_date_limit and tweet_date < self.lower_date_limit + ): + if self.upper_date_limit: + message = "earlier" + date_limit = self.upper_date_limit + else: + message = "later" + date_limit = self.lower_date_limit + if verbose: + print( + "Date limit {} is {} than date of current tweet {}".format( + date_limit, message, tweet_date + ) + ) + self.do_stop = True diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/common.py b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/common.py new file mode 100644 index 0000000000000000000000000000000000000000..052f83eedbafdafeaa7d64860d32a97ff8b4e3b3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/common.py @@ -0,0 +1,270 @@ +# Natural Language Toolkit: Twitter client +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# Lorenzo Rubio +# URL: +# For license information, see LICENSE.TXT + +""" +Utility functions for the `twitterclient` module which do not require +the `twython` library to have been installed. +""" +import csv +import gzip +import json + +from nltk.internals import deprecated + +HIER_SEPARATOR = "." + + +def extract_fields(tweet, fields): + """ + Extract field values from a full tweet and return them as a list + + :param json tweet: The tweet in JSON format + :param list fields: The fields to be extracted from the tweet + :rtype: list(str) + """ + out = [] + for field in fields: + try: + _add_field_to_out(tweet, field, out) + except TypeError as e: + raise RuntimeError( + "Fatal error when extracting fields. Cannot find field ", field + ) from e + return out + + +def _add_field_to_out(json, field, out): + if _is_composed_key(field): + key, value = _get_key_value_composed(field) + _add_field_to_out(json[key], value, out) + else: + out += [json[field]] + + +def _is_composed_key(field): + return HIER_SEPARATOR in field + + +def _get_key_value_composed(field): + out = field.split(HIER_SEPARATOR) + # there could be up to 3 levels + key = out[0] + value = HIER_SEPARATOR.join(out[1:]) + return key, value + + +def _get_entity_recursive(json, entity): + if not json: + return None + elif isinstance(json, dict): + for key, value in json.items(): + if key == entity: + return value + # 'entities' and 'extended_entities' are wrappers in Twitter json + # structure that contain other Twitter objects. See: + # https://dev.twitter.com/overview/api/entities-in-twitter-objects + + if key == "entities" or key == "extended_entities": + candidate = _get_entity_recursive(value, entity) + if candidate is not None: + return candidate + return None + elif isinstance(json, list): + for item in json: + candidate = _get_entity_recursive(item, entity) + if candidate is not None: + return candidate + return None + else: + return None + + +def json2csv( + fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False +): + """ + Extract selected fields from a file of line-separated JSON tweets and + write to a file in CSV format. + + This utility function allows a file of full tweets to be easily converted + to a CSV file for easier processing. For example, just TweetIDs or + just the text content of the Tweets can be extracted. + + Additionally, the function allows combinations of fields of other Twitter + objects (mainly the users, see below). + + For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see + `json2csv_entities` + + :param str infile: The name of the file containing full tweets + + :param str outfile: The name of the text file where results should be\ + written + + :param list fields: The list of fields to be extracted. Useful examples\ + are 'id_str' for the tweetID and 'text' for the text of the tweet. See\ + for a full list of fields.\ + e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\ + Additionally, it allows IDs from other Twitter objects, e. g.,\ + ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count'] + + :param error: Behaviour for encoding errors, see\ + https://docs.python.org/3/library/codecs.html#codec-base-classes + + :param gzip_compress: if `True`, output files are compressed with gzip + """ + (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) + # write the list of fields as header + writer.writerow(fields) + # process the file + for line in fp: + tweet = json.loads(line) + row = extract_fields(tweet, fields) + writer.writerow(row) + outf.close() + + +@deprecated("Use open() and csv.writer() directly instead.") +def outf_writer_compat(outfile, encoding, errors, gzip_compress=False): + """Get a CSV writer with optional compression.""" + return _outf_writer(outfile, encoding, errors, gzip_compress) + + +def _outf_writer(outfile, encoding, errors, gzip_compress=False): + if gzip_compress: + outf = gzip.open(outfile, "wt", newline="", encoding=encoding, errors=errors) + else: + outf = open(outfile, "w", newline="", encoding=encoding, errors=errors) + writer = csv.writer(outf) + return (writer, outf) + + +def json2csv_entities( + tweets_file, + outfile, + main_fields, + entity_type, + entity_fields, + encoding="utf8", + errors="replace", + gzip_compress=False, +): + """ + Extract selected fields from a file of line-separated JSON tweets and + write to a file in CSV format. + + This utility function allows a file of full Tweets to be easily converted + to a CSV file for easier processing of Twitter entities. For example, the + hashtags or media elements of a tweet can be extracted. + + It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags + there will be two lines in the output file, one per hashtag + + :param tweets_file: the file-like object containing full Tweets + + :param str outfile: The path of the text file where results should be\ + written + + :param list main_fields: The list of fields to be extracted from the main\ + object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\ + for a full list of fields. + e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count'] + If `entity_type` is expressed with hierarchy, then it is the list of\ + fields of the object that corresponds to the key of the entity_type,\ + (e.g., for entity_type='user.urls', the fields in the main_fields list\ + belong to the user object; for entity_type='place.bounding_box', the\ + files in the main_field list belong to the place object of the tweet). + + :param list entity_type: The name of the entity: 'hashtags', 'media',\ + 'urls' and 'user_mentions' for the tweet object. For a user object,\ + this needs to be expressed with a hierarchy: `'user.urls'`. For the\ + bounding box of the Tweet location, use `'place.bounding_box'`. + + :param list entity_fields: The list of fields to be extracted from the\ + entity. E.g. `['text']` (of the Tweet) + + :param error: Behaviour for encoding errors, see\ + https://docs.python.org/3/library/codecs.html#codec-base-classes + + :param gzip_compress: if `True`, output files are compressed with gzip + """ + + (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) + header = get_header_field_list(main_fields, entity_type, entity_fields) + writer.writerow(header) + for line in tweets_file: + tweet = json.loads(line) + if _is_composed_key(entity_type): + key, value = _get_key_value_composed(entity_type) + object_json = _get_entity_recursive(tweet, key) + if not object_json: + # this can happen in the case of "place" + continue + object_fields = extract_fields(object_json, main_fields) + items = _get_entity_recursive(object_json, value) + _write_to_file(object_fields, items, entity_fields, writer) + else: + tweet_fields = extract_fields(tweet, main_fields) + items = _get_entity_recursive(tweet, entity_type) + _write_to_file(tweet_fields, items, entity_fields, writer) + outf.close() + + +def get_header_field_list(main_fields, entity_type, entity_fields): + if _is_composed_key(entity_type): + key, value = _get_key_value_composed(entity_type) + main_entity = key + sub_entity = value + else: + main_entity = None + sub_entity = entity_type + + if main_entity: + output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields] + else: + output1 = main_fields + output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields] + return output1 + output2 + + +def _write_to_file(object_fields, items, entity_fields, writer): + if not items: + # it could be that the entity is just not present for the tweet + # e.g. tweet hashtag is always present, even as [], however + # tweet media may not be present + return + if isinstance(items, dict): + # this happens e.g. for "place" of a tweet + row = object_fields + # there might be composed keys in de list of required fields + entity_field_values = [x for x in entity_fields if not _is_composed_key(x)] + entity_field_composed = [x for x in entity_fields if _is_composed_key(x)] + for field in entity_field_values: + value = items[field] + if isinstance(value, list): + row += value + else: + row += [value] + # now check required dictionaries + for d in entity_field_composed: + kd, vd = _get_key_value_composed(d) + json_dict = items[kd] + if not isinstance(json_dict, dict): + raise RuntimeError( + """Key {} does not contain a dictionary + in the json file""".format( + kd + ) + ) + row += [json_dict[vd]] + writer.writerow(row) + return + # in general it is a list + for item in items: + row = object_fields + extract_fields(item, entity_fields) + writer.writerow(row) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/twitter_demo.py b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/twitter_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..372d98da339231560d9af4d8e5b256e147380d12 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/twitter_demo.py @@ -0,0 +1,306 @@ +# Natural Language Toolkit: Twitter client +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# Lorenzo Rubio +# URL: +# For license information, see LICENSE.TXT + +""" +Examples to demo the :py:mod:`twitterclient` code. + +These demo functions should all run, with the following caveats: + +* You must have obtained API keys from Twitter, and installed them according to + the instructions in the `twitter HOWTO `_. + +* If you are on a slow network, some of the calls to the Twitter API may + timeout. + +* If you are being rate limited while searching, you will receive a 420 + error response. + +* Your terminal window / console must be able to display UTF-8 encoded characters. + +For documentation about the Twitter APIs, see `The Streaming APIs Overview +`_ and `The REST APIs Overview +`_. + +For error codes see Twitter's +`Error Codes and Responses ` +""" + +import datetime +import json +from functools import wraps +from io import StringIO + +from nltk.twitter import ( + Query, + Streamer, + TweetViewer, + TweetWriter, + Twitter, + credsfromfile, +) + +SPACER = "###################################" + + +def verbose(func): + """Decorator for demo functions""" + + @wraps(func) + def with_formatting(*args, **kwargs): + print() + print(SPACER) + print("Using %s" % (func.__name__)) + print(SPACER) + return func(*args, **kwargs) + + return with_formatting + + +def yesterday(): + """ + Get yesterday's datetime as a 5-tuple. + """ + date = datetime.datetime.now() + date -= datetime.timedelta(days=1) + date_tuple = date.timetuple()[:6] + return date_tuple + + +def setup(): + """ + Initialize global variables for the demos. + """ + global USERIDS, FIELDS + + USERIDS = ["759251", "612473", "15108702", "6017542", "2673523800"] + # UserIDs corresponding to\ + # @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive + FIELDS = ["id_str"] + + +@verbose +def twitterclass_demo(): + """ + Use the simplified :class:`Twitter` class to write some tweets to a file. + """ + tw = Twitter() + print("Track from the public stream\n") + tw.tweets(keywords="love, hate", limit=10) # public stream + print(SPACER) + print("Search past Tweets\n") + tw = Twitter() + tw.tweets(keywords="love, hate", stream=False, limit=10) # search past tweets + print(SPACER) + print( + "Follow two accounts in the public stream" + + " -- be prepared to wait a few minutes\n" + ) + tw = Twitter() + tw.tweets(follow=["759251", "6017542"], stream=True, limit=5) # public stream + + +@verbose +def sampletoscreen_demo(limit=20): + """ + Sample from the Streaming API and send output to terminal. + """ + oauth = credsfromfile() + client = Streamer(**oauth) + client.register(TweetViewer(limit=limit)) + client.sample() + + +@verbose +def tracktoscreen_demo(track="taylor swift", limit=10): + """ + Track keywords from the public Streaming API and send output to terminal. + """ + oauth = credsfromfile() + client = Streamer(**oauth) + client.register(TweetViewer(limit=limit)) + client.filter(track=track) + + +@verbose +def search_demo(keywords="nltk"): + """ + Use the REST API to search for past tweets containing a given keyword. + """ + oauth = credsfromfile() + client = Query(**oauth) + for tweet in client.search_tweets(keywords=keywords, limit=10): + print(tweet["text"]) + + +@verbose +def tweets_by_user_demo(user="NLTK_org", count=200): + """ + Use the REST API to search for past tweets by a given user. + """ + oauth = credsfromfile() + client = Query(**oauth) + client.register(TweetWriter()) + client.user_tweets(user, count) + + +@verbose +def lookup_by_userid_demo(): + """ + Use the REST API to convert a userID to a screen name. + """ + oauth = credsfromfile() + client = Query(**oauth) + user_info = client.user_info_from_id(USERIDS) + for info in user_info: + name = info["screen_name"] + followers = info["followers_count"] + following = info["friends_count"] + print(f"{name}, followers: {followers}, following: {following}") + + +@verbose +def followtoscreen_demo(limit=10): + """ + Using the Streaming API, select just the tweets from a specified list of + userIDs. + + This is will only give results in a reasonable time if the users in + question produce a high volume of tweets, and may even so show some delay. + """ + oauth = credsfromfile() + client = Streamer(**oauth) + client.register(TweetViewer(limit=limit)) + client.statuses.filter(follow=USERIDS) + + +@verbose +def streamtofile_demo(limit=20): + """ + Write 20 tweets sampled from the public Streaming API to a file. + """ + oauth = credsfromfile() + client = Streamer(**oauth) + client.register(TweetWriter(limit=limit, repeat=False)) + client.statuses.sample() + + +@verbose +def limit_by_time_demo(keywords="nltk"): + """ + Query the REST API for Tweets about NLTK since yesterday and send + the output to terminal. + + This example makes the assumption that there are sufficient Tweets since + yesterday for the date to be an effective cut-off. + """ + date = yesterday() + dt_date = datetime.datetime(*date) + oauth = credsfromfile() + client = Query(**oauth) + client.register(TweetViewer(limit=100, lower_date_limit=date)) + + print(f"Cutoff date: {dt_date}\n") + + for tweet in client.search_tweets(keywords=keywords): + print("{} ".format(tweet["created_at"]), end="") + client.handler.handle(tweet) + + +@verbose +def corpusreader_demo(): + """ + Use `TwitterCorpusReader` tp read a file of tweets, and print out + + * some full tweets in JSON format; + * some raw strings from the tweets (i.e., the value of the `text` field); and + * the result of tokenising the raw strings. + + """ + from nltk.corpus import twitter_samples as tweets + + print() + print("Complete tweet documents") + print(SPACER) + for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: + print(json.dumps(tweet, indent=1, sort_keys=True)) + + print() + print("Raw tweet strings:") + print(SPACER) + for text in tweets.strings("tweets.20150430-223406.json")[:15]: + print(text) + + print() + print("Tokenized tweet strings:") + print(SPACER) + for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: + print(toks) + + +@verbose +def expand_tweetids_demo(): + """ + Given a file object containing a list of Tweet IDs, fetch the + corresponding full Tweets, if available. + + """ + ids_f = StringIO( + """\ + 588665495492124672 + 588665495487909888 + 588665495508766721 + 588665495513006080 + 588665495517200384 + 588665495487811584 + 588665495525588992 + 588665495487844352 + 588665495492014081 + 588665495512948737""" + ) + oauth = credsfromfile() + client = Query(**oauth) + hydrated = client.expand_tweetids(ids_f) + + for tweet in hydrated: + id_str = tweet["id_str"] + print(f"id: {id_str}") + text = tweet["text"] + if text.startswith("@null"): + text = "[Tweet not available]" + print(text + "\n") + + +ALL = [ + twitterclass_demo, + sampletoscreen_demo, + tracktoscreen_demo, + search_demo, + tweets_by_user_demo, + lookup_by_userid_demo, + followtoscreen_demo, + streamtofile_demo, + limit_by_time_demo, + corpusreader_demo, + expand_tweetids_demo, +] + +""" +Select demo functions to run. E.g. replace the following line with "DEMOS = +ALL[8:]" to execute only the final three demos. +""" +DEMOS = ALL[:] + +if __name__ == "__main__": + setup() + + for demo in DEMOS: + demo() + + print("\n" + SPACER) + print("All demos completed") + print(SPACER) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/twitterclient.py b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/twitterclient.py new file mode 100644 index 0000000000000000000000000000000000000000..04538c9a43694c29cc1197f802fed75fab75f154 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/twitterclient.py @@ -0,0 +1,564 @@ +# Natural Language Toolkit: Twitter client +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# Lorenzo Rubio +# URL: +# For license information, see LICENSE.TXT + + +""" +NLTK Twitter client + +This module offers methods for collecting and processing Tweets. Most of the +functionality depends on access to the Twitter APIs, and this is handled via +the third party Twython library. + +If one of the methods below returns an integer, it is probably a `Twitter +error code `_. For +example, the response of '420' means that you have reached the limit of the +requests you can currently make to the Twitter API. Currently, `rate limits +for the search API `_ are +divided into 15 minute windows. +""" + +import datetime +import gzip +import itertools +import json +import os +import time + +import requests +from twython import Twython, TwythonStreamer +from twython.exceptions import TwythonError, TwythonRateLimitError + +from nltk.twitter.api import BasicTweetHandler, TweetHandlerI +from nltk.twitter.util import credsfromfile, guess_path + + +class Streamer(TwythonStreamer): + """ + Retrieve data from the Twitter Streaming API. + + The streaming API requires + `OAuth 1.0 `_ authentication. + """ + + def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): + + self.handler = None + self.do_continue = True + TwythonStreamer.__init__( + self, app_key, app_secret, oauth_token, oauth_token_secret + ) + + def register(self, handler): + """ + Register a method for handling Tweets. + + :param TweetHandlerI handler: method for viewing + """ + self.handler = handler + + def on_success(self, data): + """ + :param data: response from Twitter API + """ + if self.do_continue: + if self.handler is not None: + if "text" in data: + self.handler.counter += 1 + self.handler.handle(data) + self.do_continue = self.handler.do_continue() + else: + raise ValueError("No data handler has been registered.") + else: + self.disconnect() + self.handler.on_finish() + + def on_error(self, status_code, data): + """ + :param status_code: The status code returned by the Twitter API + :param data: The response from Twitter API + + """ + print(status_code) + + def sample(self): + """ + Wrapper for 'statuses / sample' API call + """ + while self.do_continue: + + # Stream in an endless loop until limit is reached. See twython + # issue 288: https://github.com/ryanmcgrath/twython/issues/288 + # colditzjb commented on 9 Dec 2014 + + try: + self.statuses.sample() + except requests.exceptions.ChunkedEncodingError as e: + if e is not None: + print(f"Error (stream will continue): {e}") + continue + + def filter(self, track="", follow="", lang="en"): + """ + Wrapper for 'statuses / filter' API call + """ + while self.do_continue: + # Stream in an endless loop until limit is reached + + try: + if track == "" and follow == "": + msg = "Please supply a value for 'track', 'follow'" + raise ValueError(msg) + self.statuses.filter(track=track, follow=follow, lang=lang) + except requests.exceptions.ChunkedEncodingError as e: + if e is not None: + print(f"Error (stream will continue): {e}") + continue + + +class Query(Twython): + """ + Retrieve data from the Twitter REST API. + """ + + def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): + """ + :param app_key: (optional) Your applications key + :param app_secret: (optional) Your applications secret key + :param oauth_token: (optional) When using **OAuth 1**, combined with + oauth_token_secret to make authenticated calls + :param oauth_token_secret: (optional) When using **OAuth 1** combined + with oauth_token to make authenticated calls + """ + self.handler = None + self.do_continue = True + Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret) + + def register(self, handler): + """ + Register a method for handling Tweets. + + :param TweetHandlerI handler: method for viewing or writing Tweets to a file. + """ + self.handler = handler + + def expand_tweetids(self, ids_f, verbose=True): + """ + Given a file object containing a list of Tweet IDs, fetch the + corresponding full Tweets from the Twitter API. + + The API call `statuses/lookup` will fail to retrieve a Tweet if the + user has deleted it. + + This call to the Twitter API is rate-limited. See + for details. + + :param ids_f: input file object consisting of Tweet IDs, one to a line + :return: iterable of Tweet objects in JSON format + """ + ids = [line.strip() for line in ids_f if line] + + if verbose: + print(f"Counted {len(ids)} Tweet IDs in {ids_f}.") + + # The Twitter endpoint takes lists of up to 100 ids, so we chunk the + # ids. + id_chunks = [ids[i : i + 100] for i in range(0, len(ids), 100)] + + chunked_tweets = (self.lookup_status(id=chunk) for chunk in id_chunks) + + return itertools.chain.from_iterable(chunked_tweets) + + def _search_tweets(self, keywords, limit=100, lang="en"): + """ + Assumes that the handler has been informed. Fetches Tweets from + search_tweets generator output and passses them to handler + + :param str keywords: A list of query terms to search for, written as\ + a comma-separated string. + :param int limit: Number of Tweets to process + :param str lang: language + """ + while True: + tweets = self.search_tweets( + keywords=keywords, limit=limit, lang=lang, max_id=self.handler.max_id + ) + for tweet in tweets: + self.handler.handle(tweet) + if not (self.handler.do_continue() and self.handler.repeat): + break + self.handler.on_finish() + + def search_tweets( + self, + keywords, + limit=100, + lang="en", + max_id=None, + retries_after_twython_exception=0, + ): + """ + Call the REST API ``'search/tweets'`` endpoint with some plausible + defaults. See `the Twitter search documentation + `_ for more information + about admissible search parameters. + + :param str keywords: A list of query terms to search for, written as\ + a comma-separated string + :param int limit: Number of Tweets to process + :param str lang: language + :param int max_id: id of the last tweet fetched + :param int retries_after_twython_exception: number of retries when\ + searching Tweets before raising an exception + :rtype: python generator + """ + if not self.handler: + # if no handler is provided, `BasicTweetHandler` provides minimum + # functionality for limiting the number of Tweets retrieved + self.handler = BasicTweetHandler(limit=limit) + + count_from_query = 0 + if max_id: + self.handler.max_id = max_id + else: + results = self.search( + q=keywords, count=min(100, limit), lang=lang, result_type="recent" + ) + count = len(results["statuses"]) + if count == 0: + print("No Tweets available through REST API for those keywords") + return + count_from_query = count + self.handler.max_id = results["statuses"][count - 1]["id"] - 1 + + for result in results["statuses"]: + yield result + self.handler.counter += 1 + if self.handler.do_continue() == False: + return + + # Pagination loop: keep fetching Tweets until the desired count is + # reached while dealing with Twitter rate limits. + retries = 0 + while count_from_query < limit: + try: + mcount = min(100, limit - count_from_query) + results = self.search( + q=keywords, + count=mcount, + lang=lang, + max_id=self.handler.max_id, + result_type="recent", + ) + except TwythonRateLimitError as e: + print(f"Waiting for 15 minutes -{e}") + time.sleep(15 * 60) # wait 15 minutes + continue + except TwythonError as e: + print(f"Fatal error in Twython request -{e}") + if retries_after_twython_exception == retries: + raise e + retries += 1 + + count = len(results["statuses"]) + if count == 0: + print("No more Tweets available through rest api") + return + count_from_query += count + # the max_id is also present in the Tweet metadata + # results['search_metadata']['next_results'], but as part of a + # query and difficult to fetch. This is doing the equivalent + # (last tweet id minus one) + self.handler.max_id = results["statuses"][count - 1]["id"] - 1 + + for result in results["statuses"]: + yield result + self.handler.counter += 1 + if self.handler.do_continue() == False: + return + + def user_info_from_id(self, userids): + """ + Convert a list of userIDs into a variety of information about the users. + + See . + + :param list userids: A list of integer strings corresponding to Twitter userIDs + :rtype: list(json) + """ + return [self.show_user(user_id=userid) for userid in userids] + + def user_tweets(self, screen_name, limit, include_rts="false"): + """ + Return a collection of the most recent Tweets posted by the user + + :param str user: The user's screen name; the initial '@' symbol\ + should be omitted + :param int limit: The number of Tweets to recover; 200 is the maximum allowed + :param str include_rts: Whether to include statuses which have been\ + retweeted by the user; possible values are 'true' and 'false' + """ + data = self.get_user_timeline( + screen_name=screen_name, count=limit, include_rts=include_rts + ) + for item in data: + self.handler.handle(item) + + +class Twitter: + """ + Wrapper class with restricted functionality and fewer options. + """ + + def __init__(self): + self._oauth = credsfromfile() + self.streamer = Streamer(**self._oauth) + self.query = Query(**self._oauth) + + def tweets( + self, + keywords="", + follow="", + to_screen=True, + stream=True, + limit=100, + date_limit=None, + lang="en", + repeat=False, + gzip_compress=False, + ): + """ + Process some Tweets in a simple manner. + + :param str keywords: Keywords to use for searching or filtering + :param list follow: UserIDs to use for filtering Tweets from the public stream + :param bool to_screen: If `True`, display the tweet texts on the screen,\ + otherwise print to a file + + :param bool stream: If `True`, use the live public stream,\ + otherwise search past public Tweets + + :param int limit: The number of data items to process in the current\ + round of processing. + + :param tuple date_limit: The date at which to stop collecting\ + new data. This should be entered as a tuple which can serve as the\ + argument to `datetime.datetime`.\ + E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. + Note that, in the case of streaming, this is the maximum date, i.e.\ + a date in the future; if not, it is the minimum date, i.e. a date\ + in the past + + :param str lang: language + + :param bool repeat: A flag to determine whether multiple files should\ + be written. If `True`, the length of each file will be set by the\ + value of `limit`. Use only if `to_screen` is `False`. See also + :py:func:`handle`. + + :param gzip_compress: if `True`, output files are compressed with gzip. + """ + if stream: + upper_date_limit = date_limit + lower_date_limit = None + else: + upper_date_limit = None + lower_date_limit = date_limit + + if to_screen: + handler = TweetViewer( + limit=limit, + upper_date_limit=upper_date_limit, + lower_date_limit=lower_date_limit, + ) + else: + handler = TweetWriter( + limit=limit, + upper_date_limit=upper_date_limit, + lower_date_limit=lower_date_limit, + repeat=repeat, + gzip_compress=gzip_compress, + ) + + if to_screen: + handler = TweetViewer(limit=limit) + else: + if stream: + upper_date_limit = date_limit + lower_date_limit = None + else: + upper_date_limit = None + lower_date_limit = date_limit + + handler = TweetWriter( + limit=limit, + upper_date_limit=upper_date_limit, + lower_date_limit=lower_date_limit, + repeat=repeat, + gzip_compress=gzip_compress, + ) + + if stream: + self.streamer.register(handler) + if keywords == "" and follow == "": + self.streamer.sample() + else: + self.streamer.filter(track=keywords, follow=follow, lang=lang) + else: + self.query.register(handler) + if keywords == "": + raise ValueError("Please supply at least one keyword to search for.") + else: + self.query._search_tweets(keywords, limit=limit, lang=lang) + + +class TweetViewer(TweetHandlerI): + """ + Handle data by sending it to the terminal. + """ + + def handle(self, data): + """ + Direct data to `sys.stdout` + + :return: return ``False`` if processing should cease, otherwise return ``True``. + :rtype: bool + :param data: Tweet object returned by Twitter API + """ + text = data["text"] + print(text) + + self.check_date_limit(data) + if self.do_stop: + return + + def on_finish(self): + print(f"Written {self.counter} Tweets") + + +class TweetWriter(TweetHandlerI): + """ + Handle data by writing it to a file. + """ + + def __init__( + self, + limit=2000, + upper_date_limit=None, + lower_date_limit=None, + fprefix="tweets", + subdir="twitter-files", + repeat=False, + gzip_compress=False, + ): + """ + The difference between the upper and lower date limits depends on + whether Tweets are coming in an ascending date order (i.e. when + streaming) or descending date order (i.e. when searching past Tweets). + + :param int limit: number of data items to process in the current\ + round of processing. + + :param tuple upper_date_limit: The date at which to stop collecting new\ + data. This should be entered as a tuple which can serve as the\ + argument to `datetime.datetime`. E.g. `upper_date_limit=(2015, 4, 1, 12,\ + 40)` for 12:30 pm on April 1 2015. + + :param tuple lower_date_limit: The date at which to stop collecting new\ + data. See `upper_data_limit` for formatting. + + :param str fprefix: The prefix to use in creating file names for Tweet\ + collections. + + :param str subdir: The name of the directory where Tweet collection\ + files should be stored. + + :param bool repeat: flag to determine whether multiple files should be\ + written. If `True`, the length of each file will be set by the value\ + of `limit`. See also :py:func:`handle`. + + :param gzip_compress: if `True`, output files are compressed with gzip. + """ + self.fprefix = fprefix + self.subdir = guess_path(subdir) + self.gzip_compress = gzip_compress + self.fname = self.timestamped_file() + self.repeat = repeat + self.output = None + TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit) + + def timestamped_file(self): + """ + :return: timestamped file name + :rtype: str + """ + subdir = self.subdir + fprefix = self.fprefix + if subdir: + if not os.path.exists(subdir): + os.mkdir(subdir) + + fname = os.path.join(subdir, fprefix) + fmt = "%Y%m%d-%H%M%S" + timestamp = datetime.datetime.now().strftime(fmt) + if self.gzip_compress: + suffix = ".gz" + else: + suffix = "" + outfile = f"{fname}.{timestamp}.json{suffix}" + return outfile + + def handle(self, data): + """ + Write Twitter data as line-delimited JSON into one or more files. + + :return: return `False` if processing should cease, otherwise return `True`. + :param data: tweet object returned by Twitter API + """ + if self.startingup: + if self.gzip_compress: + self.output = gzip.open(self.fname, "w") + else: + self.output = open(self.fname, "w") + print(f"Writing to {self.fname}") + + json_data = json.dumps(data) + if self.gzip_compress: + self.output.write((json_data + "\n").encode("utf-8")) + else: + self.output.write(json_data + "\n") + + self.check_date_limit(data) + if self.do_stop: + return + + self.startingup = False + + def on_finish(self): + print(f"Written {self.counter} Tweets") + if self.output: + self.output.close() + + def do_continue(self): + if self.repeat == False: + return TweetHandlerI.do_continue(self) + + if self.do_stop: + # stop for a functional cause (e.g. date limit) + return False + + if self.counter == self.limit: + # repeat is True, thus close output file and + # create a new one + self._restart_file() + return True + + def _restart_file(self): + self.on_finish() + self.fname = self.timestamped_file() + self.startingup = True + self.counter = 0 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/util.py new file mode 100644 index 0000000000000000000000000000000000000000..8d24b9d9f7a2bab48f606136861306acab32bbe5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/twitter/util.py @@ -0,0 +1,147 @@ +# Natural Language Toolkit: Twitter client +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# Lorenzo Rubio +# URL: +# For license information, see LICENSE.TXT + +""" +Authentication utilities to accompany `twitterclient`. +""" + +import os +import pprint + +from twython import Twython + + +def credsfromfile(creds_file=None, subdir=None, verbose=False): + """ + Convenience function for authentication + """ + return Authenticate().load_creds( + creds_file=creds_file, subdir=subdir, verbose=verbose + ) + + +class Authenticate: + """ + Methods for authenticating with Twitter. + """ + + def __init__(self): + self.creds_file = "credentials.txt" + self.creds_fullpath = None + + self.oauth = {} + try: + self.twitter_dir = os.environ["TWITTER"] + self.creds_subdir = self.twitter_dir + except KeyError: + self.twitter_dir = None + self.creds_subdir = None + + def load_creds(self, creds_file=None, subdir=None, verbose=False): + """ + Read OAuth credentials from a text file. + + File format for OAuth 1:: + + app_key=YOUR_APP_KEY + app_secret=YOUR_APP_SECRET + oauth_token=OAUTH_TOKEN + oauth_token_secret=OAUTH_TOKEN_SECRET + + + File format for OAuth 2:: + + app_key=YOUR_APP_KEY + app_secret=YOUR_APP_SECRET + access_token=ACCESS_TOKEN + + :param str file_name: File containing credentials. ``None`` (default) reads + data from `TWITTER/'credentials.txt'` + """ + if creds_file is not None: + self.creds_file = creds_file + + if subdir is None: + if self.creds_subdir is None: + msg = ( + "Supply a value to the 'subdir' parameter or" + + " set the TWITTER environment variable." + ) + raise ValueError(msg) + else: + self.creds_subdir = subdir + + self.creds_fullpath = os.path.normpath( + os.path.join(self.creds_subdir, self.creds_file) + ) + + if not os.path.isfile(self.creds_fullpath): + raise OSError(f"Cannot find file {self.creds_fullpath}") + + with open(self.creds_fullpath) as infile: + if verbose: + print(f"Reading credentials file {self.creds_fullpath}") + + for line in infile: + if "=" in line: + name, value = line.split("=", 1) + self.oauth[name.strip()] = value.strip() + + self._validate_creds_file(verbose=verbose) + + return self.oauth + + def _validate_creds_file(self, verbose=False): + """Check validity of a credentials file.""" + oauth1 = False + oauth1_keys = ["app_key", "app_secret", "oauth_token", "oauth_token_secret"] + oauth2 = False + oauth2_keys = ["app_key", "app_secret", "access_token"] + if all(k in self.oauth for k in oauth1_keys): + oauth1 = True + elif all(k in self.oauth for k in oauth2_keys): + oauth2 = True + + if not (oauth1 or oauth2): + msg = f"Missing or incorrect entries in {self.creds_file}\n" + msg += pprint.pformat(self.oauth) + raise ValueError(msg) + elif verbose: + print(f'Credentials file "{self.creds_file}" looks good') + + +def add_access_token(creds_file=None): + """ + For OAuth 2, retrieve an access token for an app and append it to a + credentials file. + """ + if creds_file is None: + path = os.path.dirname(__file__) + creds_file = os.path.join(path, "credentials2.txt") + oauth2 = credsfromfile(creds_file=creds_file) + app_key = oauth2["app_key"] + app_secret = oauth2["app_secret"] + + twitter = Twython(app_key, app_secret, oauth_version=2) + access_token = twitter.obtain_access_token() + tok = f"access_token={access_token}\n" + with open(creds_file, "a") as infile: + print(tok, file=infile) + + +def guess_path(pth): + """ + If the path is not absolute, guess that it is a subdirectory of the + user's home directory. + + :param str pth: The pathname of the directory where files of tweets should be written + """ + if os.path.isabs(pth): + return pth + else: + return os.path.expanduser(os.path.join("~", pth)) diff --git a/.github/ISSUE_TEMPLATE/1_bug-report.yml b/.github/ISSUE_TEMPLATE/1_bug-report.yml new file mode 100644 index 0000000000000000000000000000000000000000..d6161f0b2d1f777571e6fc1a17646739d0a27d4b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1_bug-report.yml @@ -0,0 +1,94 @@ +name: 🐞 Bug report +description: Create a report to help us improve +labels: ["bug"] +title: "[Bug] " +body: + - type: markdown + attributes: + value: | + For general questions or idea discussions, please post it to our [**Forum**](https://github.com/open-compass/opencompass/discussions). + If you have already identified the reason, we strongly appreciate you creating a new PR according to [the tutorial](https://opencompass.readthedocs.io/en/master/community/CONTRIBUTING.html)! + If you need our help, please fill in the following form to help us to identify the bug. + + - type: checkboxes + attributes: + label: Prerequisite + description: Please check the following items before creating a new issue. + options: + - label: I have searched [Issues](https://github.com/open-compass/opencompass/issues/) and [Discussions](https://github.com/open-compass/opencompass/discussions) but cannot get the expected help. + required: true + - label: The bug has not been fixed in the [latest version](https://github.com/open-compass/opencompass). + required: true + + - type: dropdown + id: task + attributes: + label: Type + description: The problem arises when + options: + - I'm evaluating with the officially supported tasks/models/datasets. + - I have modified the code (config is not considered code), or I'm working on my own tasks/models/datasets. + validations: + required: true + + - type: textarea + id: environment + validations: + required: true + attributes: + label: Environment + description: | + Please run `python -c "import opencompass.utils;import pprint;pprint.pprint(dict(opencompass.utils.collect_env()))"` to collect necessary environment information and paste it here. + placeholder: | + ```python + # The output the above command + ``` + + - type: textarea + attributes: + label: Reproduces the problem - code/configuration sample + description: | + Please provide a code or configuration sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. + placeholder: | + ```python + # Sample code to reproduce the problem + ``` + validations: + required: true + + - type: textarea + attributes: + label: Reproduces the problem - command or script + description: | + What command or script did you run? + placeholder: | + ```shell + The command or script you run. + ``` + validations: + required: true + + - type: textarea + attributes: + label: Reproduces the problem - error message + description: | + Please provide the error message or logs you got, with the full traceback. + + Tip: You can attach images or log files by dragging them into the text area.. + placeholder: | + ``` + The error message or logs you got, with the full traceback. + ``` + validations: + required: true + + - type: textarea + id: other + attributes: + label: Other information + description: | + Tell us anything else you think we should know. + + 1. What's your expected result? + 2. What dataset did you use? + 3. What do you think might be the reason? diff --git a/.github/ISSUE_TEMPLATE/2_feature-request.yml b/.github/ISSUE_TEMPLATE/2_feature-request.yml new file mode 100644 index 0000000000000000000000000000000000000000..bbaffefb02f5bc636324264d7e80f9a07b37153f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2_feature-request.yml @@ -0,0 +1,30 @@ +name: 🚀 Feature request +description: Suggest an idea for this project +labels: ["enhancement"] +title: "[Feature] " +body: + - type: markdown + attributes: + value: | + For general questions or idea discussions, please post it to our [**Forum**](https://github.com/open-compass/opencompass/discussions). + If you have already implemented the feature, we strongly appreciate you creating a new PR according to [the tutorial](https://opencompass.readthedocs.io/en/master/community/CONTRIBUTING.html)! + + - type: textarea + id: describe + validations: + required: true + attributes: + label: Describe the feature + description: | + What kind of feature do you want OpenCompass to add. If there is an official code release or third-party implementation, please also provide the information here, which would be very helpful. + placeholder: | + A clear and concise description of the motivation of the feature. + Ex1. It is inconvenient when \[....\]. + Ex2. There is a recent paper \[....\], which is very helpful for \[....\]. + + - type: checkboxes + id: pr + attributes: + label: Will you implement it? + options: + - label: I would like to implement this feature and create a PR! diff --git a/.github/ISSUE_TEMPLATE/3_bug-report_zh.yml b/.github/ISSUE_TEMPLATE/3_bug-report_zh.yml new file mode 100644 index 0000000000000000000000000000000000000000..e7e367d90ab039516c673a8834b49ec334d03c00 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/3_bug-report_zh.yml @@ -0,0 +1,94 @@ +name: 🐞 报告 Bug +description: 报告你在使用中遇到的不合预期的情况 +labels: ["bug"] +title: "[Bug] " +body: + - type: markdown + attributes: + value: | + 我们推荐使用英语模板 Bug report,以便你的问题帮助更多人。 + 如果需要询问一般性的问题或者想法,请在我们的[**论坛**](https://github.com/open-compass/opencompass/discussions)讨论。 + 如果你已经有了解决方案,我们非常欢迎你直接创建一个新的 PR 来解决这个问题。创建 PR 的流程可以参考[文档](https://opencompass.readthedocs.io/zh_CN/master/community/CONTRIBUTING.html)。 + 如果你需要我们的帮助,请填写以下内容帮助我们定位 Bug。 + + - type: checkboxes + attributes: + label: 先决条件 + description: 在创建新问题之前,请检查以下项目。 + options: + - label: 我已经搜索过 [问题](https://github.com/open-compass/opencompass/issues/) 和 [讨论](https://github.com/open-compass/opencompass/discussions) 但未得到预期的帮助。 + required: true + - label: 错误在 [最新版本](https://github.com/open-compass/opencompass) 中尚未被修复。 + required: true + + - type: dropdown + id: task + attributes: + label: 问题类型 + description: 问题出现时 + options: + - 我正在使用官方支持的任务/模型/数据集进行评估。 + - 我修改了代码(配置不视为代码),或者我正在处理我自己的任务/模型/数据集。 + validations: + required: true + + - type: textarea + id: environment + validations: + required: true + attributes: + label: 环境 + description: | + 请运行 `python -c "import opencompass.utils;import pprint;pprint.pprint(dict(opencompass.utils.collect_env()))"` 来收集必要的环境信息并粘贴在此处。 + placeholder: | + ```python + # 上述命令的输出 + ``` + + - type: textarea + attributes: + label: 重现问题 - 代码/配置示例 + description: | + 请提供重现您遇到的问题的代码或配置示例。它可以是一个Colab链接或仅仅是一个代码片段。 + placeholder: | + ```python + # 重现问题的示例代码 + ``` + validations: + required: true + + - type: textarea + attributes: + label: 重现问题 - 命令或脚本 + description: | + 您运行了什么命令或脚本? + placeholder: | + ```shell + 您运行的命令或脚本。 + ``` + validations: + required: true + + - type: textarea + attributes: + label: 重现问题 - 错误信息 + description: | + 请提供您收到的错误消息或日志,并提供完整的追溯。 + + 提示:您可以通过拖放图片或日志文件到文本区域来附加它们。 + placeholder: | + ``` + 您收到的错误消息或日志,带有完整的追溯。 + ``` + validations: + required: true + + - type: textarea + id: other + attributes: + label: 其他信息 + description: | + 告诉我们其他有价值的信息。 + + 1. 你是否对代码或配置文件做了任何改动? + 2. 你认为可能的原因是什么? diff --git a/.github/ISSUE_TEMPLATE/4_feature-request_zh.yml b/.github/ISSUE_TEMPLATE/4_feature-request_zh.yml new file mode 100644 index 0000000000000000000000000000000000000000..1fee37d50fc4a3d2a9c22444d559d727b75ace81 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/4_feature-request_zh.yml @@ -0,0 +1,31 @@ +name: 🚀 功能建议 +description: 建议一项新的功能 +labels: ["enhancement"] +title: "[Feature] " +body: + - type: markdown + attributes: + value: | + 推荐使用英语模板 Feature request,以便你的问题帮助更多人。 + 如果需要询问一般性的问题或者想法,请在我们的[**论坛**](https://github.com/open-compass/opencompass/discussions)讨论。 + 如果你已经实现了该功能,我们非常欢迎你直接创建一个新的 PR 来解决这个问题。创建 PR 的流程可以参考[文档](https://opencompass.readthedocs.io/zh_CN/master/community/CONTRIBUTING.html)。 + + - type: textarea + id: describe + validations: + required: true + attributes: + label: 描述该功能 + description: | + 你希望 OpenCompass 添加什么功能?如果存在相关的论文、官方实现或者第三方实现,请同时贴出链接,这将非常有帮助。 + placeholder: | + 简要说明该功能,及为什么需要该功能 + 例 1. 现在进行 xxx 的时候不方便 + 例 2. 最近的论文中提出了有一个很有帮助的 xx + + - type: checkboxes + id: pr + attributes: + label: 是否希望自己实现该功能? + options: + - label: 我希望自己来实现这一功能,并向 OpenCompass 贡献代码! diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..f3811e303270a1e54d6322fba489ff9a018c3aa7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,12 @@ +blank_issues_enabled: false + +contact_links: + - name: 📚 OpenCompass Documentation (官方文档) + url: https://opencompass.readthedocs.io/en/latest/ + about: Check if your question is answered in docs + - name: 💬 General questions (寻求帮助) + url: https://github.com/open-compass/opencompass/discussions + about: Ask general usage questions and discuss with other OpenCompass community members + - name: 🌐 Explore OpenCompass (官网) + url: https://opencompass.org.cn/ + about: Get know more about OpenCompass diff --git a/.github/scripts/compare_results.py b/.github/scripts/compare_results.py new file mode 100644 index 0000000000000000000000000000000000000000..cced71e8f94b8b58e0112920e6549b40fe88ed85 --- /dev/null +++ b/.github/scripts/compare_results.py @@ -0,0 +1,126 @@ +import filecmp +import os + +import fire + + +def compare_results(folder1, folder2, results_ignore_list=None): + # Initialize ignore_list if not provided + if results_ignore_list is None: + results_ignore_list = [] + + # Verify that both folders exist + assert os.path.isdir(folder1), f'Folder does not exist: {folder1}' + assert os.path.isdir(folder2), f'Folder does not exist: {folder2}' + + sub_folder1 = get_all_subpaths(folder1)[0] + sub_folder2 = get_all_subpaths(folder2)[0] + + print('compare predicitons') + compare_folders(os.path.join(sub_folder1, 'predictions'), + os.path.join(sub_folder2, 'predictions'), + results_ignore_list=['srbench.json']) + print('compare results') + compare_folders(os.path.join(sub_folder1, 'results'), + os.path.join(sub_folder2, 'results'), + results_ignore_list=[ + 'dingo_en_192.json', 'dingo_zh_170.json', + 'qa_dingo_cn.json', 'srbench.json' + ]) + + +def compare_folders(folder1, folder2, results_ignore_list=None): + ''' + Compare the contents of files with the same name in two folders + and their subfolders, + ignoring files specified in the ignore_list. + :param folder1: Path to the first folder + :param folder2: Path to the second folder + :param ignore_list: List of filenames to ignore + (e.g., ['temp.txt', '.DS_Store']) + :raises: AssertionError if any non-ignored files are missing or + have different content + ''' + # Initialize ignore_list if not provided + if results_ignore_list is None: + results_ignore_list = [] + + # Verify that both folders exist + assert os.path.isdir(folder1), f'Folder does not exist: {folder1}' + assert os.path.isdir(folder2), f'Folder does not exist: {folder2}' + + # List to store differences found + diff_files = [] + + # Walk through the first folder and compare files + for root, dirs, files in os.walk(folder1): + for file in files: + # Skip files in the ignore list + if os.path.basename(file) in results_ignore_list: + print('ignore case: ' + os.path.basename(file)) + continue + + # Get relative path to maintain folder structure + rel_path = os.path.relpath(os.path.join(root, file), folder1) + file2 = os.path.join(folder2, rel_path) + + # Check if file exists in the second folder + if not os.path.exists(file2): + diff_files.append((rel_path, 'File missing in second folder')) + continue + + # Compare file content (shallow=False for content comparison) + if not filecmp.cmp(os.path.join(root, file), file2, shallow=False): + diff_files.append((rel_path, 'Content differs')) + + # Check for files in folder2 that don't exist in folder1 + for root, dirs, files in os.walk(folder2): + for file in files: + # Skip files in the ignore list + if file in results_ignore_list: + continue + + rel_path = os.path.relpath(os.path.join(root, file), folder2) + file1 = os.path.join(folder1, rel_path) + if not os.path.exists(file1): + diff_files.append((rel_path, 'File missing in first folder')) + + # Raise AssertionError if any differences were found + if diff_files: + error_msg = 'Found differences in files:\n' + error_msg += '\n'.join(f'{path}: {reason}' + for path, reason in diff_files) + raise AssertionError(error_msg) + + +def get_all_subpaths(directory): + ''' + Get all subpaths (files and directories) within a given directory. + Args: + directory (str): The root directory path to search + Returns: + list: A list of all complete subpaths + ''' + subpaths = [] + + # Verify the directory exists + if not os.path.isdir(directory): + raise ValueError(f'Directory does not exist: {directory}') + + # Walk through the directory tree + for root, dirs, files in os.walk(directory): + # Add directories first + for dir_name in dirs: + full_path = os.path.join(root, dir_name) + subpaths.append(full_path) + + # Then add files + for file_name in files: + full_path = os.path.join(root, file_name) + subpaths.append(full_path) + + return subpaths + + +if __name__ == '__main__': + fire.Fire() diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py new file mode 100644 index 0000000000000000000000000000000000000000..80c161ac5d413209b4783340029f8f2ccfe1b697 --- /dev/null +++ b/.github/scripts/eval_regression_api.py @@ -0,0 +1,234 @@ +from mmengine.config import read_base + +from opencompass.models import TurboMindModelwithChatTemplate +from opencompass.models.openai_api import OpenAISDK +from opencompass.models.openai_streaming import OpenAISDKStreaming +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.gsm8k.gsm8k_gen import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.HLE.hle_llmverify_academic import \ + hle_datasets # noqa: F401, E501 + from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \ + ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ + mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_gen import \ + race_datasets # noqa: F401, E501 + +mmlu_pro_datasets = [ + x for x in mmlu_pro_datasets if 'math' in x['abbr'] or 'other' in x['abbr'] +] + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict(abbr='lmdeploy-api-test', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='Qwen/Qwen3-8B', + tokenizer_path='Qwen/Qwen3-8B', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=128, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + retry=20, + pred_postprocessor=dict(type=extract_non_reasoning_content)), + dict(abbr='lmdeploy-api-streaming-test', + type=OpenAISDKStreaming, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='Qwen/Qwen3-8B', + tokenizer_path='Qwen/Qwen3-8B', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=128, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + stream=True, + retry=20, + pred_postprocessor=dict(type=extract_non_reasoning_content)), + dict( + abbr='lmdeploy-api-streaming-test-chunk', + type=OpenAISDKStreaming, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='Qwen/Qwen3-8B', + tokenizer_path='Qwen/Qwen3-8B', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=128, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + stream=True, + retry=20, + pred_postprocessor=dict(type=extract_non_reasoning_content), + stream_chunk_size=10, + verbose=True, + ), + dict(abbr='lmdeploy-api-test-maxlen', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='Qwen/Qwen3-8B', + tokenizer_path='Qwen/Qwen3-8B', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=128, + max_out_len=8092, + max_seq_len=8092, + temperature=0.01, + batch_size=128, + retry=20, + pred_postprocessor=dict(type=extract_non_reasoning_content)), + dict(abbr='lmdeploy-api-test-maxlen-mid', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='Qwen/Qwen3-8B', + tokenizer_path='Qwen/Qwen3-8B', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=128, + max_out_len=8092, + max_seq_len=8092, + temperature=0.01, + batch_size=128, + retry=20, + mode='mid', + pred_postprocessor=dict(type=extract_non_reasoning_content)), + dict(abbr='lmdeploy-api-test-nothink', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='Qwen/Qwen3-8B', + tokenizer_path='Qwen/Qwen3-8B', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=128, + max_out_len=8092, + max_seq_len=8092, + temperature=0.01, + batch_size=128, + retry=20, + openai_extra_kwargs={ + 'top_p': 0.95, + }, + extra_body={'chat_template_kwargs': { + 'enable_thinking': False + }}, + pred_postprocessor=dict(type=extract_non_reasoning_content)), + dict( + abbr='lmdeploy-api-test-chat-template', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='Qwen/Qwen3-8B', + tokenizer_path='Qwen/Qwen3-8B', + rpm_verbose=True, + meta_template=dict(begin=dict(role='SYSTEM', + api_role='SYSTEM', + prompt='you are a helpful AI.'), + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ]), + query_per_second=128, + max_out_len=1024, + max_seq_len=1024, + temperature=0.01, + batch_size=128, + retry=20, + openai_extra_kwargs={ + 'top_p': 0.95, + }, + extra_body={'chat_template_kwargs': { + 'enable_thinking': False + }}, + pred_postprocessor=dict(type=extract_non_reasoning_content), + ), +] + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' + if 'dataset_cfg' in d['eval_cfg']['evaluator'] and 'reader_cfg' in d[ + 'eval_cfg']['evaluator']['dataset_cfg']: + d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][ + 'test_range'] = '[0:16]' + if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'dataset_cfg' in d[ + 'eval_cfg']['evaluator']['llm_evaluator']: + d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][ + 'reader_cfg']['test_range'] = '[0:16]' + +obj_judge_model = dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=46000, + max_batch_size=1, + tp=1), + gen_config=dict(do_sample=False, enable_thinking=False), + max_seq_len=46000, + max_out_len=46000, + batch_size=1, + run_cfg=dict(num_gpus=1)) + +for d in datasets: + if 'judge_cfg' in d['eval_cfg']['evaluator']: + d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model + if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'judge_cfg' in d[ + 'eval_cfg']['evaluator']['llm_evaluator']: + d['eval_cfg']['evaluator']['llm_evaluator'][ + 'judge_cfg'] = obj_judge_model + +core_summary_groups = [ + { + 'name': + 'core_average', + 'subsets': [ + ['IFEval', 'Prompt-level-strict-accuracy'], + ['hle_llmjudge', 'accuracy'], + ['mmlu_pro', 'naive_average'], + 'mmlu_pro_math', + 'mmlu_pro_other', + 'gsm8k', + 'race-middle', + 'race-high', + ], + }, +] + +summarizer = dict( + dataset_abbrs=[ + ['core_average', 'naive_average'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['hle_llmjudge', 'accuracy'], + ['mmlu_pro', 'naive_average'], + 'mmlu_pro_math', + 'mmlu_pro_other', + 'gsm8k', + 'race-middle', + 'race-high', + ], + summary_groups=sum( + [v + for k, v in locals().items() if k.endswith('_summary_groups')], []) + + core_summary_groups, +) diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py new file mode 100644 index 0000000000000000000000000000000000000000..c11c6862cf3cb6a76f95bb722e690e5e7b707f00 --- /dev/null +++ b/.github/scripts/eval_regression_base_fullbench.py @@ -0,0 +1,221 @@ +from mmengine.config import read_base + +from opencompass.models import HuggingFaceBaseModel, TurboMindModel + +with read_base(): + from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \ + ARC_c_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.drop.drop_gen_a2697c import \ + drop_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \ + gpqa_datasets # noqa: F401, E501 + # Corebench v1.7 + from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \ + hellaswag_datasets # noqa: F401, E501 + # from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \ # noqa: F401, E501 + # humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501 + # from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \ # noqa: F401, E501 + # humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \ + mathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \ + mmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \ + mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \ + nq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_few_shot_ppl import \ + race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \ + BoolQ_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \ + triviaqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \ + wikibench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ + winogrande_datasets # noqa: F401, E501 + from opencompass.configs.summarizers.groups.bbh import \ + bbh_summary_groups # noqa: F401, E501 + # Summary Groups + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.GaokaoBench import \ + GaokaoBench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import \ + mmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups # noqa: F401, E501 + + from ...rjob import eval, infer # noqa: F401, E501 + +race_datasets = [race_datasets[1]] # Only take RACE-High +bbh_datasets = [ + x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr'] + or 'multistep_arithmetic_two' in x['abbr'] +] +cmmlu_datasets = [ + x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [ + 'ancient_chinese', 'chinese_civil_service_exam', + 'chinese_driving_rule', 'chinese_food_culture', + 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', + 'chinese_teacher_qualification', 'construction_project_management', + 'elementary_chinese', 'elementary_commonsense', 'ethnology', + 'high_school_politics', 'modern_chinese', + 'traditional_chinese_medicine' + ] +] +mmlu_datasets = [ + x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ + 'business_ethics', 'clinical_knowledge', 'college_medicine', + 'global_facts', 'human_aging', 'management', 'marketing', + 'medical_genetics', 'miscellaneous', 'nutrition', + 'professional_accounting', 'professional_medicine', 'virology' + ] +] +mmlu_pro_datasets = [mmlu_pro_datasets[0]] +mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']] +GaokaoBench_datasets = [ + x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr'] + or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr'] +] +datasets = sum((v for k, v in locals().items() + if k.endswith('_datasets') and 'dingo' not in k.lower()), []) + +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []) +summary_groups.append( + { + 'name': 'Mathbench', + 'subsets': ['mathbench-a (average)', 'mathbench-t (average)'], + }, ) + +summarizer = dict( + dataset_abbrs=[ + 'Language', + ['race-high', 'accuracy'], + ['ARC-c', 'accuracy'], + ['BoolQ', 'accuracy'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + '', + 'General Reasoning', + ['drop', 'accuracy'], + ['bbh', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + ['winogrande', 'accuracy'], + '', + 'Math Calculation', + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], + 'GaokaoBench_2010-2022_Math_II_MCQs', + 'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank', + ['math', 'accuracy'], + ['Mathbench', 'naive_average'], + '', + 'Knowledge', + ['wikibench-wiki-single_choice_cncircular', 'perf_4'], + ['cmmlu', 'naive_average'], + ['mmlu', 'naive_average'], + ['mmlu_pro', 'naive_average'], + '', + 'Code', + ['openai_humaneval', 'humaneval_pass@1'], + ['openai_humaneval_v2', 'humaneval_pass@1'], + ['sanitized_mbpp', 'score'], + '', + ['dingo_en_192', 'score'], + ['dingo_zh_170', 'score'], + '', + 'mmlu', + 'mmlu-stem', + 'mmlu-social-science', + 'mmlu-humanities', + ['mmlu-other', 'accuracy'], + '', + 'cmmlu', + 'cmmlu-stem', + 'cmmlu-social-science', + 'cmmlu-humanities', + 'cmmlu-other', + ['cmmlu-china-specific', 'accuracy'], + '', + 'mmlu_pro', + 'mmlu_pro_biology', + 'mmlu_pro_business', + 'mmlu_pro_chemistry', + 'mmlu_pro_computer_science', + 'mmlu_pro_economics', + 'mmlu_pro_engineering', + 'mmlu_pro_health', + 'mmlu_pro_history', + 'mmlu_pro_law', + 'mmlu_pro_math', + 'mmlu_pro_philosophy', + 'mmlu_pro_physics', + 'mmlu_pro_psychology', + 'mmlu_pro_other', + '', + 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', + ], + summary_groups=summary_groups, +) + +hf_model = dict( + type=HuggingFaceBaseModel, + abbr='qwen-3-8b-base-hf-fullbench', + path='Qwen/Qwen3-8B-Base', + max_out_len=8192, + batch_size=8, + run_cfg=dict(num_gpus=1), +) + +tm_model = dict( + type=TurboMindModel, + abbr='qwen-3-8b-base-fullbench', + path='Qwen/Qwen3-8B-Base', + engine_config=dict(session_len=32768, max_batch_size=1, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=32768, + max_out_len=1024, + batch_size=1, + run_cfg=dict(num_gpus=1), +) + +models = [hf_model, tm_model] + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' diff --git a/.github/scripts/eval_regression_base_longtext_fullbench.py b/.github/scripts/eval_regression_base_longtext_fullbench.py new file mode 100644 index 0000000000000000000000000000000000000000..2ae029299d596e726c308738f4dfbe3a621fcece --- /dev/null +++ b/.github/scripts/eval_regression_base_longtext_fullbench.py @@ -0,0 +1,53 @@ +from mmengine.config import read_base + +from opencompass.models import TurboMindModel + +with read_base(): + from opencompass.configs.datasets.longbench.longbench import \ + longbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.needlebench.needlebench_base.needlebench_base_gen import \ + needlebench_datasets # noqa: F401, E501 + # summarizer + from opencompass.configs.summarizers.groups.longbench import \ + longbench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.needlebench import \ + needlebench_internal_200k_summarizer # noqa: F401, E501 + from opencompass.configs.summarizers.needlebench import ( + needlebench_internal_32k_summarizer, + needlebench_internal_100k_summarizer) + + from ...rjob import eval, infer # noqa: F401, E501 + +needlebench_internal_32k_summary_groups = needlebench_internal_32k_summarizer[ + 'summary_groups'] +needlebench_internal_100k_summary_groups = ( + needlebench_internal_100k_summarizer['summary_groups']) +needlebench_internal_200k_summary_groups = ( + needlebench_internal_200k_summarizer['summary_groups']) + +models = [ + dict( + type=TurboMindModel, + abbr='qwen3-8b-base-turbomind', + path='Qwen/Qwen3-8B-Base', + engine_config=dict(session_len=264192, max_batch_size=8, tp=1), + gen_config=dict(top_k=1, + temperature=1e-6, + top_p=0.9, + max_new_tokens=2048, + min_out_len=2), + max_seq_len=264192, + max_out_len=500, + batch_size=1, + drop_middle=True, + run_cfg=dict(num_gpus=1), + ) +] + +datasets = [ + v[0] for k, v in locals().items() + if k.endswith('_datasets') and isinstance(v, list) and len(v) > 0 +] + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py new file mode 100644 index 0000000000000000000000000000000000000000..a295621a20e2c9e78893ccbe0c24e05833eb7b98 --- /dev/null +++ b/.github/scripts/eval_regression_base_models.py @@ -0,0 +1,72 @@ +from mmengine.config import read_base + +from opencompass.models import VLLM, HuggingFaceBaseModel, TurboMindModel + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ + gpqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_ppl import \ + race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ + winogrande_datasets # noqa: F401, E501 + + from ...rjob import eval, infer # noqa: F401, E501 + +race_datasets = [race_datasets[1]] +models = [ + dict( + type=TurboMindModel, + abbr='qwen3-8b-base-turbomind', + path='Qwen/Qwen3-0.6B-Base', + engine_config=dict(max_batch_size=1, tp=1), + gen_config=dict(top_k=1, + temperature=1e-6, + top_p=0.9, + max_new_tokens=2048), + max_seq_len=8192, + max_out_len=2048, + batch_size=1, + run_cfg=dict(num_gpus=1), + ), + dict( + type=VLLM, + abbr='qwen3-8b-base-vllm', + path='Qwen/Qwen3-0.6B-Base', + model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.6), + max_seq_len=8192, + max_out_len=2048, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ), + dict(type=HuggingFaceBaseModel, + abbr='qwen3-8b-base-hf', + path='Qwen/Qwen3-0.6B-Base', + max_out_len=1024, + batch_size=4, + run_cfg=dict(num_gpus=1)) +] +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:32]' + +for m in models: + if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: + m['engine_config']['max_batch_size'] = 1 + m['batch_size'] = 1 +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +summarizer = dict( + dataset_abbrs=[ + ['gsm8k', 'accuracy'], + ['GPQA_diamond', 'accuracy'], + ['race-high', 'accuracy'], + ['winogrande', 'accuracy'], + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/.github/scripts/eval_regression_chat_longtext_fullbench.py b/.github/scripts/eval_regression_chat_longtext_fullbench.py new file mode 100644 index 0000000000000000000000000000000000000000..9296b83ed703f950dff986bb9e690baa68d3bf66 --- /dev/null +++ b/.github/scripts/eval_regression_chat_longtext_fullbench.py @@ -0,0 +1,53 @@ +from mmengine.config import read_base + +from opencompass.models import TurboMindModelwithChatTemplate + +with read_base(): + from opencompass.configs.datasets.babilong.babilong_256k_gen import \ + babiLong_256k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.longbench.longbench import \ + longbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \ + needlebench_datasets as needlebench_128k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ruler.ruler_128k_gen import \ + ruler_datasets as ruler_128k_datasets # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \ + models as lmdeploy_internlm2_5_7b_chat_1m_model # noqa: F401, E501 + # Summary Groups + from opencompass.configs.summarizers.groups.babilong import \ + babilong_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.longbench import \ + longbench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.ruler import \ + ruler_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.needlebench import \ + needlebench_128k_summarizer # noqa: F401, E501 + + from ...rjob import eval, infer # noqa: F401, E501 + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(hf_override=dict( + rope_scaling=dict(rope_type='yarn', + factor=4.0, + original_max_position_embeddings=32768)), + session_len=264192, + max_batch_size=1), + gen_config=dict(do_sample=True, max_new_tokens=2048), + max_seq_len=264192, + max_out_len=2048, + batch_size=1, + run_cfg=dict(num_gpus=1), + ) +] + +datasets = [ + v[0] for k, v in locals().items() + if k.endswith('_datasets') and isinstance(v, list) and len(v) > 0 +] + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py new file mode 100644 index 0000000000000000000000000000000000000000..163f66436d6bc18155f889d8a684cc4f092b99ac --- /dev/null +++ b/.github/scripts/eval_regression_chat_models.py @@ -0,0 +1,69 @@ +from mmengine.config import read_base + +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate, + VLLMwithChatTemplate) +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.gsm8k.gsm8k_gen import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_gen import \ + race_datasets # noqa: F401, E501 + + # re-design .. including some models and modify all kinds of configs + from ...rjob import eval, infer # noqa: F401, E501 + +Qwen3_0_6B_FP8_hf = dict( + type=HuggingFacewithChatTemplate, + abbr='qwen3_0_6b_fp8-hf', + path='Qwen/Qwen3-0.6B-FP8', + max_out_len=16384, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +Qwen3_0_6B_FP8_turbomind = dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen3-0_6b-fp8-turbomind', + path='Qwen/Qwen3-0.6B-FP8', + engine_config=dict(session_len=32768, max_batch_size=1), + gen_config=dict(top_k=1, max_new_tokens=16384), + max_seq_len=32768, + max_out_len=16384, + batch_size=1, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +Qwen3_0_6B_FP8_vllm = dict( + type=VLLMwithChatTemplate, + abbr='qwen3-0_6b-fp8-vllm', + path='Qwen/Qwen3-0.6B-FP8', + model_kwargs=dict(tensor_parallel_size=1), + generation_kwargs=dict(do_sample=False), # greedy + max_seq_len=32768, + max_out_len=16384, + batch_size=16, + run_cfg=dict(num_gpus=1), +) + +race_datasets = [race_datasets[1]] +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:32]' + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +models = [Qwen3_0_6B_FP8_hf, Qwen3_0_6B_FP8_turbomind, Qwen3_0_6B_FP8_vllm] + +summarizer = dict( + dataset_abbrs=[ + 'gsm8k', + 'race-middle', + 'race-high', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/.github/scripts/eval_regression_chat_obj_fullbench_other.py b/.github/scripts/eval_regression_chat_obj_fullbench_other.py new file mode 100644 index 0000000000000000000000000000000000000000..4442ea45870543b0a91d9cdea7f578cd9478a0c8 --- /dev/null +++ b/.github/scripts/eval_regression_chat_obj_fullbench_other.py @@ -0,0 +1,97 @@ +from mmengine.config import read_base + +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +with read_base(): + # Datasets + from opencompass.configs.chatml_datasets.C_MHChem.C_MHChem_gen import \ + datasets as C_MHChem_chatml_datasets # noqa: F401, E501 + from opencompass.configs.chatml_datasets.CPsyExam.CPsyExam_gen import \ + datasets as CPsyExam_chatml_datasets # noqa: F401, E501 + from opencompass.configs.chatml_datasets.MaScQA.MaScQA_gen import \ + datasets as MaScQA_chatml_datasets # noqa: F401, E501 + from opencompass.configs.chatml_datasets.UGPhysics.UGPhysics_gen import \ + datasets as UGPhysics_chatml_datasets # noqa: F401, E501 + from opencompass.configs.datasets.eese.eese_llm_judge_gen import \ + eese_datasets # noqa: F401, E501 + + from ...rjob import eval, infer # noqa: F401, E501 + +chatml_datasets = [ + v[0] for k, v in locals().items() + if k.endswith('_chatml_datasets') and isinstance(v, list) and len(v) > 0 +] + +datasets = [eese_datasets[0]] + +for d in chatml_datasets: + d['test_range'] = '[0:16]' + +for d in datasets: + if 'reader_cfg' in d: + d['reader_cfg']['test_range'] = '[0:16]' + else: + d['test_range'] = '[0:16]' + if 'eval_cfg' in d and 'dataset_cfg' in d['eval_cfg'][ + 'evaluator'] and 'reader_cfg' in d['eval_cfg']['evaluator'][ + 'dataset_cfg']: + d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][ + 'test_range'] = '[0:16]' + if 'eval_cfg' in d and 'llm_evaluator' in d['eval_cfg'][ + 'evaluator'] and 'dataset_cfg' in d['eval_cfg']['evaluator'][ + 'llm_evaluator']: + d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][ + 'reader_cfg']['test_range'] = '[0:16]' + +hf_model = dict(type=HuggingFacewithChatTemplate, + abbr='qwen-3-8b-hf-fullbench', + path='Qwen/Qwen3-8B', + max_out_len=8192, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +tm_model = dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=32768, max_batch_size=1, tp=1), + gen_config=dict(do_sample=False, enable_thinking=True), + max_seq_len=32768, + max_out_len=32768, + batch_size=1, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +models = [hf_model, tm_model] + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +obj_judge_model = dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=46000, max_batch_size=1, tp=1), + gen_config=dict(do_sample=False, enable_thinking=True), + max_seq_len=46000, + max_out_len=46000, + batch_size=1, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +for d in datasets: + if 'eval_cfg' in d and 'evaluator' in d['eval_cfg']: + if 'judge_cfg' in d['eval_cfg']['evaluator']: + d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model + if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'judge_cfg' in d[ + 'eval_cfg']['evaluator']['llm_evaluator']: + d['eval_cfg']['evaluator']['llm_evaluator'][ + 'judge_cfg'] = obj_judge_model + +for d in chatml_datasets: + if 'judge_cfg' in d['evaluator']: + d['evaluator']['judge_cfg'] = obj_judge_model + if 'llm_evaluator' in d['evaluator'] and 'judge_cfg' in d['evaluator'][ + 'llm_evaluator']: + d['evaluator']['llm_evaluator']['judge_cfg'] = obj_judge_model diff --git a/.github/scripts/eval_regression_chat_obj_fullbench_v5.py b/.github/scripts/eval_regression_chat_obj_fullbench_v5.py new file mode 100644 index 0000000000000000000000000000000000000000..9cdf1e48b0d11af4fb225a31e43de2bbd16002ff --- /dev/null +++ b/.github/scripts/eval_regression_chat_obj_fullbench_v5.py @@ -0,0 +1,311 @@ +from mmengine.config import read_base + +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +with read_base(): + # read hf models - chat models + # Dataset + from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \ + aime2024_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \ + ARC_c_datasets # noqa: F401, E501 + # remove because of oom + # from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \ + bbh_datasets # noqa: F401, E501 + # from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \ # noqa: F401, E501 + # bigcodebench_hard_complete_datasets # noqa: F401, E501 + # from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \ # noqa: F401, E501 + # bigcodebench_hard_instruct_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \ + cmo_fib_datasets # noqa: F401, E501 + from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \ + drop_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ + gpqa_datasets # noqa: F401, E501 + # new datasets in Fullbench v1.1 + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_6e39a4 import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \ + humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \ + ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \ + korbench_0shot_single_datasets # noqa: F401, E501 + from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \ + LCB_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ + mathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \ + mmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ + mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \ + mmmlu_lite_datasets # noqa: F401, E501 + from opencompass.configs.datasets.musr.musr_gen_3622bb import \ + musr_datasets # noqa: F401, E501 + from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ + nq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_cot_gen_d95929 import \ + race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \ + SciCode_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \ + BoolQ_datasets # noqa: F401, E501 + from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \ + teval_datasets as teval_en_datasets # noqa: F401, E501 + from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \ + teval_datasets as teval_zh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \ + triviaqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \ + wikibench_datasets # noqa: F401, E501 + # Summary Groups + # Summary Groups + from opencompass.configs.summarizers.groups.bbh import \ + bbh_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.GaokaoBench import \ + GaokaoBench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.korbench import \ + korbench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import \ + mmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.musr_average import \ + summarizer as musr_summarizer # noqa: F401, E501 + from opencompass.configs.summarizers.groups.scicode import \ + scicode_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.teval import \ + teval_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.mmmlu_lite import \ + mmmlu_summary_groups # noqa: F401, E501 + + from ...rjob import eval, infer # noqa: F401, E501 + +race_datasets = [race_datasets[1]] + +bbh_datasets = [ + x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr'] + or 'multistep_arithmetic_two' in x['abbr'] +] +cmmlu_datasets = [ + x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [ + 'ancient_chinese', 'chinese_civil_service_exam', + 'chinese_driving_rule', 'chinese_food_culture', + 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', + 'chinese_teacher_qualification', 'construction_project_management', + 'elementary_chinese', 'elementary_commonsense', 'ethnology', + 'high_school_politics', 'modern_chinese', + 'traditional_chinese_medicine' + ] +] +mmlu_datasets = [ + x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ + 'business_ethics', 'clinical_knowledge', 'college_medicine', + 'global_facts', 'human_aging', 'management', 'marketing', + 'medical_genetics', 'miscellaneous', 'nutrition', + 'professional_accounting', 'professional_medicine', 'virology' + ] +] + +mmlu_pro_datasets = [mmlu_pro_datasets[0]] + +mmmlu_lite_datasets = [ + x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr'] +] +mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']] +GaokaoBench_datasets = [ + x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr'] + or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr'] +] + +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets') + and 'scicode' not in k.lower() and 'teval' not in k and 'human' not in k), + [], +) +datasets += teval_en_datasets +datasets += teval_zh_datasets +datasets += humaneval_datasets +# datasets += SciCode_datasets + +musr_summary_groups = musr_summarizer['summary_groups'] +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []) +summary_groups.append( + { + 'name': 'Mathbench', + 'subsets': ['mathbench-a (average)', 'mathbench-t (average)'], + }, ) + +# Summarizer +summarizer = dict( + dataset_abbrs=[ + 'Language', + ['race-high', 'accuracy'], + ['ARC-c', 'accuracy'], + ['BoolQ', 'accuracy'], + ['triviaqa_wiki_1shot', 'score'], + ['nq_open_1shot', 'score'], + ['mmmlu_lite', 'naive_average'], + '', + 'Instruction Following', + ['IFEval', 'Prompt-level-strict-accuracy'], + '', + 'General Reasoning', + ['drop', 'accuracy'], + ['bbh', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['hellaswag', 'accuracy'], + ['TheoremQA', 'score'], + ['musr_average', 'naive_average'], + ['korbench_single', 'naive_average'], + ['ARC_Prize_Public_Evaluation', 'accuracy'], + '', + 'Math Calculation', + ['gsm8k', 'accuracy'], + ['GaokaoBench', 'weighted_average'], + ['math', 'accuracy'], + ['cmo_fib', 'accuracy'], + ['aime2024', 'accuracy'], + ['Mathbench', 'naive_average'], + '', + 'Knowledge', + ['wikibench-wiki-single_choice_cncircular', 'perf_4'], + ['cmmlu', 'naive_average'], + ['mmlu', 'naive_average'], + ['mmlu_pro', 'naive_average'], + '', + 'Code', + ['openai_humaneval', 'humaneval_pass@1'], + ['sanitized_mbpp', 'score'], + ['humanevalx', 'naive_average'], + ['ds1000', 'naive_average'], + ['lcb_code_generation', 'pass@1'], + ['lcb_code_execution', 'pass@1'], + ['lcb_test_output', 'pass@1'], + ['bigcodebench_hard_instruct', 'pass@1'], + ['bigcodebench_hard_complete', 'pass@1'], + '', + 'Agent', + ['teval', 'naive_average'], + ['SciCode', 'accuracy'], + ['SciCode', 'sub_accuracy'], + '', + 'bbh-logical_deduction_seven_objects', + 'bbh-multistep_arithmetic_two', + '', + 'mmlu', + 'mmlu-stem', + 'mmlu-social-science', + 'mmlu-humanities', + 'mmlu-other', + '', + 'cmmlu', + 'cmmlu-stem', + 'cmmlu-social-science', + 'cmmlu-humanities', + 'cmmlu-other', + 'cmmlu-china-specific', + '', + 'mmlu_pro', + 'mmlu_pro_biology', + 'mmlu_pro_business', + 'mmlu_pro_chemistry', + 'mmlu_pro_computer_science', + 'mmlu_pro_economics', + 'mmlu_pro_engineering', + 'mmlu_pro_health', + 'mmlu_pro_history', + 'mmlu_pro_law', + 'mmlu_pro_math', + 'mmlu_pro_philosophy', + 'mmlu_pro_physics', + 'mmlu_pro_psychology', + 'mmlu_pro_other', + '', + 'ds1000_Pandas', + 'ds1000_Numpy', + 'ds1000_Tensorflow', + 'ds1000_Scipy', + 'ds1000_Sklearn', + 'ds1000_Pytorch', + 'ds1000_Matplotlib', + '', + 'mmmlu_lite', + 'openai_mmmlu_lite_AR-XY', + 'openai_mmmlu_lite_BN-BD', + 'openai_mmmlu_lite_DE-DE', + 'openai_mmmlu_lite_ES-LA', + 'openai_mmmlu_lite_FR-FR', + 'openai_mmmlu_lite_HI-IN', + 'openai_mmmlu_lite_ID-ID', + 'openai_mmmlu_lite_IT-IT', + 'openai_mmmlu_lite_JA-JP', + 'openai_mmmlu_lite_KO-KR', + 'openai_mmmlu_lite_PT-BR', + 'openai_mmmlu_lite_SW-KE', + 'openai_mmmlu_lite_YO-NG', + 'openai_mmmlu_lite_ZH-CN', + '', + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', + ], + summary_groups=summary_groups, +) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' + +hf_model = dict(type=HuggingFacewithChatTemplate, + abbr='qwen-3-8b-hf-fullbench', + path='Qwen/Qwen3-8B', + max_out_len=32768, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +tm_model = dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=32768, max_batch_size=1, tp=1), + gen_config=dict(do_sample=False, enable_thinking=True), + max_seq_len=32768, + max_out_len=32768, + batch_size=1, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +models = [hf_model, tm_model] + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) diff --git a/.github/scripts/eval_regression_chat_obj_fullbench_v6.py b/.github/scripts/eval_regression_chat_obj_fullbench_v6.py new file mode 100644 index 0000000000000000000000000000000000000000..b39b458ec3a7ee2e949a46e4e57faa270714cce3 --- /dev/null +++ b/.github/scripts/eval_regression_chat_obj_fullbench_v6.py @@ -0,0 +1,142 @@ +from mmengine.config import read_base + +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +with read_base(): + from opencompass.configs.datasets.aime2024.aime2024_llmjudge_gen_5e9f4f import \ + aime2024_datasets # noqa: F401, E501 + from opencompass.configs.datasets.aime2025.aime2025_llmjudge_gen_5e9f4f import \ + aime2025_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_fedd04 import \ + arc_prize_public_evaluation_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bbh.bbh_llmjudge_gen_b5bdf1 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_2783e5 import \ + cmo_fib_datasets # noqa: F401, E501 + # dingo + from opencompass.configs.datasets.dingo.dingo_gen import \ + datasets as dingo_datasets # noqa: F401, E501 + # General Reasoning + from opencompass.configs.datasets.drop.drop_llmjudge_gen_3857b0 import \ + drop_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d16acb import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_0shot_nocot_genericllmeval_gen_772ea0 import \ + gpqa_datasets # noqa: F401, E501 + # Math Calculation + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_17d799 import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_llmjudge_gen_809ef1 import \ + hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.korbench.korbench_llmjudge_gen_56cf43 import \ + korbench_0shot_single_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_500_llmjudge_gen_6ff468 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MathBench.mathbench_2024_gen_4b8f28 import \ + mathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.musr.musr_llmjudge_gen_b47fd3 import \ + musr_datasets # noqa: F401, E501 + from opencompass.configs.datasets.supergpqa.supergpqa_llmjudge_gen_12b8bc import \ + supergpqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_c87d61 import \ + triviaqa_datasets # noqa: F401, E501 + # Summary Groups + from opencompass.configs.summarizers.groups.bbeh import \ + bbeh_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.bbh import \ + bbh_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.GaokaoBench import \ + GaokaoBench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.korbench import \ + korbench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import \ + mmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.musr_average import \ + summarizer as musr_summarizer + from opencompass.configs.summarizers.mmmlu_lite import \ + mmmlu_summary_groups # noqa: F401, E501 + + from ...rjob import eval, infer # noqa: F401, E501 + +datasets = [ + v[0] for k, v in locals().items() if k.endswith('_datasets') + and 'scicode' not in k.lower() and 'dingo' not in k.lower() + and 'arc_prize' not in k.lower() and isinstance(v, list) and len(v) > 0 +] + +datasets += arc_prize_public_evaluation_datasets +dingo_datasets[0]['abbr'] = 'qa_dingo_cn' +dingo_datasets[0]['path'] = 'data/qabench/history_prompt_case_cn.csv' +datasets.append(dingo_datasets[0]) + +musr_summary_groups = musr_summarizer['summary_groups'] +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []) + +summary_groups.append( + { + 'name': 'Mathbench', + 'subsets': ['mathbench-a (average)', 'mathbench-t (average)'], + }, ) + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' + if 'dataset_cfg' in d['eval_cfg']['evaluator'] and 'reader_cfg' in d[ + 'eval_cfg']['evaluator']['dataset_cfg']: + d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][ + 'test_range'] = '[0:16]' + if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'dataset_cfg' in d[ + 'eval_cfg']['evaluator']['llm_evaluator']: + d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][ + 'reader_cfg']['test_range'] = '[0:16]' + +hf_model = dict(type=HuggingFacewithChatTemplate, + abbr='qwen-3-8b-hf-fullbench', + path='Qwen/Qwen3-8B', + max_out_len=8192, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +tm_model = dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=32768, max_batch_size=1, tp=1), + gen_config=dict(do_sample=False, enable_thinking=True), + max_seq_len=32768, + max_out_len=32768, + batch_size=1, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +models = [hf_model, tm_model] + +models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) + +obj_judge_model = dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=46000, + max_batch_size=1, + tp=1), + gen_config=dict(do_sample=False, enable_thinking=False), + max_seq_len=46000, + max_out_len=46000, + batch_size=1, + run_cfg=dict(num_gpus=1)) + +for d in datasets: + if 'judge_cfg' in d['eval_cfg']['evaluator']: + d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model + if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'judge_cfg' in d[ + 'eval_cfg']['evaluator']['llm_evaluator']: + d['eval_cfg']['evaluator']['llm_evaluator'][ + 'judge_cfg'] = obj_judge_model diff --git a/.github/scripts/eval_regression_chat_obj_fullbench_v7.py b/.github/scripts/eval_regression_chat_obj_fullbench_v7.py new file mode 100644 index 0000000000000000000000000000000000000000..a599cd058cd201bd91b59fe1eb75788bbd65659a --- /dev/null +++ b/.github/scripts/eval_regression_chat_obj_fullbench_v7.py @@ -0,0 +1,187 @@ +from mmengine.config import read_base + +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +with read_base(): + # Datasets + # Instruct Following + # # # # Math Calculation + from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import \ + aime2024_datasets # noqa: F401, E501 + from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import \ + aime2025_datasets # noqa: F401, E501 + # # # General Reasoning + from opencompass.configs.datasets.bbeh.bbeh_llmjudge_gen_86c3a0 import \ + bbeh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_2888d3 import \ + bigcodebench_hard_complete_datasets # noqa: F401, E501 + from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import \ + bigcodebench_hard_instruct_datasets # noqa: F401, E501 + from opencompass.configs.datasets.chem_exam.competition_gen import \ + chem_competition_instruct_datasets # noqa: F401, E501 + from opencompass.configs.datasets.chem_exam.gaokao_gen import \ + chem_gaokao_instruct_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ChemBench.ChemBench_llmjudge_gen_c584cf import \ + chembench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ClimaQA.ClimaQA_Gold_llm_judge_gen_f15343 import \ + climaqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_llmjudge_gen_e1cd9a import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.Earth_Silver.Earth_Silver_llmjudge_gen import \ + earth_silver_mcq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_gen_772ea0 import \ + gpqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.HLE.hle_llmverify_gen_6ff468 import \ + hle_datasets # noqa: F401, E501 + # # Coding + from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \ + ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.kcle.kcle_llm_judge_gen import \ + kcle_datasets # noqa: F401, E501 + from opencompass.configs.datasets.korbench.korbench_single_0shot_cascade_eval_gen_56cf43 import \ + korbench_0shot_single_datasets # noqa: F401, E501 + from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \ + LCBCodeGeneration_dataset # noqa: F401, E501 + from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_cascade_eval_gen_4bce59 import \ + livemathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.matbench.matbench_llm_judge_gen_0e9276 import \ + matbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MedXpertQA.MedXpertQA_llmjudge_gen import \ + medxpertqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_llmjudge_gen_f4336b import \ + mmlu_datasets # noqa: F401, E501 + # # # Knowledge + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import \ + mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.OlymMATH.olymmath_llmverify_gen_97b203 import \ + olymmath_datasets # noqa: F401, E501 + from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import \ + olympiadbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.PHYBench.phybench_gen import \ + phybench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.PHYSICS.PHYSICS_llm_judge_gen_a133a2 import \ + physics_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ProteinLMBench.ProteinLMBench_llmjudge_gen_a67965 import \ + proteinlmbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.R_Bench.rbench_llmjudge_gen_c89350 import \ + RBench_datasets # noqa: F401, E501 + # # Academic + from opencompass.configs.datasets.SmolInstruct.smolinstruct_0shot_instruct_gen import \ + smolinstruct_datasets_0shot_instruct as \ + smolinstruct_datasets # noqa: F401, E501 + from opencompass.configs.datasets.srbench.srbench_gen import \ + srbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.supergpqa.supergpqa_cascade_gen_1545c1 import \ + supergpqa_datasets # noqa: F401, E501 + # Summary Groups + from opencompass.configs.summarizers.groups.bbeh import \ + bbeh_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.korbench import \ + korbench_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import \ + mmlu_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.OlympiadBench import \ + OlympiadBenchPhysics_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.OlympiadBench import ( # noqa: F401, E501 + OlympiadBench_summary_groups, OlympiadBenchMath_summary_groups) + from opencompass.configs.summarizers.groups.PHYSICS import \ + physics_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.supergpqa import \ + supergpqa_summary_groups # noqa: F401, E501 + + from ...rjob import eval, infer # noqa: F401, E501 + +# Add lattest LCB version +LCBCodeGeneration_v6_datasets = LCBCodeGeneration_dataset +LCBCodeGeneration_v6_datasets['abbr'] = 'lcb_code_generation_v6' +LCBCodeGeneration_v6_datasets['release_version'] = 'v6' +LCBCodeGeneration_v6_datasets['eval_cfg']['evaluator'][ + 'release_version'] = 'v6' +LCBCodeGeneration_v6_datasets = [LCBCodeGeneration_v6_datasets] + +repeated_info = [ + (math_datasets, 1), + (gpqa_datasets, 1), + (aime2024_datasets, 1), + (aime2025_datasets, 1), + (olympiadbench_datasets, 1), + (livemathbench_datasets, 1), + (olymmath_datasets, 1), + (korbench_0shot_single_datasets, 1), +] + +for datasets_, num in repeated_info: + for dataset_ in datasets_: + dataset_['n'] = num + dataset_['k'] = num + +datasets = [ + v[0] for k, v in locals().items() + if k.endswith('_datasets') and 'bigcode' not in k.lower() + and 'humaneval' not in k.lower() and isinstance(v, list) and len(v) > 0 +] + +datasets += bigcodebench_hard_instruct_datasets +datasets += bigcodebench_hard_complete_datasets + +for d in datasets: + d['reader_cfg']['test_range'] = '[0:16]' + if 'dataset_cfg' in d['eval_cfg']['evaluator'] and 'reader_cfg' in d[ + 'eval_cfg']['evaluator']['dataset_cfg']: + d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][ + 'test_range'] = '[0:16]' + if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'dataset_cfg' in d[ + 'eval_cfg']['evaluator']['llm_evaluator']: + d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][ + 'reader_cfg']['test_range'] = '[0:16]' + +hf_model = dict(type=HuggingFacewithChatTemplate, + abbr='qwen-3-8b-hf-fullbench', + path='Qwen/Qwen3-8B', + max_out_len=8192, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +tm_model = dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=32768, max_batch_size=1, tp=1), + gen_config=dict(do_sample=False, enable_thinking=True), + max_seq_len=32768, + max_out_len=32768, + batch_size=1, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +models = [hf_model, tm_model] + +obj_judge_model = dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=46000, + max_batch_size=1, + tp=1), + gen_config=dict(do_sample=False, enable_thinking=False), + max_seq_len=46000, + max_out_len=46000, + batch_size=1, + run_cfg=dict(num_gpus=1)) + +for d in datasets: + if 'judge_cfg' in d['eval_cfg']['evaluator']: + d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model + if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'judge_cfg' in d[ + 'eval_cfg']['evaluator']['llm_evaluator']: + d['eval_cfg']['evaluator']['llm_evaluator'][ + 'judge_cfg'] = obj_judge_model diff --git a/.github/scripts/eval_regression_chat_sub_fullbench.py b/.github/scripts/eval_regression_chat_sub_fullbench.py new file mode 100644 index 0000000000000000000000000000000000000000..95bf33f1205cb32bae6b924001cb06ca6d608e70 --- /dev/null +++ b/.github/scripts/eval_regression_chat_sub_fullbench.py @@ -0,0 +1,189 @@ +from mmengine.config import read_base + +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.summarizers import DefaultSubjectiveSummarizer +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +with read_base(): + # read hf models - chat models + # Dataset + from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \ + csimpleqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \ + simpleqa_datasets # noqa: F401, E501; noqa: F401, E501 + from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \ + alignbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \ + alpacav2_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \ + arenahard_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \ + compassarena_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \ + followbench_llmeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \ + mtbench101_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \ + wildbench_datasets # noqa: F401, E501 + + from ...rjob import infer, sub_eval # noqa: F401, E501 + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') + and 'mtbench101' not in k and 'wildbench' not in k), []) +datasets += mtbench101_datasets # noqa: F401, E501 +datasets += wildbench_datasets # noqa: F401, E501 + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +hf_model = dict(type=HuggingFacewithChatTemplate, + abbr='qwen-3-8b-hf-fullbench', + path='Qwen/Qwen3-8B', + max_out_len=8192, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +tm_model = dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-fullbench', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=32768, max_batch_size=1, tp=1), + gen_config=dict(do_sample=False, enable_thinking=True), + max_seq_len=32768, + max_out_len=32768, + batch_size=1, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) + +models = [hf_model, tm_model] + +judge_models = [ + dict(type=TurboMindModelwithChatTemplate, + abbr='qwen-3-8b-judger', + path='Qwen/Qwen3-8B', + engine_config=dict(session_len=46000, max_batch_size=1, tp=1), + gen_config=dict(do_sample=False, enable_thinking=False), + max_seq_len=46000, + max_out_len=46000, + batch_size=1, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content)) +] + +sub_eval['partitioner']['judge_models'] = judge_models +eval = sub_eval + +summary_groups = [] +summary_groups.append({ + 'name': 'compassarena_language', + 'subsets': [ + ['compassarena_language', '内容总结'], + ], +}) +summary_groups.append({ + 'name': 'compassarena_knowledge', + 'subsets': [ + ['compassarena_knowledge', '生活常识_ZH'], + ], +}) +summary_groups.append({ + 'name': 'compassarena_reason_v2', + 'subsets': [ + ['compassarena_reason_v2', 'reasoning'], + ], +}) +summary_groups.append({ + 'name': 'compassarena_math_v2', + 'subsets': [ + ['compassarena_math_v2', '高等数学_ZH'], + ], +}) +summary_groups.append({ + 'name': 'compassarena_creationv2_zh', + 'subsets': [ + ['compassarena_creationv2_zh', '内容扩写_ZH'], + ], +}) +summary_groups.append({ + 'name': + 'CompassArena', + 'subsets': [ + 'compassarena_language', + 'compassarena_knowledge', + 'compassarena_reason_v2', + 'compassarena_math_v2', + 'compassarena_creationv2_zh', + ], +}) +summary_groups.append({ + 'name': + 'FoFo', + 'subsets': [['fofo_test_prompts', 'overall'], + ['fofo_test_prompts_cn', 'overall']], +}) +summary_groups.append({ + 'name': + 'Followbench', + 'subsets': [ + ['followbench_llmeval_en', 'HSR_AVG'], + ['followbench_llmeval_en', 'SSR_AVG'], + ], +}) + +# Summarizer +summarizer = dict( + dataset_abbrs=[ + ['alignment_bench_v1_1', '总分'], + ['alpaca_eval', 'total'], + ['arenahard', 'score'], + ['Followbench', 'naive_average'], + ['CompassArena', 'naive_average'], + ['FoFo', 'naive_average'], + ['mtbench101', 'avg'], + ['wildbench', 'average'], + ['simpleqa', 'accuracy_given_attempted'], + ['chinese_simpleqa', 'given_attempted_accuracy'], + '', + ['alignment_bench_v1_1', '专业能力'], + ['alignment_bench_v1_1', '数学计算'], + ['alignment_bench_v1_1', '基本任务'], + ['alignment_bench_v1_1', '逻辑推理'], + ['alignment_bench_v1_1', '中文理解'], + ['alignment_bench_v1_1', '文本写作'], + ['alignment_bench_v1_1', '角色扮演'], + ['alignment_bench_v1_1', '综合问答'], + ['alpaca_eval', 'helpful_base'], + ['alpaca_eval', 'koala'], + ['alpaca_eval', 'oasst'], + ['alpaca_eval', 'selfinstruct'], + ['alpaca_eval', 'vicuna'], + ['compassarena_language', 'naive_average'], + ['compassarena_knowledge', 'naive_average'], + ['compassarena_reason_v2', 'naive_average'], + ['compassarena_math_v2', 'naive_average'], + ['compassarena_creationv2_zh', 'naive_average'], + ['fofo_test_prompts', 'overall'], + ['fofo_test_prompts_cn', 'overall'], + ['followbench_llmeval_en', 'HSR_AVG'], + ['followbench_llmeval_en', 'SSR_AVG'], + ['followbench_llmeval_en', 'HSR_L1'], + ['followbench_llmeval_en', 'HSR_L2'], + ['followbench_llmeval_en', 'HSR_L3'], + ['followbench_llmeval_en', 'HSR_L4'], + ['followbench_llmeval_en', 'HSR_L5'], + ['followbench_llmeval_en', 'SSR_L1'], + ['followbench_llmeval_en', 'SSR_L2'], + ['followbench_llmeval_en', 'SSR_L3'], + ['followbench_llmeval_en', 'SSR_L4'], + ['followbench_llmeval_en', 'SSR_L5'], + ['simpleqa', 'f1'], + ], + type=DefaultSubjectiveSummarizer, + summary_groups=summary_groups, +) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py new file mode 100644 index 0000000000000000000000000000000000000000..313238658f18c829b59dd470a86cad15010b7624 --- /dev/null +++ b/.github/scripts/oc_score_assert.py @@ -0,0 +1,408 @@ +import csv +import os + +import pytest +import yaml + +output_path = 'regression_result_daily' + + +def model_list(type): + config_path = '.github/scripts/oc_score_baseline_testrange.yaml' + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config.get(type).keys() + + +def dataset_list(model, type): + config_path = '.github/scripts/oc_score_baseline_fullbench.yaml' + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config.get(model).get(type).keys() + + +@pytest.fixture() +def baseline_scores_testrange(request): + config_path = os.path.join( + request.config.rootdir, + '.github/scripts/oc_score_baseline_testrange.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config + + +@pytest.fixture() +def baseline_scores(request): + config_path = os.path.join(request.config.rootdir, + '.github/scripts/oc_score_baseline.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config + + +@pytest.fixture() +def baseline_scores_fullbench(request): + config_path = os.path.join( + request.config.rootdir, + '.github/scripts/oc_score_baseline_fullbench.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config + + +@pytest.fixture() +def result_scores(): + file = find_csv_files(output_path) + if file is None: + return None + return read_csv_file(file) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_testrange') +@pytest.mark.chat_models +class TestChat: + """Test cases for chat model.""" + + @pytest.mark.parametrize( + 'model, dataset', [(p1, p2) for p1 in model_list('chat') + for p2 in ['gsm8k_accuracy', 'race-high_accuracy']]) + def test_model_dataset_score(self, baseline_scores_testrange, + result_scores, model, dataset): + base_score = baseline_scores_testrange.get('chat').get(model).get( + dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_testrange') +@pytest.mark.base_models +class TestBase: + """Test cases for base model.""" + + @pytest.mark.parametrize('model, dataset', + [(p1, p2) for p1 in model_list('base') for p2 in [ + 'gsm8k_accuracy', 'GPQA_diamond_accuracy', + 'race-high_accuracy', 'winogrande_accuracy' + ]]) + def test_model_dataset_score(self, baseline_scores_testrange, + result_scores, model, dataset): + if model in ['gemma-2b-vllm', 'gemma-7b-vllm' + ] and dataset != 'gsm8k_accuracy': + return + base_score = baseline_scores_testrange.get('base').get(model).get( + dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +class TestChatFullbench: + + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['qwen-3-8b-hf-fullbench', 'qwen-3-8b-fullbench'] + for p2 in dataset_list('qwen-3-8b-hf-fullbench', 'objective_v5')]) + @pytest.mark.chat_obj_fullbench_v5 + def test_chat_obj_v5(self, baseline_scores_fullbench, result_scores, model, + dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'objective_v5').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + @pytest.mark.chat_obj_fullbench_v6 + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['qwen-3-8b-hf-fullbench', 'qwen-3-8b-fullbench'] + for p2 in dataset_list('qwen-3-8b-hf-fullbench', 'objective_v6')]) + def test_chat_obj_v6(self, baseline_scores_fullbench, result_scores, model, + dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'objective_v6').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + @pytest.mark.chat_obj_fullbench_v7 + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['qwen-3-8b-hf-fullbench', 'qwen-3-8b-fullbench'] + for p2 in dataset_list('qwen-3-8b-hf-fullbench', 'objective_v7')]) + def test_chat_obj_v7(self, baseline_scores_fullbench, result_scores, model, + dataset): + if 'srbench' in dataset: + return + base_score = baseline_scores_fullbench.get(model).get( + 'objective_v7').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + @pytest.mark.chat_obj_fullbench_other + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['qwen-3-8b-hf-fullbench', 'qwen-3-8b-fullbench'] + for p2 in dataset_list('qwen-3-8b-hf-fullbench', 'objective_other')]) + def test_chat_obj_other(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'objective_other').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + @pytest.mark.chat_sub_fullbench + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['qwen-3-8b-hf-fullbench', 'qwen-3-8b-fullbench'] + for p2 in dataset_list('qwen-3-8b-hf-fullbench', 'chat_subjective')]) + def test_chat_sub_fullbench(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'chat_subjective').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['qwen-3-8b-fullbench'] + for p2 in dataset_list('qwen-3-8b-fullbench', 'chat_longtext')]) + @pytest.mark.chat_longtext_fullbench + def test_chat_longtext(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'chat_longtext').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores_fullbench') +class TestBaseFullbench: + + @pytest.mark.parametrize('model, dataset', [ + (p1, p2) + for p1 in ['qwen-3-8b-base-hf-fullbench', 'qwen-3-8b-base-fullbench'] + for p2 in dataset_list('qwen-3-8b-base-hf-fullbench', 'objective_base') + ]) + @pytest.mark.base_fullbench + def test_objective_base(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'objective_base').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + @pytest.mark.parametrize( + 'model, dataset', + [(p1, p2) for p1 in ['qwen3-8b-base-turbomind'] + for p2 in dataset_list('qwen3-8b-base-turbomind', 'base_longtext')]) + @pytest.mark.base_longtext_fullbench + def test_base_longtext(self, baseline_scores_fullbench, result_scores, + model, dataset): + base_score = baseline_scores_fullbench.get(model).get( + 'base_longtext').get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores') +@pytest.mark.api +class TestApibench: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [ + ('lmdeploy-api-test', 'race-middle_accuracy'), + ('lmdeploy-api-test', 'race-high_accuracy'), + ('lmdeploy-api-test', 'gsm8k_accuracy'), + ('lmdeploy-api-test', 'IFEval_Prompt-level-strict-accuracy'), + ('lmdeploy-api-test', 'hle_llmjudge_accuracy'), + ('lmdeploy-api-test', 'mmlu_pro_math_accuracy'), + ('lmdeploy-api-test', 'mmlu_pro_other_accuracy'), + ('lmdeploy-api-streaming-test', 'race-middle_accuracy'), + ('lmdeploy-api-streaming-test', 'race-high_accuracy'), + ('lmdeploy-api-streaming-test', 'gsm8k_accuracy'), + ('lmdeploy-api-streaming-test', 'IFEval_Prompt-level-strict-accuracy'), + ('lmdeploy-api-streaming-test', 'hle_llmjudge_accuracy'), + ('lmdeploy-api-streaming-test', 'mmlu_pro_math_accuracy'), + ('lmdeploy-api-streaming-test', 'mmlu_pro_other_accuracy'), + ('lmdeploy-api-streaming-test-chunk', 'race-middle_accuracy'), + ('lmdeploy-api-streaming-test-chunk', 'race-high_accuracy'), + ('lmdeploy-api-streaming-test-chunk', 'gsm8k_accuracy'), + ('lmdeploy-api-streaming-test-chunk', + 'IFEval_Prompt-level-strict-accuracy'), + ('lmdeploy-api-streaming-test-chunk', 'hle_llmjudge_accuracy'), + ('lmdeploy-api-streaming-test-chunk', 'mmlu_pro_math_accuracy'), + ('lmdeploy-api-streaming-test-chunk', 'mmlu_pro_other_accuracy'), + ('lmdeploy-api-test-maxlen', 'race-middle_accuracy'), + ('lmdeploy-api-test-maxlen', 'race-high_accuracy'), + ('lmdeploy-api-test-maxlen', 'gsm8k_accuracy'), + ('lmdeploy-api-test-maxlen', 'IFEval_Prompt-level-strict-accuracy'), + ('lmdeploy-api-test-maxlen', 'hle_llmjudge_accuracy'), + ('lmdeploy-api-test-maxlen', 'mmlu_pro_math_accuracy'), + ('lmdeploy-api-test-maxlen', 'mmlu_pro_other_accuracy'), + ('lmdeploy-api-test-maxlen-mid', 'race-middle_accuracy'), + ('lmdeploy-api-test-maxlen-mid', 'race-high_accuracy'), + ('lmdeploy-api-test-maxlen-mid', 'gsm8k_accuracy'), + ('lmdeploy-api-test-maxlen-mid', + 'IFEval_Prompt-level-strict-accuracy'), + ('lmdeploy-api-test-maxlen-mid', 'hle_llmjudge_accuracy'), + ('lmdeploy-api-test-maxlen-mid', 'mmlu_pro_math_accuracy'), + ('lmdeploy-api-test-maxlen-mid', 'mmlu_pro_other_accuracy'), + ('lmdeploy-api-test-chat-template', 'race-middle_accuracy'), + ('lmdeploy-api-test-chat-template', 'race-high_accuracy'), + ('lmdeploy-api-test-chat-template', + 'IFEval_Prompt-level-strict-accuracy'), + ('lmdeploy-api-test-chat-template', 'hle_llmjudge_accuracy'), + ('lmdeploy-api-test-chat-template', 'mmlu_pro_math_accuracy'), + ('lmdeploy-api-test-chat-template', 'mmlu_pro_other_accuracy') + ]) + def test_api(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score, dataset) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores') +class TestCmdCase: + + @pytest.mark.case1 + @pytest.mark.parametrize('model, dataset', + [('qwen2.5-7b-hf', 'race-middle_accuracy'), + ('qwen2.5-7b-hf', 'race-high_accuracy'), + ('qwen2.5-7b-hf', 'demo_gsm8k_accuracy')]) + def test_cmd_case1(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + @pytest.mark.case2 + @pytest.mark.parametrize( + 'model, dataset', + [('qwen2.5-7b-hf', 'race-middle_accuracy'), + ('qwen2.5-7b-hf', 'race-high_accuracy'), + ('qwen2.5-7b-hf', 'demo_gsm8k_accuracy'), + ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'), + ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'), + ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')]) + def test_cmd_case2(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score, dataset) + + @pytest.mark.case3 + @pytest.mark.parametrize('model, dataset', + [('Qwen2.5-7B_hf', 'race-middle_accuracy'), + ('Qwen2.5-7B_hf', 'race-high_accuracy'), + ('Qwen2.5-7B_hf', 'demo_gsm8k_accuracy')]) + def test_cmd_case3(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model, result_score, base_score, dataset) + + @pytest.mark.case4 + @pytest.mark.parametrize( + 'model, dataset', + [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'), + ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'), + ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')]) + def test_cmd_case4(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score, dataset) + + @pytest.mark.case5 + @pytest.mark.parametrize('model, dataset', + [('Qwen3-0.6B_hf-vllm', 'race-middle_accuracy'), + ('Qwen3-0.6B_hf-vllm', 'race-high_accuracy'), + ('Qwen3-0.6B_hf-vllm', 'demo_gsm8k_accuracy')]) + def test_cmd_case5(self, baseline_scores, result_scores, model, dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(model + '_batch', result_score, base_score, dataset) + + +def assert_score(model_type, score, baseline, dataset: str = ''): + if score is None or score == '-': + assert False, 'value is none' + + if 'batch' not in model_type: + if float(score) <= (float(baseline) + + 0.01) and float(score) >= (float(baseline) - 0.01): + print(' '.join([score, 'is equal', str(baseline)])) + assert True + else: + print(' '.join([score, 'is not equal', str(baseline)])) + assert False, ' '.join([score, 'is not equal', str(baseline)]) + else: + if dataset.startswith('dingo') or dataset.startswith( + 'GPQA') or dataset.startswith('high') or dataset.startswith( + 'mmlu_pro_') or dataset.startswith( + 'alpaca_eval') or dataset.startswith('compassarena_'): + threshold = 5 + elif dataset.startswith('humanevalx') or dataset == 'large_threshold': + threshold = 10 + else: + threshold = 3.2 + if float(score) <= (baseline + threshold) and float(score) >= ( + baseline - threshold): + print(' '.join([ + score, 'is between', + str(baseline - threshold), 'and', + str(baseline + threshold) + ])) + assert True + else: + print(' '.join([ + score, 'is not between', + str(baseline - threshold), 'and', + str(baseline + threshold) + ])) + assert False, ' '.join([ + score, 'is not between', + str(baseline - threshold), 'and', + str(baseline + threshold) + ]) + + +def find_csv_files(directory): + csv_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.csv') and file.startswith('summary'): + csv_files.append(os.path.join(root, file)) + + csv_files_with_time = {f: os.path.getctime(f) for f in csv_files} + sorted_csv_files = sorted(csv_files_with_time.items(), key=lambda x: x[1]) + latest_csv_file = sorted_csv_files[-1][0] + return latest_csv_file + + +def read_csv_file(file_path): + with open(file_path, 'r') as csvfile: + reader = csv.DictReader(csvfile) + filtered_data = [] + for row in reader: + if row['metric'] is not None and 'bpb' not in row[ + 'metric'] and '_' != row['metric']: + filtered_row = row + filtered_row['dataset'] = row['dataset'] + '_' + row['metric'] + del filtered_row['version'] + del filtered_row['metric'] + del filtered_row['mode'] + filtered_data.append(filtered_row) + + result = {} + for data in filtered_data: + dataset = data.get('dataset') + for key in data.keys(): + if key == 'dataset': + continue + else: + if key in result.keys(): + result.get(key)[dataset] = data.get(key) + else: + result[key] = {dataset: data.get(key)} + return result diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e55a992d5a30cfab44ebc147b126371b952faa5 --- /dev/null +++ b/.github/scripts/oc_score_baseline.yaml @@ -0,0 +1,86 @@ +qwen2.5-7b-hf: + demo_gsm8k_accuracy: 78.12 + race-middle_accuracy: 90.46 + race-high_accuracy: 86.54 + +internlm3-8b-instruct-lmdeploy: + demo_gsm8k_accuracy: 75.00 + race-middle_accuracy: 93.31 + race-high_accuracy: 90.28 + +internlm3-8b-instruct_hf-lmdeploy: + demo_gsm8k_accuracy: 73.44 + race-middle_accuracy: 93.38 + race-high_accuracy: 90.34 + +Qwen2.5-7B_hf: + demo_gsm8k_accuracy: 78.12 + race-middle_accuracy: 90.46 + race-high_accuracy: 86.54 + +Qwen3-0.6B_hf-vllm: + demo_gsm8k_accuracy: 53.12 + race-middle_accuracy: 38.23 + race-high_accuracy: 28.07 + +lmdeploy-api-test: + IFEval_Prompt-level-strict-accuracy: 81.25 + hle_llmjudge_accuracy: 75.00 + mmlu_pro_math_accuracy: 18.75 + mmlu_pro_other_accuracy: 25.00 + race-middle_accuracy: 87.50 + race-high_accuracy: 81.25 + gsm8k_accuracy: 18.75 + +lmdeploy-api-streaming-test: + IFEval_Prompt-level-strict-accuracy: 68.75 + hle_llmjudge_accuracy: 81.25 + mmlu_pro_math_accuracy: 18.75 + mmlu_pro_other_accuracy: 31.25 + race-middle_accuracy: 81.25 + race-high_accuracy: 75.00 + gsm8k_accuracy: 25.00 + +lmdeploy-api-streaming-test-chunk: + IFEval_Prompt-level-strict-accuracy: 81.25 + hle_llmjudge_accuracy: 75 + mmlu_pro_math_accuracy: 18.75 + mmlu_pro_other_accuracy: 37.50 + race-middle_accuracy: 75 + race-high_accuracy: 81.25 + gsm8k_accuracy: 37.5 + +lmdeploy-api-test-maxlen: + IFEval_Prompt-level-strict-accuracy: 93.75 + hle_llmjudge_accuracy: 56.25 + mmlu_pro_math_accuracy: 62.50 + mmlu_pro_other_accuracy: 43.75 + race-middle_accuracy: 81.25 + race-high_accuracy: 81.25 + gsm8k_accuracy: 31.25 + +lmdeploy-api-test-maxlen-mid: + IFEval_Prompt-level-strict-accuracy: 12.50 + hle_llmjudge_accuracy: 81.25 + mmlu_pro_math_accuracy: 0 + mmlu_pro_other_accuracy: 0 + race-middle_accuracy: 18.75 + race-high_accuracy: 12.50 + gsm8k_accuracy: 43.75 + +lmdeploy-api-test-nothink: + IFEval_Prompt-level-strict-accuracy: 93.75 + hle_llmjudge_accuracy: 62.50 + mmlu_pro_math_accuracy: 62.50 + mmlu_pro_other_accuracy: 50.00 + race-middle_accuracy: 87.50 + race-high_accuracy: 87.50 + gsm8k_accuracy: 31.25 + +lmdeploy-api-test-chat-template: + IFEval_Prompt-level-strict-accuracy: 68.75 + hle_llmjudge_accuracy: 62.50 + mmlu_pro_math_accuracy: 18.75 + mmlu_pro_other_accuracy: 6.25 + race-middle_accuracy: 81.25 + race-high_accuracy: 68.75 \ No newline at end of file diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c20be4d594d1d5b95269794302f4e7c7a320c353 --- /dev/null +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -0,0 +1,761 @@ +qwen-3-8b-hf-fullbench: + objective_other: + C-MHChem_accuracy: 81.25 + C-MHChem_f1: 0.81 + C-MHChem_accuracy_given_attempted: 81.25 + C-MHChem_attempted_ratio: 100 + C-MHChem_correct_count: 13 + C-MHChem_incorrect_count: 3 + C-MHChem_not_attempted_count: 0 + CPsyExam_accuracy: 87.5 + CPsyExam_f1: 0.88 + CPsyExam_accuracy_given_attempted: 87.5 + CPsyExam_attempted_ratio: 100 + CPsyExam_correct_count: 14 + CPsyExam_incorrect_count: 2 + CPsyExam_not_attempted_count: 0 + MaScQA_accuracy: 75 + MaScQA_f1: 0.75 + MaScQA_accuracy_given_attempted: 75 + MaScQA_attempted_ratio: 100 + MaScQA_correct_count: 12 + MaScQA_incorrect_count: 4 + MaScQA_not_attempted_count: 0 + UGPhysics_AtomicPhysics_zh_accuracy: 56.25 + objective_v5: + race-high_accuracy: 87.5 + ARC-c_accuracy: 100 + BoolQ_accuracy: 93.75 + triviaqa_wiki_1shot_score: 0 + nq_open_1shot_score: 0 + IFEval_Prompt-level-strict-accuracy: 87.50 + drop_accuracy: 93.75 + GPQA_diamond_accuracy: 62.50 + hellaswag_accuracy: 75 + TheoremQA_score: 6.25 + musr_average_naive_average: 14.58 + korbench_single_naive_average: 75.00 + gsm8k_accuracy: 75 + math_accuracy: 93.75 + cmo_fib_accuracy: 0 + aime2024_accuracy: 6.25 + wikibench-wiki-single_choice_cncircular_perf_4: 25 + sanitized_mbpp_score: 6.25 + lcb_code_generation_pass@1: 6.25 + lcb_code_execution_pass@1: 50 + lcb_test_output_pass@1: 0 + teval_naive_average: 69.16 + bbh-logical_deduction_seven_objects_score: 12.5 + bbh-multistep_arithmetic_two_score: 0 + mmlu-other_accuracy: 77.88 + cmmlu-china-specific_accuracy: 74.17 + mmlu_pro_math_accuracy: 81.25 + openai_mmmlu_lite_AR-XY_accuracy: 50 + college_naive_average: 37.5 + college_knowledge_naive_average: 50 + objective_v6: + aime2024_accuracy: 87.5 + aime2024_f1: 0.88 + aime2024_accuracy_given_attempted: 87.5 + aime2024_attempted_ratio: 100 + aime2024_correct_count: 14 + aime2024_incorrect_count: 2 + aime2024_not_attempted_count: 0 + aime2025_accuracy: 56.25 + aime2025_f1: 0.56 + aime2025_accuracy_given_attempted: 56.25 + aime2025_attempted_ratio: 100 + aime2025_correct_count: 9 + aime2025_incorrect_count: 7 + aime2025_not_attempted_count: 0 + bbh-temporal_sequences_score: 75 + bbh-temporal_sequences_score_given_attempted: 75 + bbh-temporal_sequences_attempted_ratio: 100 + bbh-temporal_sequences_correct_count: 12 + bbh-temporal_sequences_incorrect_count: 4 + bbh-temporal_sequences_not_attempted_count: 0 + cmo_fib_accuracy: 31.25 + drop_accuracy: 56.25 + drop_f1: 0.56 + drop_accuracy_given_attempted: 56.25 + drop_attempted_ratio: 100 + drop_correct_count: 9 + drop_incorrect_count: 7 + drop_not_attempted_count: 0 + GaokaoBench_2010-2022_Math_II_MCQs_score: 100 + GPQA_diamond_accuracy: 56.25 + GPQA_diamond_f1: 0.56 + GPQA_diamond_accuracy_given_attempted: 56.25 + GPQA_diamond_attempted_ratio: 100 + GPQA_diamond_correct_count: 9 + GPQA_diamond_incorrect_count: 7 + GPQA_diamond_not_attempted_count: 0 + gsm8k_accuracy: 93.75 + hellaswag_accuracy: 100 + hellaswag_f1: 1 + hellaswag_accuracy_given_attempted: 100 + hellaswag_attempted_ratio: 100 + hellaswag_correct_count: 16 + hellaswag_incorrect_count: 0 + hellaswag_not_attempted_count: 0 + korbench_cipher_accuracy: 93.75 + korbench_cipher_f1: 0.94 + korbench_cipher_accuracy_given_attempted: 93.75 + korbench_cipher_attempted_ratio: 100 + korbench_cipher_correct_count: 15 + korbench_cipher_incorrect_count: 1 + korbench_cipher_not_attempted_count: 0 + math_prm800k_500-llmjudge_accuracy: 93.75 + math_prm800k_500-llmjudge_f1: 0.94 + math_prm800k_500-llmjudge_accuracy_given_attempted: 93.75 + math_prm800k_500-llmjudge_attempted_ratio: 100 + math_prm800k_500-llmjudge_correct_count: 15 + math_prm800k_500-llmjudge_incorrect_count: 1 + math_prm800k_500-llmjudge_not_attempted_count: 0 + mathbench-college-single_choice_cn_acc_4: 87.5 + mathbench-college-single_choice_cn_acc_1: 75 + mathbench-college-single_choice_cn_more_1_0: 100 + mathbench-college-single_choice_cn_more_1_1: 75 + mathbench-college-single_choice_cn_more_4_0: 100 + mathbench-college-single_choice_cn_more_4_1: 100 + mathbench-college-single_choice_cn_more_4_2: 100 + mathbench-college-single_choice_cn_more_4_3: 100 + mathbench-college-single_choice_cn_more_4_4: 50 + mathbench-college-single_choice_cn_perf_1: 75 + mathbench-college-single_choice_cn_perf_4: 50 + mathbench-college-single_choice_cn_vote_4: 100 + mathbench-college-single_choice_cn_vote_1: 75 + mathbench-college-single_choice_cn_prior_A: 31.25 + mathbench-college-single_choice_cn_prior_B: 12.5 + mathbench-college-single_choice_cn_prior_C: 25 + mathbench-college-single_choice_cn_prior_D: 25 + mathbench-college-single_choice_cn_prior_-: 6.25 + musr_murder_mysteries_accuracy: 68.75 + musr_murder_mysteries_f1: 0.69 + musr_murder_mysteries_accuracy_given_attempted: 68.75 + musr_murder_mysteries_attempted_ratio: 100 + musr_murder_mysteries_correct_count: 11 + musr_murder_mysteries_incorrect_count: 5 + musr_murder_mysteries_not_attempted_count: 0 + supergpqa_accuracy: 68.75 + supergpqa_total_correct: 11 + supergpqa_total_count: 16 + supergpqa_SuperGPQA-Engineering: 80 + supergpqa_SuperGPQA-Philosophy: 0 + supergpqa_SuperGPQA-Medicine: 100 + supergpqa_SuperGPQA-Economics: 50 + supergpqa_SuperGPQA-Science: 66.67 + supergpqa_SuperGPQA-Engineering-Electronic Science and Technology: 100 + supergpqa_SuperGPQA-Philosophy-Philosophy: 0 + supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine: 100 + supergpqa_SuperGPQA-Economics-Applied Economics: 0 + supergpqa_SuperGPQA-Science-Mathematics: 80 + supergpqa_SuperGPQA-Science-Physics: 0 + supergpqa_SuperGPQA-Medicine-Clinical Medicine: 100 + supergpqa_SuperGPQA-Engineering-Computer Science and Technology: 100 + supergpqa_SuperGPQA-Engineering-Information and Communication Engineering: 50 + supergpqa_SuperGPQA-Engineering-Control Science and Engineering: 100 + supergpqa_SuperGPQA-Economics-Theoretical Economics: 100 + supergpqa_SuperGPQA-Engineering-Electronic Science and Technology-Circuits and Systems: 100 + supergpqa_SuperGPQA-Philosophy-Philosophy-Philosophical Aesthetics: 0 + supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine-Traditional Chinese Medicine Theory: 100 + supergpqa_SuperGPQA-Economics-Applied Economics-Finance: 0 + supergpqa_SuperGPQA-Science-Mathematics-Combinatorial Mathematics: 100 + supergpqa_SuperGPQA-Science-Physics-Particle and Nuclear Physics: 0 + supergpqa_SuperGPQA-Medicine-Clinical Medicine-Internal Medicine: 100 + supergpqa_SuperGPQA-Engineering-Computer Science and Technology-Computer Architecture: 100 + supergpqa_SuperGPQA-Science-Mathematics-Ordinary Differential Equations: 100 + supergpqa_SuperGPQA-Science-Mathematics-Mathematical Analysis: 0 + supergpqa_SuperGPQA-Engineering-Information and Communication Engineering-Signal and Information Processing: 50 + supergpqa_SuperGPQA-Science-Mathematics-Advanced Algebra: 100 + supergpqa_SuperGPQA-Engineering-Control Science and Engineering-Control Theory and Control Engineering: 100 + supergpqa_SuperGPQA-Economics-Theoretical Economics-Political Economy: 100 + triviaqa_wiki_1shot_score: 50 + ARC_Prize_Public_Evaluation_accuracy: 0.06 + objective_v7: + aime2024_accuracy: 50 + aime2025_accuracy: 25 + bbeh_boolean_expressions_accuracy: 75 + bbeh_boolean_expressions_f1: 0.75 + bbeh_boolean_expressions_accuracy_given_attempted: 75 + bbeh_boolean_expressions_attempted_ratio: 100 + bbeh_boolean_expressions_correct_count: 12 + bbeh_boolean_expressions_incorrect_count: 4 + bbeh_boolean_expressions_not_attempted_count: 0 + Chem_exam-competition_final_score: 40.39 + Chem_exam-gaokao_final_score: 78.12 + ChemBench_Name_Conversion_accuracy: 93.75 + ChemBench_Name_Conversion_f1: 0.94 + ChemBench_Name_Conversion_accuracy_given_attempted: 93.75 + ChemBench_Name_Conversion_attempted_ratio: 100 + ChemBench_Name_Conversion_correct_count: 15 + ChemBench_Name_Conversion_incorrect_count: 1 + ChemBench_Name_Conversion_not_attempted_count: 0 + ClimaQA_Gold_mcq_accuracy: 87.5 + ClimaQA_Gold_mcq_f1: 0.88 + ClimaQA_Gold_mcq_accuracy_given_attempted: 87.5 + ClimaQA_Gold_mcq_attempted_ratio: 100 + ClimaQA_Gold_mcq_correct_count: 14 + ClimaQA_Gold_mcq_incorrect_count: 2 + ClimaQA_Gold_mcq_not_attempted_count: 0 + cmmlu-agronomy_accuracy: 87.5 + cmmlu-agronomy_f1: 0.88 + cmmlu-agronomy_accuracy_given_attempted: 87.5 + cmmlu-agronomy_attempted_ratio: 100 + cmmlu-agronomy_correct_count: 14 + cmmlu-agronomy_incorrect_count: 2 + cmmlu-agronomy_not_attempted_count: 0 + earth_silver_mcq_accuracy: 75 + earth_silver_mcq_f1: 0.75 + earth_silver_mcq_accuracy_given_attempted: 75 + earth_silver_mcq_attempted_ratio: 100 + earth_silver_mcq_correct_count: 12 + earth_silver_mcq_incorrect_count: 4 + earth_silver_mcq_not_attempted_count: 0 + GPQA_diamond_accuracy: 56.25 + hle_llmjudge_accuracy: 56.25 + hle_llmjudge_f1: 0.56 + hle_llmjudge_accuracy_given_attempted: 56.25 + hle_llmjudge_attempted_ratio: 100 + hle_llmjudge_correct_count: 9 + hle_llmjudge_incorrect_count: 7 + hle_llmjudge_not_attempted_count: 0 + IFEval_Prompt-level-strict-accuracy: 87.5 + IFEval_Inst-level-strict-accuracy: 91.67 + IFEval_Prompt-level-loose-accuracy: 87.5 + IFEval_Inst-level-loose-accuracy: 91.67 + kcle_accuracy: 62.5 + kcle_f1: 0.62 + kcle_accuracy_given_attempted: 62.5 + kcle_attempted_ratio: 100 + kcle_correct_count: 10 + kcle_incorrect_count: 6 + kcle_not_attempted_count: 0 + korbench_cipher_accuracy: 87.5 + livemathbench_hard_custom_hard_cn_accuracy: 18.75 + matbench_steels_mae: 635.88 + math_prm800k_500_accuracy: 93.75 + sanitized_mbpp_score: 6.25 + sanitized_mbpp_pass: 1 + sanitized_mbpp_timeout: 0 + sanitized_mbpp_failed: 15 + sanitized_mbpp_wrong_answer: 0 + medxpertqa_accuracy: 50 + medxpertqa_total_correct: 8 + medxpertqa_total_count: 16 + medxpertqa_MedXpertQA-Basic Science: 50 + medxpertqa_MedXpertQA-Diagnosis: 37.5 + medxpertqa_MedXpertQA-Treatment: 75 + medxpertqa_MedXpertQA-Skeletal: 33.33 + medxpertqa_MedXpertQA-Muscular: 100 + medxpertqa_MedXpertQA-Respiratory: 0 + medxpertqa_MedXpertQA-Endocrine: 100 + medxpertqa_MedXpertQA-Cardiovascular: 50 + medxpertqa_MedXpertQA-Lymphatic: 0 + medxpertqa_MedXpertQA-Nervous: 100 + medxpertqa_MedXpertQA-Reproductive: 0 + medxpertqa_MedXpertQA-Reasoning: 50 + medxpertqa_MedXpertQA-Understanding: 50 + lukaemon_mmlu_college_biology_accuracy: 81.25 + lukaemon_mmlu_college_biology_f1: 0.81 + lukaemon_mmlu_college_biology_accuracy_given_attempted: 81.25 + lukaemon_mmlu_college_biology_attempted_ratio: 100 + lukaemon_mmlu_college_biology_correct_count: 13 + lukaemon_mmlu_college_biology_incorrect_count: 3 + lukaemon_mmlu_college_biology_not_attempted_count: 0 + mmlu_pro_math_accuracy: 81.25 + mmlu_pro_math_f1: 0.81 + mmlu_pro_math_accuracy_given_attempted: 81.25 + mmlu_pro_math_attempted_ratio: 100 + mmlu_pro_math_correct_count: 13 + mmlu_pro_math_incorrect_count: 3 + mmlu_pro_math_not_attempted_count: 0 + olymmath_llmjudge_en-hard_accuracy: 68.75 + olymmath_llmjudge_en-hard_f1: 0.69 + olymmath_llmjudge_en-hard_accuracy_given_attempted: 68.75 + olymmath_llmjudge_en-hard_attempted_ratio: 100 + olymmath_llmjudge_en-hard_correct_count: 11 + olymmath_llmjudge_en-hard_incorrect_count: 5 + olymmath_llmjudge_en-hard_not_attempted_count: 0 + OlympiadBench_OE_TO_maths_en_COMP_accuracy: 50 + OlympiadBench_OE_TO_maths_en_COMP_f1: 0.5 + OlympiadBench_OE_TO_maths_en_COMP_accuracy_given_attempted: 50 + OlympiadBench_OE_TO_maths_en_COMP_attempted_ratio: 100 + OlympiadBench_OE_TO_maths_en_COMP_correct_count: 8 + OlympiadBench_OE_TO_maths_en_COMP_incorrect_count: 8 + OlympiadBench_OE_TO_maths_en_COMP_not_attempted_count: 0 + phybench-eed_accuracy: 6.25 + PHYSICS_atomic_dataset_textonly_accuracy: 81.25 + PHYSICS_atomic_dataset_textonly_f1: 0.81 + PHYSICS_atomic_dataset_textonly_accuracy_given_attempted: 81.25 + PHYSICS_atomic_dataset_textonly_attempted_ratio: 100 + PHYSICS_atomic_dataset_textonly_correct_count: 13 + PHYSICS_atomic_dataset_textonly_incorrect_count: 3 + PHYSICS_atomic_dataset_textonly_not_attempted_count: 0 + ProteinLMBench_accuracy: 62.5 + ProteinLMBench_f1: 0.62 + ProteinLMBench_accuracy_given_attempted: 62.5 + ProteinLMBench_attempted_ratio: 100 + ProteinLMBench_correct_count: 10 + ProteinLMBench_incorrect_count: 6 + ProteinLMBench_not_attempted_count: 0 + R-Bench_en_accuracy: 62.5 + R-Bench_en_f1: 0.62 + R-Bench_en_accuracy_given_attempted: 62.5 + R-Bench_en_attempted_ratio: 100 + R-Bench_en_correct_count: 10 + R-Bench_en_incorrect_count: 6 + R-Bench_en_not_attempted_count: 0 + NC-I2F-0shot-instruct_score: 0 + NC-I2F-0shot-instruct_valid_score: 68.75 + srbench_mean_RMSE: 30057.14 + srbench_mean_NMSE: 29.42 + srbench_mean_R2: -28.42 + srbench_SymbolicMatch: 0.18 + supergpqa_Electronic_Science_and_Technology_accuracy: 68.75 + lcb_code_generation_v6_pass@1: 37.5 + chat_subjective: + alignment_bench_v1_1_总分: 0.46 + arenahard_score: 100 + Followbench_naive_average: 1 + mtbench101_avg: 9 + wildbench_average: 66.72 + simpleqa_accuracy_given_attempted: 0.73 + chinese_simpleqa_given_attempted_accuracy: 0.5 + alignment_bench_v1_1_专业能力: 5.56 + alignment_bench_v1_1_数学计算: 0 + alignment_bench_v1_1_基本任务: 0 + alignment_bench_v1_1_逻辑推理: 0 + alignment_bench_v1_1_中文理解: 0 + alignment_bench_v1_1_文本写作: 0 + alignment_bench_v1_1_角色扮演: 0 + alignment_bench_v1_1_综合问答: 0 + compassarena_language_naive_average: 52.63 + compassarena_knowledge_naive_average: 85 + compassarena_reason_v2_naive_average: 50 + compassarena_creationv2_zh_naive_average: 95 + followbench_llmeval_en_HSR_AVG: 1 + followbench_llmeval_en_SSR_AVG: 1 + followbench_llmeval_en_HSR_L1: 1 + followbench_llmeval_en_HSR_L2: 1 + followbench_llmeval_en_HSR_L3: 1 + followbench_llmeval_en_HSR_L4: 1 + followbench_llmeval_en_HSR_L5: 1 + followbench_llmeval_en_SSR_L1: 1 + followbench_llmeval_en_SSR_L2: 1 + followbench_llmeval_en_SSR_L3: 1 + followbench_llmeval_en_SSR_L4: 1 + followbench_llmeval_en_SSR_L5: 1 + simpleqa_f1: 0.62 + +qwen-3-8b-fullbench: + objective_other: + C-MHChem_accuracy: 75 + C-MHChem_f1: 0.75 + C-MHChem_accuracy_given_attempted: 75 + C-MHChem_attempted_ratio: 100 + C-MHChem_correct_count: 12 + C-MHChem_incorrect_count: 4 + C-MHChem_not_attempted_count: 0 + CPsyExam_accuracy: 93.75 + CPsyExam_f1: 0.94 + CPsyExam_accuracy_given_attempted: 93.75 + CPsyExam_attempted_ratio: 100 + CPsyExam_correct_count: 15 + CPsyExam_incorrect_count: 1 + CPsyExam_not_attempted_count: 0 + MaScQA_accuracy: 81.25 + MaScQA_f1: 0.81 + MaScQA_accuracy_given_attempted: 81.25 + MaScQA_attempted_ratio: 100 + MaScQA_correct_count: 13 + MaScQA_incorrect_count: 3 + MaScQA_not_attempted_count: 0 + UGPhysics_AtomicPhysics_zh_accuracy: 68.75 + objective_v5: + race-high_accuracy: 81.25 + ARC-c_accuracy: 100 + BoolQ_accuracy: 87.5 + triviaqa_wiki_1shot_score: 0 + nq_open_1shot_score: 0 + IFEval_Prompt-level-strict-accuracy: 87.50 + drop_accuracy: 93.75 + GPQA_diamond_accuracy: 75 + hellaswag_accuracy: 87.5 + TheoremQA_score: 6.25 + musr_average_naive_average: 20.83 + korbench_single_naive_average: 72.5 + gsm8k_accuracy: 68.75 + math_accuracy: 93.75 + cmo_fib_accuracy: 0 + aime2024_accuracy: 0 + wikibench-wiki-single_choice_cncircular_perf_4: 50 + sanitized_mbpp_score: 6.25 + lcb_code_generation_pass@1: 6.25 + lcb_code_execution_pass@1: 68.75 + lcb_test_output_pass@1: 0 + teval_naive_average: 67.43 + bbh-logical_deduction_seven_objects_score: 6.25 + bbh-multistep_arithmetic_two_score: 0 + mmlu-other_accuracy: 81.25 + cmmlu-china-specific_accuracy: 75.83 + mmlu_pro_math_accuracy: 87.5 + openai_mmmlu_lite_AR-XY_accuracy: 43.75 + college_naive_average: 25 + college_knowledge_naive_average: 37.5 + objective_v6: + aime2024_accuracy: 68.75 + aime2024_f1: 0.69 + aime2024_accuracy_given_attempted: 68.75 + aime2024_attempted_ratio: 100 + aime2024_correct_count: 11 + aime2024_incorrect_count: 5 + aime2024_not_attempted_count: 0 + aime2025_accuracy: 56.25 + aime2025_f1: 0.56 + aime2025_accuracy_given_attempted: 56.25 + aime2025_attempted_ratio: 100 + aime2025_correct_count: 9 + aime2025_incorrect_count: 7 + aime2025_not_attempted_count: 0 + bbh-temporal_sequences_score: 75 + bbh-temporal_sequences_score_given_attempted: 75 + bbh-temporal_sequences_attempted_ratio: 100 + bbh-temporal_sequences_correct_count: 12 + bbh-temporal_sequences_incorrect_count: 4 + bbh-temporal_sequences_not_attempted_count: 0 + cmo_fib_accuracy: 75 + drop_accuracy: 56.25 + drop_f1: 0.56 + drop_accuracy_given_attempted: 56.25 + drop_attempted_ratio: 100 + drop_correct_count: 9 + drop_incorrect_count: 7 + drop_not_attempted_count: 0 + GaokaoBench_2010-2022_Math_II_MCQs_score: 100 + GPQA_diamond_accuracy: 62.5 + GPQA_diamond_f1: 0.62 + GPQA_diamond_accuracy_given_attempted: 62.5 + GPQA_diamond_attempted_ratio: 100 + GPQA_diamond_correct_count: 10 + GPQA_diamond_incorrect_count: 6 + GPQA_diamond_not_attempted_count: 0 + gsm8k_accuracy: 100 + hellaswag_accuracy: 100 + hellaswag_f1: 1 + hellaswag_accuracy_given_attempted: 100 + hellaswag_attempted_ratio: 100 + hellaswag_correct_count: 16 + hellaswag_incorrect_count: 0 + hellaswag_not_attempted_count: 0 + korbench_cipher_accuracy: 93.75 + korbench_cipher_f1: 0.94 + korbench_cipher_accuracy_given_attempted: 93.75 + korbench_cipher_attempted_ratio: 100 + korbench_cipher_correct_count: 15 + korbench_cipher_incorrect_count: 1 + korbench_cipher_not_attempted_count: 0 + math_prm800k_500-llmjudge_accuracy: 100 + math_prm800k_500-llmjudge_f1: 1 + math_prm800k_500-llmjudge_accuracy_given_attempted: 100 + math_prm800k_500-llmjudge_attempted_ratio: 100 + math_prm800k_500-llmjudge_correct_count: 16 + math_prm800k_500-llmjudge_incorrect_count: 0 + math_prm800k_500-llmjudge_not_attempted_count: 0 + mathbench-college-single_choice_cn_acc_4: 93.75 + mathbench-college-single_choice_cn_acc_1: 75 + mathbench-college-single_choice_cn_more_1_0: 100 + mathbench-college-single_choice_cn_more_1_1: 75 + mathbench-college-single_choice_cn_more_4_0: 100 + mathbench-college-single_choice_cn_more_4_1: 100 + mathbench-college-single_choice_cn_more_4_2: 100 + mathbench-college-single_choice_cn_more_4_3: 100 + mathbench-college-single_choice_cn_more_4_4: 75 + mathbench-college-single_choice_cn_perf_1: 75 + mathbench-college-single_choice_cn_perf_4: 75 + mathbench-college-single_choice_cn_vote_4: 100 + mathbench-college-single_choice_cn_vote_1: 75 + mathbench-college-single_choice_cn_prior_A: 31.25 + mathbench-college-single_choice_cn_prior_B: 18.75 + mathbench-college-single_choice_cn_prior_C: 25 + mathbench-college-single_choice_cn_prior_D: 25 + mathbench-college-single_choice_cn_prior_-: 0 + musr_murder_mysteries_accuracy: 75 + musr_murder_mysteries_f1: 0.75 + musr_murder_mysteries_accuracy_given_attempted: 75 + musr_murder_mysteries_attempted_ratio: 100 + musr_murder_mysteries_correct_count: 12 + musr_murder_mysteries_incorrect_count: 4 + musr_murder_mysteries_not_attempted_count: 0 + supergpqa_accuracy: 75 + supergpqa_total_correct: 12 + supergpqa_total_count: 16 + supergpqa_SuperGPQA-Engineering: 80 + supergpqa_SuperGPQA-Philosophy: 0 + supergpqa_SuperGPQA-Medicine: 100 + supergpqa_SuperGPQA-Economics: 100 + supergpqa_SuperGPQA-Science: 66.67 + supergpqa_SuperGPQA-Engineering-Electronic Science and Technology: 100 + supergpqa_SuperGPQA-Philosophy-Philosophy: 0 + supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine: 100 + supergpqa_SuperGPQA-Economics-Applied Economics: 100 + supergpqa_SuperGPQA-Science-Mathematics: 80 + supergpqa_SuperGPQA-Science-Physics: 0 + supergpqa_SuperGPQA-Medicine-Clinical Medicine: 100 + supergpqa_SuperGPQA-Engineering-Computer Science and Technology: 100 + supergpqa_SuperGPQA-Engineering-Information and Communication Engineering: 50 + supergpqa_SuperGPQA-Engineering-Control Science and Engineering: 100 + supergpqa_SuperGPQA-Economics-Theoretical Economics: 100 + supergpqa_SuperGPQA-Engineering-Electronic Science and Technology-Circuits and Systems: 100 + supergpqa_SuperGPQA-Philosophy-Philosophy-Philosophical Aesthetics: 0 + supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine-Traditional Chinese Medicine Theory: 100 + supergpqa_SuperGPQA-Economics-Applied Economics-Finance: 100 + supergpqa_SuperGPQA-Science-Mathematics-Combinatorial Mathematics: 100 + supergpqa_SuperGPQA-Science-Physics-Particle and Nuclear Physics: 0 + supergpqa_SuperGPQA-Medicine-Clinical Medicine-Internal Medicine: 100 + supergpqa_SuperGPQA-Engineering-Computer Science and Technology-Computer Architecture: 100 + supergpqa_SuperGPQA-Science-Mathematics-Ordinary Differential Equations: 100 + supergpqa_SuperGPQA-Science-Mathematics-Mathematical Analysis: 0 + supergpqa_SuperGPQA-Engineering-Information and Communication Engineering-Signal and Information Processing: 50 + supergpqa_SuperGPQA-Science-Mathematics-Advanced Algebra: 100 + supergpqa_SuperGPQA-Engineering-Control Science and Engineering-Control Theory and Control Engineering: 100 + supergpqa_SuperGPQA-Economics-Theoretical Economics-Political Economy: 100 + triviaqa_wiki_1shot_score: 50 + ARC_Prize_Public_Evaluation_accuracy: 0.06 + objective_v7: + aime2024_accuracy: 75 + aime2025_accuracy: 56.25 + bbeh_boolean_expressions_accuracy: 68.75 + bbeh_boolean_expressions_f1: 0.69 + bbeh_boolean_expressions_accuracy_given_attempted: 68.75 + bbeh_boolean_expressions_attempted_ratio: 100 + bbeh_boolean_expressions_correct_count: 11 + bbeh_boolean_expressions_incorrect_count: 5 + bbeh_boolean_expressions_not_attempted_count: 0 + Chem_exam-competition_final_score: 42.29 + Chem_exam-gaokao_final_score: 71.88 + ChemBench_Name_Conversion_accuracy: 100.00 + ChemBench_Name_Conversion_f1: 1.00 + ChemBench_Name_Conversion_accuracy_given_attempted: 100 + ChemBench_Name_Conversion_attempted_ratio: 100.00 + ChemBench_Name_Conversion_correct_count: 16.00 + ChemBench_Name_Conversion_incorrect_count: 0 + ChemBench_Name_Conversion_not_attempted_count: 0 + ClimaQA_Gold_mcq_accuracy: 87.5 + ClimaQA_Gold_mcq_f1: 0.88 + ClimaQA_Gold_mcq_accuracy_given_attempted: 87.5 + ClimaQA_Gold_mcq_attempted_ratio: 100 + ClimaQA_Gold_mcq_correct_count: 14 + ClimaQA_Gold_mcq_incorrect_count: 2 + ClimaQA_Gold_mcq_not_attempted_count: 0 + cmmlu-agronomy_accuracy: 87.50 + cmmlu-agronomy_f1: 0.88 + cmmlu-agronomy_accuracy_given_attempted: 87.50 + cmmlu-agronomy_attempted_ratio: 100.00 + cmmlu-agronomy_correct_count: 14.00 + cmmlu-agronomy_incorrect_count: 2 + cmmlu-agronomy_not_attempted_count: 0.00 + earth_silver_mcq_accuracy: 81.25 + earth_silver_mcq_f1: 0.81 + earth_silver_mcq_accuracy_given_attempted: 81.25 + earth_silver_mcq_attempted_ratio: 100 + earth_silver_mcq_correct_count: 13 + earth_silver_mcq_incorrect_count: 3 + earth_silver_mcq_not_attempted_count: 0 + GPQA_diamond_accuracy: 68.75 + hle_llmjudge_accuracy: 43.75 + hle_llmjudge_f1: 0.43 + hle_llmjudge_accuracy_given_attempted: 43.75 + hle_llmjudge_attempted_ratio: 100.00 + hle_llmjudge_correct_count: 7.00 + hle_llmjudge_incorrect_count: 9.00 + hle_llmjudge_not_attempted_count: 0.00 + IFEval_Prompt-level-strict-accuracy: 87.5 + IFEval_Inst-level-strict-accuracy: 91.67 + IFEval_Prompt-level-loose-accuracy: 87.5 + IFEval_Inst-level-loose-accuracy: 91.67 + kcle_accuracy: 75.00 + kcle_f1: 0.75 + kcle_accuracy_given_attempted: 75.00 + kcle_attempted_ratio: 100.00 + kcle_correct_count: 12 + kcle_incorrect_count: 4 + kcle_not_attempted_count: 0 + korbench_cipher_accuracy: 81.25 + livemathbench_hard_custom_hard_cn_accuracy: 25 + matbench_steels_mae: 816.50 + math_prm800k_500_accuracy: 100 + sanitized_mbpp_score: 6.25 + sanitized_mbpp_pass: 1 + sanitized_mbpp_timeout: 0 + sanitized_mbpp_failed: 15 + sanitized_mbpp_wrong_answer: 0 + medxpertqa_accuracy: 62.5 + medxpertqa_total_correct: 10 + medxpertqa_total_count: 16 + medxpertqa_MedXpertQA-Basic Science: 75 + medxpertqa_MedXpertQA-Diagnosis: 50 + medxpertqa_MedXpertQA-Treatment: 75 + medxpertqa_MedXpertQA-Skeletal: 66.67 + medxpertqa_MedXpertQA-Muscular: 100 + medxpertqa_MedXpertQA-Respiratory: 33.33 + medxpertqa_MedXpertQA-Endocrine: 66.67 + medxpertqa_MedXpertQA-Cardiovascular: 100 + medxpertqa_MedXpertQA-Lymphatic: 0 + medxpertqa_MedXpertQA-Nervous: 100 + medxpertqa_MedXpertQA-Reproductive: 0 + medxpertqa_MedXpertQA-Reasoning: 64.29 + medxpertqa_MedXpertQA-Understanding: 50 + lukaemon_mmlu_college_biology_accuracy: 81.25 + lukaemon_mmlu_college_biology_f1: 0.81 + lukaemon_mmlu_college_biology_accuracy_given_attempted: 81.25 + lukaemon_mmlu_college_biology_attempted_ratio: 100 + lukaemon_mmlu_college_biology_correct_count: 13 + lukaemon_mmlu_college_biology_incorrect_count: 3 + lukaemon_mmlu_college_biology_not_attempted_count: 0 + mmlu_pro_math_accuracy: 68.75 + mmlu_pro_math_f1: 0.69 + mmlu_pro_math_accuracy_given_attempted: 68.75 + mmlu_pro_math_attempted_ratio: 100 + mmlu_pro_math_correct_count: 11 + mmlu_pro_math_incorrect_count: 5 + mmlu_pro_math_not_attempted_count: 0 + olymmath_llmjudge_en-hard_accuracy: 68.75 + olymmath_llmjudge_en-hard_f1: 0.69 + olymmath_llmjudge_en-hard_accuracy_given_attempted: 68.75 + olymmath_llmjudge_en-hard_attempted_ratio: 100.00 + olymmath_llmjudge_en-hard_correct_count: 11 + olymmath_llmjudge_en-hard_incorrect_count: 5 + olymmath_llmjudge_en-hard_not_attempted_count: 0 + OlympiadBench_OE_TO_maths_en_COMP_accuracy: 43.75 + OlympiadBench_OE_TO_maths_en_COMP_f1: 0.44 + OlympiadBench_OE_TO_maths_en_COMP_accuracy_given_attempted: 43.75 + OlympiadBench_OE_TO_maths_en_COMP_attempted_ratio: 100.00 + OlympiadBench_OE_TO_maths_en_COMP_correct_count: 7 + OlympiadBench_OE_TO_maths_en_COMP_incorrect_count: 9 + OlympiadBench_OE_TO_maths_en_COMP_not_attempted_count: 0 + phybench-eed_accuracy: 27.94 + PHYSICS_atomic_dataset_textonly_accuracy: 93.75 + PHYSICS_atomic_dataset_textonly_f1: 0.94 + PHYSICS_atomic_dataset_textonly_accuracy_given_attempted: 93.75 + PHYSICS_atomic_dataset_textonly_attempted_ratio: 100 + PHYSICS_atomic_dataset_textonly_correct_count: 15 + PHYSICS_atomic_dataset_textonly_incorrect_count: 1 + PHYSICS_atomic_dataset_textonly_not_attempted_count: 0 + ProteinLMBench_accuracy: 68.75 + ProteinLMBench_f1: 0.69 + ProteinLMBench_accuracy_given_attempted: 68.75 + ProteinLMBench_attempted_ratio: 100 + ProteinLMBench_correct_count: 11 + ProteinLMBench_incorrect_count: 5 + ProteinLMBench_not_attempted_count: 0 + R-Bench_en_accuracy: 50.00 + R-Bench_en_f1: 0.50 + R-Bench_en_accuracy_given_attempted: 50.00 + R-Bench_en_attempted_ratio: 100.00 + R-Bench_en_correct_count: 8 + R-Bench_en_incorrect_count: 8 + R-Bench_en_not_attempted_count: 0 + NC-I2F-0shot-instruct_score: 12.5 + NC-I2F-0shot-instruct_valid_score: 75 + srbench_mean_RMSE: 61339597.01 + srbench_mean_NMSE: 83.61 + srbench_mean_R2: -82.61 + srbench_SymbolicMatch: 0.14 + supergpqa_Electronic_Science_and_Technology_accuracy: 56.25 + lcb_code_generation_v6_pass@1: 50 + chat_longtext: + babilong_qa1_256k_score: 0.00 + LongBench_2wikimqa_score: 5.43 + Length16000Depth0_2needle_en_128k_score: 100.00 + ruler_cwe_128k_score: 0.00 + chat_subjective: + alignment_bench_v1_1_总分: 0.48 + arenahard_score: 92.07 + Followbench_naive_average: 1 + mtbench101_avg: 8.7 + wildbench_average: 75.17 + simpleqa_accuracy_given_attempted: 0.55 + chinese_simpleqa_given_attempted_accuracy: 0.64 + alignment_bench_v1_1_专业能力: 5.81 + alignment_bench_v1_1_数学计算: 0 + alignment_bench_v1_1_基本任务: 0 + alignment_bench_v1_1_逻辑推理: 0 + alignment_bench_v1_1_中文理解: 0 + alignment_bench_v1_1_文本写作: 0 + alignment_bench_v1_1_角色扮演: 0 + alignment_bench_v1_1_综合问答: 0 + compassarena_language_naive_average: 55 + compassarena_knowledge_naive_average: 100 + compassarena_reason_v2_naive_average: 78.95 + compassarena_creationv2_zh_naive_average: 100 + followbench_llmeval_en_HSR_AVG: 1 + followbench_llmeval_en_SSR_AVG: 1 + followbench_llmeval_en_HSR_L1: 1 + followbench_llmeval_en_HSR_L2: 1 + followbench_llmeval_en_HSR_L3: 1 + followbench_llmeval_en_HSR_L4: 1 + followbench_llmeval_en_HSR_L5: 1 + followbench_llmeval_en_SSR_L1: 1 + followbench_llmeval_en_SSR_L2: 1 + followbench_llmeval_en_SSR_L3: 1 + followbench_llmeval_en_SSR_L4: 1 + followbench_llmeval_en_SSR_L5: 1 + simpleqa_f1: 0.46 + +qwen3-8b-base-turbomind: + base_longtext: + LongBench_2wikimqa_score: 26.98 + Length32000Depth0_origin_en_32000_score: 100.00 + +qwen-3-8b-base-hf-fullbench: + objective_base: + race-high_accuracy: 87.5 + ARC-c_accuracy: 75 + BoolQ_accuracy: 75 + triviaqa_wiki_1shot_score: 37.5 + nq_open_1shot_score: 6.25 + drop_accuracy: 75 + GPQA_diamond_accuracy: 81.25 + hellaswag_accuracy: 81.25 + TheoremQA_score: 18.75 + winogrande_accuracy: 81.25 + gsm8k_accuracy: 81.25 + GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75 + GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 + math_accuracy: 87.5 + wikibench-wiki-single_choice_cncircular_perf_4: 50 + sanitized_mbpp_score: 93.75 + mmlu-other_accuracy: 76.44 + cmmlu-china-specific_accuracy: 82.5 + mmlu_pro_math_accuracy: 50 + bbh-logical_deduction_seven_objects_score: 37.5 + bbh-multistep_arithmetic_two_score: 0 + college_naive_average: 37.5 + college_knowledge_naive_average: 87.5 + +qwen-3-8b-base-fullbench: + objective_base: + race-high_accuracy: 87.5 + ARC-c_accuracy: 75 + BoolQ_accuracy: 81.25 + triviaqa_wiki_1shot_score: 37.5 + nq_open_1shot_score: 6.25 + drop_accuracy: 81.25 + GPQA_diamond_accuracy: 81.25 + hellaswag_accuracy: 75 + TheoremQA_score: 18.75 + winogrande_accuracy: 81.25 + gsm8k_accuracy: 81.25 + GaokaoBench_2010-2022_Math_II_MCQs_score: 100 + GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 + math_accuracy: 81.25 + wikibench-wiki-single_choice_cncircular_perf_4: 50 + sanitized_mbpp_score: 93.75 + mmlu-other_accuracy: 75.48 + cmmlu-china-specific_accuracy: 83.75 + mmlu_pro_math_accuracy: 50 + bbh-logical_deduction_seven_objects_score: 62.5 + bbh-multistep_arithmetic_two_score: 100 + college_naive_average: 37.5 + college_knowledge_naive_average: 87.5 \ No newline at end of file diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d20ecc65e944791ee69fdfc91ed760e09b7b1736 --- /dev/null +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -0,0 +1,23 @@ +chat: + qwen3_0_6b_fp8-hf: + gsm8k_accuracy: 3.12 + race-high_accuracy: 6.25 + qwen3-0_6b-fp8-turbomind: + gsm8k_accuracy: 40.62 + race-high_accuracy: 65.62 +base: + qwen3-8b-base-turbomind: + gsm8k_accuracy: 50.00 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 62.50 + winogrande_accuracy: 71.88 + qwen3-8b-base-vllm: + gsm8k_accuracy: 50.00 + GPQA_diamond_accuracy: 21.88 + race-high_accuracy: 59.38 + winogrande_accuracy: 68.75 + qwen3-8b-base-hf: + gsm8k_accuracy: 50.00 + GPQA_diamond_accuracy: 18.75 + race-high_accuracy: 59.38 + winogrande_accuracy: 53.12 diff --git a/.github/scripts/pr_oc_score_assert.py b/.github/scripts/pr_oc_score_assert.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac8750c3cdb32b69382b92bc549e9661ed32741 --- /dev/null +++ b/.github/scripts/pr_oc_score_assert.py @@ -0,0 +1,77 @@ +import csv +import os + +import pytest + +output_path = 'regression_result' +model = 'internlm2-chat-7b-hf' +dataset = 'siqa' + + +@pytest.fixture() +def result_scores(): + file = find_csv_files(output_path) + if file is None: + return None + return read_csv_file(file) + + +@pytest.mark.usefixtures('result_scores') +class TestChatScore: + """Test cases for chat model.""" + + def test_model_dataset_score(self, result_scores): + result_score = result_scores.get(model).get(dataset) + assert_score(result_score, 79.53) + + +def assert_score(score, baseline): + if score is None or score == '-': + assert False, 'value is none' + if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97): + print(score + ' between ' + str(baseline * 0.97) + ' and ' + + str(baseline * 1.03)) + assert True + else: + assert False, score + ' not between ' + str( + baseline * 0.97) + ' and ' + str(baseline * 1.03) + + +def find_csv_files(directory): + csv_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.csv'): + csv_files.append(os.path.join(root, file)) + if len(csv_files) > 1: + raise 'have more than 1 result file, please check the result manually' + if len(csv_files) == 0: + return None + return csv_files[0] + + +def read_csv_file(file_path): + with open(file_path, 'r') as csvfile: + reader = csv.DictReader(csvfile) + filtered_data = [] + + for row in reader: + filtered_row = { + k: v + for k, v in row.items() + if k not in ['version', 'metric', 'mode'] + } + filtered_data.append(filtered_row) + + result = {} + for data in filtered_data: + dataset = data.get('dataset') + for key in data.keys(): + if key == 'dataset': + continue + else: + if key in result.keys(): + result.get(key)[dataset] = data.get(key) + else: + result[key] = {dataset: data.get(key)} + return result diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml new file mode 100644 index 0000000000000000000000000000000000000000..b3714f2b86155d47c8f0dfc513fc5e531a176208 --- /dev/null +++ b/.github/workflows/daily-run-test.yml @@ -0,0 +1,297 @@ +name: daily_run_test + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is open-compass/opencompass' + type: string + default: 'open-compass/opencompass' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + regression_type: + required: true + description: 'regression types' + type: string + default: "['cmd', 'api', 'bench']" + regression_func: + required: true + description: 'regression functions' + type: string + default: "['chat_models','base_models','chat_obj_fullbench_v5', 'chat_obj_fullbench_v6', 'chat_obj_fullbench_v7', 'chat_obj_fullbench_other','chat_sub_fullbench','base_fullbench','base_longtext_fullbench','chat_longtext_fullbench']" + baseline_result: + required: true + description: 'baseline result' + type: string + default: "0.5.0-baseline" + schedule: + - cron: '15 14 * * 2' + +env: + HF_DATASETS_OFFLINE: 1 + HF_EVALUATE_OFFLINE: 1 + TRANSFORMERS_OFFLINE: 1 + VLLM_USE_MODELSCOPE: false + LMDEPLOY_USE_MODELSCOPE: false + HF_HUB_OFFLINE: 1 + OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} + CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3 + REPORT_ROOT: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/eval_report/regression_test + COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache + HF_DATASETS_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache + HF_HUB_CACHE: /mnt/shared-storage-user/large-model-center-share-weights/hf_hub + HF_ENDPOINT: https://hf-mirror.com + TMPDIR: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/tmpdir + PIP_CACHE_DIR: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pip_cache + CONDA_ENV: regression_test + VLLM_WORKER_MULTIPROC_METHOD: spawn + KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn + KUBEBRAIN_NAMESPACE: ailab-opencompass + JOB_NAME: daily-test-${{ github.run_id }}-${{ github.run_attempt }} + BASELINE_DIR: ${{github.event.inputs.baseline_result || '0.5.0-baseline' }} + WORK_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_workdir/opencompass/opencompass + +jobs: + build-pypi: + runs-on: ubuntu-latest + env: + http_proxy: '' + https_proxy: '' + steps: + - uses: actions/checkout@v5 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Build lagent + run: | + pip install wheel setuptools + python setup.py sdist bdist_wheel + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: dist/* + retention-days: 1 + name: my-artifact-${{ github.run_id }} + + prepare_env: + if: ${{!cancelled()}} + needs: ['build-pypi'] + runs-on: yidian_cu12_daily + timeout-minutes: 180 #3hours + steps: + - name: Clone repository + uses: actions/checkout@v5 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Download Artifacts + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }} + - name: Remove Conda Env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda env remove -y --name ${{env.CONDA_ENV}} + conda info --envs + - name: Prepare - create conda env and install torch - cu12 + uses: nick-fields/retry@v3 + with: + max_attempts: 3 + timeout_minutes: 120 + command: | + . ${{env.CONDA_PATH}}/bin/activate + conda create -y --name ${{env.CONDA_ENV}} python=3.10 + conda activate ${{env.CONDA_ENV}} + export PIP_CACHE_DIR=${{env.PIP_CACHE_DIR}} + rm -rf ${{env.TMPDIR}} && mkdir -p ${{env.TMPDIR}} + export TMPDIR=${{env.TMPDIR}} + pip install -r /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/config/requirements.txt -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + pip install opencompass*.whl -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + pip install opencompass[lmdeploy] -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + pip install opencompass[vllm] -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + pip install opencompass[full] -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + pip install opencompass[api] -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + pip install xformers -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + pip install transformers==4.57.0 vllm==0.11.0 lmdeploy==0.10.1 -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/packages/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + cp -r /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/packages/nltk_data /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/envs/regression_test + - name: conda env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + + daily_run_test: + if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'bench'))}} + needs: prepare_env + strategy: + fail-fast: false + matrix: + regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench_v5", "chat_obj_fullbench_v6", "chat_obj_fullbench_v7", "chat_obj_fullbench_other", "chat_sub_fullbench", "base_fullbench","base_longtext_fullbench","chat_longtext_fullbench"]')}} + runs-on: yidian_cu12_daily + timeout-minutes: 240 #4hours + steps: + - name: Clone repository + uses: actions/checkout@v5 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: conda env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + - name: change rjob.py + run: | + cp /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/config/rjob.py . + sed -i "s/TASK_ID='none'/TASK_ID='${{ github.run_id }}'/g" rjob.py + - name: Run test + if: matrix.regression_func != 'chat_obj_fullbench_other' && matrix.regression_func != 'chat_sub_fullbench' + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse + - name: Run test - other + if: matrix.regression_func == 'chat_obj_fullbench_other' + env: + DATASET_SOURCE: HF + run: | + sed -i "s/DATASET_SOURCE=/DATASET_SOURCE=HF/g" rjob.py + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse + - name: Run test - other + if: matrix.regression_func == 'chat_sub_fullbench' + env: + COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache_sub + run: | + sed -i "s/compass_data_cache/compass_data_cache_sub/g" rjob.py + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse + - name: Assert result + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily + python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py || true + python .github/scripts/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/${{matrix.regression_func}} + - name: Change code permission + if: always() + run: | + sudo chmod -R 777 . + + + daily_run_cmd: + if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'cmd'))}} + needs: prepare_env + runs-on: yidian_cu12_daily + timeout-minutes: 240 #4hours + steps: + - name: Clone repository + uses: actions/checkout@v5 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: conda env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + - name: Run test + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + rjob submit --name=cmd-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_cmd_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}' + + for i in {1..300}; do + current_status=$(rjob get cmd-${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+') + if [[ $current_status == "Succeeded" || $current_status == "Failed" || $current_status == "Stopped" ]]; then + echo "Current status: $current_status, stop checking" + rjob logs job cmd-${{ env.JOB_NAME }} + break + fi + sleep 6 + done + - name: Assert result + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily + python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily + python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily + python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily + python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily + python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Change code permission + if: always() + run: | + sudo chmod -R 777 . + + daily_run_api: + if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'api'))}} + needs: prepare_env + runs-on: yidian_cu12_daily + timeout-minutes: 240 #4hours + steps: + - name: Clone repository + uses: actions/checkout@v5 + with: + repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: conda env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + - name: Run test + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + rjob submit --name=api-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_api_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }} ${{env.WORK_PATH}}' + + for i in {1..300}; do + current_status=$(rjob get api-${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+') + if [[ $current_status == "Succeeded" || $current_status == "Failed" || $current_status == "Stopped" ]]; then + echo "Current status: $current_status, stop checking" + rjob logs job api-${{ env.JOB_NAME }} + break + fi + sleep 6 + done + - name: Assert result + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily + python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Change code permission + if: always() + run: | + sudo chmod -R 777 . diff --git a/.github/workflows/link-check.yml b/.github/workflows/link-check.yml new file mode 100644 index 0000000000000000000000000000000000000000..c6f3bdd230b9f5406bdd3232b707aa5ac43d4c43 --- /dev/null +++ b/.github/workflows/link-check.yml @@ -0,0 +1,26 @@ +name: 'Link check' + +on: + schedule: + # check links at 01:30 a.m. every day + - cron: '30 1 * * *' + + workflow_dispatch: # allow manual trigger + +jobs: + link-check: + runs-on: ubuntu-latest + steps: + # - uses: actions/checkout@v3 + + - name: Install linkchecker + run: | + pip install linkchecker + + - name: Run linkchecker + run: | + linkchecker https://opencompass.readthedocs.io/ --no-robots -t 30 --no-warnings \ + --ignore-url "https://opencompass.readthedocs.io/.*/static/images/opencompass_logo.svg" \ + --ignore-url "https://opencompass.readthedocs.io/.*/_static/images/icon-menu-dots.svg" \ + --ignore-url "https://opencompass.readthedocs.io/policy" \ + --ignore-url "https://opencompass.readthedocs.io/(en|zh_CN)/[0-9a-f]{40}/.*" diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000000000000000000000000000000..106c2453c0c93d046f5dfa96f4a8eb306eecf599 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,23 @@ +name: lint + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Install pre-commit hook + run: | + pip install pre-commit==3.8.0 mmengine==0.10.5 + pre-commit install + - name: Linting + run: pre-commit run --all-files diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml new file mode 100644 index 0000000000000000000000000000000000000000..716b86a0d9f4c4dcdc9cd38d317383cbaf3b96d7 --- /dev/null +++ b/.github/workflows/pr-run-test.yml @@ -0,0 +1,111 @@ +name: pr_run_test + +on: + pull_request: + paths-ignore: + - 'README.md' + - 'README_zh-CN.md' + - 'docs/**' + - 'configs/**' + - 'tools/**' + workflow_dispatch: + schedule: + - cron: '56 22 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + CONDA_ENV: pr_test + HF_DATASETS_OFFLINE: 1 + HF_EVALUATE_OFFLINE: 1 + TRANSFORMERS_OFFLINE: 1 + VLLM_USE_MODELSCOPE: false + LMDEPLOY_USE_MODELSCOPE: false + HF_HUB_OFFLINE: 1 + CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3 + REPORT_ROOT: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/eval_report/prtest + COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache + HF_DATASETS_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/hf_cache + HF_HUB_CACHE: /mnt/shared-storage-user/large-model-center-share-weights/hf_hub + KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn + KUBEBRAIN_NAMESPACE: ailab-opencompass + JOB_NAME: pr-test-${{ github.run_id }}-${{ github.run_attempt }} + +jobs: + pr_run_test: + runs-on: yidian_cu12 + timeout-minutes: 45 + steps: + - name: Checkout repository + uses: actions/checkout@v2 + - name: Prepare - Install opencompass + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + python3 -m pip uninstall opencompass -y + python3 -m pip install .[full] -i https://pkg.pjlab.org.cn/repository/pypi-proxy/simple/ --trusted-host pkg.pjlab.org.cn --no-cache-dir + conda info --envs + - name: conda env + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + pip list + lmdeploy check_env + - name: Run test + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + conda info --envs + rjob submit --name=${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pr_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}' + + for i in {1..300}; do + current_status=$(rjob get ${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+') + if [[ $current_status == "Succeeded" || $current_status == "Failed" || $current_status == "Stopped" ]]; then + echo "Current status: $current_status, stop checking" + break + fi + sleep 6 + done + - name: Get result + run: | + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}') + if (( ${score%.*} >= 75 && ${score%.*} <= 80 )); then + echo "score is $score between 75 and 80" + else + echo "score is $score not between 75 and 80" + exit 1 + fi + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}') + if (( ${score%.*} >= 75 && ${score%.*} <= 80 )); then + echo "score is $score between 75 and 80" + else + echo "score is $score not between 75 and 80" + exit 1 + fi + score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}') + if (( ${score%.*} >= 75 && ${score%.*} <= 80 )); then + echo "score is $score between 75 and 80" + else + echo "score is $score not between 75 and 80" + exit 1 + fi + - name: Uninstall opencompass + if: always() + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + python3 -m pip uninstall opencompass -y + conda info --envs + + notify_to_feishu: + if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} + needs: [pr_run_test] + timeout-minutes: 5 + runs-on: self-hosted + steps: + - name: notify + run: | + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} diff --git a/.github/workflows/pr-stage-check.yml b/.github/workflows/pr-stage-check.yml new file mode 100644 index 0000000000000000000000000000000000000000..a98718878cdacef988a45ee132c8c4a26aa2c4e7 --- /dev/null +++ b/.github/workflows/pr-stage-check.yml @@ -0,0 +1,121 @@ +name: pr_stage_test + +on: + pull_request: + paths-ignore: + - 'README.md' + - 'README_zh-CN.md' + - 'docs/**' + - 'configs/**' + - 'tools/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.10'] + include: + - torch: 2.5.1 + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: python -m pip install --upgrade pip + - name: Install PyTorch + run: pip install torch==${{matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html + - name: Install system dependencies + run: | + sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list + sudo apt-get update && sudo apt-get install -y libc6 libffi-dev libncursesw6 wget unzip + - name: Upgrade pip + run: python -m pip install pip --upgrade + - name: Install opencompass dependencies + run: | + python -m pip install -r requirements.txt + - name: Build and install + run: python -m pip install -e . + - name: Prepare dataset + run: | + wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip + unzip OpenCompassData-core-20240207.zip + - name: Dry run test + run: | + python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run + + build_cu117: + runs-on: ubuntu-22.04 + container: + image: nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04 + strategy: + matrix: + python-version: ['3.10'] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Fetch GPG keys + run: | + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + - name: Install Python-dev + run: apt-get update && apt-get install -y python${{matrix.python-version}}-dev + if: ${{matrix.python-version != 3.10}} + - name: Install system dependencies + run: | + apt-get update + apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libc6 libc6-dev + sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list + apt-get update && apt-get install -y libc6 libffi-dev libncursesw6 wget unzip + - name: Upgrade pip + run: python -m pip install pip --upgrade + - name: Install opencompass dependencies + run: | + python -m pip install -r requirements.txt + - name: Build and install + run: python -m pip install -e . + - name: Prepare dataset + run: | + wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip + unzip OpenCompassData-core-20240207.zip + - name: Dry run test + run: | + python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run + + build_windows: + runs-on: windows-2022 + strategy: + matrix: + python-version: ['3.10'] + platform: [cpu] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: python -m pip install pip --upgrade + - name: Install PyTorch + run: pip install torch==2.5.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html + - name: Install opencompass dependencies + run: | + pip install -r requirements.txt + - name: Build and install + run: pip install -e . + - name: Prepare dataset + run: | + Invoke-WebRequest -Uri https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip -OutFile OpenCompassData-core-20240207.zip + unzip OpenCompassData-core-20240207.zip + - name: Dry run test + run: | + python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml new file mode 100644 index 0000000000000000000000000000000000000000..e59abbcafb1bccd23b01ab8f7c041cf13890240f --- /dev/null +++ b/.github/workflows/publish-to-pypi.yml @@ -0,0 +1,31 @@ +name: deploy + +on: + push: + workflow_dispatch: + inputs: + confirm_publish: + description: 'Type YES to confirm publishing to PyPI' + required: true + type: string + +jobs: + build-n-publish: + runs-on: ubuntu-latest + if: | + github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') || + (github.event_name == 'workflow_dispatch' && inputs.confirm_publish == 'YES') + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Build lagent + run: | + pip install wheel + python setup.py sdist bdist_wheel + - name: Publish distribution to PyPI + run: | + pip install twine + twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }} diff --git a/build/lib/opencompass/__init__.py b/build/lib/opencompass/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..93b60a1dcce607677ea2feed7d4d751303032779 --- /dev/null +++ b/build/lib/opencompass/__init__.py @@ -0,0 +1 @@ +__version__ = '0.5.1' diff --git a/build/lib/opencompass/cli/__init__.py b/build/lib/opencompass/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/build/lib/opencompass/cli/main.py b/build/lib/opencompass/cli/main.py new file mode 100644 index 0000000000000000000000000000000000000000..c696f3765672be033092694d7e0dd04a03371d25 --- /dev/null +++ b/build/lib/opencompass/cli/main.py @@ -0,0 +1,451 @@ +# flake8: noqa +# yapf: disable +import argparse +import copy +import getpass +import os +import os.path as osp +from datetime import datetime + +from mmengine.config import Config, DictAction + +from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg +from opencompass.runners import SlurmRunner +from opencompass.summarizers import DefaultSummarizer +from opencompass.utils import (LarkReporter, get_logger, pretty_print_config, + read_from_station, save_to_station) +from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg, + get_config_from_arg) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Run an evaluation task') + parser.add_argument('config', nargs='?', help='Train config file path') + + # add mutually exclusive args `--slurm` `--dlc`, defaults to local runner + # if "infer" or "eval" not specified + launch_method = parser.add_mutually_exclusive_group() + launch_method.add_argument('--slurm', + action='store_true', + default=False, + help='Whether to force tasks to run with srun. ' + 'If True, `--partition(-p)` must be set. ' + 'Defaults to False') + launch_method.add_argument('--dlc', + action='store_true', + default=False, + help='Whether to force tasks to run on dlc. If ' + 'True, `--aliyun-cfg` must be set. Defaults' + ' to False') + # Add shortcut parameters (models, datasets and summarizer) + parser.add_argument('--models', nargs='+', help='', default=None) + parser.add_argument('--datasets', nargs='+', help='', default=None) + parser.add_argument('--summarizer', help='', default=None) + # add general args + parser.add_argument('--debug', + help='Debug mode, in which scheduler will run tasks ' + 'in the single process, and output will not be ' + 'redirected to files', + action='store_true', + default=False) + parser.add_argument('--dry-run', + help='Dry run mode, in which the scheduler will not ' + 'actually run the tasks, but only print the commands ' + 'to run', + action='store_true', + default=False) + parser.add_argument( + '-a', '--accelerator', + help='Infer accelerator, support vllm and lmdeploy now.', + choices=['vllm', 'lmdeploy', None], + default=None, + type=str) + parser.add_argument('-m', + '--mode', + help='Running mode. You can choose "infer" if you ' + 'only want the inference results, or "eval" if you ' + 'already have the results and want to evaluate them, ' + 'or "viz" if you want to visualize the results.', + choices=['all', 'infer', 'eval', 'viz'], + default='all', + type=str) + parser.add_argument('-r', + '--reuse', + nargs='?', + type=str, + const='latest', + help='Reuse previous outputs & results, and run any ' + 'missing jobs presented in the config. If its ' + 'argument is not specified, the latest results in ' + 'the work_dir will be reused. The argument should ' + 'also be a specific timestamp, e.g. 20230516_144254') + parser.add_argument('-w', + '--work-dir', + help='Work path, all the outputs will be ' + 'saved in this path, including the slurm logs, ' + 'the evaluation results, the summary results, etc.' + 'If not specified, the work_dir will be set to ' + 'outputs/default.', + default=None, + type=str) + parser.add_argument( + '--config-dir', + default='configs', + help='Use the custom config directory instead of config/ to ' + 'search the configs for datasets, models and summarizers', + type=str) + parser.add_argument( + '--config-verbose', + default=False, + action='store_true', + help='Whether to print the config in verbose mode.') + parser.add_argument('-l', + '--lark', + help='Report the running status to lark bot', + action='store_true', + default=False) + parser.add_argument('--max-num-workers', + help='Max number of workers to run in parallel. ' + 'Will be overrideen by the "max_num_workers" argument ' + 'in the config.', + type=int, + default=1) + parser.add_argument('--max-workers-per-gpu', + help='Max task to run in parallel on one GPU. ' + 'It will only be used in the local runner.', + type=int, + default=1) + parser.add_argument( + '--retry', + help='Number of retries if the job failed when using slurm or dlc. ' + 'Will be overrideen by the "retry" argument in the config.', + type=int, + default=2) + parser.add_argument( + '--dump-eval-details', + help='Whether to dump the evaluation details, including the ' + 'correctness of each sample, bpb, etc. Defaults to True.', + nargs='?', + const=True, + default=True, + type=lambda x: False if x and x.lower() == 'false' else True + ) + parser.add_argument('--dump-res-length', + help='dump the length of model responses', + action='store_true', + default=False) + parser.add_argument( + '--dump-extract-rate', + help='Whether to dump the evaluation details, including the ' + 'correctness of each sample, bpb, etc.', + action='store_true', + ) + # for the results persistence + parser.add_argument('-sp', + '--station-path', + help='Path to your results station.', + type=str, + default=None, + ) + + parser.add_argument('--station-overwrite', + help='Whether to overwrite the results at station.', + action='store_true', + ) + + parser.add_argument( + '--read-from-station', + help='Whether to save the evaluation results to the ' + 'data station.', + action='store_true', + ) + # for evaluation with multiple runs + parser.add_argument('--dataset-num-runs', + help='How many runs for one dataset', + type=int, + default=1, + ) + + # set srun args + slurm_parser = parser.add_argument_group('slurm_args') + parse_slurm_args(slurm_parser) + # set dlc args + dlc_parser = parser.add_argument_group('dlc_args') + parse_dlc_args(dlc_parser) + # set hf args + hf_parser = parser.add_argument_group('hf_args') + parse_hf_args(hf_parser) + # set custom dataset args + custom_dataset_parser = parser.add_argument_group('custom_dataset_args') + parse_custom_dataset_args(custom_dataset_parser) + args = parser.parse_args() + if args.slurm: + assert args.partition is not None, ( + '--partition(-p) must be set if you want to use slurm') + if args.dlc: + assert os.path.exists(args.aliyun_cfg), ( + 'When launching tasks using dlc, it needs to be configured ' + 'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"' + ' to specify a new path.') + return args + + +def parse_slurm_args(slurm_parser): + """These args are all for slurm launch.""" + slurm_parser.add_argument('-p', + '--partition', + help='Slurm partition name', + default=None, + type=str) + slurm_parser.add_argument('-q', + '--quotatype', + help='Slurm quota type', + default=None, + type=str) + slurm_parser.add_argument('--qos', + help='Slurm quality of service', + default=None, + type=str) + + +def parse_dlc_args(dlc_parser): + """These args are all for dlc launch.""" + dlc_parser.add_argument('--aliyun-cfg', + help='The config path for aliyun config', + default='~/.aliyun.cfg', + type=str) + + +def parse_hf_args(hf_parser): + """These args are all for the quick construction of HuggingFace models.""" + hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat') + hf_parser.add_argument('--hf-path', type=str, help='The path to the HuggingFace model, e.g. "facebook/opt-125m", required') + hf_parser.add_argument('--model-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the HuggingFace model') + hf_parser.add_argument('--tokenizer-path', type=str, help='The path to the HuggingFace tokenizer, same as --hf-path if not specified') + hf_parser.add_argument('--tokenizer-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the tokenizer') + hf_parser.add_argument('--peft-path', type=str, help='The path to the PEFT model') + hf_parser.add_argument('--peft-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the PEFT model') + hf_parser.add_argument('--generation-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the generation') + hf_parser.add_argument('--max-seq-len', type=int, help='The max sequence length for the HuggingFace model') + hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model') + hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model') + hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model') + hf_parser.add_argument('--num-gpus', type=int, default=None, help='Deprecated, please use --hf-num-gpus instead') + hf_parser.add_argument('--hf-num-gpus', type=int, default=1, help='The number of GPUs for the HuggingFace model passed via cli') + hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model') + hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model') + + +def parse_custom_dataset_args(custom_dataset_parser): + """These args are all for the quick construction of custom datasets.""" + custom_dataset_parser.add_argument('--custom-dataset-path', type=str) + custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str) + custom_dataset_parser.add_argument('--custom-dataset-data-type', + type=str, + choices=['mcq', 'qa']) + custom_dataset_parser.add_argument('--custom-dataset-infer-method', + type=str, + choices=['gen', 'ppl']) + + +def main(): + args = parse_args() + if args.num_gpus is not None: + raise ValueError('The `--num-gpus` argument is deprecated, please use ' + '`--hf-num-gpus` to describe number of gpus used for ' + 'the HuggingFace model instead.') + + if args.dry_run: + args.debug = True + # initialize logger + logger = get_logger(log_level='DEBUG' if args.debug else 'INFO') + + cfg = get_config_from_arg(args) + if args.work_dir is not None: + cfg['work_dir'] = args.work_dir + else: + cfg.setdefault('work_dir', os.path.join('outputs', 'default')) + + # cfg_time_str defaults to the current time + cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S') + if args.reuse: + if args.reuse == 'latest': + if not os.path.exists(cfg.work_dir) or not os.listdir( + cfg.work_dir): + logger.warning('No previous results to reuse!') + else: + dirs = os.listdir(cfg.work_dir) + dir_time_str = sorted(dirs)[-1] + else: + dir_time_str = args.reuse + logger.info(f'Reusing experiements from {dir_time_str}') + elif args.mode in ['eval', 'viz'] and not args.read_from_station: + raise ValueError( + 'You must specify -r or --reuse, or you have to specify ' + '--read-from-station and --station-path when running in eval ' + 'or viz mode!') + + # update "actual" work_dir + cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str) + current_workdir = cfg['work_dir'] + logger.info(f'Current exp folder: {current_workdir}') + + os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True) + + # dump config + output_config_path = osp.join(cfg.work_dir, 'configs', + f'{cfg_time_str}_{os.getpid()}.py') + cfg.dump(output_config_path) + # Config is intentally reloaded here to avoid initialized + # types cannot be serialized + cfg = Config.fromfile(output_config_path, format_python_code=False) + + # get existed results from station + if args.read_from_station: + existing_results_list = read_from_station(cfg, args) + rs_exist_results = [comb['combination'] for comb in existing_results_list] + cfg['rs_exist_results'] = rs_exist_results + + # report to lark bot if specify --lark + if not args.lark: + cfg['lark_bot_url'] = None + elif cfg.get('lark_bot_url', None): + content = f'{getpass.getuser()}\'s task has been launched!' + LarkReporter(cfg['lark_bot_url']).post(content) + + + # print config if specified --config-verbose + if args.config_verbose: + pretty_print_config(cfg) + + # infer + if args.mode in ['all', 'infer']: + # When user have specified --slurm or --dlc, or have not set + # "infer" in config, we will provide a default configuration + # for infer + if (args.dlc or args.slurm) and cfg.get('infer', None): + logger.warning('You have set "infer" in the config, but ' + 'also specified --slurm or --dlc. ' + 'The "infer" configuration will be overridden by ' + 'your runtime arguments.') + + if args.dlc or args.slurm or cfg.get('infer', None) is None: + fill_infer_cfg(cfg, args) + + if args.partition is not None: + if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner: + cfg.infer.runner.partition = args.partition + cfg.infer.runner.quotatype = args.quotatype + else: + logger.warning('SlurmRunner is not used, so the partition ' + 'argument is ignored.') + if args.debug: + cfg.infer.runner.debug = True + if args.lark: + cfg.infer.runner.lark_bot_url = cfg['lark_bot_url'] + cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'], + 'predictions/') + partitioner = PARTITIONERS.build(cfg.infer.partitioner) + tasks = partitioner(cfg) + if args.dry_run: + return + runner = RUNNERS.build(cfg.infer.runner) + # Add extra attack config if exists + if hasattr(cfg, 'attack'): + for task in tasks: + cfg.attack.dataset = task.datasets[0][0].abbr + task.attack = cfg.attack + if args.dump_res_length: + for task in tasks: + task.dump_res_length = True + runner(tasks) + + # evaluate + if args.mode in ['all', 'eval']: + # When user have specified --slurm or --dlc, or have not set + # "eval" in config, we will provide a default configuration + # for eval + if (args.dlc or args.slurm) and cfg.get('eval', None): + logger.warning('You have set "eval" in the config, but ' + 'also specified --slurm or --dlc. ' + 'The "eval" configuration will be overridden by ' + 'your runtime arguments.') + + if args.dlc or args.slurm or cfg.get('eval', None) is None: + fill_eval_cfg(cfg, args) + if args.dump_eval_details: + logger.warning('Default to dump eval details, it might take extra' + 'space to save all the evaluation details. ' + 'Set --dump-eval-details False to skip the details dump') + cfg.eval.runner.task.dump_details = True + if args.dump_extract_rate: + cfg.eval.runner.task.cal_extract_rate = True + if args.partition is not None: + if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner: + cfg.eval.runner.partition = args.partition + cfg.eval.runner.quotatype = args.quotatype + else: + logger.warning('SlurmRunner is not used, so the partition ' + 'argument is ignored.') + if args.debug: + cfg.eval.runner.debug = True + if args.lark: + cfg.eval.runner.lark_bot_url = cfg['lark_bot_url'] + cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/') + partitioner = PARTITIONERS.build(cfg.eval.partitioner) + tasks = partitioner(cfg) + if args.dry_run: + return + runner = RUNNERS.build(cfg.eval.runner) + + # For meta-review-judge in subjective evaluation + if isinstance(tasks, list) and len(tasks) != 0 and isinstance( + tasks[0], list): + for task_part in tasks: + runner(task_part) + else: + runner(tasks) + + # save to station + if args.station_path is not None or cfg.get('station_path') is not None: + save_to_station(cfg, args) + + # visualize + if args.mode in ['all', 'eval', 'viz']: + summarizer_cfg = cfg.get('summarizer', {}) + + # For subjective summarizer + if summarizer_cfg.get('function', None): + main_summarizer_cfg = copy.deepcopy(summarizer_cfg) + grouped_datasets = {} + for dataset in cfg.datasets: + prefix = dataset['abbr'].split('_')[0] + if prefix not in grouped_datasets: + grouped_datasets[prefix] = [] + grouped_datasets[prefix].append(dataset) + all_grouped_lists = [] + for prefix in grouped_datasets: + all_grouped_lists.append(grouped_datasets[prefix]) + dataset_score_container = [] + for dataset in all_grouped_lists: + temp_cfg = copy.deepcopy(cfg) + temp_cfg.datasets = dataset + summarizer_cfg = dict(type=dataset[0]['summarizer']['type'], config=temp_cfg) + summarizer = build_from_cfg(summarizer_cfg) + dataset_score = summarizer.summarize(time_str=cfg_time_str) + if dataset_score: + dataset_score_container.append(dataset_score) + main_summarizer_cfg['config'] = cfg + main_summarizer = build_from_cfg(main_summarizer_cfg) + main_summarizer.summarize(time_str=cfg_time_str, subjective_scores=dataset_score_container) + else: + if not summarizer_cfg or summarizer_cfg.get('type', None) is None: + summarizer_cfg['type'] = DefaultSummarizer + summarizer_cfg['config'] = cfg + summarizer = build_from_cfg(summarizer_cfg) + summarizer.summarize(time_str=cfg_time_str) + + + +if __name__ == '__main__': + main() diff --git a/build/lib/opencompass/datasets/CARDBiomedBench.py b/build/lib/opencompass/datasets/CARDBiomedBench.py new file mode 100644 index 0000000000000000000000000000000000000000..77ff9ee6d823f1593052e2b9361740bb50817899 --- /dev/null +++ b/build/lib/opencompass/datasets/CARDBiomedBench.py @@ -0,0 +1,30 @@ +from datasets import load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +def _parse(item, prompt_mode): + item['expert'] = item['Bio_Category'] + item['start'] = chr(65) + item['end'] = chr(65 + len(item.get('choices', {'label': []})['label']) - + 1) + item['prompt_mode'] = prompt_mode + return item + + +@LOAD_DATASET.register_module() +class CARDBiomedBenchDataset(BaseDataset): + + @staticmethod + def load(path: str, prompt_mode: str, **kwargs): + data_files = {'test': 'data/CARDBiomedBench.csv'} + dataset = load_dataset(path, data_files=data_files, split='test') + # dataset = dataset.select(range(200)) + if prompt_mode == 'zero-shot': + dataset = dataset.map(lambda item: _parse(item, prompt_mode), + load_from_cache_file=False) + elif prompt_mode == 'few-shot': + pass # TODO: Implement few-shot prompt + return dataset diff --git a/build/lib/opencompass/datasets/ClinicBench.py b/build/lib/opencompass/datasets/ClinicBench.py new file mode 100644 index 0000000000000000000000000000000000000000..86ef5082c48c077f1e182fdaff159e21792a7da6 --- /dev/null +++ b/build/lib/opencompass/datasets/ClinicBench.py @@ -0,0 +1,19 @@ +from datasets import load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class ClinicBenchDataset(BaseDataset): + + @staticmethod + def load_single(path): + dataset = load_dataset(path)['train'] + return dataset + + @staticmethod + def load(path): + dataset = ClinicBenchDataset.load_single(path) + return dataset diff --git a/build/lib/opencompass/datasets/Earth_Silver.py b/build/lib/opencompass/datasets/Earth_Silver.py new file mode 100644 index 0000000000000000000000000000000000000000..606765d561b751dcfe2709122f8423bce57026dd --- /dev/null +++ b/build/lib/opencompass/datasets/Earth_Silver.py @@ -0,0 +1,26 @@ +from datasets import load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class Earth_Silver_MCQDataset(BaseDataset): + + @staticmethod + def load(path: str, prompt_mode: str = 'zero-shot', **kwargs): + + dataset = load_dataset(path=path, split='multiple_choice') + + dataset = dataset.map(lambda item: { + 'question': item['question'], + 'answer': item['answer'] + }) + + if prompt_mode == 'zero-shot': + return dataset + elif prompt_mode == 'few-shot': + raise NotImplementedError('few-shot prompt 尚未实现') + else: + raise ValueError(f'Unsupported prompt_mode: {prompt_mode}') diff --git a/build/lib/opencompass/datasets/FinanceIQ.py b/build/lib/opencompass/datasets/FinanceIQ.py new file mode 100644 index 0000000000000000000000000000000000000000..ed87d30096d82897510a08e1e36d3456c482479b --- /dev/null +++ b/build/lib/opencompass/datasets/FinanceIQ.py @@ -0,0 +1,41 @@ +import csv +import os.path as osp + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class FinanceIQDataset(BaseDataset): + + # @staticmethod + # def load(path: str): + # from datasets import load_dataset + # return load_dataset('csv', data_files={'test': path}) + + @staticmethod + def load(path: str, name: str): + path = get_data_path(path, local_mode=True) + dataset = DatasetDict() + for split in ['dev', 'test']: + raw_data = [] + filename = osp.join(path, split, f'{name}.csv') + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + _ = next(reader) # skip the header + for row in reader: + assert len(row) == 7 + raw_data.append({ + 'question': row[1], + 'A': row[2], + 'B': row[3], + 'C': row[4], + 'D': row[5], + 'answer': row[6], + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/build/lib/opencompass/datasets/GaokaoBench.py b/build/lib/opencompass/datasets/GaokaoBench.py new file mode 100644 index 0000000000000000000000000000000000000000..c1ae6d10e9eed01b69684eb4446b79fc4c1268ae --- /dev/null +++ b/build/lib/opencompass/datasets/GaokaoBench.py @@ -0,0 +1,158 @@ +import json +import re +from os import environ + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class GaokaoBenchDataset(BaseDataset): + + @staticmethod + def load(path: str, filename: str, name: str): + path = get_data_path(path) + path = path + filename + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + return MsDataset.load(path, subset_name=name, split='test') + else: + with open(path, encoding='utf-8') as f: + data = json.load(f) + return Dataset.from_list(data['example']) + + +valid_gaokao_bench_question_types = [ + 'single_choice', 'multi_choice', 'multi_question_choice', + 'five_out_of_seven', 'cloze', 'subjective', 'correction' +] + + +class GaokaoBenchEvaluator(BaseEvaluator): + + def __init__(self, question_type) -> None: + super().__init__() + assert question_type in valid_gaokao_bench_question_types + self.question_type = question_type + + def do_predictions_postprocess(self, model_output, answer_lenth=None): + if self.question_type == 'single_choice': + model_answer = [] + temp = re.findall(r'[A-D]', model_output[::-1]) + if len(temp) != 0: + model_answer.append(temp[0]) + + elif self.question_type == 'multi_question_choice': + model_answer = [] + temp = re.findall(r'【答案】\s*[::]*\s*[A-Z]', model_output) + + if len(temp) == answer_lenth: + for t in temp: + model_answer.append(re.findall(r'[A-Z]', t)[0]) + else: + temp = re.findall(r'[A-Z]', model_output) + if len(temp) > 0: + for k in range(min(len(temp), answer_lenth)): + model_answer.append(temp[k]) + + elif self.question_type == 'multi_choice': + model_answer = [] + answer = '' + content = re.sub(r'\s+', '', model_output) + answer_index = content.find('【答案】') + if answer_index > 0: + temp = content[answer_index:] + if len(re.findall(r'[A-D]', temp)) > 0: + for t in re.findall(r'[A-D]', temp): + answer += t + else: + temp = content[-10:] + if len(re.findall(r'[A-D]', temp)) > 0: + for t in re.findall(r'[A-D]', temp): + answer += t + if len(answer) != 0: + model_answer.append(answer) + + elif self.question_type == 'five_out_of_seven': + model_answer = [] + temp = re.findall(r'[A-G]', model_output) + if len(temp) > 0: + for k in range(min(5, len(temp))): + model_answer.append(temp[k]) + + return model_answer + + def ensure_same_length(self, pred, refr): + if len(pred) == len(refr): + return pred + return ['Z'] * len(refr) + + def score(self, predictions, references): + if self.question_type not in [ + 'single_choice', 'multi_choice', 'multi_question_choice', + 'five_out_of_seven' + ]: + return {'score': 0} + elif self.question_type == 'multi_choice': + details = {} + correct_score, total_score = 0, 0 + for index, (pred, refr) in enumerate(zip(predictions, references)): + pred = self.do_predictions_postprocess(pred) + pred = self.ensure_same_length(pred, refr) + is_corrects = [] + for p, r in zip(pred, refr): + if p == r: + correct_score += 2 + is_corrects.append(True) + else: + for i in p: + if i not in r: + break + else: + correct_score += 1 + is_corrects.append(False) + total_score += 2 + details[str(index)] = { + 'pred': pred, + 'refr': refr, + 'is_correct': all(is_corrects), + } + + else: + details = {} + correct_score, total_score = 0, 0 + for index, (pred, refr) in enumerate(zip(predictions, references)): + if self.question_type == 'multi_question_choice': + pred = self.do_predictions_postprocess(pred, len(refr)) + else: + pred = self.do_predictions_postprocess(pred) + pred = self.ensure_same_length(pred, refr) + is_corrects = [] + for p, r in zip(pred, refr): + is_correct = p == r + correct_score += is_correct + total_score += 1 + is_corrects.append(is_correct) + details[str(index)] = { + 'pred': pred, + 'refr': refr, + 'is_correct': all(is_corrects), + } + return {'score': correct_score / total_score * 100, 'details': details} + + +for question_type in valid_gaokao_bench_question_types: + # fix classic closure problem + def _gaokao_register(question_type): + ICL_EVALUATORS.register_module( + name='GaokaoBenchEvaluator' + '_' + question_type, + module=lambda *args, **kwargs: GaokaoBenchEvaluator( + question_type=question_type, *args, **kwargs)) + + _gaokao_register(question_type) diff --git a/build/lib/opencompass/datasets/LCBench.py b/build/lib/opencompass/datasets/LCBench.py new file mode 100644 index 0000000000000000000000000000000000000000..bbbc4f9efff0dc1c7971a8f9ae35a17c7a0b419a --- /dev/null +++ b/build/lib/opencompass/datasets/LCBench.py @@ -0,0 +1,500 @@ +# flake8: noqa +import contextlib +import io +import itertools +import multiprocessing +import re +import signal +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import List, Sequence, Union + +import numpy as np +from datasets import DatasetDict, concatenate_datasets, load_dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LCDataset(BaseDataset): + + @staticmethod + def load(path: str, + num_repeats: int = 1, + difficulty='ALL', + local_mode=False): + """Load LC dataset for pass k mode. + + Note that you can use num_repeats > 1 when your model does not support + `num_return_sequence` in generation, otherwise use the raw + LC dataset and set `num_return_sequence` in model config to + generate multiple responses for testing pass@k>1. + + It better to change your dataset abbr correspondingly if you want to + change num_repeats>1, otherwise the number in + `.cache/dataset_size.json` might be inconsistent. + + Args: + num_repeats(int): Number of repetition for this dataset to get + multiple responses in special cases. + """ + path = get_data_path(path, local_mode=local_mode) + + def processing_test(example): + example['test_case'] = example['test_list'] + example['test_list'] = '\n'.join(example['test_list']) + example['test_column'] = dict(test_list_2=example['test_list'], + task_id=example['Contest id']) + return example + + train = load_dataset('json', data_files=path, + split='train[:5]').map(processing_test) + test = load_dataset('json', data_files=path, + split='train[5:]').map(processing_test) + if not difficulty == 'ALL': + train = train.filter( + lambda example: example['Difficulty'] == difficulty) + test = test.filter( + lambda example: example['Difficulty'] == difficulty) + test = concatenate_datasets([test] * num_repeats) + return DatasetDict({'train': train, 'test': test}) + + +class TimeOutException(Exception): + pass + + +@contextlib.contextmanager +def swallow_io(): + stream = WriteOnlyStringIO() + with contextlib.redirect_stdout(stream): + with contextlib.redirect_stderr(stream): + with redirect_stdin(stream): + yield + + +@contextlib.contextmanager +def time_limit(seconds: float): + + def signal_handler(signum, frame): + raise TimeOutException('Time out!') + + signal.setitimer(signal.ITIMER_REAL, seconds) + signal.signal(signal.SIGALRM, signal_handler) + try: + yield + finally: + signal.setitimer(signal.ITIMER_REAL, 0) + + +class WriteOnlyStringIO(io.StringIO): + """StringIO that throws an exception when it's read from.""" + + def read(self, *args, **kwargs): + raise IOError + + def readline(self, *args, **kwargs): + raise IOError + + def readlines(self, *args, **kwargs): + raise IOError + + def readable(self, *args, **kwargs): + """Returns True if the IO object can be read.""" + return False + + +class redirect_stdin(contextlib._RedirectStream): # type: ignore + _stream = 'stdin' + + +@ICL_EVALUATORS.register_module() +class LCEvaluator(BaseEvaluator): + + def score(self, predictions, references): + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + + result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0} + details = {} + + with ProcessPoolExecutor() as executor: + futures = [] + for i, (refer, pred) in enumerate(zip(references, predictions)): + + code_blocks = self._process_answer(pred) + + # Try each code block until one passes + for code_idx, code_block in enumerate(code_blocks): + test_programs = self._process_test(refer, code_block) + + # Submit each test program variant for execution + for prog_idx, program in enumerate(test_programs): + future = executor.submit( + execution, + program, + ( + i, + code_idx, + prog_idx, + ), # Pass indices for tracking + 3, + ) + futures.append(future) + + from tqdm import tqdm + + # Track which examples passed + passed_examples = set() + all_results = {} + + for future in tqdm(as_completed(futures), total=len(futures)): + (example_idx, code_idx, prog_idx), ret, program = future.result() + + # Store result + if example_idx not in all_results: + all_results[example_idx] = [] + + all_results[example_idx].append({ + 'code_idx': code_idx, + 'prog_idx': prog_idx, + 'result': ret, + 'is_correct': ret == 'pass', + 'program': program, + }) + + # If this example passed with any code block or test variant + if ret == 'pass': + passed_examples.add(example_idx) + + # Process final results + for example_idx, results in all_results.items(): + # Did any variant pass? + example_passed = example_idx in passed_examples + + # Get the first passing result if any, otherwise get the first result + result_to_use = next((r for r in results if r['is_correct']), + results[0]) + + # Update counters + if example_passed: + result['pass'] += 1 + else: + result[result_to_use['result']] += 1 + + # Store details + details[str(example_idx)] = { + 'result': + ('pass' if example_passed else result_to_use['result']), + 'is_correct': example_passed, + 'num_attempts': len(results), + 'code_blocks_tried': len(set(r['code_idx'] for r in results)), + 'program': result_to_use['program'], + } + + result['score'] = result['pass'] / len(predictions) * 100 + result['details'] = details + return result + + def _process_answer(self, text): + try: + # for chatGLM related text + eval_text = eval(text) + except Exception: + pass + else: + if isinstance(eval_text, str): + text = eval_text + + code_blocks = [] + # breakpoint() + # extract all code blocks with ```python or ``` markers + if '```' in text: + # Try to find ```python blocks first + python_blocks = re.findall(r'```python\s*(.*?)```', text, + re.DOTALL) + + # If no ```python blocks, look for generic ``` blocks + if not python_blocks: + blocks = re.findall(r'```(.*?)```', text, re.DOTALL) + if not blocks: + # Fall back: split by ``` and take the content between markers + parts = text.split('```') + if len(parts) > 1: + code_blocks.append(parts[1]) + else: + for block in blocks: + # Skip language identifier if present + if not block.startswith('\n') and '\n' in block: + block = block[block.find('\n') + 1:] + code_blocks.append(block.strip()) + else: + code_blocks.extend([block.strip() for block in python_blocks]) + + # If no code blocks found, use the entire text + if not code_blocks: + code_blocks = [text] + + # Process each code block + processed_blocks = [] + for code in code_blocks: + # Clean up the code block + code = code.strip() + # Remove [BEGIN]/[DONE] markers + match = re.search(r"('\s*|)(\[DONE\]|DONE)", code) + if match: + code = code[:match.start()] + match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", code) + if match: + code = code[match.end():] + code = code.strip() + if code.startswith("'"): + code = code[1:] + if code.endswith("'"): + code = code[:-1] + code = code.replace('\\', '') + + processed_blocks.append(code) + + return processed_blocks + + def _process_test(self, test_case, code): + """Process test with both direct function call and Solution class. + + Args: + test_case (str): Test case code + code (str): User submitted code + """ + + # Add wrapper to support Solution class if it exists in the code + if 'class Solution' in code: + # Extract the function name from assert statements + # Looking for patterns like: assert func_name(args) + func_calls = re.findall(r'assert\s+(\w+)\(', test_case) + if func_calls: + # Get unique function names from the test case + func_names = set(func_calls) + + modified_test = test_case + for func_name in func_names: + # Replace all occurrences of function calls with Solution().func_name + modified_test = re.sub( + r'(\bassert\s+)' + func_name + r'(\()', + r'\1Solution().' + func_name + r'\2', + modified_test, + ) + + # Use the modified test + test_case = modified_test + + formatted = code + '\n' + formatted += test_case + # breakpoint() + return formatted + + +def execution(programs, task_ids, timeout): + """Execution function for running generation code. + + Args: + programs(str): Python code to be executed. + task_ids(tuple): Tuple containing (example_idx, code_block_idx, program_variant_idx). + timeout(int): Time limit for execution. + + Returns: + tuple: (task_ids, result_status, program_code) + """ + + def _execution(programs, timeout): + try: + # Add exec globals to prevent the exec to raise + # unnecessary NameError for correct answer + exec_globals = {} + with swallow_io(): + with time_limit(timeout): + exec(programs, exec_globals) + key.append('pass') + except TimeOutException: + key.append('timeout') + except AssertionError: + key.append('wrong_answer') + except BaseException as e: + print(e) + key.append('failed') + + manager = multiprocessing.Manager() + key = manager.list() + # `signal` cannot be used in child thread, therefore, we + # need to create a process in the thread. + p = multiprocessing.Process(target=_execution, + args=(programs, timeout - 1)) + p.start() + p.join(timeout=timeout) + if p.is_alive(): + p.kill() + # key might not have value if killed + return task_ids, 'timeout', programs + return task_ids, key[0], programs + + +class LCPassKEvaluator(LCEvaluator): + """Better use for pass k evaluation. + + Args: + k(Tuple[int]): Choices of Pass@k. Defaults to (1, 10, 100) + """ + + def __init__(self, k=(1, 10, 100)) -> None: + if not isinstance(k, Sequence): + k = (k, ) + self.k = k + + @staticmethod + def estimate_pass_at_k( + num_samples: Union[int, List[int], np.ndarray], + num_correct: Union[List[int], np.ndarray], + k: int, + ) -> np.ndarray: + """Estimates pass@k of each problem and returns them in an array.""" + + def estimator(n: int, c: int, k: int) -> float: + """ + Calculates 1 - comb(n - c, k) / comb(n, k). + """ + if n - c < k: + return 1.0 + return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) + + if isinstance(num_samples, int): + num_samples_it = itertools.repeat(num_samples, len(num_correct)) + else: + assert len(num_samples) == len(num_correct) + num_samples_it = iter(num_samples) + + return np.array([ + estimator(int(n), int(c), k) + for n, c in zip(num_samples_it, num_correct) + ]) + + def score(self, predictions, references): + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + + task_pass = defaultdict(int) + task_total = defaultdict(int) + + result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0} + details = {} + + with ProcessPoolExecutor() as executor: + futures = [] + task_info = [] # Store info for each task + + index = 0 + for refer, preds in zip(references, predictions): + # suits for two case + # 1. use repeated dataset + # 2. use `num_return_sequences` to generate multiple responses + if not isinstance(preds, list): + preds = [preds] + + test_case = refer['test_list_2'] + task_id = refer['task_id'] + + # create empty task_pass in case all example failed + if task_id not in task_pass: + task_pass[task_id] = 0 + + for pred in preds: + # Extract all code blocks from the prediction + code_blocks = self._process_answer(pred) + + # Try each code block with various test program formats + for code_idx, code_block in enumerate(code_blocks): + # Process test with the current code block + test_program = self._process_test( + test_case, code_block) + + # Submit this program for execution + future = executor.submit( + execution, + test_program, + ( + index, + task_id, + code_idx, + 0, + ), # prog_idx always 0 since we only have one program per code block + 30, + ) + futures.append(future) + task_info.append({ + 'index': index, + 'task_id': task_id, + 'code_block': code_block, + 'program': test_program, + }) + + index += 1 + + # Track which tasks have passed with any code block + passed_tasks = set() + task_results = defaultdict(list) + + from tqdm import tqdm + + for future in tqdm(as_completed(futures), total=len(futures)): + (index, task_id, code_idx, + prog_idx), ret, program = future.result() + + # Store result + task_results[(index, task_id)].append({ + 'result': ret, + 'is_correct': ret == 'pass', + 'program': program + }) + + # If this is a pass, mark the task + if ret == 'pass': + passed_tasks.add((index, task_id)) + + # Store detailed result + details[f'{index}_{code_idx}_{prog_idx}'] = { + 'program': program, + 'task_id': task_id, + 'result': ret, + 'is_correct': ret == 'pass', + } + + # Process all tasks + for (index, task_id), results in task_results.items(): + task_total[task_id] += 1 + # Task passes if any code block passes + if (index, task_id) in passed_tasks: + task_pass[task_id] += 1 + result['pass'] += 1 + else: + # Get the first result to classify the error + first_result = results[0]['result'] + result[first_result] += 1 + + result['details'] = details + + def get_number(tasks): + return np.array([ + task[1] for task in sorted(tasks.items(), key=lambda x: x[0]) + ]) + + task_pass = get_number(task_pass) + task_total = get_number(task_total) + pass_at_k = { + f'pass@{k}': + self.estimate_pass_at_k(task_total, task_pass, k).mean() * 100 + for k in self.k if (task_total >= k).all() + } + result.update(pass_at_k) + return result diff --git a/build/lib/opencompass/datasets/MMLUArabic.py b/build/lib/opencompass/datasets/MMLUArabic.py new file mode 100644 index 0000000000000000000000000000000000000000..7b7de0c95112f25bf8677d6e0274cc64f52e8897 --- /dev/null +++ b/build/lib/opencompass/datasets/MMLUArabic.py @@ -0,0 +1,35 @@ +import csv +import os.path as osp + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class MMLUArabicDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + path = get_data_path(path, local_mode=True) + dataset = DatasetDict() + for split in ['dev', 'test']: + raw_data = [] + filename = osp.join(path, split, f'{name}_{split}.csv') + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + for row in reader: + assert len(row) == 6 + raw_data.append({ + 'input': row[0], + 'A': row[1], + 'B': row[2], + 'C': row[3], + 'D': row[4], + 'target': row[5], + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/build/lib/opencompass/datasets/MedCalc_Bench.py b/build/lib/opencompass/datasets/MedCalc_Bench.py new file mode 100644 index 0000000000000000000000000000000000000000..66855d5cec773ecd1aa4d29d808fe1fa0a0af170 --- /dev/null +++ b/build/lib/opencompass/datasets/MedCalc_Bench.py @@ -0,0 +1,323 @@ +import math +import re +from datetime import datetime + +import numpy as np +from datasets import load_dataset + +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +def check_correctness(answer: str, ground_truth, calid, upper_limit, + lower_limit): + """""" + calid = int(calid) + + if calid in [13, 68]: + # Output Type: date + + if datetime.strptime( + answer, + '%m/%d/%Y').strftime('%-m/%-d/%Y') == datetime.strptime( + ground_truth, '%m/%d/%Y').strftime('%-m/%-d/%Y'): + correctness = 1 + else: + correctness = 0 + elif calid in [69]: + # Output Type: integer (A, B) + match = re.search( + r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?" + r"\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", ground_truth) + ground_truth = f'({match.group(1)}, {match.group(3)})' + match = re.search( + r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?" + r"\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", answer) + if match: + weeks = match.group(1) + days = match.group(3) + answer = f'({weeks}, {days})' + if eval(answer) == eval(ground_truth): + correctness = 1 + else: + correctness = 0 + else: + correctness = 0 + elif calid in [ + 4, 15, 16, 17, 18, 20, 21, 25, 27, 28, 29, 32, 33, 36, 43, 45, 48, + 51, 69 + ]: + # Output Type: integer A + answer = round(eval(answer)) + if answer == eval(ground_truth): + correctness = 1 + else: + correctness = 0 + elif calid in [ + 2, 3, 5, 6, 7, 8, 9, 10, 11, 19, 22, 23, 24, 26, 30, 31, 38, 39, + 40, 44, 46, 49, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67 + ]: + # Output Type: decimal + answer = eval(answer) + if answer >= eval(lower_limit) and answer <= eval(upper_limit): + correctness = 1 + else: + correctness = 0 + else: + raise ValueError(f'Unknown calculator ID: {calid}') + return correctness + + +def extract_answer(answer, calid): + + calid = int(calid) + extracted_answer = re.findall(r'[Aa]nswer":\s*(.*?)\}', answer) + matches = re.findall( + r'"step_by_step_thinking":\s*"' + r'([^"]+)"\s*,\s*"[Aa]nswer"', answer) + + if matches: + # Select the last match + last_match = matches[-1] + explanation = last_match + else: + explanation = 'No Explanation' + + if len(extracted_answer) == 0: + extracted_answer = 'Not Found' + else: + extracted_answer = extracted_answer[-1].strip().strip('"') + if extracted_answer == 'str(short_and_direct\ + _answer_of_the_question)': + extracted_answer = 'Not Found' + if extracted_answer == 'str(value which is\ + the answer to the question)': + extracted_answer = 'Not Found' + if extracted_answer == 'X.XX': + extracted_answer = 'Not Found' + + if calid in [13, 68]: + # Output Type: date + match = re.search( + r'^(0?[1-9]|1[0-2])\/(0?[1-9]' + r'|[12][0-9]|3[01])\/(\d{4})', extracted_answer) + if match: + month = int(match.group(1)) + day = int(match.group(2)) + year = match.group(3) + answer = f'{month:02}/{day:02}/{year}' + else: + answer = 'N/A' + + elif calid in [69]: + # Output Type: integer (A, B) + match = re.search( + r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?," + r"\?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", extracted_answer) + extracted_answer = extracted_answer.replace('[', '(').replace( + ']', ')').replace("'", '').replace('"', '') + match = re.search( + r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?," + r"?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", extracted_answer) + if match: + weeks = match.group(1) + days = match.group(3) + answer = f'({weeks}, {days})' + else: + answer = 'N/A' + elif calid in [ + 4, 15, 16, 17, 18, 20, 21, 25, 27, 28, 29, 32, 33, 36, 43, 45, 48, + 51, 69 + ]: + # Output Type: integer A + match = re.search(r'(\d+) out of', extracted_answer) + if match: # cases like "3 out of 5" + answer = match.group(1) + else: + match = re.search(r'-?\d+(, ?-?\d+)+', extracted_answer) + if match: # cases like "3, 4, 5" + answer = str(len(match.group(0).split(','))) + else: + # match = re.findall(r"(? 0: # find the last integer + answer = match[-1][0] + # answer = match[-1].lstrip("0") + else: + answer = 'N/A' + elif calid in [ + 2, 3, 5, 6, 7, 8, 9, 10, 11, 19, 22, 23, 24, 26, 30, 31, 38, 39, + 40, 44, 46, 49, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67 + ]: + # Output Type: decimal + match = re.search(r'str\((.*)\)', extracted_answer) + if match: + expression = match.group(1).replace('^', '**').replace( + 'is odd', '% 2 == 1').replace('is even', '% 2 == 0').replace( + 'sqrt', 'math.sqrt').replace('.math', '').replace( + 'weight', + '').replace('height', '').replace('mg/dl', '').replace( + 'g/dl', '').replace('mmol/L', '').replace( + 'kg', '').replace('g', + '').replace('mEq/L', '') + expression = expression.split('#')[0] + if expression.count('(') > expression.count(')'): # add missing ') + expression += ')' * (expression.count('(') - + expression.count(')')) + elif expression.count(')') > expression.count( + '('): # add missing ( + expression = '(' * (expression.count(')') - + expression.count('(')) + expression + try: + answer = eval(expression, {'__builtins__': None}, { + 'min': min, + 'pow': pow, + 'round': round, + 'abs': abs, + 'int': int, + 'float': float, + 'math': math, + 'np': np, + 'numpy': np + }) + except Exception: + print(f'Error in evaluating expression: {expression}') + answer = 'N/A' + else: + match = re.search(r'(-?\d+(\.\d+)?)\s*mL/min/1.73', + extracted_answer) + if match: # cases like "8.1 mL/min/1.73 m\u00b2" + answer = eval(match.group(1)) + else: + match = re.findall(r'(-?\d+(\.\d+)?)\%', extracted_answer) + if len(match) > 0: # cases like "53.1%" + answer = eval(match[-1][0]) / 100 + else: + match = re.findall(r'(-?\d+(\.\d+)?)', extracted_answer) + if len( + match + ) > 0: # cases like "8.1 mL/min/1.73 m\u00b2" or "11.1" + answer = eval(match[-1][0]) + else: + answer = 'N/A' + if answer != 'N/A': + answer = str(answer) + + return answer, explanation + + +def _parse(item, prompt_mode): + item['row_number'] = item['Row Number'] + item['calculator_id'] = item['Calculator ID'] + item['calculator_name'] = item['Calculator Name'] + item['category'] = item['Category'] + item['output_type'] = item['Output Type'] + item['note_id'] = item['Note ID'] + item['note_type'] = item['Note Type'] + item['patient_note'] = item['Patient Note'] + item['question'] = item['Question'] + item['relevant_entities'] = item['Relevant Entities'] + item['ground_truth_answer'] = item['Ground Truth Answer'] + item['lower_limit'] = item['Lower Limit'] + item['upper_limit'] = item['Upper Limit'] + item['ground_truth_explanation'] = item['Ground Truth Explanation'] + return item + + +@LOAD_DATASET.register_module() +class MedCalc_BenchDataset(BaseDataset): + + @staticmethod + def load(path: str, prompt_mode: str, **kwargs): + data_files = { + 'test': 'data/test-00000-of-00001.parquet', + 'train': 'data/train-00000-of-00001.parquet' + } + dataset = load_dataset(path, data_files=data_files, split='test') + # dataset = dataset.select(range(2)) + if prompt_mode == 'zero-shot': + dataset = dataset.map(lambda item: _parse(item, prompt_mode), + load_from_cache_file=False) + elif prompt_mode == 'few-shot': + pass # TODO: Implement few-shot prompt + return dataset + + +class MedCalcOfficial_Evaluator(BaseEvaluator): + + def score(self, predictions, references, test_set): + + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + + correct = 0 + count = 0 + details = [] + for idx, (i, j) in enumerate(zip(predictions, references)): + calculator_id = test_set['calculator_id'][idx] + lower_limit = test_set['lower_limit'][idx] + upper_limit = test_set['upper_limit'][idx] + row_number = test_set['row_number'][idx] + note_id = test_set['note_id'][idx] + category = test_set['category'][idx] + question = test_set['question'][idx] + calculator_name = test_set['calculator_name'][idx] + patient_note = test_set['patient_note'][idx] + ground_truth_explanation = test_set['ground_truth_explanation'][ + idx] + ground_truth_answer = test_set['ground_truth_answer'][idx] + try: + answer_value, explanation = extract_answer( + i, int(calculator_id)) + + print(answer_value) + print(explanation) + + correctness = check_correctness(answer_value, + ground_truth_answer, + calculator_id, upper_limit, + lower_limit) + + status = 'Correct' if correctness else 'Incorrect' + + outputs = { + 'Row Number': int(row_number), + 'Calculator Name': calculator_name, + 'Calculator ID': calculator_id, + 'Category': category, + 'Note ID': note_id, + 'Patient Note': patient_note, + 'Question': question, + 'LLM Answer': answer_value, + 'LLM Explanation': explanation, + 'Ground Truth Answer': ground_truth_answer, + 'Ground Truth Explanation': ground_truth_explanation, + 'Result': status + } + + except Exception as e: + outputs = { + 'Row Number': int(row_number), + 'Calculator Name': calculator_name, + 'Calculator ID': calculator_id, + 'Category': category, + 'Note ID': note_id, + 'Patient Note': patient_note, + 'Question': question, + 'LLM Answer': str(e), + 'LLM Explanation': str(e), + 'Ground Truth Answer': ground_truth_answer, + 'Ground Truth Explanation': ground_truth_explanation, + 'Result': 'Incorrect' + } + status = 'Incorrect' + count += 1 + if status == 'Correct': + correct += 1 + details.append(outputs) + + result = {'accuracy': 100 * correct / count, 'details': details} + return result diff --git a/build/lib/opencompass/datasets/MedQA.py b/build/lib/opencompass/datasets/MedQA.py new file mode 100644 index 0000000000000000000000000000000000000000..256f991034a1f85d4d1e087360de1525a3d29de2 --- /dev/null +++ b/build/lib/opencompass/datasets/MedQA.py @@ -0,0 +1,29 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class MedQADataset(BaseDataset): + + @staticmethod + def load_single(path): + dataset = [] + ds = load_dataset(path) + for data in ds['train']: + data['label'] = data['answer_idx'] + choices = '' + for option in data['options']: + choices += option + '. ' + data['options'][option] + '\n' + data['choices'] = choices + + dataset.append(data) + + return Dataset.from_list(dataset) + + @staticmethod + def load(path): + dataset = MedQADataset.load_single(path) + return dataset diff --git a/build/lib/opencompass/datasets/MedXpertQA.py b/build/lib/opencompass/datasets/MedXpertQA.py new file mode 100644 index 0000000000000000000000000000000000000000..f016297ab6c35f6cdc42c0bd0136c7a3cedf9699 --- /dev/null +++ b/build/lib/opencompass/datasets/MedXpertQA.py @@ -0,0 +1,225 @@ +import re + +from datasets import Dataset, load_dataset + +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_logger + +from .base import BaseDataset + + +def _parse(item, prompt_mode): + item['start'] = chr(65) + item['end'] = chr(65 + len(item.get('options', [])) - 1) + item['prompt_mode'] = prompt_mode + return item + + +@LOAD_DATASET.register_module() +class MedXpertQADataset(BaseDataset): + + @staticmethod + def load(path: str, prompt_mode: str, **kwargs): + dataset = load_dataset(path, 'Text', split='test') + # dataset = load_dataset(path, 'Text', split='dev') + + if prompt_mode == 'zero-shot': + dataset = dataset.map(lambda item: _parse(item, prompt_mode)) + elif prompt_mode == 'few-shot': + pass # TODO: Implement few-shot prompt + + return dataset + + +class MedXpertQAEvaluator(BaseEvaluator): + + def score(self, predictions, references, test_set): + method = test_set['prompt_mode'][0] + + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + correct = 0 + count = 0 + details = [] + for idx, (i, j) in enumerate(zip(predictions, references)): + i = answer_cleansing(method, i, test_set['options'][idx], + test_set['label'][idx]) + detail = {'pred': i, 'answer': j, 'correct': False} + count += 1 + if i == j: + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result + + +@TEXT_POSTPROCESSORS.register_module() +def answer_cleansing( + method: str, + prediction: str, + options: list, + label: str, +) -> str: + + # Clean up unwanted phrases in the prediction + for unwanted_phrase in [ + 'I understand', + 'A through J', + 'A through E', + 'A through D', + ]: + prediction = prediction.replace(unwanted_phrase, '') + + options_num = len(options) + options = [chr(65 + i) for i in range(options_num)] + options_str = r'\b(' + '|'.join(options) + r')\b' + prediction = re.findall(options_str, prediction) + + if len(prediction) == 0: + prediction = [] + else: + # If there is a "label" and its length is 1, + # process prediction accordingly + if len(label) == 1: + if method == 'few-shot': + answer_flag = True if len(prediction) > 1 else False + # choose the first or last element based on the answer_flag + if answer_flag: + prediction = [prediction[0]] + else: + prediction = [prediction[-1]] + elif method == 'zero-shot': + # choose the first element in list + prediction = [prediction[0]] + else: + raise ValueError('Method is not properly defined ...') + + # Remove trailing period if it exists + if prediction[0] and prediction[0].endswith('.'): + prediction[0] = prediction[0][:-1] + + return prediction[0] + + +def _generic_llmjudge_postprocess(judgement: str): + match = re.search(r'(A|B)', judgement) + grade_letter = (match.group(0) if match else 'B' + ) # Default to "INCORRECT" if no match + return grade_letter + + +def MedXpertQA_llmjudge_postprocess( + output: dict, + output_path: str, + dataset: Dataset, +) -> dict: + # Get the original dataset + original_dataset = dataset.reader.dataset['test'] + + judged_answers = [] + original_responses = [] + references = [] + details = [] + + # Initialize statistics dictionaries + stats = {'medical_task': {}, 'body_system': {}, 'question_type': {}} + + total_correct = 0 + total_count = 0 + + # Process each sample + for k, v in output.items(): + idx = int(k) # Convert key to integer for indexing + original_responses.append(v['prediction']) + processed_judge = _generic_llmjudge_postprocess(v['prediction']) + + # Get category information from the dataset + sample = original_dataset[idx] + medical_task = sample.get('medical_task', 'unknown') + body_system = sample.get('body_system', 'unknown') + question_type = sample.get('question_type', 'unknown') + + # Initialize category stats if not exists + for level, key in [ + ('medical_task', medical_task), + ('body_system', body_system), + ('question_type', question_type), + ]: + if key not in stats[level]: + stats[level][key] = {'correct': 0, 'total': 0} + + # Record the judgment + if processed_judge is not None: + judged_answers.append(processed_judge) + try: + gold = v['gold'] + references.append(gold) + except KeyError: + get_logger().warning( + f'No gold answer for {k}, use empty string as reference!') + gold = '' + references.append('') + + # Check if the answer is correct (A means correct) + is_correct = processed_judge == 'A' + total_count += 1 + + if is_correct: + total_correct += 1 + # Update category stats + for level, key in [ + ('medical_task', medical_task), + ('body_system', body_system), + ('question_type', question_type), + ]: + stats[level][key]['correct'] += 1 + + # Update category totals + for level, key in [ + ('medical_task', medical_task), + ('body_system', body_system), + ('question_type', question_type), + ]: + stats[level][key]['total'] += 1 + # Add to details + details.append({ + 'id': k, + 'question': sample['question'], + 'options': sample['options'], + 'origin_prompt': v['origin_prompt'], + 'llm_judge': processed_judge, + 'gold': gold, + 'is_correct': is_correct, + 'medical_task': medical_task, + 'body_system': body_system, + 'question_type': question_type, + }) + + # Calculate overall accuracy with two decimal places + overall_accuracy = (round( + (total_correct / total_count * 100), 2) if total_count > 0 else 0.00) + + # Initialize results dictionary + results = { + 'accuracy': overall_accuracy, + 'total_correct': total_correct, + 'total_count': total_count, + 'details': details, + } + + # Calculate accuracy for each category and flatten into results + for level in stats: + for key, value in stats[level].items(): + if value['total'] > 0: + # Calculate accuracy with two decimal places + accuracy = round((value['correct'] / value['total'] * 100), 2) + + # Create a flattened key for the category + flat_key = f'MedXpertQA-{key}' + + # Add to results + results[flat_key] = accuracy + + return results diff --git a/build/lib/opencompass/datasets/Medbullets.py b/build/lib/opencompass/datasets/Medbullets.py new file mode 100644 index 0000000000000000000000000000000000000000..1e7e9a63bbbd4c7babd97224f0b0302b819c9fbd --- /dev/null +++ b/build/lib/opencompass/datasets/Medbullets.py @@ -0,0 +1,243 @@ +import re + +import pandas as pd +from datasets import Dataset + +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path, get_logger + +from .base import BaseDataset + + +def _parse(item: dict, prompt_mode: str) -> dict: + # 构建选项列表,忽略空字符串的 ope + options_keys = ['opa', 'opb', 'opc', 'opd'] + if item.get('ope', '') != '': + options_keys.append('ope') + options_list = [item.get(k, '') for k in options_keys] + item['options'] = options_list + + # 构建带标号的选项字符串 + options_str = '\n'.join( + [f'{chr(65 + i)}. {opt}' for i, opt in enumerate(options_list)]) + + # 将选项附加到问题末尾 + item['question'] = f"{item.get('question', '')}\n{options_str}" + + # 标签及其他字段 + item['label'] = item.get('answer_idx') + item['prompt_mode'] = prompt_mode + item['start'] = chr(65) + item['end'] = chr(65 + len(options_list) - 1) + return item + + +@LOAD_DATASET.register_module() +class MedbulletsDataset(BaseDataset): + + @staticmethod + def load(path: str, prompt_mode: str = 'zero-shot', **kwargs): + # 读取 CSV 文件为 DataFrame,并将 NaN 转为空字符串 + path = get_data_path(path) + df = pd.read_csv(path, encoding='utf-8') + df = df.fillna('') + + # 转换为字典列表 + data_list = df.to_dict(orient='records') + + # 将数据列表包装为 Dataset + dataset = Dataset.from_list(data_list) + + # 根据提示模式进行解析 + if prompt_mode == 'zero-shot': + dataset = dataset.map(lambda item: _parse(item, prompt_mode)) + elif prompt_mode == 'few-shot': + pass # TODO: Implement few-shot prompt handling + return dataset + + +class MedbulletsEvaluator(BaseEvaluator): + + def score(self, predictions, references, test_set): + method = test_set['prompt_mode'][0] + + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + correct = 0 + count = 0 + details = [] + for idx, (i, j) in enumerate(zip(predictions, references)): + i = answer_cleansing(method, i, test_set['options'][idx], + test_set['label'][idx]) + detail = { + 'pred': i, + 'answer': j, + 'correct': False, + 'question_type': test_set['question_type'][idx] + } + count += 1 + if i == j: + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result + + +@TEXT_POSTPROCESSORS.register_module() +def answer_cleansing( + method: str, + prediction: str, + options: list, + label: str, +) -> str: + + # Clean up unwanted phrases in the prediction + for unwanted_phrase in [ + 'I understand', + 'A through J', + 'A through E', + 'A through D', + ]: + prediction = prediction.replace(unwanted_phrase, '') + + options_num = len(options) + options = [chr(65 + i) for i in range(options_num)] + options_str = r'\b(' + '|'.join(options) + r')\b' + prediction = re.findall(options_str, prediction) + + if len(prediction) == 0: + prediction = [] + return prediction + else: + # If there is a "label" and its length is 1, + # process prediction accordingly + if len(label) == 1: + if method == 'few-shot': + answer_flag = True if len(prediction) > 1 else False + # choose the first or last element based on the answer_flag + if answer_flag: + prediction = [prediction[0]] + else: + prediction = [prediction[-1]] + elif method == 'zero-shot': + # choose the first element in list + prediction = [prediction[0]] + else: + raise ValueError('Method is not properly defined ...') + + # Remove trailing period if it exists + if prediction[0] and prediction[0].endswith('.'): + prediction[0] = prediction[0][:-1] + + return prediction[0] + + +def _generic_llmjudge_postprocess(judgement: str): + match = re.search(r'(A|B)', judgement) + grade_letter = (match.group(0) if match else 'B' + ) # Default to "INCORRECT" if no match + return grade_letter + + +def medbullets_llmjudge_postprocess( + output: dict, + output_path: str, + dataset: Dataset, +) -> dict: + original_dataset = dataset.reader.dataset['test'] + + judged_answers = [] + original_responses = [] + references = [] + details = [] + + # Initialize statistics dictionaries + stats = {'question_type': {}} + + total_correct = 0 + total_count = 0 + + # Process each sample + for k, v in output.items(): + idx = int(k) # Convert key to integer for indexing + original_responses.append(v['prediction']) + processed_judge = _generic_llmjudge_postprocess(v['prediction']) + + # Get category information from the dataset + sample = original_dataset[idx] + question_type = sample.get('question_type', 'unknown') + + # Initialize category stats if not exists + for level, key in [ + ('question_type', question_type), + ]: + if key not in stats[level]: + stats[level][key] = {'correct': 0, 'total': 0} + + # Record the judgment + if processed_judge is not None: + judged_answers.append(processed_judge) + try: + gold = v['gold'] + references.append(gold) + except KeyError: + get_logger().warning( + f'No gold answer for {k}, use empty string as reference!') + gold = '' + references.append('') + + # Check if the answer is correct (A means correct) + is_correct = processed_judge == 'A' + total_count += 1 + + if is_correct: + total_correct += 1 + # Update category stats + for level, key in [ + ('question_type', question_type), + ]: + stats[level][key]['correct'] += 1 + + # Update category totals + for level, key in [ + ('question_type', question_type), + ]: + stats[level][key]['total'] += 1 + # Add to details + details.append({ + 'id': k, + 'origin_prompt': v['origin_prompt'], + 'llm_judge': processed_judge, + 'gold': gold, + 'is_correct': is_correct, + 'question_type': question_type, + }) + + # Calculate overall accuracy with two decimal places + overall_accuracy = (round( + (total_correct / total_count * 100), 2) if total_count > 0 else 0.00) + + # Initialize results dictionary + results = { + 'accuracy': overall_accuracy, + 'total_correct': total_correct, + 'total_count': total_count, + 'details': details, + } + + # Calculate accuracy for each category and flatten into results + for level in stats: + for key, value in stats[level].items(): + if value['total'] > 0: + # Calculate accuracy with two decimal places + accuracy = round((value['correct'] / value['total'] * 100), 2) + + # Create a flattened key for the category + flat_key = f'Medbullets-{key}' + + # Add to results + results[flat_key] = accuracy + + return results diff --git a/build/lib/opencompass/datasets/OlympiadBench.py b/build/lib/opencompass/datasets/OlympiadBench.py new file mode 100644 index 0000000000000000000000000000000000000000..2b4cb9849a05286f0bb3c3b73e8dcca51bb33119 --- /dev/null +++ b/build/lib/opencompass/datasets/OlympiadBench.py @@ -0,0 +1,795 @@ +import json +import math +import os +import re +from os import environ +from typing import Dict + +import sympy as sp +from datasets import Dataset, DatasetDict +from sympy import Eq, Pow, simplify, sympify +from sympy.parsing.latex import parse_latex + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.registry import (ICL_PROMPT_TEMPLATES, LOAD_DATASET, + TEXT_POSTPROCESSORS) +from opencompass.utils import get_data_path + +from .base import BaseDataset + +# Load Dataset + + +@LOAD_DATASET.register_module() +class OlympiadBenchDataset(BaseDataset): + """Dataset for OlympiadBench. + + Args: + path (str): Path to dataset directory + name (str): Name of specific json file to load + e.g. 'OE_TO_maths_en_COMP' + """ + + @staticmethod + def load(path: str, name: str = None, **kwargs): + """Load dataset. + + Args: + path (str): Path to dataset directory + name (str): Name of specific json file to load + + Returns: + DatasetDict: Dataset with test and train splits + """ + path = get_data_path(path) + dataset = DatasetDict() + raw_data = [] + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + + ms_dataset = MsDataset.load(path, split='train') + for item in ms_dataset: + raw_data.append({ + 'problem': + item['question'], + 'solution': + item['final_answer'][0], + 'language': + item['language'], + 'subject': + item['subject'], + 'question_type': + item['question_type'], + 'answer_type': + item['answer_type'], + 'is_multiple_answer': + item['is_multiple_answer'], + 'unit': + item['unit'], + 'error': + item['error'], + 'questions': + item, # may not be used + }) + else: + # Construct file path using name parameter + if name is None: + raise ValueError( + "Must specify 'name' parameter to load specific json file") + + # file_path = os.path.join(path, name, f'{name}.json') + file_path = os.path.join(path, f'{name}.json') + if not os.path.exists(file_path): + raise FileNotFoundError(f'File not found: {file_path}') + + # Load the specified json file + data = json.load(open(file_path, encoding='utf-8')) + for item in data: + raw_data.append({ + 'problem': + item['question'], + 'solution': + item['final_answer'][0], + 'language': + item['language'], + 'subject': + item['subject'], + 'question_type': + item['question_type'], + 'answer_type': + item['answer_type'], + 'is_multiple_answer': + item['is_multiple_answer'], + 'unit': + item['unit'], + 'error': + item['error'], + 'questions': + item, # may not be used + }) + + dataset['test'] = Dataset.from_list(raw_data) + dataset['train'] = Dataset.from_list(raw_data) + return dataset + + +# Construct Prompt + + +def get_single_answer_type_text(answer_type, is_chinese): + if '-' in answer_type: # No need now + answer_type = answer_type[:answer_type.find('-')] + chinese_answer_type_dict = { + 'Numerical': '数值', + 'Expression': '表达式', + 'Equation': '方程', + 'Interval': '区间', + } + english_answer_type_dict = { + 'Numerical': 'a numerical value', + 'Expression': 'an expression', + 'Equation': 'an equation', + 'Interval': 'an interval', + } + + for t in ['Numerical', 'Expression', 'Equation', 'Interval']: + if t in answer_type: + if is_chinese: + return chinese_answer_type_dict[t] + else: + return english_answer_type_dict[t] + raise ValueError(f'Error parsing answer type {answer_type}!') + + +def get_answer_type_text(answer_type, is_chinese, multiple_answer): + if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type): + return '' + if not multiple_answer: + answer_text = get_single_answer_type_text(answer_type, is_chinese) + if is_chinese: + return f',答案类型为{answer_text}' + else: + return (f'The answer of The problem should be ' + f'{answer_text}. ') + # Multiple answers case + if ',' not in answer_type: # Same answer type for all answers + answer_text = get_single_answer_type_text(answer_type, is_chinese) + if is_chinese: + return f',题目有多个答案,答案类型均为{answer_text}' + else: + return (f'The problem has multiple answers, each of them ' + f'should be {answer_text}. ') + # Different answer types + answer_types = answer_type.split(',') + answer_types = [ + get_single_answer_type_text(t, is_chinese) for t in answer_types + ] + if len(set(answer_types)) == 1: + answer_text = answer_types[0] + if is_chinese: + return f',题目有多个答案,答案类型均为{answer_text}' + else: + return (f'The problem has multiple answers, each of them ' + f'should be {answer_text}. ') + else: + if is_chinese: + answer_text = '、'.join(answer_types) + return f',题目有多个答案,答案类型分别为{answer_text}' + else: + answer_text = ', '.join(answer_types) + return (f'The problem has multiple answers, ' + f'with the answers in order being {answer_text}. ') + + +class OlympiadBenchPrompter: + + def __init__(self): + pass + + def make_prompt( + self, + language, + subject, + question_type, + answer_type, + is_multiple_answer, + unit, + ): + self.is_chinese = language == 'Chinese' + self.is_math = subject == 'Math' + self.is_theorem_proving = question_type == 'Theorem proof' + """Generate prompt based on question properties.""" + if self.is_chinese: + subject_content = '数学' if self.is_math else '物理' + if self.is_theorem_proving: + prompt = (f'以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,' + f'运用逻辑推理及常用定理证明题目中的命题。证明过程中使用的变量和公式请使用LaTeX格式表示。') + else: + answer_type_text = get_answer_type_text( + answer_type, + is_chinese=True, + multiple_answer=is_multiple_answer, + ) + if is_multiple_answer: + multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}' + else: + multiple_answer_text = '\\boxed{答案}' + unit_text = '' + if unit: + multiple_answer_text += '(单位)' + unit_text = ',注意答案的单位不要放在\\boxed{}中' + prompt = (f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。' + f'请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的' + f'变量和公式请使用LaTeX格式表示。请在最后以"所以最终答案是' + f'{multiple_answer_text}。"显式给出结果{unit_text}。') + else: + subject_content = 'Math' if self.is_math else 'Physics' + if self.is_theorem_proving: + prompt = ( + f'The following is a theorem proving problem from an ' + f'International {subject_content} competition. Please use ' + f'logical reasoning and common theorems to prove the ' + f'proposition in the problem according to the given ' + f'requirements. Please use LaTeX format to represent the ' + f'variables and formulas used in the proof.') + else: + if is_multiple_answer: + multiple_answer_text = ( + '\\boxed{multiple answers connected with commas}') + else: + multiple_answer_text = '\\boxed{answer}' + unit_text = '' + if unit: + multiple_answer_text += '(unit)' + unit_text = (', note that the unit of the answer should ' + 'not be included in \\boxed{}') + answer_type_text = get_answer_type_text( + answer_type, + is_chinese=False, + multiple_answer=is_multiple_answer, + ) + prompt = ( + f'The following is an open-ended problem from an ' + f'International {subject_content} competition. ' + f'{answer_type_text}Please calculate the answer according ' + f'to the given requirements and the information provided. ' + f'Please use LaTeX format to represent the variables and ' + f'formulas used in the solution process and results. ' + f'Please end your solution with "So the final answer is ' + f'{multiple_answer_text}." and give the result explicitly' + f'{unit_text}.') + # Add problem statement to the prompt + prompt = prompt + '\n' + '{problem}' + '\n' + # Add step-by-step reasoning instruction + if self.is_chinese: + prompt += '\n请通过逐步推理来解答问题,并把最终答案放置于\\boxed{}中。' + else: + prompt += ('\nPlease reason step by step, and put your final ' + 'answer within \\boxed{}.') + return prompt + + +# Evaluate + + +class MathJudger: + + def __init__(self): + self.special_signal_map = { + '\\left': '', + '\\right': '', + '∶': ':', + ',': ',', + '$': '', + '\\approx': '=', + '\\simeq': '=', + '\\sim': '=', + '^\\prime': "'", + '^{\\prime}': "'", + '^\\circ': '', + '%': '', + } + self.pi = parse_latex('\\pi') + self.precision = 1e-8 + + def split_by_comma(self, expr: str): + in_bracket_num = 0 + splitted_expr = [] + start_idx = 0 + for i, char in enumerate(expr): + if char == '(' or char == '[': + in_bracket_num += 1 + elif char == ')' or char == ']': + in_bracket_num -= 1 + elif char == ',' and in_bracket_num == 0: + splitted_expr.append(expr[start_idx:i].strip()) + start_idx = i + 1 + + if start_idx < len(expr): + splitted_expr.append(expr[start_idx:].strip()) + + return splitted_expr + + def trans_plus_minus_sign(self, expr_list: list): + new_expr_list = [] + for expr in expr_list: + if '\\pm' in expr: + new_expr_list.append(expr.replace('\\pm', '+')) + new_expr_list.append(expr.replace('\\pm', '-')) + else: + new_expr_list.append(expr) + + return new_expr_list + + def judge(self, expression1, expression2, precision=1e-8): + # (默认 expression1 为 Ground_Truth) + precision = precision if type(precision) == list else [precision] + + try: + expression1, expression2 = self.preprocess(expression1, + expression2) + except Exception: # 处理具体异常 + return False + if expression1 == expression2: + return True + + # 去除字符串中的中文字符 + expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1) + expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2) + + expression1 = self.split_by_comma(expression1) + expression2 = self.split_by_comma(expression2) + + temp_list1 = self.trans_plus_minus_sign(expression1) + temp_list2 = self.trans_plus_minus_sign(expression2) + + # 设计误差值列表 + if len(precision) <= 1: + precision = precision * len(temp_list1) + + if len(temp_list1) != len(temp_list2): + return False + + # 判断两个列表中的元素是否可以两两配对,并且两两相等 + idx = -1 + while len(temp_list1) != 0: + idx = (idx + 1) % len(temp_list1) + + item1 = temp_list1[idx] + self.precision = precision[idx] + + for item2 in temp_list2: + if self.is_equal(item1, item2): + temp_list1.remove(item1) + temp_list2.remove(item2) + precision.remove(self.precision) + break + else: + return False + + # 如果所有元素都匹配并移除,列表可以配对 + return True + + def is_interval(self, epr): + return epr.startswith(('(', '[')) and epr.endswith((')', ']')) + + def sympy_sub_pi(self, expression_sympy): + return expression_sympy.subs(self.pi, math.pi) + + def is_equal(self, expression1, expression2): + if (expression1 == expression2 and expression1 != '' + and expression2 != ''): + return True + + # 先判断是否是两个区间 + if self.is_interval(expression1) and self.is_interval(expression2): + try: + if self.interval_equal(expression1, expression2): + return True + except Exception: # 处理具体异常 + return False + + # 再判断是否在数值上相等 + try: + if self.numerical_equal(expression1, expression2): + return True + except Exception: # 处理具体异常 + pass + + # 再判断是否是表达式相等 + try: + if self.expression_equal( + expression1, expression2) and not ('=' in expression1 + and '=' in expression2): + return True + except Exception: # 处理具体异常 + pass + + # 再判断是否是等式相等 + try: + if self.equation_equal(expression1, expression2): + return True + except Exception: # 处理具体异常 + pass + + return False + + def numerical_equal( + self, + expression1: str, + expression2: str, + include_percentage: bool = True, + ): + """(默认 expression1 为 Ground_Truth) 函数: 判读两个数值是否在误差允许范围内相等 步骤1: + + 将可能出现的百分号的情况包含进来 步骤2: 使用 math.isclose 函数判断是否相等. + """ + reference = float(expression1) + prediction = float(expression2) + + if include_percentage: + gt_result = [reference / 100, reference, reference * 100] + else: + gt_result = [reference] + + for item in gt_result: + if abs(item - prediction) <= self.precision * 1.01: + return True + return False + + def expression_equal(self, exp1, exp2): + """(默认 expression1 为 Ground_Truth) 函数: 判断两个表达式是否在数学意义上等价 步骤1: 提取表达式, + 防止有的模型会给出"x=1"而不是"1" 步骤2: 使用 sympy 库进行等价判断.""" + + # 只提取等号右边的表达式 + def extract_expression(expression): + if '=' in expression: + expression = expression.split('=')[1] + return expression.strip() + + exp1 = extract_expression(exp1) + exp2 = extract_expression(exp2) + + # 将表达式转换为 sympy 中能够进行处理的格式 + expr1_sym = sympify(parse_latex(exp1)) + expr2_sym = sympify(parse_latex(exp2)) + + if expr1_sym == expr2_sym: + return True + else: + expr1_sym = self.sympy_sub_pi(expr1_sym) + expr2_sym = self.sympy_sub_pi(expr2_sym) + + if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or ( + not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)): + return False + elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol): + try: + if not (self.can_compute_power(expr1_sym) + and self.can_compute_power(expr2_sym)): + print(f'These two number can not be calculated by ' + f'current computer for: ' + f'"{str(expr1_sym)}" and "{str(expr2_sym)}"') + return False + + if (abs(expr1_sym.evalf() - expr2_sym.evalf()) <= + self.precision * 1.01): + return True + else: + return False + except Exception: # 处理具体异常 + return False + else: + try: + simplified_expr = simplify(expr1_sym - expr2_sym) + + num_value = simplified_expr.evalf() + + return abs(num_value) < 1e-3 + except Exception: # 处理具体异常 + return False + + def equation_equal(self, expression1, expression2): + """ + (expression1 is assumed to be Ground_Truth) + Function: Check if two equations are mathematically equivalent + Step 1: Simplify equations to standard form with right side equal to 0 + Step 2: Use sympy library to calculate quotient of left sides, + if quotient or its reciprocal is integer, equations are equivalent + """ + + # Convert equations to sympy format with right side moved to left side + def simplify_equation(latex_eq): + # Split left and right sides of equation + lhs, rhs = latex_eq.split('=') + + # Parse LaTeX expressions using parse_latex + lhs_expr = parse_latex(lhs) + rhs_expr = parse_latex(rhs) + + # Create equation object + equation = Eq(lhs_expr, rhs_expr) + + # Simplify equation by moving right side to left + simplified_eq = simplify(equation.lhs - equation.rhs) + + return simplified_eq + + expr1_sym = simplify_equation(expression1) + expr2_sym = simplify_equation(expression2) + + division_result_1 = simplify(expr1_sym / expr2_sym) + division_result_2 = simplify(expr2_sym / expr1_sym) + + # If division result or its reciprocal is + # non-zero integer, equations are equivalent + if (division_result_1.is_Integer + and division_result_1 != 0) or (division_result_2.is_Integer + and division_result_2 != 0): + return True + else: + return False + + def interval_equal(self, expression1, expression2): + """ + Function: Check if two intervals are mathematically equivalent + Step 1: Simplify interval expressions, + remove irrelevant symbols + like "\\left", "\\right", and "x \\in" + Step 2: Compare brackets and mathematical expressions in between + """ + + def compare_two_interval(inter1, inter2): + # First compare brackets on both sides + if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]: + return False + + inter1 = inter1.strip('[]()') + inter2 = inter2.strip('[]()') + + # Split interval into left and right parts + items_1 = inter1.split(',') + items_2 = inter2.split(',') + + for item_1, item_2 in zip(items_1, items_2): + if not self.expression_equal(item_1, item_2): + return False + return True + + interval1 = expression1 + interval2 = expression2 + + if interval1 == interval2: + return True + else: + inter_list1 = interval1.split('\\cup') + inter_list2 = interval2.split('\\cup') + + if len(inter_list1) != len(inter_list2): + return False + else: + for inter1, inter2 in zip(inter_list1, inter_list2): + if not compare_two_interval(inter1, inter2): + return False + return True + + def preprocess(self, expression1, expression2): + """Extract and preprocess expressions from model output.""" + + def extract_boxed_content(latex_str): + # Find all \boxed{...} structures + boxed_matches = re.finditer(r'\\boxed{', latex_str) + results = '' + + for match in boxed_matches: + start_index = match.end() + end_index = start_index + stack = 1 + + # Search from after \boxed{ until + # finding matching closing brace + while stack > 0 and end_index < len(latex_str): + if latex_str[end_index] == '{': + stack += 1 + elif latex_str[end_index] == '}': + stack -= 1 + end_index += 1 + + if stack == 0: + # Extract content inside \boxed{} + content = latex_str[start_index:end_index - 1] + results += content + ',' + else: + raise ValueError('Mismatched braces in LaTeX string.') + + # If no \boxed{} found, extract formulas from last line + if results == '': + last_line_ans = latex_str.strip().split('\n')[-1] + dollar_pattern = r'\$(.*?)\$' + answers = re.findall(dollar_pattern, last_line_ans) + + if answers: + for ans in answers: + results += ans + ',' + else: + results = latex_str + + return results + + def special_symbol_replace(expression): + if '\\in ' in expression: + expression = expression.split('\\in ')[1] + + # Replace special characters that + # don't affect LaTeX parsing (decorative) + for signal in self.special_signal_map: + expression = expression.replace( + signal, self.special_signal_map[signal]) + + expression = expression.strip('\n$,.:;^_=+`!@#$%^&*~,。') + + pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}' + expression = re.sub(pattern, r'\1', expression) + + return expression + + exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content( + expression2) + exp1, exp2 = special_symbol_replace(exp1), special_symbol_replace(exp2) + + return exp1, exp2 + + def can_compute_power(self, expr): + """Check if the power expression can be computed. + + Parameters: + expr (sympy expression): The expression to check. + + Returns: + bool: True if the expression can be computed, False otherwise. + """ + # Check if the expression is a power expression + if isinstance(expr, Pow): + # Extract the base and the exponent + base, exp = expr.as_base_exp() + + # Check if the base and the exponent are numbers + if base.is_number and exp.is_number: + # Set a threshold for the maximum size of the exponent + # can be adjusted based on the computing environment + MAX_EXP = 1000 + + # Check if the exponent is greater than the threshold + if abs(exp.evalf()) > MAX_EXP: + return False + else: + return True + else: + # If the base or the exponent is not a number, + # we cannot compute the power + return False + else: + # If the expression is not a power expression, + # return True as it is not the case we are checking for + return True + + +@TEXT_POSTPROCESSORS.register_module('olympiadbench_postprocess_v2') +def olympiadbench_postprocess_v2(text: str, + is_chinese: bool = False, + is_deepseek: bool = False) -> str: + """Extract answer from model output.""" + # deepseekmath has special answering format + if is_deepseek: + if is_chinese: + matches = re.findall('## 解题答案(.*)', text) + else: + matches = re.findall('The answer is: (.*)', text) + else: + if is_chinese: + matches = re.findall('所以最终答案是(.*)', text) + else: + matches = re.findall('So the final answer is (.*)', text) + + # If found matches, take the last one, otherwise return the whole text + if matches: + return matches[-1].strip() + return text + + +class OlympiadBenchEvaluator(BaseEvaluator): + """Evaluator for OlympiadBench dataset.""" + + def __init__(self, version='v1'): + assert version in ['v1', 'v2'] + self.version = version + self.judger = MathJudger() + + def score(self, predictions, references): # Remove questions parameter + """Calculate accuracy score. + + Args: + predictions (list): List of model predictions + references (list): List of ground truth answers + """ + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different length' + } + + correct = 0 + count = 0 + details = [] + + for pred, ref in zip(predictions, references): + detail = {'pred': pred, 'answer': ref, 'correct': False} + count += 1 + + # Get precision/error threshold from reference if available + precision = 1e-8 + if isinstance(ref, dict) and 'error' in ref: + if ',' in ref['error']: + # Multiple precisions for multiple answers + precisions = ref['error'].split(',') + precisions = [float(p) if p else 1e-8 for p in precisions] + precision = precisions + else: + precision = float(ref['error']) + + # Check if answer is correct + try: + if (isinstance(ref, dict) and 'answer_type' in ref + and 'Tuple' in ref['answer_type']): + # Special handling for tuple type answers + is_correct = self.judger.judge(pred, + ref['final_answer'][0], + precision) + else: + is_correct = self.judger.judge(pred, ref, precision) + + if is_correct: + correct += 1 + detail['correct'] = True + except Exception as e: # 处理具体异常 + detail['error'] = str(e) + + details.append(detail) + + result = {'accuracy': 100 * correct / count, 'details': details} + return result + + +@ICL_PROMPT_TEMPLATES.register_module() +class OlympiadBenchTemplate(PromptTemplate): + """Template for OlympiadBench dataset.""" + + def __init__(self): + # Define basic template structure + template = dict(round=[dict(role='HUMAN', prompt='{prompt}')]) + super().__init__(template=template) + self.prompter = OlympiadBenchPrompter() + + def generate_item(self, entry: Dict, *args, **kwargs) -> str: + """Generate prompt for a single item.""" + problem = entry.get('problem', '') + language = entry.get('language', 'English') + subject = entry.get('subject', 'Math') + question_type = entry.get('question_type', '') + answer_type = entry.get('answer_type', '') + is_multiple_answer = entry.get('is_multiple_answer', False) + unit = entry.get('unit', '') + + prompt = self.prompter.make_prompt( + language=language, + subject=subject, + question_type=question_type, + answer_type=answer_type, + is_multiple_answer=is_multiple_answer, + unit=unit, + ) + + new_entry = {'prompt': prompt, 'problem': problem} + + return super().generate_item(new_entry, *args, **kwargs) diff --git a/build/lib/opencompass/datasets/OpenFinData.py b/build/lib/opencompass/datasets/OpenFinData.py new file mode 100644 index 0000000000000000000000000000000000000000..4777b17f09e07fb66c20d2971b9955a01022cb4a --- /dev/null +++ b/build/lib/opencompass/datasets/OpenFinData.py @@ -0,0 +1,49 @@ +import json +import os.path as osp + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class OpenFinDataDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + path = get_data_path(path, local_mode=True) + with open(osp.join(path, f'{name}.json'), 'r') as f: + data = json.load(f) + return Dataset.from_list(data) + + +@ICL_EVALUATORS.register_module() +class OpenFinDataKWEvaluator(BaseEvaluator): + + def __init__(self, ): + super().__init__() + + def score(self, predictions, references): + assert len(predictions) == len(references) + + scores = [] + results = dict() + + for i in range(len(references)): + all_hit = True + judgement = references[i].split('、') + for item in judgement: + if item not in predictions[i]: + all_hit = False + break + if all_hit: + scores.append(True) + else: + scores.append(False) + + results['accuracy'] = round(sum(scores) / len(scores), 4) * 100 + return results diff --git a/build/lib/opencompass/datasets/OpenSWI.py b/build/lib/opencompass/datasets/OpenSWI.py new file mode 100644 index 0000000000000000000000000000000000000000..feddef23cc030cffbc8b6942a74741363f2d73cd --- /dev/null +++ b/build/lib/opencompass/datasets/OpenSWI.py @@ -0,0 +1,96 @@ +import ast +import json +import math +import os +import re + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class OpenSWIDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + new_data = [] + path = os.path.join(get_data_path(path), name) + for file in os.listdir(path): + if file.endswith('.jsonl'): + final_path = os.path.join(path, file) + with open(final_path, 'r', encoding='utf-8') as f: + for line in f: + data = json.loads(line) + new_data.append({ + 'id': data['id_ddm'], + 'prompt': data['dialogs'][0]['content'], + 'ground_truth': data['ground_truth'], + 'subset': file.split('.')[0] + }) + dataset = Dataset.from_list(new_data) + return dataset + + +def extract_list(text): + # 使用正则提取 ```python\n[ ... ]``` 中的 list 内容 + matches = re.findall(r'(\[.*?\])', text, re.DOTALL) + if matches: + raw_list_str = matches[-1] + # 将字符串安全地转换为 Python 列表对象 + try: + question_list = ast.literal_eval(raw_list_str) + return question_list + except Exception: + return None + else: + return None + + +@ICL_EVALUATORS.register_module() +class OpenSWIMSEEvaluator(BaseEvaluator): + """Exact match evaluator for name conversion.""" + + def __init__(self) -> None: + super().__init__() + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + + avg_score = 0 + avg_valid = [] + details = [] + for prediction, reference in zip(predictions, references): + pred = extract_list(prediction) + ans = reference + + if not pred or all(isinstance(x, float) for x in pred) is False: + detail = {'pred': None, 'answer': ans, 'valid': False} + pred = [0] * len(ans) + else: + detail = {'pred': pred, 'answer': ans, 'valid': True} + if len(pred) < len(ans): + detail['valid'] = False + pred = pred + [0] * (len(ans) - len(pred)) + elif len(pred) > len(ans): + detail['valid'] = False + pred = pred[:len(ans)] + avg_valid.append(detail['valid']) + squared_errors = [(a - p)**2 for a, p in zip(ans, pred)] + rmse_score = math.sqrt(sum(squared_errors) / len(squared_errors)) + detail['score'] = rmse_score + avg_score += rmse_score + details.append(detail) + + score = avg_score / len(predictions) + valid = sum(avg_valid) / len(avg_valid) + + return {'score': score, 'valid': valid * 100, 'details': details} diff --git a/build/lib/opencompass/datasets/PI_LLM.py b/build/lib/opencompass/datasets/PI_LLM.py new file mode 100644 index 0000000000000000000000000000000000000000..358e9b61febc09cc6f0dc38d81d05b518a7d074e --- /dev/null +++ b/build/lib/opencompass/datasets/PI_LLM.py @@ -0,0 +1,106 @@ +import json + +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class PILLMDataset(BaseDataset): + """ + PI-LLM: Context (Proactive) Interference in Large Language Models + + A benchmark dataset for evaluating context (proactive) interference + effects in LLMs through simple key-value tracking tasks. Tests and + shows LLM working memory limitations beyond context length. + + ICML 2025 Long-Context Foundation Models Workshop - Accepted + + Dataset Structure: + - exp_updates: Single mode (n_updates: 2-400) + - exp_keys: Two modes (easy: n_updates=125, hard: n_updates=350) + - exp_valuelength: Two modes (easy: n_updates=4, hard: n_updates=20) + - exp_sequential: Single mode (n_updates: 2-400, non-randomized) + + Homepage: https://sites.google.com/view/cog4llm + Paper: https://arxiv.org/abs/2506.08184 + HuggingFace: https://huggingface.co/datasets/giantfish-fly/pi-llm + """ + + @staticmethod + def load(**kwargs) -> Dataset: + """ + Load PI-LLM dataset from HuggingFace Hub. + + Args: + subset: Dataset subset ('core' or 'sequential_additional') + experiment_filter: Filter by experiment type + - 'exp_updates': Main updates experiment (2-400 updates) + - 'exp_keys': Keys experiment (varying n_keys, + fixed n_updates=125/350) + - 'exp_valuelength': Value length experiment + (varying lengths, fixed n_updates=4/20) + - 'exp_sequential': Sequential mode (randomize_mode=off) + max_samples: Maximum samples to load (for testing) + + Returns: + Dataset: HuggingFace dataset with samples + """ + subset = kwargs.get('subset', 'core') + experiment_filter = kwargs.get('experiment_filter', None) + max_samples = kwargs.get('max_samples', None) + + # Load from HuggingFace (parquet format supported natively) + dataset = load_dataset(kwargs.get('path', None), subset) + + # Extract test split + data = dataset['test'].to_list() + + # Filter by experiment type if specified + if experiment_filter: + if experiment_filter == 'exp_updates': + # Main updates experiment from core dataset + data = [ + item for item in data + if 'exp_updates' in item.get('experiment', '') + and 'randomoff' not in item.get('experiment', '') + ] + elif experiment_filter == 'exp_keys': + # Keys experiment from core dataset + data = [ + item for item in data + if 'exp_keys' in item.get('experiment', '') + ] + elif experiment_filter == 'exp_valuelength': + # Value length experiment from core dataset + data = [ + item for item in data + if 'exp_valuelength' in item.get('experiment', '') + ] + elif experiment_filter == 'exp_sequential': + # Sequential mode from sequential_additional dataset + data = [ + item for item in data + if 'extra_exp_updates_randomoff' in item.get( + 'experiment', '') + ] + + # Limit samples for testing if specified + if max_samples: + data = data[:max_samples] + + # Parse prompt strings to proper chat format + for item in data: + if isinstance(item.get('prompt'), str): + try: + item['prompt'] = json.loads(item['prompt']) + except json.JSONDecodeError: + # Fallback: wrap in basic chat format + item['prompt'] = [{ + 'role': 'user', + 'content': item['prompt'] + }] + + return Dataset.from_list(data) diff --git a/build/lib/opencompass/datasets/ProteinLMBench.py b/build/lib/opencompass/datasets/ProteinLMBench.py new file mode 100644 index 0000000000000000000000000000000000000000..bebaadfd676f4a350be64cc93511a0069af2db61 --- /dev/null +++ b/build/lib/opencompass/datasets/ProteinLMBench.py @@ -0,0 +1,58 @@ +from datasets import load_dataset + +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET +from opencompass.utils.text_postprocessors import first_option_postprocess + +from .base import BaseDataset + + +def _parse(item): + item['start'] = chr(65) + item['end'] = chr(65 + len(item.get('options', [])) - 1) + new_options = [] + choices = '' + for i in range(len(item['options'])): + new_options.append(item['options'][i].split(': ')[-1]) + choices += chr(65 + + i) + '. ' + item['options'][i].split(': ')[-1] + '\n' + item['question'] = (f'\nQuestion: {item["question"]}\n' + f'Answer Choices: \n{choices}') + item['options'] = new_options + item['label'] = chr(65 + int(item['answer'].split(' ')[-1]) - + 1) # Index from 1 in answer + return item + + +@LOAD_DATASET.register_module() +class ProteinLMBenchDataset(BaseDataset): + + @staticmethod + def load(path: str, **kwargs): + dataset = load_dataset(path, 'evaluation', split='train') + dataset = dataset.map(lambda item: _parse(item)) + + return dataset + + +class ProteinLMBenchEvaluator(BaseEvaluator): + + def score(self, predictions, references, test_set): + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + correct = 0 + count = 0 + details = [] + for idx, (prediction, + reference) in enumerate(zip(predictions, references)): + options = ''.join( + [chr(65 + i) for i in range(len(test_set['options'][idx]))]) + predict = first_option_postprocess(prediction, options) + detail = {'pred': predict, 'answer': reference, 'correct': False} + count += 1 + if predict == reference: + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result diff --git a/build/lib/opencompass/datasets/PubMedQA.py b/build/lib/opencompass/datasets/PubMedQA.py new file mode 100644 index 0000000000000000000000000000000000000000..b0db32e336276bd1b25c38faf31152abd1b9f378 --- /dev/null +++ b/build/lib/opencompass/datasets/PubMedQA.py @@ -0,0 +1,34 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class PubMedQADataset(BaseDataset): + + @staticmethod + def load_single(path): + dataset = [] + ds = load_dataset(path, 'pqa_labeled') + for data in ds['train']: + data['question'] = (f"CONTEXTS: {data['context']}\n" + f"QUESTION: {data['question']}") + choices = 'A. yes\nB. no\nC. maybe' + data['choices'] = choices + if data['final_decision'] == 'yes': + data['label'] = 'A. yes' + elif data['final_decision'] == 'no': + data['label'] = 'B. no' + else: + data['label'] = 'C. maybe' + + dataset.append(data) + + return Dataset.from_list(dataset) + + @staticmethod + def load(path): + dataset = PubMedQADataset.load_single(path) + return dataset diff --git a/build/lib/opencompass/datasets/QuALITY.py b/build/lib/opencompass/datasets/QuALITY.py new file mode 100644 index 0000000000000000000000000000000000000000..a9f5343528251570d6d7b8597336900403a64f57 --- /dev/null +++ b/build/lib/opencompass/datasets/QuALITY.py @@ -0,0 +1,61 @@ +import json + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class QuALITYDataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + dataset_list = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + for question in line['questions']: + dataset_list.append({ + 'article': + line['article'], + 'question': + question['question'], + 'A': + question['options'][0], + 'B': + question['options'][1], + 'C': + question['options'][2], + 'D': + question['options'][3], + 'gold_label': + 'ABCD'[question['gold_label'] - 1], + 'difficult': + question['difficult'] + }) + return Dataset.from_list(dataset_list) + + +class QuALITYEvaluator(BaseEvaluator): + + def score(self, predictions, references, test_set): + assert len(predictions) == len(references) + easy, hard, all = [], [], [] + for pred, refer, test in zip(predictions, references, test_set): + if pred == refer: + answer = True + else: + answer = False + all.append(answer) + if test['difficult'] == 0: + easy.append(answer) + else: + hard.append(answer) + return dict(easy_acc=sum(easy) / len(easy) * 100, + hard_acc=sum(hard) / len(easy) * 100, + all_acc=sum(all) / len(all) * 100) diff --git a/build/lib/opencompass/datasets/SciEval.py b/build/lib/opencompass/datasets/SciEval.py new file mode 100644 index 0000000000000000000000000000000000000000..593e3183c26566f83c16bce50053cc89b61ba899 --- /dev/null +++ b/build/lib/opencompass/datasets/SciEval.py @@ -0,0 +1,68 @@ +import re +from typing import List + +from datasets import Dataset, DatasetDict, load_dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.registry import LOAD_DATASET + +# 预编译的多选题正则,按 PEP-8 每行 < 79 字符 +_PATTERN_MC = ( + r'^(?P.*?)' # 题干 + r'(?:A\.)\s*(?P.*?)\s*' # 选项 A + r'B\.\s*(?P.*?)\s*' # 选项 B + r'C\.\s*(?P.*?)\s*' # 选项 C + r'D\.\s*(?P.*?)' # 选项 D + r'Answer:' # 答案分隔符 +) + + +@LOAD_DATASET.register_module() +class SciEvalDataset(BaseDataset): + """多选题子集,支持所有类别(可选指定 category 过滤)""" + + @staticmethod + def load(path: str, name: str, **kwargs) -> DatasetDict: + # 如果传入 category,则仅保留该类别,否则包含所有类别 + category = kwargs.get('category') + dataset: DatasetDict = DatasetDict() + + for split in ('test', ): + raw_iter = load_dataset( + path, + name=name, + split=split, + streaming=True, + ) + examples: List[dict] = [] + + for ex in raw_iter: + # 仅保留多选题 + if ex.get('type') != 'multiple-choice': + continue + # 如指定了 category,则进行过滤 + if category is not None \ + and ex.get('category') != category: + continue + + ans_list = (ex.get('answer') or ex.get('answers') or []) + if not ans_list: + continue + target = ans_list[0] + + match = re.search(_PATTERN_MC, ex.get('question', ''), re.S) + if not match: + continue + + examples.append({ + 'input': match.group('stem').strip(), + 'A': match.group('A').strip(), + 'B': match.group('B').strip(), + 'C': match.group('C').strip(), + 'D': match.group('D').strip(), + 'target': target, + }) + + dataset[split] = Dataset.from_list(examples) + + return dataset diff --git a/build/lib/opencompass/datasets/SciKnowEval.py b/build/lib/opencompass/datasets/SciKnowEval.py new file mode 100644 index 0000000000000000000000000000000000000000..d9635d966834f692cc24e27bcdd9fff91c1e5466 --- /dev/null +++ b/build/lib/opencompass/datasets/SciKnowEval.py @@ -0,0 +1,107 @@ +import re + +from datasets import load_dataset + +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS + +from .base import BaseDataset + + +def _parse(item, prompt_mode, discipline): + choices = item['choices'] + + item['q4'] = f'You are an expert in {discipline}.\n' + item['q4'] += item['prompt']['default'] + '\n' + item['question'] + '\n' + label_texts = [] + for label_meta, text_meta in zip(choices['label'], choices['text']): + label_texts.append(f'{label_meta}. {text_meta}') + item['q4'] += '\n'.join(label_texts) # noqa: E501, E741, E741 + item['prompt_mode'] = prompt_mode + return item + + +@LOAD_DATASET.register_module() +class SciKnowEvalDataset(BaseDataset): + + @staticmethod + def load(path: str, prompt_mode: str, **kwargs): + + def capitalize_first_letter(s): + if not s: # 检查字符串是否为空 + return s + return s[0].upper() + s[1:] + + subset = kwargs['subset'] + data_files = {} + test_file = f'data/{capitalize_first_letter(subset)}/' + test_file += f'sciknoweval_{subset}_test.jsonl' + data_files['test'] = test_file + dataset = load_dataset(path, data_files=data_files, split='test') + # dataset = dataset.select(range(20)) + if prompt_mode == 'zero-shot': + dataset = dataset.map( + lambda item: _parse(item, prompt_mode, subset), + load_from_cache_file=False) + elif prompt_mode == 'few-shot': + pass # TODO: Implement few-shot prompt + return dataset + + +class SciKnowEvalEvaluator(BaseEvaluator): + + def score(self, predictions, references, test_set): + method = test_set['prompt_mode'][0] + + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + correct = 0 + count = 0 + details = [] + for idx, (i, j) in enumerate(zip(predictions, references)): + i = answer_cleansing(method, i, test_set['choices'][idx]['label'], + test_set['answerKey'][idx]) + detail = {'pred': i, 'answer': j, 'correct': False} + count += 1 + if i == j: + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result + + +@TEXT_POSTPROCESSORS.register_module() +def answer_cleansing( + method: str, + prediction: str, + options: list, + label: str, +) -> str: + options_str = r'\b(' + '|'.join(options) + r')\b' + prediction = re.findall(options_str, prediction) + + if len(prediction) == 0: + prediction = [] + else: + # If there is a "label" and its length is 1, + # process prediction accordingly + if len(label) == 1: + if method == 'few-shot': + answer_flag = True if len(prediction) > 1 else False + # choose the first or last element based on the answer_flag + if answer_flag: + prediction = [prediction[0]] + else: + prediction = [prediction[-1]] + elif method == 'zero-shot': + # choose the first element in list + prediction = [prediction[0]] + else: + raise ValueError('Method is not properly defined ...') + + # Remove trailing period if it exists + if prediction[0] and prediction[0].endswith('.'): + prediction[0] = prediction[0][:-1] + + return prediction[0] diff --git a/build/lib/opencompass/datasets/ScienceQA.py b/build/lib/opencompass/datasets/ScienceQA.py new file mode 100644 index 0000000000000000000000000000000000000000..1bc9c952423361660637ee9af2b13f4244e0413c --- /dev/null +++ b/build/lib/opencompass/datasets/ScienceQA.py @@ -0,0 +1,32 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class ScienceQADataset(BaseDataset): + + @staticmethod + def load_single(path): + dataset = [] + ds = load_dataset(path) + for data in ds['test']: + if data['image'] is None: + data['label'] = chr(65 + data['answer'] + ) + '. ' + data['choices'][data['answer']] + choices = '' + for i in range(len(data['choices'])): + choices += chr(65 + i) + '. ' + data['choices'][i] + '\n' + data['choices'] = choices + # print(data) + + dataset.append(data) + + return Dataset.from_list(dataset) + + @staticmethod + def load(path): + dataset = ScienceQADataset.load_single(path) + return dataset diff --git a/build/lib/opencompass/datasets/SeedBench.py b/build/lib/opencompass/datasets/SeedBench.py new file mode 100644 index 0000000000000000000000000000000000000000..3ec1ec230ba49bc8f52b28bdb18cb1b6a0166959 --- /dev/null +++ b/build/lib/opencompass/datasets/SeedBench.py @@ -0,0 +1,309 @@ +import random +import re +from os import environ +from typing import List + +import datasets +import jieba +import numpy as np +from rouge_chinese import Rouge + +from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator +from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, + TEXT_POSTPROCESSORS) +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class SeedBenchDataset(BaseDataset): + + @staticmethod + def load(data_files: str, + path: str, + split: str = None, + **kwargs) -> datasets.Dataset: + + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, + subset_name='default', + split=split, + data_files=data_files, + **kwargs) + else: + dataset = datasets.load_dataset(path, + data_files=data_files, + **kwargs) + + if split is None: + split = list(dataset.keys())[0] + + if split not in dataset: + raise ValueError(f"Split '{split}' not found. \ + Available splits: {list(dataset.keys())}") + + return dataset[split] + + +class F1Evaluator(BaseEvaluator): + """F1 Score evaluator for multiple choice questions. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + return { + 'predictions': predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + return scores + + def score(self, predictions: List, references: List) -> dict: + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + true_positives = 0 + false_positives = 0 + false_negatives = 0 + + for hyp, ref in zip(predictions, references): + hyp = re.sub(r'[^A-Da-d,]+', '', hyp.lower()) + ref = re.sub(r'[^A-Da-d,]+', '', ref.lower()) + ref_set = set(ref.split(',')) + hyp_set = set(hyp.split(',')) + ref_set = {r.strip() for r in ref_set} + hyp_set = {h.strip() for h in hyp_set} + + sample_tp = len(hyp_set.intersection(ref_set)) + sample_fp = len(hyp_set - ref_set) + sample_fn = len(ref_set - hyp_set) + true_positives += sample_tp + false_positives += sample_fp + false_negatives += sample_fn + sample_precision = sample_tp / (sample_tp + sample_fp) if ( + sample_tp + sample_fp) > 0 else 0 + sample_recall = sample_tp / (sample_tp + sample_fn) if ( + sample_tp + sample_fn) > 0 else 0 + sample_f1 = (2 * sample_precision * sample_recall) / ( + sample_precision + sample_recall) if (sample_precision + + sample_recall) > 0 else 0 + details.append({ + 'pred': hyp, + 'answer': ref, + 'correct': sample_f1 * 100 + }) + + precision = true_positives / (true_positives + false_positives) if ( + true_positives + false_positives) > 0 else 0 + recall = true_positives / (true_positives + false_negatives) if ( + true_positives + false_negatives) > 0 else 0 + f1 = (2 * precision * + recall) / (precision + recall) if (precision + recall) > 0 else 0 + + result = { + 'F1Score': f1 * 100, # 总体 F1 分数 + 'details': details + } + random.setstate(random_state) + np.random.set_state(np_random_state) + return self._postprocess(result) + + +@ICL_EVALUATORS.register_module() +class F1ScoreEvaluator(F1Evaluator): + """F1 Score evaluator for multiple choice questions.""" + + def __init__(self) -> None: + super().__init__() + + +# 定义自己的多选后处理逻辑(输入回答为:ABC ---> A,B,C) +@TEXT_POSTPROCESSORS.register_module('my_multiple_select_postprocess') +def my_multiple_select_postprocess(text: str) -> str: + selected_options = [t for t in text if t.isupper()] + selected_options = sorted(set(selected_options)) + res = ', '.join(selected_options) + return res + + +class AverageRougeEvaluator(BaseEvaluator): + """Average Rouge Score evaluator for fill-in-the-blank tasks. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + pattern = r'(正确答案[::]|correct answer[::])' + cleaned_predictions = [ + re.sub(pattern, '', pred, flags=re.IGNORECASE).strip() + for pred in predictions + ] + + return { + 'predictions': cleaned_predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + return scores + + def score(self, predictions: List, references: List) -> dict: + + def rouge_score(hyps, refs): + assert (len(hyps) == len(refs)) + hyps = [' '.join(jieba.cut(h)) for h in hyps] + hyps = [h if h.strip() != '' else '无内容' for h in hyps] + refs = [' '.join(jieba.cut(r)) for r in refs] + rouge_scores = Rouge().get_scores(hyps, refs) + rouge_ls = [score['rouge-l']['f'] for score in rouge_scores] + average_rouge_l = sum(rouge_ls) / len(rouge_ls) + return {'score': average_rouge_l * 100} + + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + preprocessed_data = self._preprocess(predictions, references) + hyps, refs = preprocessed_data['predictions'], preprocessed_data[ + 'references'] + + scores = [] + for i in range(len(hyps)): + refs[i] = refs[i].replace(',', ',') + word_level_refs = refs[i].split(',') + word_level_refs = [r.strip() for r in word_level_refs] + if len(word_level_refs) == 1: + word_level_hyps = [hyps[i]] + else: + word_level_hyps = hyps[i].split(',') + word_level_hyps = [h.strip() for h in word_level_hyps] + + if len(word_level_hyps) < len(word_level_refs): + word_level_hyps += ['无内容'] * (len(word_level_refs) - + len(word_level_hyps)) + else: + word_level_hyps = word_level_hyps[:len(word_level_refs)] + + sample_score = rouge_score(word_level_hyps, + word_level_refs)['score'] + scores.append(sample_score) + details.append({ + 'pred': word_level_hyps, + 'answer': word_level_refs, + 'correct': sample_score + }) + + average_score = sum(scores) / len(scores) + result = {'AvgRougeScore': average_score, 'details': details} + random.setstate(random_state) + np.random.set_state(np_random_state) + + return self._postprocess(result) + + +@ICL_EVALUATORS.register_module() +class AverageRougeScoreEvaluator(AverageRougeEvaluator): + """Average Rouge Score evaluator.""" + + def __init__(self) -> None: + super().__init__() + + +class AccScoreStrEvaluator(BaseEvaluator): + """Accuracy evaluator based on string matching. + + Args: + seed (int): Seed for randomness, ensuring reproducibility. + Defaults to 0. + """ + + def __init__(self, seed: int = 0) -> None: + self.seed = seed + super().__init__() + + def _preprocess(self, predictions: List, references: List) -> dict: + return { + 'predictions': predictions, + 'references': references, + } + + def _postprocess(self, scores: dict) -> dict: + return scores + + def score(self, predictions: List, references: List) -> dict: + random_state = random.getstate() + np_random_state = np.random.get_state() + details = [] + random.seed(self.seed) + np.random.seed(self.seed) + + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + preprocessed_data = self._preprocess(predictions, references) + + correct = 0 + for hyp, ref in zip(preprocessed_data['predictions'], + preprocessed_data['references']): + is_correct = 1 if ref.strip().lower() in hyp.strip().lower() else 0 + correct += is_correct + details.append({'pred': hyp, 'answer': ref, 'correct': is_correct}) + + accuracy = correct / len(predictions) + result = {'ACCStrScore': accuracy * 100, 'details': details} + random.setstate(random_state) + np.random.set_state(np_random_state) + + return self._postprocess(result) + + +@ICL_EVALUATORS.register_module() +class AccScoreStr_Evaluator(AccScoreStrEvaluator): + """Accuracy evaluator wrapper for the AccScoreEvaluator.""" + + def __init__(self) -> None: + super().__init__() diff --git a/build/lib/opencompass/datasets/__init__.py b/build/lib/opencompass/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f30d9b885de515d1d8e0aabdb7aaa4fee3f465f8 --- /dev/null +++ b/build/lib/opencompass/datasets/__init__.py @@ -0,0 +1,187 @@ +from .advglue import * # noqa: F401, F403 +from .afqmcd import * # noqa: F401, F403 +from .agieval import * # noqa: F401, F403 +from .aime2024 import * # noqa: F401, F403 +from .anli import AnliDataset # noqa: F401, F403 +from .anthropics_evals import * # noqa: F401, F403 +from .apps import * # noqa: F401, F403 +from .arc import * # noqa: F401, F403 +from .arc_prize_public_evaluation import * # noqa: F401, F403 +from .ax import * # noqa: F401, F403 +from .babilong import * # noqa: F401, F403 +from .bbeh import * # noqa: F401, F403 +from .bbh import * # noqa: F401, F403 +from .beyondaime import * # noqa: F401, F403 +from .bigcodebench import * # noqa: F401, F403 +from .biodata import * # noqa: F401, F403 +from .boolq import * # noqa: F401, F403 +from .bustum import * # noqa: F401, F403 +from .c3 import * # noqa: F401, F403 +from .calm import * # noqa: F401, F403 +from .CARDBiomedBench import CARDBiomedBenchDataset # noqa: F401 +from .cb import * # noqa: F401, F403 +from .ceval import * # noqa: F401, F403 +from .charm import * # noqa: F401, F403 +from .chatml import * # noqa: F401, F403 +from .chem_exam import * # noqa: F401, F403 +from .chembench import * # noqa: F401, F403 +from .chid import * # noqa: F401, F403 +from .chinese_simpleqa import * # noqa: F401, F403 +from .cibench import * # noqa: F401, F403 +from .circular import * # noqa: F401, F403 +from .civilcomments import * # noqa: F401, F403 +from .climaqa import * # noqa: F401, F403 +from .clozeTest_maxmin import * # noqa: F401, F403 +from .cluewsc import * # noqa: F401, F403 +from .cmb import * # noqa: F401, F403 +from .cmmlu import * # noqa: F401, F403 +from .cmnli import * # noqa: F401, F403 +from .cmo_fib import * # noqa: F401, F403 +from .cmphysbench import * # noqa: F401, F403 +from .cmrc import * # noqa: F401, F403 +from .codecompass import * # noqa: F401, F403 +from .commonsenseqa import * # noqa: F401, F403 +from .commonsenseqa_cn import * # noqa: F401, F403 +from .copa import * # noqa: F401, F403 +from .crowspairs import * # noqa: F401, F403 +from .crowspairs_cn import * # noqa: F401, F403 +from .csl import * # noqa: F401, F403 +from .custom import * # noqa: F401, F403 +from .cvalues import * # noqa: F401, F403 +from .dingo import * # noqa: F401, F403 +from .drcd import * # noqa: F401, F403 +from .drop import * # noqa: F401, F403 +from .drop_simple_eval import * # noqa: F401, F403 +from .ds1000 import * # noqa: F401, F403 +from .ds1000_interpreter import * # noqa: F401, F403 +from .Earth_Silver import * # noqa: F401, F403 +from .eese.eese import * # noqa: F401, F403 +from .eprstmt import * # noqa: F401, F403 +from .FinanceIQ import * # noqa: F401, F403 +from .flores import * # noqa: F401, F403 +from .game24 import * # noqa: F401, F403 +from .gaokao_math import * # noqa: F401, F403 +from .GaokaoBench import * # noqa: F401, F403 +from .generic import * # noqa: F401, F403 +from .govrepcrs import * # noqa: F401, F403 +from .gpqa import * # noqa: F401, F403 +from .gsm8k import * # noqa: F401, F403 +from .gsm_hard import * # noqa: F401, F403 +from .healthbench.healthbench import * # noqa: F401, F403 +from .hellaswag import * # noqa: F401, F403 +from .hle import * # noqa: F401, F403 +from .huggingface import * # noqa: F401, F403 +from .humaneval import * # noqa: F401, F403 +from .humaneval_multi import * # noqa: F401, F403 +from .humaneval_pro import * # noqa: F401, F403 +from .humanevalx import * # noqa: F401, F403 +from .hungarian_math import * # noqa: F401, F403 +from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403 +from .inference_ppl import InferencePPLDataset # noqa: F401, F403 +from .infinitebench import * # noqa: F401, F403 +from .internsandbox import * # noqa: F401, F403 +from .iwslt2017 import * # noqa: F401, F403 +from .jigsawmultilingual import * # noqa: F401, F403 +from .jsonl import JsonlDataset # noqa: F401, F403 +from .judge import * # noqa: F401, F403 +from .kaoshi import KaoshiDataset, KaoshiEvaluator # noqa: F401, F403 +from .korbench import * # noqa: F401, F403 +from .lambada import * # noqa: F401, F403 +from .lawbench import * # noqa: F401, F403 +from .LCBench import * # noqa: F401, F403 +from .lcsts import * # noqa: F401, F403 +from .leval import * # noqa: F401, F403 +from .livecodebench import * # noqa: F401, F403 +from .livemathbench import * # noqa: F401, F403 +from .livereasonbench import * # noqa: F401, F403 +from .livestembench import * # noqa: F401, F403 +from .llm_compression import LLMCompressionDataset # noqa: F401, F403 +from .longbench import * # noqa: F401, F403 +from .longbenchv2 import * # noqa: F401, F403 +from .lveval import * # noqa: F401, F403 +from .mastermath2024v1 import * # noqa: F401, F403 +from .matbench import * # noqa: F401, F403 +from .math import * # noqa: F401, F403 +from .math401 import * # noqa: F401, F403 +from .math_intern import * # noqa: F401, F403 +from .mathbench import * # noqa: F401, F403 +from .mbpp import * # noqa: F401, F403 +from .mbpp_pro import * # noqa: F401, F403 +from .medbench import * # noqa: F401, F403 +from .Medbullets import * # noqa: F401, F403 +from .MedCalc_Bench import MedCalc_BenchDataset # noqa: F401 +from .MedCalc_Bench import MedCalcOfficial_Evaluator # noqa: F401 +from .medmcqa import * # noqa: F401, F403 +from .MedQA import * # noqa: F401, F403 +from .MedXpertQA import * # noqa: F401, F403 +from .mgsm import * # noqa: F401, F403 +from .mmlu import * # noqa: F401, F403 +from .mmlu_cf import * # noqa: F401, F403 +from .mmlu_pro import * # noqa: F401, F403 +from .MMLUArabic import * # noqa: F401, F403 +from .mmmlu import * # noqa: F401, F403 +from .multipl_e import * # noqa: F401, F403 +from .multirc import * # noqa: F401, F403 +from .musr import * # noqa: F401, F403 +from .narrativeqa import * # noqa: F401, F403 +from .natural_question import * # noqa: F401, F403 +from .natural_question_cn import * # noqa: F401, F403 +from .nejmaibench import * # noqa: F401, F403 +from .NPHardEval import * # noqa: F401, F403 +from .obqa import * # noqa: F401, F403 +from .olymmath import * # noqa: F401, F403 +from .OlympiadBench import * # noqa: F401, F403 +from .OpenFinData import * # noqa: F401, F403 +from .OpenSWI import * # noqa: F401, F403 +from .phybench import * # noqa: F401, F403 +from .physics import * # noqa: F401, F403 +from .PI_LLM import PILLMDataset # noqa: F401 +from .piqa import * # noqa: F401, F403 +from .ProteinLMBench import * # noqa: F401, F403 +from .py150 import * # noqa: F401, F403 +from .qasper import * # noqa: F401, F403 +from .qaspercut import * # noqa: F401, F403 +from .QuALITY import * # noqa: F401, F403 +from .race import * # noqa: F401, F403 +from .rbench import * # noqa: F401, F403 +from .realtoxicprompts import * # noqa: F401, F403 +from .reasonbench import ReasonBenchDataset # noqa: F401, F403 +from .record import * # noqa: F401, F403 +from .ruler import * # noqa: F401, F403 +from .safety import * # noqa: F401, F403 +from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403 +from .scicode import * # noqa: F401, F403 +from .SciEval import SciEvalDataset # noqa: F401 +from .SciKnowEval import * # noqa: F401, F403 +from .SeedBench import * # noqa: F401, F403 +from .simpleqa import * # noqa: F401, F403 +from .siqa import * # noqa: F401, F403 +from .smolinstruct import * # noqa: F401, F403 +from .squad20 import SQuAD20Dataset, SQuAD20Evaluator # noqa: F401, F403 +from .srbench import * # noqa: F401, F403 +from .storycloze import * # noqa: F401, F403 +from .strategyqa import * # noqa: F401, F403 +from .subjective import * # noqa: F401, F403 +from .summedits import * # noqa: F401, F403 +from .summscreen import * # noqa: F401, F403 +from .supergpqa import * # noqa: F401, F403 +from .svamp import * # noqa: F401, F403 +from .tabmwp import * # noqa: F401, F403 +from .taco import * # noqa: F401, F403 +from .teval import * # noqa: F401, F403 +from .TheoremQA import * # noqa: F401, F403 +from .tnews import * # noqa: F401, F403 +from .triviaqa import * # noqa: F401, F403 +from .triviaqarc import * # noqa: F401, F403 +from .truthfulqa import * # noqa: F401, F403 +from .tydiqa import * # noqa: F401, F403 +from .wic import * # noqa: F401, F403 +from .wikibench import * # noqa: F401, F403 +from .winograd import * # noqa: F401, F403 +from .winogrande import * # noqa: F401, F403 +from .wnli import wnliDataset # noqa: F401, F403 +from .wsc import * # noqa: F401, F403 +from .xcopa import * # noqa: F401, F403 +from .xiezhi import XiezhiDataset, XiezhiRetriever # noqa: F401, F403 +from .xlsum import * # noqa: F401, F403 +from .xsum import * # noqa: F401, F403 diff --git a/build/lib/opencompass/datasets/advglue.py b/build/lib/opencompass/datasets/advglue.py new file mode 100644 index 0000000000000000000000000000000000000000..db884d0b1ca7928836a598fce04f07aa9d39dbf8 --- /dev/null +++ b/build/lib/opencompass/datasets/advglue.py @@ -0,0 +1,176 @@ +import json +from typing import List, Union + +from datasets import Dataset, concatenate_datasets + +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +class AdvDataset(BaseDataset): + """Base adv GLUE dataset. Adv GLUE is built on GLUE dataset. The main + purpose is to eval the accuracy drop on original set and adv set. + + Args: + subset (str): The subset task of adv GLUE dataset. + filter_keys (str): The keys to be filtered to create the original + set for comparison. + """ + + def __init__( + self, + subset: str, + filter_keys: Union[str, List[str]], + **kwargs, + ): + self.subset = subset + if isinstance(filter_keys, str): + filter_keys = [filter_keys] + self.filter_keys = filter_keys + super().__init__(**kwargs) + + def aug_with_original_data(self, dataset): + """Create original dataset and concat to the end.""" + # Remove data without original reference + dataset = dataset.filter( + lambda x: any([x[k] for k in self.filter_keys])) + + def ori_preprocess(example): + for k in self.filter_keys: + if example[k]: + new_k = k.split('original_')[-1] + example[new_k] = example[k] + example['type'] = 'original' + return example + + original_dataset = dataset.map(ori_preprocess) + + return concatenate_datasets([dataset, original_dataset]) + + def load(self, path): + """Load dataset and aug with original dataset.""" + + path = get_data_path(path) + with open(path, 'r') as f: + raw_data = json.load(f) + subset = raw_data[self.subset] + + # In case the missing keys in first example causes Dataset + # to ignore them in the following examples when building. + for k in self.filter_keys: + if k not in subset[0]: + subset[0][k] = None + + dataset = Dataset.from_list(raw_data[self.subset]) + + dataset = self.aug_with_original_data(dataset) + + def choices_process(example): + example['label_option'] = chr(ord('A') + example['label']) + return example + + dataset = dataset.map(choices_process) + return dataset + + +# label 0 for A. negative +# label 1 for B. positive +class AdvSst2Dataset(AdvDataset): + """Adv GLUE sst2 dataset.""" + + def __init__(self, **kwargs): + super().__init__(subset='sst2', + filter_keys='original_sentence', + **kwargs) + + +# label 0 for not_duplicate, A. no +# label 1 for duplicate, B. yes +class AdvQqpDataset(AdvDataset): + """Adv GLUE qqp dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='qqp', + filter_keys=['original_question1', 'original_question2'], + **kwargs) + + +# # label 0 for entailment, A. yes +# # label 1 for neutral, B. maybe +# # label 2 for contradiction, C. no +class AdvMnliDataset(AdvDataset): + """Adv GLUE mnli dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='mnli', + filter_keys=['original_premise', 'original_hypothesis'], + **kwargs) + + +# # label 0 for entailment, A. yes +# # label 1 for neutral, B. maybe +# # label 2 for contradiction, C. no +class AdvMnliMMDataset(AdvDataset): + """Adv GLUE mnli mm dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='mnli-mm', + filter_keys=['original_premise', 'original_hypothesis'], + **kwargs) + + +# # label 0 for entailment, A. yes +# # label 1 for not entailment, B. no +class AdvQnliDataset(AdvDataset): + """Adv GLUE qnli dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='qnli', + filter_keys=['original_question', 'original_sentence'], + **kwargs) + + +# # label 0 for entailment, A. yes +# # label 1 for not entailment, B. no +class AdvRteDataset(AdvDataset): + """Adv GLUE rte dataset.""" + + def __init__(self, **kwargs): + super().__init__( + subset='rte', + filter_keys=['original_sentence1', 'original_sentence2'], + **kwargs) + + +class AccDropEvaluator(AccEvaluator): + """Eval accuracy drop.""" + + def __init__(self) -> None: + super().__init__() + + def score(self, predictions: List, references: List) -> dict: + """Calculate scores and accuracy. + + Args: + predictions (List): List of probabilities for each class of each + sample. + references (List): List of target labels for each sample. + + Returns: + dict: calculated scores. + """ + + n = len(predictions) + assert n % 2 == 0, 'Number of examples should be even.' + acc_after = super().score(predictions[:n // 2], references[:n // 2]) + acc_before = super().score(predictions[n // 2:], references[n // 2:]) + acc_drop = 1 - acc_after['accuracy'] / acc_before['accuracy'] + return dict(acc_drop=acc_drop, + acc_after=acc_after['accuracy'], + acc_before=acc_before['accuracy']) diff --git a/build/lib/opencompass/datasets/afqmcd.py b/build/lib/opencompass/datasets/afqmcd.py new file mode 100644 index 0000000000000000000000000000000000000000..bb50bede206be461297a622f55f055bb23958e3f --- /dev/null +++ b/build/lib/opencompass/datasets/afqmcd.py @@ -0,0 +1,33 @@ +import json +from os import environ + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET +from opencompass.utils.datasets import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class AFQMCDatasetV2(BaseDataset): + + @staticmethod + def load(path, local_mode=False): + path = get_data_path(path, local_mode=local_mode) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load(path, split='dev') + data = [] + for line in ms_dataset: + row = line + row['label'] = 'AB'[int(line['label'])] + data.append(line) + else: + data = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + line['label'] = 'AB'[int(line['label'])] + data.append(line) + return Dataset.from_list(data) diff --git a/build/lib/opencompass/datasets/aime2024.py b/build/lib/opencompass/datasets/aime2024.py new file mode 100644 index 0000000000000000000000000000000000000000..9de3fd029fd30846b7b98e1b349dfa8f587991b7 --- /dev/null +++ b/build/lib/opencompass/datasets/aime2024.py @@ -0,0 +1,25 @@ +import json + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class Aime2024Dataset(BaseDataset): + + @staticmethod + def load(path, **kwargs): + path = get_data_path(path) + dataset = [] + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + origin_prompt = line['origin_prompt'] + line['question'] = origin_prompt[:] + line['answer'] = line['gold_answer'] + dataset.append(line) + return Dataset.from_list(dataset) diff --git a/build/lib/opencompass/datasets/anli.py b/build/lib/opencompass/datasets/anli.py new file mode 100644 index 0000000000000000000000000000000000000000..73d114b69dadc476a4612896a77fdcd7d0af46ce --- /dev/null +++ b/build/lib/opencompass/datasets/anli.py @@ -0,0 +1,18 @@ +import json + +from datasets import Dataset + +from .base import BaseDataset + + +class AnliDataset(BaseDataset): + + @staticmethod + def load(path: str): + dataset = [] + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + line['label'] = {'c': 'A', 'e': 'B', 'n': 'C'}[line['label']] + dataset.append(line) + return Dataset.from_list(dataset) diff --git a/build/lib/opencompass/datasets/anthropics_evals.py b/build/lib/opencompass/datasets/anthropics_evals.py new file mode 100644 index 0000000000000000000000000000000000000000..8fd28d7dd0d9ec03288339d494ff58e8a229f6a1 --- /dev/null +++ b/build/lib/opencompass/datasets/anthropics_evals.py @@ -0,0 +1,63 @@ +from datasets import load_dataset + +from .base import BaseDataset + + +class AiRiskDataset(BaseDataset): + + @staticmethod + def load(path: str): + """Load dataset.""" + + dataset = load_dataset('json', data_files=path) + + def choices_process(example): + # the original answer format is ` (A)`, etc. + for i in 'ABCDEFGH': + if i in example['answer_matching_behavior']: + example['answer_matching_behavior'] = i + break + return example + + dataset = dataset.map(choices_process) + return dataset + + +class PersonaDataset(BaseDataset): + + @staticmethod + def load(path: str): + """Load dataset.""" + + dataset = load_dataset('json', data_files=path) + + def choices_process(example): + # the original answer format is ` No` or ` Yes`. + if example['answer_matching_behavior'] == ' Yes': + example['answer_matching_behavior'] = 'A' + else: + example['answer_matching_behavior'] = 'B' + return example + + dataset = dataset.map(choices_process) + return dataset + + +class SycophancyDataset(BaseDataset): + + @staticmethod + def load(path: str): + """Load dataset.""" + + dataset = load_dataset('json', data_files=path) + + def choices_process(example): + # the original answer format is ` (A)`, etc. + for i in 'ABCDEFG': + if i in example['answer_matching_behavior']: + example['answer_matching_behavior'] = i + break + return example + + dataset = dataset.map(choices_process) + return dataset diff --git a/build/lib/opencompass/datasets/arc.py b/build/lib/opencompass/datasets/arc.py new file mode 100644 index 0000000000000000000000000000000000000000..97b4dafc21b8cd143e543ac8ed32f03e8c394ade --- /dev/null +++ b/build/lib/opencompass/datasets/arc.py @@ -0,0 +1,149 @@ +import json +import os.path as osp +from os import environ + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class ARCDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, + split='validation', + subset_name=name) + rows = [] + for row in dataset: + answerKey = row['answerKey'] + question = row['question'] + choices = row['choices'] + if len(choices['text']) != 4: + continue + labels = row['choices']['label'] + answerKey = 'ABCD'[labels.index(answerKey)] + + rows.append({ + 'question': question, + 'answerKey': answerKey, + 'textA': choices['text'][0], + 'textB': choices['text'][1], + 'textC': choices['text'][2], + 'textD': choices['text'][3], + }) + else: + with open(path, 'r', errors='ignore') as in_f: + rows = [] + for line in in_f: + item = json.loads(line.strip()) + question = item['question'] + if len(question['choices']) != 4: + continue + labels = [c['label'] for c in question['choices']] + answerKey = 'ABCD'[labels.index(item['answerKey'])] + rows.append({ + 'question': question['stem'], + 'answerKey': answerKey, + 'textA': question['choices'][0]['text'], + 'textB': question['choices'][1]['text'], + 'textC': question['choices'][2]['text'], + 'textD': question['choices'][3]['text'], + }) + dataset = Dataset.from_list(rows) + return dataset + + +class ARCDatasetClean(BaseDataset): + + # load the contamination annotations of CEval from + # https://github.com/liyucheng09/Contamination_Detector + @staticmethod + def load_contamination_annotations(path, split='val'): + import requests + + assert split == 'test', 'We only have test set annotation for ARC' + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope.utils.config_ds import MS_DATASETS_CACHE + annotation_cache_path = osp.join( + MS_DATASETS_CACHE, + f'ARC_c_{split}_contamination_annotations.json') + link_of_annotations = 'https://modelscope.cn/datasets/opencompass/Contamination_Detector/resolve/master/ARC_annotations.json' # noqa + else: + annotation_cache_path = osp.join( + path, f'ARC_c_{split}_contamination_annotations.json') + link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ARC_annotations.json' # noqa + + if osp.exists(annotation_cache_path): + with open(annotation_cache_path, 'r') as f: + annotations = json.load(f) + return annotations + + annotations = json.loads(requests.get(link_of_annotations).text) + with open(annotation_cache_path, 'w') as f: + json.dump(annotations, f) + return annotations + + @staticmethod + def load(path: str, name: str): + path = get_data_path(path) + annotations = ARCDatasetClean.load_contamination_annotations( + osp.dirname(path), 'test') + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, split='test', subset_name=name) + rows = [] + for row in dataset: + answerKey = row['answerKey'] + question = row['question'] + choices = row['choices'] + if len(choices['text']) != 4: + continue + labels = row['choices']['label'] + answerKey = 'ABCD'[labels.index(answerKey)] + id_ = row['id'] + if id_ in annotations: + is_clean = annotations[id_][0] + else: + is_clean = 'not labeled' + rows.append({ + 'question': question, + 'answerKey': answerKey, + 'textA': choices['text'][0], + 'textB': choices['text'][1], + 'textC': choices['text'][2], + 'textD': choices['text'][3], + 'is_clean': is_clean, + }) + else: + with open(path, 'r', errors='ignore') as in_f: + rows = [] + for line in in_f: + item = json.loads(line.strip()) + id_ = item['id'] + question = item['question'] + if id_ in annotations: + is_clean = annotations[id_][0] + else: + is_clean = 'not labeled' + if len(question['choices']) != 4: + continue + labels = [c['label'] for c in question['choices']] + answerKey = 'ABCD'[labels.index(item['answerKey'])] + rows.append({ + 'question': question['stem'], + 'answerKey': answerKey, + 'textA': question['choices'][0]['text'], + 'textB': question['choices'][1]['text'], + 'textC': question['choices'][2]['text'], + 'textD': question['choices'][3]['text'], + 'is_clean': is_clean, + }) + return Dataset.from_list(rows) diff --git a/build/lib/opencompass/datasets/arc_prize_public_evaluation.py b/build/lib/opencompass/datasets/arc_prize_public_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..6c1769267654f98a88d0cc709f587fbc22a2a87c --- /dev/null +++ b/build/lib/opencompass/datasets/arc_prize_public_evaluation.py @@ -0,0 +1,213 @@ +import ast +import json +import os +from typing import Dict, List + +import numpy as np +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class ARCPrizeDataset(BaseDataset): + task_file_names = [ + '2072aba6.json', 'bb52a14b.json', '136b0064.json', 'ea9794b1.json', + '40f6cd08.json', 'f5aa3634.json', '7039b2d7.json', '712bf12e.json', + '9b365c51.json', 'ccd554ac.json', 'f9d67f8b.json', '03560426.json', + 'e2092e0c.json', '8fbca751.json', '42918530.json', 'c64f1187.json', + '00576224.json', '705a3229.json', 'af24b4cc.json', '81c0276b.json', + 'f21745ec.json', '8dae5dfc.json', '4e469f39.json', '695367ec.json', + 'dc2aa30b.json', 'b9630600.json', '770cc55f.json', '3391f8c0.json', + 'c1990cce.json', '1da012fc.json', '50a16a69.json', '212895b5.json', + 'e69241bd.json', '692cd3b6.json', '0bb8deee.json', '9772c176.json', + '22a4bbc2.json', 'ca8de6ea.json', 'dc2e9a9d.json', '4aab4007.json', + 'cfb2ce5a.json', '9f27f097.json', '2c737e39.json', '84db8fc4.json', + 'e1baa8a4.json', 'ea959feb.json', '4f537728.json', '47996f11.json', + 'bf32578f.json', 'aee291af.json', '5d2a5c43.json', '2546ccf6.json', + 'e57337a4.json', 'd4b1c2b1.json', '20981f0e.json', '05a7bcf2.json', + 'fc754716.json', '6ad5bdfd.json', 'e88171ec.json', '1acc24af.json', + '34b99a2b.json', 'e78887d1.json', '4acc7107.json', '137f0df0.json', + '62b74c02.json', '50aad11f.json', '642d658d.json', '64a7c07e.json', + 'bd14c3bf.json', '73c3b0d8.json', 'e0fb7511.json', 'c7d4e6ad.json', + '85b81ff1.json', 'e760a62e.json', 'ca8f78db.json', 'd931c21c.json', + 'aab50785.json', 'ac605cbb.json', '3194b014.json', '68b67ca3.json', + 'e7b06bea.json', 'e5790162.json', 'da2b0fe3.json', '0becf7df.json', + 'fe9372f3.json', 'd56f2372.json', 'e66aafb8.json', 'b7999b51.json', + '2697da3f.json', '516b51b7.json', '9a4bb226.json', '195ba7dc.json', + '310f3251.json', '639f5a19.json', '0d87d2a6.json', 'c663677b.json', + 'e74e1818.json', '69889d6e.json', 'f45f5ca7.json', '8597cfd7.json', + '0c9aba6e.json', 'e9b4f6fc.json', 'e7639916.json', '5207a7b5.json', + 'e4075551.json', '90347967.json', '9ddd00f0.json', '4b6b68e5.json', + 'e9c9d9a1.json', '2f0c5170.json', '58e15b12.json', 'd37a1ef5.json', + '62ab2642.json', 'b457fec5.json', 'c97c0139.json', 'ac0c5833.json', + '7d419a02.json', '4ff4c9da.json', '4cd1b7b2.json', '27a77e38.json', + '66f2d22f.json', '2a5f8217.json', 'c074846d.json', 'c6e1b8da.json', + '319f2597.json', '94be5b80.json', '55783887.json', '60c09cac.json', + 'f823c43c.json', 'd492a647.json', 'e681b708.json', '15663ba9.json', + 'a3f84088.json', '103eff5b.json', '5a5a2103.json', '1e97544e.json', + '009d5c81.json', 'ed74f2f2.json', 'ce039d91.json', 'baf41dbf.json', + '3490cc26.json', 'ce8d95cc.json', '3f23242b.json', '1d0a4b61.json', + '8719f442.json', 'd94c3b52.json', '4c177718.json', '59341089.json', + '3ee1011a.json', 'f5c89df1.json', '5833af48.json', 'd4c90558.json', + '88207623.json', '833dafe3.json', '070dd51e.json', '3ed85e70.json', + '21f83797.json', '7c8af763.json', '5783df64.json', 'a57f2f04.json', + 'e9ac8c9e.json', 'aa18de87.json', '505fff84.json', '5ffb2104.json', + '42a15761.json', '1a2e2828.json', '0607ce86.json', '84f2aca1.json', + '456873bc.json', '903d1b4a.json', '0f63c0b9.json', '54db823b.json', + 'ad7e01d0.json', '8e2edd66.json', '79fb03f4.json', '4364c1c4.json', + 'e7a25a18.json', 'e133d23d.json', 'e21a174a.json', '55059096.json', + 'e95e3d8e.json', '94414823.json', '9356391f.json', '15113be4.json', + 'ba9d41b8.json', '52fd389e.json', 'de493100.json', '9c56f360.json', + 'c92b942c.json', '97239e3d.json', 'b0f4d537.json', '19bb5feb.json', + '506d28a5.json', '5b692c0f.json', 'ef26cbf6.json', 'e345f17b.json', + '7d1f7ee8.json', 'ac3e2b04.json', '551d5bf1.json', 'fb791726.json', + '2037f2c7.json', 'e6de6e8f.json', '3d31c5b3.json', 'd19f7514.json', + '1d398264.json', '358ba94e.json', '696d4842.json', '08573cc6.json', + '7e02026e.json', '7953d61e.json', 'c3202e5a.json', '351d6448.json', + 'fea12743.json', '12422b43.json', 'b942fd60.json', 'bcb3040b.json', + 'e41c6fd3.json', 'a59b95c0.json', '3a301edc.json', '0b17323b.json', + 'da515329.json', '96a8c0cd.json', '6f473927.json', '9def23fe.json', + 'c35c1b4c.json', 'be03b35f.json', '604001fa.json', 'd304284e.json', + 'cb227835.json', 'e9bb6954.json', 'ac2e8ecf.json', '1e81d6f9.json', + '72207abc.json', '37d3e8b2.json', 'c8b7cc0f.json', 'a096bf4d.json', + '1c02dbbe.json', 'fd096ab6.json', '9bebae7a.json', '25094a63.json', + 'b7fb29bc.json', 'aa4ec2a5.json', '50f325b5.json', '423a55dc.json', + 'b0722778.json', 'e7dd8335.json', 'f3cdc58f.json', 'cad67732.json', + '256b0a75.json', 'd282b262.json', '58743b76.json', '6df30ad6.json', + '9110e3c5.json', '48f8583b.json', 'a680ac02.json', '642248e4.json', + '2685904e.json', '48131b3c.json', 'b7cb93ac.json', '73182012.json', + 'df8cc377.json', '3b4c2228.json', '93c31fbe.json', '8ee62060.json', + '9b2a60aa.json', 'f0df5ff0.json', '917bccba.json', 'ed98d772.json', + 'bf89d739.json', 'f3e62deb.json', '11e1fe23.json', 'bbb1b8b6.json', + 'f4081712.json', '817e6c09.json', '45bbe264.json', 'f3b10344.json', + 'fafd9572.json', 'b7f8a4d8.json', '2c0b0aff.json', '8cb8642d.json', + '67c52801.json', 'd47aa2ff.json', '0934a4d8.json', '60a26a3e.json', + 'cf133acc.json', '5289ad53.json', '16b78196.json', '09c534e7.json', + 'f83cb3f6.json', 'd017b73f.json', 'b20f7c8b.json', '5af49b42.json', + '18419cfa.json', '929ab4e9.json', '6a11f6da.json', '17cae0c1.json', + 'e99362f0.json', '1c56ad9f.json', '8a371977.json', 'e633a9e5.json', + 'c658a4bd.json', 'bc4146bd.json', '67636eac.json', '4e45f183.json', + '17b80ad2.json', '94133066.json', 'e1d2900e.json', 'a934301b.json', + '0a2355a6.json', '45737921.json', '332efdb3.json', '7bb29440.json', + 'f9a67cb5.json', 'a8610ef7.json', '32e9702f.json', '0c786b71.json', + '626c0bcc.json', 'aa300dc3.json', 'c62e2108.json', '0692e18c.json', + 'af22c60d.json', '992798f6.json', 'c48954c1.json', '5b526a93.json', + 'ae58858e.json', 'ff72ca3e.json', '2b01abd0.json', '7d18a6fb.json', + '963f59bc.json', '759f3fd3.json', '7c9b52a0.json', '4852f2fa.json', + '14754a24.json', 'c87289bb.json', '845d6e51.json', '281123b4.json', + '79369cc6.json', '0a1d4ef5.json', '477d2879.json', '72a961c9.json', + '67b4a34d.json', 'e5c44e8f.json', 'bf699163.json', '13713586.json', + '27f8ce4f.json', '95a58926.json', '15696249.json', 'd2acf2cb.json', + '140c817e.json', '1990f7a8.json', '782b5218.json', '8b28cd80.json', + '92e50de0.json', 'e619ca6e.json', '5b6cbef5.json', '575b1a71.json', + '66e6c45b.json', '31adaf00.json', '6ea4a07e.json', 'f0afb749.json', + '00dbd492.json', 'b1fc8b8e.json', 'fd4b2b02.json', 'b15fca0b.json', + 'a04b2602.json', '20818e16.json', '762cd429.json', '29700607.json', + 'd5c634a2.json', 'a406ac07.json', '8ba14f53.json', '184a9768.json', + '12997ef3.json', 'dd2401ed.json', 'f8be4b64.json', '12eac192.json', + '31d5ba1a.json', 'b4a43f3b.json', '7ee1c6ea.json', '9b4c17c4.json', + '981571dc.json', '93b4f4b3.json', '9caba7c3.json', '891232d6.json', + '85fa5666.json', '0e671a1a.json', '73ccf9c2.json', '414297c0.json', + 'e872b94a.json', '99306f82.json', '3979b1a8.json', '2753e76c.json', + '1c0d0a4b.json', '292dd178.json', 'cd3c21df.json', '33b52de3.json', + 'ecaa0ec1.json', '896d5239.json', '1a6449f1.json', '9c1e755f.json' + ] + + @staticmethod + def load(path: str): + task_file_dir = get_data_path(path) + + dataset = [] + + task_file_name_list = os.listdir(task_file_dir) + for task_file_name in task_file_name_list: + if task_file_name not in ARCPrizeDataset.task_file_names: + continue + with open(os.path.join(task_file_dir, task_file_name), + 'r') as file: + task = json.load(file) + task = { + 'training_data': task['train'], + 'input_test_data': task['test'][0]['input'], + 'output_test_data': task['test'][0]['output'] + } + dataset.append(task) + + return Dataset.from_list(dataset) + + +class ARCPrizeEvaluator(BaseEvaluator): + + def score(self, predictions: List[str], + references: List[List[int]]) -> Dict: + accuracy = [] + details = [] + for pred, refer in zip(map(extract_solution, predictions), references): + is_correct, correct_percentage = compare_solutions_with_padding( + pred, refer, pad_value=-1) + details.append({ + 'solved': True if is_correct else False, + 'correct_percentage': correct_percentage, + 'generated_solution': pred + }) + accuracy.append(1 if is_correct else 0) + + return {'accuracy': np.mean(accuracy), 'details': details} + + +def extract_solution(text): + try: + # Find the part of the text that looks like a nested list + start = text.index('[[') + end = text.index(']]', start) + 2 + array_str = text[start:end] + + # Use ast.literal_eval to safely evaluate the + # string as a Python expression + array = ast.literal_eval(array_str) + # Check if the result is a list of lists + if all(isinstance(i, list) for i in array): + if all(all(isinstance(i, int) for i in j) for j in array): + return array + else: + return [[0]] + else: + return [[0]] + except (ValueError, SyntaxError): + return [[0]] + + +def pad_array_with_value(array, target_shape, pad_value): + padded_array = np.full(target_shape, pad_value, dtype=int) + for i in range(len(array)): + padded_array[i, :len(array[i])] = array[i] + return padded_array + + +def compare_solutions_with_padding(generated_output: List[int], + correct_output: List[int], + pad_value=-1): + max_rows = max(len(generated_output), len(correct_output)) + max_cols = max(max(map(len, generated_output)), + max(map(len, correct_output))) + target_shape = (max_rows, max_cols) + + padded_generated = pad_array_with_value(generated_output, target_shape, + pad_value) + padded_correct = pad_array_with_value(correct_output, target_shape, + pad_value) + + total_pixels = max_rows * max_cols + correct_pixels = np.sum((padded_generated == padded_correct) + & (padded_generated != pad_value) + & (padded_correct != pad_value)) + correct_percentage = (correct_pixels / total_pixels) * 100 + + is_correct = (correct_pixels == total_pixels) + + return is_correct, correct_percentage diff --git a/build/lib/opencompass/datasets/ax.py b/build/lib/opencompass/datasets/ax.py new file mode 100644 index 0000000000000000000000000000000000000000..0dc542ed5b1b82bfe426731fa54c0ff9b97e5848 --- /dev/null +++ b/build/lib/opencompass/datasets/ax.py @@ -0,0 +1,37 @@ +import json +from os import environ + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class AXDatasetV2(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path) + dataset = [] + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + ms_dataset = MsDataset.load('opencompass/super_glue', + subset_name='axb')['test'] + for data in ms_dataset: + row = data + row['label'] = {0: 'A', 1: 'B'}[data['label']] + dataset.append(row) + else: + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + line['label'] = { + 'entailment': 'A', + 'not_entailment': 'B' + }[line['label']] + dataset.append(line) + dataset = Dataset.from_list(dataset) + return dataset diff --git a/build/lib/opencompass/registry.py b/build/lib/opencompass/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..561732fa527dbcf228933bd0d3ea13cf8dbee451 --- /dev/null +++ b/build/lib/opencompass/registry.py @@ -0,0 +1,55 @@ +from typing import Callable, List, Optional, Type, Union + +from mmengine.registry import METRICS as MMENGINE_METRICS +from mmengine.registry import Registry as OriginalRegistry + + +class Registry(OriginalRegistry): + + # override the default force behavior + def register_module( + self, + name: Optional[Union[str, List[str]]] = None, + force: bool = True, + module: Optional[Type] = None) -> Union[type, Callable]: + return super().register_module(name, force, module) + + +PARTITIONERS = Registry('partitioner', locations=['opencompass.partitioners']) +RUNNERS = Registry('runner', locations=['opencompass.runners']) +TASKS = Registry('task', locations=['opencompass.tasks']) +MODELS = Registry('model', locations=['opencompass.models']) +# TODO: LOAD_DATASET -> DATASETS +LOAD_DATASET = Registry('load_dataset', locations=['opencompass.datasets']) +TEXT_POSTPROCESSORS = Registry( + 'text_postprocessors', locations=['opencompass.utils.text_postprocessors']) +DICT_POSTPROCESSORS = Registry('dict_postprocessors', + locations=[ + 'opencompass.utils.dict_postprocessors', + 'opencompass.datasets.generic' + ]) + +EVALUATORS = Registry('evaluators', locations=['opencompass.evaluators']) + +ICL_INFERENCERS = Registry('icl_inferencers', + locations=['opencompass.openicl.icl_inferencer']) +ICL_RETRIEVERS = Registry('icl_retrievers', + locations=['opencompass.openicl.icl_retriever']) +ICL_DATASET_READERS = Registry( + 'icl_dataset_readers', + locations=['opencompass.openicl.icl_dataset_reader']) +ICL_PROMPT_TEMPLATES = Registry( + 'icl_prompt_templates', + locations=['opencompass.openicl.icl_prompt_template']) +ICL_EVALUATORS = Registry( + 'icl_evaluators', + locations=['opencompass.openicl.icl_evaluator', 'opencompass.evaluator']) +METRICS = Registry('metric', + parent=MMENGINE_METRICS, + locations=['opencompass.metrics']) +TOT_WRAPPER = Registry('tot_wrapper', locations=['opencompass.datasets']) + + +def build_from_cfg(cfg): + """A helper function that builds object with MMEngine's new config.""" + return PARTITIONERS.build(cfg)