msj19 commited on Jan 22

Commit

9b40ad5

verified ·

1 Parent(s): 234704f

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py +186 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py +154 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py +516 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py +218 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py +265 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py +237 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py +168 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py +158 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py +174 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py +95 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py +375 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py +227 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py +95 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py +520 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py +133 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py +331 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py +146 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py +296 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py +196 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py +136 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py +75 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py +56 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py +125 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py +354 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py +510 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py +76 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py +136 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py +75 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py +867 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py +629 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py +166 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py +2489 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py +397 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py +256 -0
.eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py +393 -0
.eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py +772 -0
.eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py +684 -0
.eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py +479 -0
.eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py +470 -0
.eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py +794 -0
.eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py +234 -0
.eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py +453 -0
.eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py +1605 -0
.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py +553 -0
.eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py +835 -0
.eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py +395 -0
.eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py +34 -0
.eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py +27 -0
.eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py +343 -0
.eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py +137 -0

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Natural Language Toolkit: Corpus Readers
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+NLTK corpus readers.  The modules in this package provide functions
+that can be used to read corpus fileids in a variety of formats.  These
+functions can be used to read both the corpus fileids that are
+distributed in the NLTK corpus package, and corpus fileids that are part
+of external corpora.
+Corpus Reader Functions
+=======================
+Each corpus module defines one or more "corpus reader functions",
+which can be used to read documents from that corpus.  These functions
+take an argument, ``item``, which is used to indicate which document
+should be read from the corpus:
+- If ``item`` is one of the unique identifiers listed in the corpus
+  module's ``items`` variable, then the corresponding document will
+  be loaded from the NLTK corpus package.
+- If ``item`` is a fileid, then that file will be read.
+Additionally, corpus reader functions can be given lists of item
+names; in which case, they will return a concatenation of the
+corresponding documents.
+Corpus reader functions are named based on the type of information
+they return.  Some common examples, and their return types, are:
+- words(): list of str
+- sents(): list of (list of str)
+- paras(): list of (list of (list of str))
+- tagged_words(): list of (str,str) tuple
+- tagged_sents(): list of (list of (str,str))
+- tagged_paras(): list of (list of (list of (str,str)))
+- chunked_sents(): list of (Tree w/ (str,str) leaves)
+- parsed_sents(): list of (Tree with str leaves)
+- parsed_paras(): list of (list of (Tree with str leaves))
+- xml(): A single xml ElementTree
+- raw(): unprocessed corpus contents
+For example, to read a list of the words in the Brown Corpus, use
+``nltk.corpus.brown.words()``:
+    >>> from nltk.corpus import brown
+    >>> print(", ".join(brown.words()[:6])) # only first 6 words
+    The, Fulton, County, Grand, Jury, said
+isort:skip_file
+"""
+from nltk.corpus.reader.plaintext import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.tagged import *
+from nltk.corpus.reader.cmudict import *
+from nltk.corpus.reader.conll import *
+from nltk.corpus.reader.chunked import *
+from nltk.corpus.reader.wordlist import *
+from nltk.corpus.reader.xmldocs import *
+from nltk.corpus.reader.ppattach import *
+from nltk.corpus.reader.senseval import *
+from nltk.corpus.reader.ieer import *
+from nltk.corpus.reader.sinica_treebank import *
+from nltk.corpus.reader.bracket_parse import *
+from nltk.corpus.reader.indian import *
+from nltk.corpus.reader.toolbox import *
+from nltk.corpus.reader.timit import *
+from nltk.corpus.reader.ycoe import *
+from nltk.corpus.reader.rte import *
+from nltk.corpus.reader.string_category import *
+from nltk.corpus.reader.propbank import *
+from nltk.corpus.reader.verbnet import *
+from nltk.corpus.reader.bnc import *
+from nltk.corpus.reader.nps_chat import *
+from nltk.corpus.reader.wordnet import *
+from nltk.corpus.reader.switchboard import *
+from nltk.corpus.reader.dependency import *
+from nltk.corpus.reader.nombank import *
+from nltk.corpus.reader.ipipan import *
+from nltk.corpus.reader.pl196x import *
+from nltk.corpus.reader.knbc import *
+from nltk.corpus.reader.chasen import *
+from nltk.corpus.reader.childes import *
+from nltk.corpus.reader.aligned import *
+from nltk.corpus.reader.lin import *
+from nltk.corpus.reader.semcor import *
+from nltk.corpus.reader.framenet import *
+from nltk.corpus.reader.udhr import *
+from nltk.corpus.reader.bnc import *
+from nltk.corpus.reader.sentiwordnet import *
+from nltk.corpus.reader.twitter import *
+from nltk.corpus.reader.nkjp import *
+from nltk.corpus.reader.crubadan import *
+from nltk.corpus.reader.mte import *
+from nltk.corpus.reader.reviews import *
+from nltk.corpus.reader.opinion_lexicon import *
+from nltk.corpus.reader.pros_cons import *
+from nltk.corpus.reader.categorized_sents import *
+from nltk.corpus.reader.comparative_sents import *
+from nltk.corpus.reader.panlex_lite import *
+from nltk.corpus.reader.panlex_swadesh import *
+from nltk.corpus.reader.bcp47 import *
+# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
+# the function bracket_parse() defined in nltk.tree:
+from nltk.corpus.reader import bracket_parse
+__all__ = [
+    "CorpusReader",
+    "CategorizedCorpusReader",
+    "PlaintextCorpusReader",
+    "find_corpus_fileids",
+    "TaggedCorpusReader",
+    "CMUDictCorpusReader",
+    "ConllChunkCorpusReader",
+    "WordListCorpusReader",
+    "PPAttachmentCorpusReader",
+    "SensevalCorpusReader",
+    "IEERCorpusReader",
+    "ChunkedCorpusReader",
+    "SinicaTreebankCorpusReader",
+    "BracketParseCorpusReader",
+    "IndianCorpusReader",
+    "ToolboxCorpusReader",
+    "TimitCorpusReader",
+    "YCOECorpusReader",
+    "MacMorphoCorpusReader",
+    "SyntaxCorpusReader",
+    "AlpinoCorpusReader",
+    "RTECorpusReader",
+    "StringCategoryCorpusReader",
+    "EuroparlCorpusReader",
+    "CategorizedBracketParseCorpusReader",
+    "CategorizedTaggedCorpusReader",
+    "CategorizedPlaintextCorpusReader",
+    "PortugueseCategorizedPlaintextCorpusReader",
+    "tagged_treebank_para_block_reader",
+    "PropbankCorpusReader",
+    "VerbnetCorpusReader",
+    "BNCCorpusReader",
+    "ConllCorpusReader",
+    "XMLCorpusReader",
+    "NPSChatCorpusReader",
+    "SwadeshCorpusReader",
+    "WordNetCorpusReader",
+    "WordNetICCorpusReader",
+    "SwitchboardCorpusReader",
+    "DependencyCorpusReader",
+    "NombankCorpusReader",
+    "IPIPANCorpusReader",
+    "Pl196xCorpusReader",
+    "TEICorpusView",
+    "KNBCorpusReader",
+    "ChasenCorpusReader",
+    "CHILDESCorpusReader",
+    "AlignedCorpusReader",
+    "TimitTaggedCorpusReader",
+    "LinThesaurusCorpusReader",
+    "SemcorCorpusReader",
+    "FramenetCorpusReader",
+    "UdhrCorpusReader",
+    "BNCCorpusReader",
+    "SentiWordNetCorpusReader",
+    "SentiSynset",
+    "TwitterCorpusReader",
+    "NKJPCorpusReader",
+    "CrubadanCorpusReader",
+    "MTECorpusReader",
+    "ReviewsCorpusReader",
+    "OpinionLexiconCorpusReader",
+    "ProsConsCorpusReader",
+    "CategorizedSentencesCorpusReader",
+    "ComparativeSentencesCorpusReader",
+    "PanLexLiteCorpusReader",
+    "NonbreakingPrefixesCorpusReader",
+    "UnicharsCorpusReader",
+    "MWAPPDBCorpusReader",
+    "PanlexSwadeshCorpusReader",
+    "BCP47CorpusReader",
+]

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Natural Language Toolkit: Aligned Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# URL: <https://www.nltk.org/>
+# Author: Steven Bird <stevenbird1@gmail.com>
+# For license information, see LICENSE.TXT
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import (
+    StreamBackedCorpusView,
+    concat,
+    read_alignedsent_block,
+)
+from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
+from nltk.translate import AlignedSent, Alignment
+class AlignedCorpusReader(CorpusReader):
+    """
+    Reader for corpora of word-aligned sentences.  Tokens are assumed
+    to be separated by whitespace.  Sentences begin on separate lines.
+    """
+    def __init__(
+        self,
+        root,
+        fileids,
+        sep="/",
+        word_tokenizer=WhitespaceTokenizer(),
+        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        alignedsent_block_reader=read_alignedsent_block,
+        encoding="latin1",
+    ):
+        """
+        Construct a new Aligned Corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+            >>> root = '/...path to corpus.../'
+            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._alignedsent_block_reader = alignedsent_block_reader
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                AlignedSentCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    False,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._alignedsent_block_reader,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        return concat(
+            [
+                AlignedSentCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    True,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._alignedsent_block_reader,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def aligned_sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of AlignedSent objects.
+        :rtype: list(AlignedSent)
+        """
+        return concat(
+            [
+                AlignedSentCorpusView(
+                    fileid,
+                    enc,
+                    True,
+                    True,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._alignedsent_block_reader,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+class AlignedSentCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for aligned sentences.
+    ``AlignedSentCorpusView`` objects are typically created by
+    ``AlignedCorpusReader`` (not directly by nltk users).
+    """
+    def __init__(
+        self,
+        corpus_file,
+        encoding,
+        aligned,
+        group_by_sent,
+        word_tokenizer,
+        sent_tokenizer,
+        alignedsent_block_reader,
+    ):
+        self._aligned = aligned
+        self._group_by_sent = group_by_sent
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._alignedsent_block_reader = alignedsent_block_reader
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+    def read_block(self, stream):
+        block = [
+            self._word_tokenizer.tokenize(sent_str)
+            for alignedsent_str in self._alignedsent_block_reader(stream)
+            for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
+        ]
+        if self._aligned:
+            block[2] = Alignment.fromstring(
+                " ".join(block[2])
+            )  # kludge; we shouldn't have tokenized the alignment string
+            block = [AlignedSent(*block)]
+        elif self._group_by_sent:
+            block = [block[0]]
+        else:
+            block = block[0]
+        return block

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py ADDED Viewed

	@@ -0,0 +1,516 @@

+# Natural Language Toolkit: API for Corpus Readers
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+API for corpus readers.
+"""
+import os
+import re
+from collections import defaultdict
+from itertools import chain
+from nltk.corpus.reader.util import *
+from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer
+class CorpusReader:
+    """
+    A base class for "corpus reader" classes, each of which can be
+    used to read a specific corpus format.  Each individual corpus
+    reader instance is used to read a specific corpus, consisting of
+    one or more files under a common root directory.  Each file is
+    identified by its ``file identifier``, which is the relative path
+    to the file from the root directory.
+    A separate subclass is defined for each corpus format.  These
+    subclasses define one or more methods that provide 'views' on the
+    corpus contents, such as ``words()`` (for a list of words) and
+    ``parsed_sents()`` (for a list of parsed sentences).  Called with
+    no arguments, these methods will return the contents of the entire
+    corpus.  For most corpora, these methods define one or more
+    selection arguments, such as ``fileids`` or ``categories``, which can
+    be used to select which portion of the corpus should be returned.
+    """
+    def __init__(self, root, fileids, encoding="utf8", tagset=None):
+        """
+        :type root: PathPointer or str
+        :param root: A path pointer identifying the root directory for
+            this corpus.  If a string is specified, then it will be
+            converted to a ``PathPointer`` automatically.
+        :param fileids: A list of the files that make up this corpus.
+            This list can either be specified explicitly, as a list of
+            strings; or implicitly, as a regular expression over file
+            paths.  The absolute path for each file will be constructed
+            by joining the reader's root to each file name.
+        :param encoding: The default unicode encoding for the files
+            that make up the corpus.  The value of ``encoding`` can be any
+            of the following:
+            - A string: ``encoding`` is the encoding name for all files.
+            - A dictionary: ``encoding[file_id]`` is the encoding
+              name for the file whose identifier is ``file_id``.  If
+              ``file_id`` is not in ``encoding``, then the file
+              contents will be processed using non-unicode byte strings.
+            - A list: ``encoding`` should be a list of ``(regexp, encoding)``
+              tuples.  The encoding for a file whose identifier is ``file_id``
+              will be the ``encoding`` value for the first tuple whose
+              ``regexp`` matches the ``file_id``.  If no tuple's ``regexp``
+              matches the ``file_id``, the file contents will be processed
+              using non-unicode byte strings.
+            - None: the file contents of all files will be
+              processed using non-unicode byte strings.
+        :param tagset: The name of the tagset used by this corpus, to be used
+              for normalizing or converting the POS tags returned by the
+              ``tagged_...()`` methods.
+        """
+        # Convert the root to a path pointer, if necessary.
+        if isinstance(root, str) and not isinstance(root, PathPointer):
+            m = re.match(r"(.*\.zip)/?(.*)$|", root)
+            zipfile, zipentry = m.groups()
+            if zipfile:
+                root = ZipFilePathPointer(zipfile, zipentry)
+            else:
+                root = FileSystemPathPointer(root)
+        elif not isinstance(root, PathPointer):
+            raise TypeError("CorpusReader: expected a string or a PathPointer")
+        # If `fileids` is a regexp, then expand it.
+        if isinstance(fileids, str):
+            fileids = find_corpus_fileids(root, fileids)
+        self._fileids = fileids
+        """A list of the relative paths for the fileids that make up
+        this corpus."""
+        self._root = root
+        """The root directory for this corpus."""
+        self._readme = "README"
+        self._license = "LICENSE"
+        self._citation = "citation.bib"
+        # If encoding was specified as a list of regexps, then convert
+        # it to a dictionary.
+        if isinstance(encoding, list):
+            encoding_dict = {}
+            for fileid in self._fileids:
+                for x in encoding:
+                    (regexp, enc) = x
+                    if re.match(regexp, fileid):
+                        encoding_dict[fileid] = enc
+                        break
+            encoding = encoding_dict
+        self._encoding = encoding
+        """The default unicode encoding for the fileids that make up
+           this corpus.  If ``encoding`` is None, then the file
+           contents are processed using byte strings."""
+        self._tagset = tagset
+    def __repr__(self):
+        if isinstance(self._root, ZipFilePathPointer):
+            path = f"{self._root.zipfile.filename}/{self._root.entry}"
+        else:
+            path = "%s" % self._root.path
+        return f"<{self.__class__.__name__} in {path!r}>"
+    def ensure_loaded(self):
+        """
+        Load this corpus (if it has not already been loaded).  This is
+        used by LazyCorpusLoader as a simple method that can be used to
+        make sure a corpus is loaded -- e.g., in case a user wants to
+        do help(some_corpus).
+        """
+        pass  # no need to actually do anything.
+    def readme(self):
+        """
+        Return the contents of the corpus README file, if it exists.
+        """
+        with self.open(self._readme) as f:
+            return f.read()
+    def license(self):
+        """
+        Return the contents of the corpus LICENSE file, if it exists.
+        """
+        with self.open(self._license) as f:
+            return f.read()
+    def citation(self):
+        """
+        Return the contents of the corpus citation.bib file, if it exists.
+        """
+        with self.open(self._citation) as f:
+            return f.read()
+    def fileids(self):
+        """
+        Return a list of file identifiers for the fileids that make up
+        this corpus.
+        """
+        return self._fileids
+    def abspath(self, fileid):
+        """
+        Return the absolute path for the given file.
+        :type fileid: str
+        :param fileid: The file identifier for the file whose path
+            should be returned.
+        :rtype: PathPointer
+        """
+        return self._root.join(fileid)
+    def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
+        """
+        Return a list of the absolute paths for all fileids in this corpus;
+        or for the given list of fileids, if specified.
+        :type fileids: None or str or list
+        :param fileids: Specifies the set of fileids for which paths should
+            be returned.  Can be None, for all fileids; a list of
+            file identifiers, for a specified set of fileids; or a single
+            file identifier, for a single file.  Note that the return
+            value is always a list of paths, even if ``fileids`` is a
+            single file identifier.
+        :param include_encoding: If true, then return a list of
+            ``(path_pointer, encoding)`` tuples.
+        :rtype: list(PathPointer)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        paths = [self._root.join(f) for f in fileids]
+        if include_encoding and include_fileid:
+            return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
+        elif include_fileid:
+            return list(zip(paths, fileids))
+        elif include_encoding:
+            return list(zip(paths, [self.encoding(f) for f in fileids]))
+        else:
+            return paths
+    def raw(self, fileids=None):
+        """
+        :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        contents = []
+        for f in fileids:
+            with self.open(f) as fp:
+                contents.append(fp.read())
+        return concat(contents)
+    def open(self, file):
+        """
+        Return an open stream that can be used to read the given file.
+        If the file's encoding is not None, then the stream will
+        automatically decode the file's contents into unicode.
+        :param file: The file identifier of the file to read.
+        """
+        encoding = self.encoding(file)
+        stream = self._root.join(file).open(encoding)
+        return stream
+    def encoding(self, file):
+        """
+        Return the unicode encoding for the given corpus file, if known.
+        If the encoding is unknown, or if the given file should be
+        processed using byte strings (str), then return None.
+        """
+        if isinstance(self._encoding, dict):
+            return self._encoding.get(file)
+        else:
+            return self._encoding
+    def _get_root(self):
+        return self._root
+    root = property(
+        _get_root,
+        doc="""
+        The directory where this corpus is stored.
+        :type: PathPointer""",
+    )
+######################################################################
+# { Corpora containing categorized items
+######################################################################
+class CategorizedCorpusReader:
+    """
+    A mixin class used to aid in the implementation of corpus readers
+    for categorized corpora.  This class defines the method
+    ``categories()``, which returns a list of the categories for the
+    corpus or for a specified set of fileids; and overrides ``fileids()``
+    to take a ``categories`` argument, restricting the set of fileids to
+    be returned.
+    Subclasses are expected to:
+      - Call ``__init__()`` to set up the mapping.
+      - Override all view methods to accept a ``categories`` parameter,
+        which can be used *instead* of the ``fileids`` parameter, to
+        select which fileids should be included in the returned view.
+    """
+    def __init__(self, kwargs):
+        """
+        Initialize this mapping based on keyword arguments, as
+        follows:
+          - cat_pattern: A regular expression pattern used to find the
+            category for each file identifier.  The pattern will be
+            applied to each file identifier, and the first matching
+            group will be used as the category label for that file.
+          - cat_map: A dictionary, mapping from file identifiers to
+            category labels.
+          - cat_file: The name of a file that contains the mapping
+            from file identifiers to categories.  The argument
+            ``cat_delimiter`` can be used to specify a delimiter.
+        The corresponding argument will be deleted from ``kwargs``.  If
+        more than one argument is specified, an exception will be
+        raised.
+        """
+        self._f2c = None  #: file-to-category mapping
+        self._c2f = None  #: category-to-file mapping
+        self._pattern = None  #: regexp specifying the mapping
+        self._map = None  #: dict specifying the mapping
+        self._file = None  #: fileid of file containing the mapping
+        self._delimiter = None  #: delimiter for ``self._file``
+        if "cat_pattern" in kwargs:
+            self._pattern = kwargs["cat_pattern"]
+            del kwargs["cat_pattern"]
+        elif "cat_map" in kwargs:
+            self._map = kwargs["cat_map"]
+            del kwargs["cat_map"]
+        elif "cat_file" in kwargs:
+            self._file = kwargs["cat_file"]
+            del kwargs["cat_file"]
+            if "cat_delimiter" in kwargs:
+                self._delimiter = kwargs["cat_delimiter"]
+                del kwargs["cat_delimiter"]
+        else:
+            raise ValueError(
+                "Expected keyword argument cat_pattern or " "cat_map or cat_file."
+            )
+        if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
+            raise ValueError(
+                "Specify exactly one of: cat_pattern, " "cat_map, cat_file."
+            )
+    def _init(self):
+        self._f2c = defaultdict(set)
+        self._c2f = defaultdict(set)
+        if self._pattern is not None:
+            for file_id in self._fileids:
+                category = re.match(self._pattern, file_id).group(1)
+                self._add(file_id, category)
+        elif self._map is not None:
+            for (file_id, categories) in self._map.items():
+                for category in categories:
+                    self._add(file_id, category)
+        elif self._file is not None:
+            with self.open(self._file) as f:
+                for line in f.readlines():
+                    line = line.strip()
+                    file_id, categories = line.split(self._delimiter, 1)
+                    if file_id not in self.fileids():
+                        raise ValueError(
+                            "In category mapping file %s: %s "
+                            "not found" % (self._file, file_id)
+                        )
+                    for category in categories.split(self._delimiter):
+                        self._add(file_id, category)
+    def _add(self, file_id, category):
+        self._f2c[file_id].add(category)
+        self._c2f[category].add(file_id)
+    def categories(self, fileids=None):
+        """
+        Return a list of the categories that are defined for this corpus,
+        or for the file(s) if it is given.
+        """
+        if self._f2c is None:
+            self._init()
+        if fileids is None:
+            return sorted(self._c2f)
+        if isinstance(fileids, str):
+            fileids = [fileids]
+        return sorted(set.union(*(self._f2c[d] for d in fileids)))
+    def fileids(self, categories=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus, or that make up the given category(s) if specified.
+        """
+        if categories is None:
+            return super().fileids()
+        elif isinstance(categories, str):
+            if self._f2c is None:
+                self._init()
+            if categories in self._c2f:
+                return sorted(self._c2f[categories])
+            else:
+                raise ValueError("Category %s not found" % categories)
+        else:
+            if self._f2c is None:
+                self._init()
+            return sorted(set.union(*(self._c2f[c] for c in categories)))
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError("Specify fileids or categories, not both")
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
+    def raw(self, fileids=None, categories=None):
+        return super().raw(self._resolve(fileids, categories))
+    def words(self, fileids=None, categories=None):
+        return super().words(self._resolve(fileids, categories))
+    def sents(self, fileids=None, categories=None):
+        return super().sents(self._resolve(fileids, categories))
+    def paras(self, fileids=None, categories=None):
+        return super().paras(self._resolve(fileids, categories))
+######################################################################
+# { Treebank readers
+######################################################################
+# [xx] is it worth it to factor this out?
+class SyntaxCorpusReader(CorpusReader):
+    """
+    An abstract base class for reading corpora consisting of
+    syntactically parsed text.  Subclasses should define:
+      - ``__init__``, which specifies the location of the corpus
+        and a method for detecting the sentence blocks in corpus files.
+      - ``_read_block``, which reads a block from the input stream.
+      - ``_word``, which takes a block and returns a list of list of words.
+      - ``_tag``, which takes a block and returns a list of list of tagged
+        words.
+      - ``_parse``, which takes a block and returns a list of parsed
+        sentences.
+    """
+    def _parse(self, s):
+        raise NotImplementedError()
+    def _word(self, s):
+        raise NotImplementedError()
+    def _tag(self, s):
+        raise NotImplementedError()
+    def _read_block(self, stream):
+        raise NotImplementedError()
+    def parsed_sents(self, fileids=None):
+        reader = self._read_parsed_sent_block
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, reader, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+    def tagged_sents(self, fileids=None, tagset=None):
+        def reader(stream):
+            return self._read_tagged_sent_block(stream, tagset)
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, reader, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+    def sents(self, fileids=None):
+        reader = self._read_sent_block
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, reader, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+    def tagged_words(self, fileids=None, tagset=None):
+        def reader(stream):
+            return self._read_tagged_word_block(stream, tagset)
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, reader, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+    def words(self, fileids=None):
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
+                for fileid, enc in self.abspaths(fileids, True)
+            ]
+        )
+    # ------------------------------------------------------------
+    # { Block Readers
+    def _read_word_block(self, stream):
+        return list(chain.from_iterable(self._read_sent_block(stream)))
+    def _read_tagged_word_block(self, stream, tagset=None):
+        return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset)))
+    def _read_sent_block(self, stream):
+        return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
+    def _read_tagged_sent_block(self, stream, tagset=None):
+        return list(
+            filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
+        )
+    def _read_parsed_sent_block(self, stream):
+        return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
+    # } End of Block Readers
+    # ------------------------------------------------------------

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Natural Language Toolkit: BCP-47 language tags
+#
+# Copyright (C) 2022 NLTK Project
+# Author: Eric Kafe <kafe.eric@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import re
+from warnings import warn
+from xml.etree import ElementTree as et
+from nltk.corpus.reader import CorpusReader
+class BCP47CorpusReader(CorpusReader):
+    """
+    Parse BCP-47 composite language tags
+    Supports all the main subtags, and the 'u-sd' extension:
+    >>> from nltk.corpus import bcp47
+    >>> bcp47.name('oc-gascon-u-sd-fr64')
+    'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
+    Can load a conversion table to Wikidata Q-codes:
+    >>> bcp47.load_wiki_q()
+    >>> bcp47.wiki_q['en-GI-spanglis']
+    'Q79388'
+    """
+    def __init__(self, root, fileids):
+        """Read the BCP-47 database"""
+        super().__init__(root, fileids)
+        self.langcode = {}
+        with self.open("iana/language-subtag-registry.txt") as fp:
+            self.db = self.data_dict(fp.read().split("%%\n"))
+        with self.open("cldr/common-subdivisions-en.xml") as fp:
+            self.subdiv = self.subdiv_dict(
+                et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
+            )
+        self.morphology()
+    def load_wiki_q(self):
+        """Load conversion table to Wikidata Q-codes (only if needed)"""
+        with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
+            self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
+    def wiki_dict(self, lines):
+        """Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
+        return {
+            pair[1]: pair[0].split("/")[-1]
+            for pair in [line.strip().split("\t") for line in lines]
+        }
+    def subdiv_dict(self, subdivs):
+        """Convert the CLDR subdivisions list to a dictionary"""
+        return {sub.attrib["type"]: sub.text for sub in subdivs}
+    def morphology(self):
+        self.casing = {
+            "language": str.lower,
+            "extlang": str.lower,
+            "script": str.title,
+            "region": str.upper,
+            "variant": str.lower,
+        }
+        dig = "[0-9]"
+        low = "[a-z]"
+        up = "[A-Z]"
+        alnum = "[a-zA-Z0-9]"
+        self.format = {
+            "language": re.compile(f"{low*3}?"),
+            "extlang": re.compile(f"{low*3}"),
+            "script": re.compile(f"{up}{low*3}"),
+            "region": re.compile(f"({up*2})|({dig*3})"),
+            "variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
+            "singleton": re.compile(f"{low}"),
+        }
+    def data_dict(self, records):
+        """Convert the BCP-47 language subtag registry to a dictionary"""
+        self.version = records[0].replace("File-Date:", "").strip()
+        dic = {}
+        dic["deprecated"] = {}
+        for label in [
+            "language",
+            "extlang",
+            "script",
+            "region",
+            "variant",
+            "redundant",
+            "grandfathered",
+        ]:
+            dic["deprecated"][label] = {}
+        for record in records[1:]:
+            fields = [field.split(": ") for field in record.strip().split("\n")]
+            typ = fields[0][1]
+            tag = fields[1][1]
+            if typ not in dic:
+                dic[typ] = {}
+            subfields = {}
+            for field in fields[2:]:
+                if len(field) == 2:
+                    [key, val] = field
+                    if key not in subfields:
+                        subfields[key] = [val]
+                    else:  # multiple value
+                        subfields[key].append(val)
+                else:  # multiline field
+                    subfields[key][-1] += " " + field[0].strip()
+                if (
+                    "Deprecated" not in record
+                    and typ == "language"
+                    and key == "Description"
+                ):
+                    self.langcode[subfields[key][-1]] = tag
+            for key in subfields:
+                if len(subfields[key]) == 1:  # single value
+                    subfields[key] = subfields[key][0]
+            if "Deprecated" in record:
+                dic["deprecated"][typ][tag] = subfields
+            else:
+                dic[typ][tag] = subfields
+        return dic
+    def val2str(self, val):
+        """Return only first value"""
+        if type(val) == list:
+            #            val = "/".join(val) # Concatenate all values
+            val = val[0]
+        return val
+    def lang2str(self, lg_record):
+        """Concatenate subtag values"""
+        name = f"{lg_record['language']}"
+        for label in ["extlang", "script", "region", "variant", "extension"]:
+            if label in lg_record:
+                name += f": {lg_record[label]}"
+        return name
+    def parse_tag(self, tag):
+        """Convert a BCP-47 tag to a dictionary of labelled subtags"""
+        subtags = tag.split("-")
+        lang = {}
+        labels = ["language", "extlang", "script", "region", "variant", "variant"]
+        while subtags and labels:
+            subtag = subtags.pop(0)
+            found = False
+            while labels:
+                label = labels.pop(0)
+                subtag = self.casing[label](subtag)
+                if self.format[label].fullmatch(subtag):
+                    if subtag in self.db[label]:
+                        found = True
+                        valstr = self.val2str(self.db[label][subtag]["Description"])
+                        if label == "variant" and label in lang:
+                            lang[label] += ": " + valstr
+                        else:
+                            lang[label] = valstr
+                        break
+                    elif subtag in self.db["deprecated"][label]:
+                        found = True
+                        note = f"The {subtag!r} {label} code is deprecated"
+                        if "Preferred-Value" in self.db["deprecated"][label][subtag]:
+                            prefer = self.db["deprecated"][label][subtag][
+                                "Preferred-Value"
+                            ]
+                            note += f"', prefer '{self.val2str(prefer)}'"
+                        lang[label] = self.val2str(
+                            self.db["deprecated"][label][subtag]["Description"]
+                        )
+                        warn(note)
+                        break
+            if not found:
+                if subtag == "u" and subtags[0] == "sd":  # CLDR regional subdivisions
+                    sd = subtags[1]
+                    if sd in self.subdiv:
+                        ext = self.subdiv[sd]
+                    else:
+                        ext = f"<Unknown subdivision: {ext}>"
+                else:  # other extension subtags are not supported yet
+                    ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
+                    if not self.format["singleton"].fullmatch(subtag):
+                        ext = f"<Invalid extension: {ext}>"
+                        warn(ext)
+                lang["extension"] = ext
+                subtags = []
+        return lang
+    def name(self, tag):
+        """
+        Convert a BCP-47 tag to a colon-separated string of subtag names
+        >>> from nltk.corpus import bcp47
+        >>> bcp47.name('ca-Latn-ES-valencia')
+        'Catalan: Latin: Spain: Valencian'
+        """
+        for label in ["redundant", "grandfathered"]:
+            val = None
+            if tag in self.db[label]:
+                val = f"{self.db[label][tag]['Description']}"
+                note = f"The {tag!r} code is {label}"
+            elif tag in self.db["deprecated"][label]:
+                val = f"{self.db['deprecated'][label][tag]['Description']}"
+                note = f"The {tag!r} code is {label} and deprecated"
+                if "Preferred-Value" in self.db["deprecated"][label][tag]:
+                    prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
+                    note += f", prefer {self.val2str(prefer)!r}"
+            if val:
+                warn(note)
+                return val
+        try:
+            return self.lang2str(self.parse_tag(tag))
+        except:
+            warn(f"Tag {tag!r} was not recognized")
+            return None

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# Natural Language Toolkit: Plaintext Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""Corpus reader for the XML version of the British National Corpus."""
+from nltk.corpus.reader.util import concat
+from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
+class BNCCorpusReader(XMLCorpusReader):
+    r"""Corpus reader for the XML version of the British National Corpus.
+    For access to the complete XML data structure, use the ``xml()``
+    method.  For access to simple word lists and tagged word lists, use
+    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
+    You can obtain the full version of the BNC corpus at
+    https://www.ota.ox.ac.uk/desc/2554
+    If you extracted the archive to a directory called `BNC`, then you can
+    instantiate the reader as::
+        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
+    """
+    def __init__(self, root, fileids, lazy=True):
+        XMLCorpusReader.__init__(self, root, fileids)
+        self._lazy = lazy
+    def words(self, fileids=None, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        return self._views(fileids, False, None, strip_space, stem)
+    def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+        :param c5: If true, then the tags used will be the more detailed
+            c5 tags.  Otherwise, the simplified tags will be used.
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        tag = "c5" if c5 else "pos"
+        return self._views(fileids, False, tag, strip_space, stem)
+    def sents(self, fileids=None, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        return self._views(fileids, True, None, strip_space, stem)
+    def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+        :rtype: list(list(tuple(str,str)))
+        :param c5: If true, then the tags used will be the more detailed
+            c5 tags.  Otherwise, the simplified tags will be used.
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        tag = "c5" if c5 else "pos"
+        return self._views(
+            fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
+        )
+    def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
+        """A helper function that instantiates BNCWordViews or the list of words/sentences."""
+        f = BNCWordView if self._lazy else self._words
+        return concat(
+            [
+                f(fileid, sent, tag, strip_space, stem)
+                for fileid in self.abspaths(fileids)
+            ]
+        )
+    def _words(self, fileid, bracket_sent, tag, strip_space, stem):
+        """
+        Helper used to implement the view methods -- returns a list of
+        words or a list of sentences, optionally tagged.
+        :param fileid: The name of the underlying file.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param tag: The name of the tagset to use, or None for no tags.
+        :param strip_space: If true, strip spaces from word tokens.
+        :param stem: If true, then substitute stems for words.
+        """
+        result = []
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for xmlsent in xmldoc.findall(".//s"):
+            sent = []
+            for xmlword in _all_xmlwords_in(xmlsent):
+                word = xmlword.text
+                if not word:
+                    word = ""  # fixes issue 337?
+                if strip_space or stem:
+                    word = word.strip()
+                if stem:
+                    word = xmlword.get("hw", word)
+                if tag == "c5":
+                    word = (word, xmlword.get("c5"))
+                elif tag == "pos":
+                    word = (word, xmlword.get("pos", xmlword.get("c5")))
+                sent.append(word)
+            if bracket_sent:
+                result.append(BNCSentence(xmlsent.attrib["n"], sent))
+            else:
+                result.extend(sent)
+        assert None not in result
+        return result
+def _all_xmlwords_in(elt, result=None):
+    if result is None:
+        result = []
+    for child in elt:
+        if child.tag in ("c", "w"):
+            result.append(child)
+        else:
+            _all_xmlwords_in(child, result)
+    return result
+class BNCSentence(list):
+    """
+    A list of words, augmented by an attribute ``num`` used to record
+    the sentence identifier (the ``n`` attribute from the XML).
+    """
+    def __init__(self, num, items):
+        self.num = num
+        list.__init__(self, items)
+class BNCWordView(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with the BNC corpus.
+    """
+    tags_to_ignore = {
+        "pb",
+        "gap",
+        "vocal",
+        "event",
+        "unclear",
+        "shift",
+        "pause",
+        "align",
+    }
+    """These tags are ignored. For their description refer to the
+    technical documentation, for example,
+    http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
+    """
+    def __init__(self, fileid, sent, tag, strip_space, stem):
+        """
+        :param fileid: The name of the underlying file.
+        :param sent: If true, include sentence bracketing.
+        :param tag: The name of the tagset to use, or None for no tags.
+        :param strip_space: If true, strip spaces from word tokens.
+        :param stem: If true, then substitute stems for words.
+        """
+        if sent:
+            tagspec = ".*/s"
+        else:
+            tagspec = ".*/s/(.*/)?(c|w)"
+        self._sent = sent
+        self._tag = tag
+        self._strip_space = strip_space
+        self._stem = stem
+        self.title = None  #: Title of the document.
+        self.author = None  #: Author of the document.
+        self.editor = None  #: Editor
+        self.resps = None  #: Statement of responsibility
+        XMLCorpusView.__init__(self, fileid, tagspec)
+        # Read in a tasty header.
+        self._open()
+        self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
+        self.close()
+        # Reset tag context.
+        self._tag_context = {0: ()}
+    def handle_header(self, elt, context):
+        # Set up some metadata!
+        titles = elt.findall("titleStmt/title")
+        if titles:
+            self.title = "\n".join(title.text.strip() for title in titles)
+        authors = elt.findall("titleStmt/author")
+        if authors:
+            self.author = "\n".join(author.text.strip() for author in authors)
+        editors = elt.findall("titleStmt/editor")
+        if editors:
+            self.editor = "\n".join(editor.text.strip() for editor in editors)
+        resps = elt.findall("titleStmt/respStmt")
+        if resps:
+            self.resps = "\n\n".join(
+                "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
+            )
+    def handle_elt(self, elt, context):
+        if self._sent:
+            return self.handle_sent(elt)
+        else:
+            return self.handle_word(elt)
+    def handle_word(self, elt):
+        word = elt.text
+        if not word:
+            word = ""  # fixes issue 337?
+        if self._strip_space or self._stem:
+            word = word.strip()
+        if self._stem:
+            word = elt.get("hw", word)
+        if self._tag == "c5":
+            word = (word, elt.get("c5"))
+        elif self._tag == "pos":
+            word = (word, elt.get("pos", elt.get("c5")))
+        return word
+    def handle_sent(self, elt):
+        sent = []
+        for child in elt:
+            if child.tag in ("mw", "hi", "corr", "trunc"):
+                sent += [self.handle_word(w) for w in child]
+            elif child.tag in ("w", "c"):
+                sent.append(self.handle_word(child))
+            elif child.tag not in self.tags_to_ignore:
+                raise ValueError("Unexpected element %s" % child.tag)
+        return BNCSentence(elt.attrib["n"], sent)

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# Natural Language Toolkit: Penn Treebank Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Corpus reader for corpora that consist of parenthesis-delineated parse trees.
+"""
+import sys
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag
+from nltk.tree import Tree
+# we use [^\s()]+ instead of \S+? to avoid matching ()
+SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
+TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
+WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
+EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
+class BracketParseCorpusReader(SyntaxCorpusReader):
+    """
+    Reader for corpora that consist of parenthesis-delineated parse trees,
+    like those found in the "combined" section of the Penn Treebank,
+    e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
+    """
+    def __init__(
+        self,
+        root,
+        fileids,
+        comment_char=None,
+        detect_blocks="unindented_paren",
+        encoding="utf8",
+        tagset=None,
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param comment_char: The character which can appear at the start of
+            a line to indicate that the rest of the line is a comment.
+        :param detect_blocks: The method that is used to find blocks
+            in the corpus; can be 'unindented_paren' (every unindented
+            parenthesis starts a new parse) or 'sexpr' (brackets are
+            matched).
+        :param tagset: The name of the tagset used by this corpus, to be used
+            for normalizing or converting the POS tags returned by the
+            ``tagged_...()`` methods.
+        """
+        SyntaxCorpusReader.__init__(self, root, fileids, encoding)
+        self._comment_char = comment_char
+        self._detect_blocks = detect_blocks
+        self._tagset = tagset
+    def _read_block(self, stream):
+        if self._detect_blocks == "sexpr":
+            return read_sexpr_block(stream, comment_char=self._comment_char)
+        elif self._detect_blocks == "blankline":
+            return read_blankline_block(stream)
+        elif self._detect_blocks == "unindented_paren":
+            # Tokens start with unindented left parens.
+            toks = read_regexp_block(stream, start_re=r"^\(")
+            # Strip any comments out of the tokens.
+            if self._comment_char:
+                toks = [
+                    re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
+                    for tok in toks
+                ]
+            return toks
+        else:
+            assert 0, "bad block type"
+    def _normalize(self, t):
+        # Replace leaves of the form (!), (,), with (! !), (, ,)
+        t = re.sub(r"\((.)\)", r"(\1 \1)", t)
+        # Replace leaves of the form (tag word root) with (tag word)
+        t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
+        return t
+    def _parse(self, t):
+        try:
+            tree = Tree.fromstring(self._normalize(t))
+            # If there's an empty node at the top, strip it off
+            if tree.label() == "" and len(tree) == 1:
+                return tree[0]
+            else:
+                return tree
+        except ValueError as e:
+            sys.stderr.write("Bad tree detected; trying to recover...\n")
+            # Try to recover, if we can:
+            if e.args == ("mismatched parens",):
+                for n in range(1, 5):
+                    try:
+                        v = Tree(self._normalize(t + ")" * n))
+                        sys.stderr.write(
+                            "  Recovered by adding %d close " "paren(s)\n" % n
+                        )
+                        return v
+                    except ValueError:
+                        pass
+            # Try something else:
+            sys.stderr.write("  Recovered by returning a flat parse.\n")
+            # sys.stderr.write(' '.join(t.split())+'\n')
+            return Tree("S", self._tag(t))
+    def _tag(self, t, tagset=None):
+        tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
+        if tagset and tagset != self._tagset:
+            tagged_sent = [
+                (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
+            ]
+        return tagged_sent
+    def _word(self, t):
+        return WORD.findall(self._normalize(t))
+class CategorizedBracketParseCorpusReader(
+    CategorizedCorpusReader, BracketParseCorpusReader
+):
+    """
+    A reader for parsed corpora whose documents are
+    divided into categories based on their file identifiers.
+    @author: Nathan Schneider <nschneid@cs.cmu.edu>
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
+        the L{CategorizedCorpusReader constructor
+        <CategorizedCorpusReader.__init__>}.  The remaining arguments
+        are passed to the L{BracketParseCorpusReader constructor
+        <BracketParseCorpusReader.__init__>}.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        BracketParseCorpusReader.__init__(self, *args, **kwargs)
+    def tagged_words(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_words(self._resolve(fileids, categories), tagset)
+    def tagged_sents(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_sents(self._resolve(fileids, categories), tagset)
+    def tagged_paras(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_paras(self._resolve(fileids, categories), tagset)
+    def parsed_words(self, fileids=None, categories=None):
+        return super().parsed_words(self._resolve(fileids, categories))
+    def parsed_sents(self, fileids=None, categories=None):
+        return super().parsed_sents(self._resolve(fileids, categories))
+    def parsed_paras(self, fileids=None, categories=None):
+        return super().parsed_paras(self._resolve(fileids, categories))
+class AlpinoCorpusReader(BracketParseCorpusReader):
+    """
+    Reader for the Alpino Dutch Treebank.
+    This corpus has a lexical breakdown structure embedded, as read by `_parse`
+    Unfortunately this puts punctuation and some other words out of the sentence
+    order in the xml element tree. This is no good for `tag_` and `word_`
+    `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
+    to the overridden _normalize function. The _parse function can then remain
+    untouched.
+    """
+    def __init__(self, root, encoding="ISO-8859-1", tagset=None):
+        BracketParseCorpusReader.__init__(
+            self,
+            root,
+            r"alpino\.xml",
+            detect_blocks="blankline",
+            encoding=encoding,
+            tagset=tagset,
+        )
+    def _normalize(self, t, ordered=False):
+        """Normalize the xml sentence element in t.
+        The sentence elements <alpino_ds>, although embedded in a few overall
+        xml elements, are separated by blank lines. That's how the reader can
+        deliver them one at a time.
+        Each sentence has a few category subnodes that are of no use to us.
+        The remaining word nodes may or may not appear in the proper order.
+        Each word node has attributes, among which:
+        - begin : the position of the word in the sentence
+        - pos   : Part of Speech: the Tag
+        - word  : the actual word
+        The return value is a string with all xml elementes replaced by
+        clauses: either a cat clause with nested clauses, or a word clause.
+        The order of the bracket clauses closely follows the xml.
+        If ordered == True, the word clauses include an order sequence number.
+        If ordered == False, the word clauses only have pos and word parts.
+        """
+        if t[:10] != "<alpino_ds":
+            return ""
+        # convert XML to sexpr notation
+        t = re.sub(r'  <node .*? cat="(\w+)".*>', r"(\1", t)
+        if ordered:
+            t = re.sub(
+                r'  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
+                r"(\1 \2 \3)",
+                t,
+            )
+        else:
+            t = re.sub(r'  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
+        t = re.sub(r"  </node>", r")", t)
+        t = re.sub(r"<sentence>.*</sentence>", r"", t)
+        t = re.sub(r"</?alpino_ds.*>", r"", t)
+        return t
+    def _tag(self, t, tagset=None):
+        tagged_sent = [
+            (int(o), w, p)
+            for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
+        ]
+        tagged_sent.sort()
+        if tagset and tagset != self._tagset:
+            tagged_sent = [
+                (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
+            ]
+        else:
+            tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
+        return tagged_sent
+    def _word(self, t):
+        """Return a correctly ordered list if words"""
+        tagged_sent = self._tag(t)
+        return [w for (w, p) in tagged_sent]

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Natural Language Toolkit: Categorized Sentences Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+CorpusReader structured for corpora that contain one instance on each row.
+This CorpusReader is specifically used for the Subjectivity Dataset and the
+Sentence Polarity Dataset.
+- Subjectivity Dataset information -
+Authors: Bo Pang and Lillian Lee.
+Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
+Distributed with permission.
+Related papers:
+- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
+    Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
+    2004.
+- Sentence Polarity Dataset information -
+Authors: Bo Pang and Lillian Lee.
+Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
+Related papers:
+- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
+    sentiment categorization with respect to rating scales". Proceedings of the
+    ACL, 2005.
+"""
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
+    """
+    A reader for corpora in which each row represents a single instance, mainly
+    a sentence. Istances are divided into categories based on their file identifiers
+    (see CategorizedCorpusReader).
+    Since many corpora allow rows that contain more than one sentence, it is
+    possible to specify a sentence tokenizer to retrieve all sentences instead
+    than all rows.
+    Examples using the Subjectivity Dataset:
+    >>> from nltk.corpus import subjectivity
+    >>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE
+    ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
+    'happened', 'off', 'screen', '.']
+    >>> subjectivity.categories()
+    ['obj', 'subj']
+    >>> subjectivity.words(categories='subj')
+    ['smart', 'and', 'alert', ',', 'thirteen', ...]
+    Examples using the Sentence Polarity Dataset:
+    >>> from nltk.corpus import sentence_polarity
+    >>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE
+    [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
+    'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
+    'it', 'funny', '.'], ...]
+    >>> sentence_polarity.categories()
+    ['neg', 'pos']
+    """
+    CorpusView = StreamBackedCorpusView
+    def __init__(
+        self,
+        root,
+        fileids,
+        word_tokenizer=WhitespaceTokenizer(),
+        sent_tokenizer=None,
+        encoding="utf8",
+        **kwargs
+    ):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
+        :param encoding: the encoding that should be used to read the corpus.
+        :param kwargs: additional parameters passed to CategorizedCorpusReader.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+    def sents(self, fileids=None, categories=None):
+        """
+        Return all sentences in the corpus or in the specified file(s).
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :param categories: a list specifying the categories whose sentences have
+            to be returned.
+        :return: the given file(s) as a list of sentences.
+            Each sentence is tokenized using the specified word_tokenizer.
+        :rtype: list(list(str))
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def words(self, fileids=None, categories=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        file(s).
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :param categories: a list specifying the categories whose words have to
+            be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def _read_sent_block(self, stream):
+        sents = []
+        for i in range(20):  # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            if self._sent_tokenizer:
+                sents.extend(
+                    [
+                        self._word_tokenizer.tokenize(sent)
+                        for sent in self._sent_tokenizer.tokenize(line)
+                    ]
+                )
+            else:
+                sents.append(self._word_tokenizer.tokenize(line))
+        return sents
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py ADDED Viewed

	@@ -0,0 +1,158 @@

+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Masato Hagiwara <hagisan@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import sys
+from nltk.corpus.reader import util
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+class ChasenCorpusReader(CorpusReader):
+    def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
+        self._sent_splitter = sent_splitter
+        CorpusReader.__init__(self, root, fileids, encoding)
+    def words(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def tagged_words(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def sents(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def tagged_sents(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def paras(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def tagged_paras(self, fileids=None):
+        return concat(
+            [
+                ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+class ChasenCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
+    but this'll use fixed sets of word and sentence tokenizer.
+    """
+    def __init__(
+        self,
+        corpus_file,
+        encoding,
+        tagged,
+        group_by_sent,
+        group_by_para,
+        sent_splitter=None,
+    ):
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        self._sent_splitter = sent_splitter
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+    def read_block(self, stream):
+        """Reads one paragraph at a time."""
+        block = []
+        for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
+            para = []
+            sent = []
+            for line in para_str.splitlines():
+                _eos = line.strip() == "EOS"
+                _cells = line.split("\t")
+                w = (_cells[0], "\t".join(_cells[1:]))
+                if not _eos:
+                    sent.append(w)
+                if _eos or (self._sent_splitter and self._sent_splitter(w)):
+                    if not self._tagged:
+                        sent = [w for (w, t) in sent]
+                    if self._group_by_sent:
+                        para.append(sent)
+                    else:
+                        para.extend(sent)
+                    sent = []
+            if len(sent) > 0:
+                if not self._tagged:
+                    sent = [w for (w, t) in sent]
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+            if self._group_by_para:
+                block.append(para)
+            else:
+                block.extend(para)
+        return block
+def demo():
+    import nltk
+    from nltk.corpus.util import LazyCorpusLoader
+    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
+    print("/".join(jeita.words()[22100:22140]))
+    print(
+        "\nEOS\n".join(
+            "\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
+            for sent in jeita.tagged_sents()[2170:2173]
+        )
+    )
+def test():
+    from nltk.corpus.util import LazyCorpusLoader
+    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
+    assert isinstance(jeita.tagged_words()[0][1], str)
+if __name__ == "__main__":
+    demo()
+    test()

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Natural Language Toolkit: PanLex Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: David Kamholz <kamholz@panlex.org>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
+as an SQLite database. See the README.txt in the panlex_lite corpus directory
+for more information on PanLex Lite.
+"""
+import os
+import sqlite3
+from nltk.corpus.reader.api import CorpusReader
+class PanLexLiteCorpusReader(CorpusReader):
+    MEANING_Q = """
+        SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
+        FROM dnx
+        JOIN ex ON (ex.ex = dnx.ex)
+        JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
+        JOIN ex ex2 ON (ex2.ex = dnx2.ex)
+        WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
+        ORDER BY dnx2.uq DESC
+    """
+    TRANSLATION_Q = """
+        SELECT s.tt, sum(s.uq) AS trq FROM (
+            SELECT ex2.tt, max(dnx.uq) AS uq
+            FROM dnx
+            JOIN ex ON (ex.ex = dnx.ex)
+            JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
+            JOIN ex ex2 ON (ex2.ex = dnx2.ex)
+            WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
+            GROUP BY ex2.tt, dnx.ui
+        ) s
+        GROUP BY s.tt
+        ORDER BY trq DESC, s.tt
+    """
+    def __init__(self, root):
+        self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
+        self._uid_lv = {}
+        self._lv_uid = {}
+        for row in self._c.execute("SELECT uid, lv FROM lv"):
+            self._uid_lv[row[0]] = row[1]
+            self._lv_uid[row[1]] = row[0]
+    def language_varieties(self, lc=None):
+        """
+        Return a list of PanLex language varieties.
+        :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
+            by this code. If unspecified, all varieties are returned.
+        :return: the specified language varieties as a list of tuples. The first
+            element is the language variety's seven-character uniform identifier,
+            and the second element is its default name.
+        :rtype: list(tuple)
+        """
+        if lc is None:
+            return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
+        else:
+            return self._c.execute(
+                "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
+            ).fetchall()
+    def meanings(self, expr_uid, expr_tt):
+        """
+        Return a list of meanings for an expression.
+        :param expr_uid: the expression's language variety, as a seven-character
+            uniform identifier.
+        :param expr_tt: the expression's text.
+        :return: a list of Meaning objects.
+        :rtype: list(Meaning)
+        """
+        expr_lv = self._uid_lv[expr_uid]
+        mn_info = {}
+        for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
+            mn = i[0]
+            uid = self._lv_uid[i[5]]
+            if not mn in mn_info:
+                mn_info[mn] = {
+                    "uq": i[1],
+                    "ap": i[2],
+                    "ui": i[3],
+                    "ex": {expr_uid: [expr_tt]},
+                }
+            if not uid in mn_info[mn]["ex"]:
+                mn_info[mn]["ex"][uid] = []
+            mn_info[mn]["ex"][uid].append(i[4])
+        return [Meaning(mn, mn_info[mn]) for mn in mn_info]
+    def translations(self, from_uid, from_tt, to_uid):
+        """
+        Return a list of translations for an expression into a single language
+        variety.
+        :param from_uid: the source expression's language variety, as a
+            seven-character uniform identifier.
+        :param from_tt: the source expression's text.
+        :param to_uid: the target language variety, as a seven-character
+            uniform identifier.
+        :return: a list of translation tuples. The first element is the expression
+            text and the second element is the translation quality.
+        :rtype: list(tuple)
+        """
+        from_lv = self._uid_lv[from_uid]
+        to_lv = self._uid_lv[to_uid]
+        return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
+class Meaning(dict):
+    """
+    Represents a single PanLex meaning. A meaning is a translation set derived
+    from a single source.
+    """
+    def __init__(self, mn, attr):
+        super().__init__(**attr)
+        self["mn"] = mn
+    def id(self):
+        """
+        :return: the meaning's id.
+        :rtype: int
+        """
+        return self["mn"]
+    def quality(self):
+        """
+        :return: the meaning's source's quality (0=worst, 9=best).
+        :rtype: int
+        """
+        return self["uq"]
+    def source(self):
+        """
+        :return: the meaning's source id.
+        :rtype: int
+        """
+        return self["ap"]
+    def source_group(self):
+        """
+        :return: the meaning's source group id.
+        :rtype: int
+        """
+        return self["ui"]
+    def expressions(self):
+        """
+        :return: the meaning's expressions as a dictionary whose keys are language
+            variety uniform identifiers and whose values are lists of expression
+            texts.
+        :rtype: dict
+        """
+        return self["ex"]

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Natural Language Toolkit: Word List Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import re
+from collections import defaultdict, namedtuple
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.wordlist import WordListCorpusReader
+from nltk.tokenize import line_tokenize
+PanlexLanguage = namedtuple(
+    "PanlexLanguage",
+    [
+        "panlex_uid",  # (1) PanLex UID
+        "iso639",  # (2) ISO 639 language code
+        "iso639_type",  # (3) ISO 639 language type, see README
+        "script",  # (4) normal scripts of expressions
+        "name",  # (5) PanLex default name
+        "langvar_uid",  # (6) UID of the language variety in which the default name is an expression
+    ],
+)
+class PanlexSwadeshCorpusReader(WordListCorpusReader):
+    """
+    This is a class to read the PanLex Swadesh list from
+    David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
+    PanLex: Building a Resource for Panlingual Lexical Translation.
+    In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
+    License: CC0 1.0 Universal
+    https://creativecommons.org/publicdomain/zero/1.0/legalcode
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Find the swadesh size using the fileids' path.
+        self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
+        self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
+        self._macro_langauges = self.get_macrolanguages()
+    def license(self):
+        return "CC0 1.0 Universal"
+    def language_codes(self):
+        return self._languages.keys()
+    def get_languages(self):
+        for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
+            if not line.strip():  # Skip empty lines.
+                continue
+            yield PanlexLanguage(*line.strip().split("\t"))
+    def get_macrolanguages(self):
+        macro_langauges = defaultdict(list)
+        for lang in self._languages.values():
+            macro_langauges[lang.iso639].append(lang.panlex_uid)
+        return macro_langauges
+    def words_by_lang(self, lang_code):
+        """
+        :return: a list of list(str)
+        """
+        fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
+        return [concept.split("\t") for concept in self.words(fileid)]
+    def words_by_iso639(self, iso63_code):
+        """
+        :return: a list of list(str)
+        """
+        fileids = [
+            f"swadesh{self.swadesh_size}/{lang_code}.txt"
+            for lang_code in self._macro_langauges[iso63_code]
+        ]
+        return [
+            concept.split("\t") for fileid in fileids for concept in self.words(fileid)
+        ]
+    def entries(self, fileids=None):
+        """
+        :return: a tuple of words for the specified fileids.
+        """
+        if not fileids:
+            fileids = self.fileids()
+        wordlists = [self.words(f) for f in fileids]
+        return list(zip(*wordlists))

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py ADDED Viewed

	@@ -0,0 +1,375 @@

+# Natural Language Toolkit:
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.xmldocs import XMLCorpusReader
+PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
+SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
+TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
+WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
+TYPE = re.compile(r'type="(.*?)"')
+ANA = re.compile(r'ana="(.*?)"')
+TEXTID = re.compile(r'text id="(.*?)"')
+class TEICorpusView(StreamBackedCorpusView):
+    def __init__(
+        self,
+        corpus_file,
+        tagged,
+        group_by_sent,
+        group_by_para,
+        tagset=None,
+        head_len=0,
+        textids=None,
+    ):
+        self._tagged = tagged
+        self._textids = textids
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        # WARNING -- skip header
+        StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
+    _pagesize = 4096
+    def read_block(self, stream):
+        block = stream.readlines(self._pagesize)
+        block = concat(block)
+        while (block.count("<text id") > block.count("</text>")) or block.count(
+            "<text id"
+        ) == 0:
+            tmp = stream.readline()
+            if len(tmp) <= 0:
+                break
+            block += tmp
+        block = block.replace("\n", "")
+        textids = TEXTID.findall(block)
+        if self._textids:
+            for tid in textids:
+                if tid not in self._textids:
+                    beg = block.find(tid) - 1
+                    end = block[beg:].find("</text>") + len("</text>")
+                    block = block[:beg] + block[beg + end :]
+        output = []
+        for para_str in PARA.findall(block):
+            para = []
+            for sent_str in SENT.findall(para_str):
+                if not self._tagged:
+                    sent = WORD.findall(sent_str)
+                else:
+                    sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+            if self._group_by_para:
+                output.append(para)
+            else:
+                output.extend(para)
+        return output
+    def _parse_tag(self, tag_word_tuple):
+        (tag, word) = tag_word_tuple
+        if tag.startswith("w"):
+            tag = ANA.search(tag).group(1)
+        else:  # tag.startswith('c')
+            tag = TYPE.search(tag).group(1)
+        return word, tag
+class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
+    head_len = 2770
+    def __init__(self, *args, **kwargs):
+        if "textid_file" in kwargs:
+            self._textids = kwargs["textid_file"]
+        else:
+            self._textids = None
+        XMLCorpusReader.__init__(self, *args)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._init_textids()
+    def _init_textids(self):
+        self._f2t = defaultdict(list)
+        self._t2f = defaultdict(list)
+        if self._textids is not None:
+            with open(self._textids) as fp:
+                for line in fp:
+                    line = line.strip()
+                    file_id, text_ids = line.split(" ", 1)
+                    if file_id not in self.fileids():
+                        raise ValueError(
+                            "In text_id mapping file %s: %s not found"
+                            % (self._textids, file_id)
+                        )
+                    for text_id in text_ids.split(self._delimiter):
+                        self._add_textids(file_id, text_id)
+    def _add_textids(self, file_id, text_id):
+        self._f2t[file_id].append(text_id)
+        self._t2f[text_id].append(file_id)
+    def _resolve(self, fileids, categories, textids=None):
+        tmp = None
+        if (
+            len(
+                list(
+                    filter(
+                        lambda accessor: accessor is None,
+                        (fileids, categories, textids),
+                    )
+                )
+            )
+            != 1
+        ):
+            raise ValueError(
+                "Specify exactly one of: fileids, " "categories or textids"
+            )
+        if fileids is not None:
+            return fileids, None
+        if categories is not None:
+            return self.fileids(categories), None
+        if textids is not None:
+            if isinstance(textids, str):
+                textids = [textids]
+            files = sum((self._t2f[t] for t in textids), [])
+            tdict = dict()
+            for f in files:
+                tdict[f] = set(self._f2t[f]) & set(textids)
+            return files, tdict
+    def decode_tag(self, tag):
+        # to be implemented
+        return tag
+    def textids(self, fileids=None, categories=None):
+        """
+        In the pl196x corpus each category is stored in single
+        file and thus both methods provide identical functionality. In order
+        to accommodate finer granularity, a non-standard textids() method was
+        implemented. All the main functions can be supplied with a list
+        of required chunks---giving much more control to the user.
+        """
+        fileids, _ = self._resolve(fileids, categories)
+        if fileids is None:
+            return sorted(self._t2f)
+        if isinstance(fileids, str):
+            fileids = [fileids]
+        return sorted(sum((self._f2t[d] for d in fileids), []))
+    def words(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        False,
+                        False,
+                        False,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        False,
+                        False,
+                        False,
+                        head_len=self.head_len,
+                    )
+                    for fileid in fileids
+                ]
+            )
+    def sents(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        False,
+                        True,
+                        False,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), False, True, False, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+    def paras(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        False,
+                        True,
+                        True,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), False, True, True, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+    def tagged_words(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        True,
+                        False,
+                        False,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), True, False, False, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+    def tagged_sents(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        True,
+                        True,
+                        False,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), True, True, False, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+    def tagged_paras(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        if textids:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid),
+                        True,
+                        True,
+                        True,
+                        head_len=self.head_len,
+                        textids=textids[fileid],
+                    )
+                    for fileid in fileids
+                ]
+            )
+        else:
+            return concat(
+                [
+                    TEICorpusView(
+                        self.abspath(fileid), True, True, True, head_len=self.head_len
+                    )
+                    for fileid in fileids
+                ]
+            )
+    def xml(self, fileids=None, categories=None):
+        fileids, _ = self._resolve(fileids, categories)
+        if len(fileids) == 1:
+            return XMLCorpusReader.xml(self, fileids[0])
+        else:
+            raise TypeError("Expected a single file")

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# Natural Language Toolkit: Plaintext Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+#         Nitin Madnani <nmadnani@umiacs.umd.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+A reader for corpora that consist of plaintext documents.
+"""
+import nltk.data
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tokenize import *
+class PlaintextCorpusReader(CorpusReader):
+    """
+    Reader for corpora that consist of plaintext documents.  Paragraphs
+    are assumed to be split using blank lines.  Sentences and words can
+    be tokenized using the default tokenizers, or by custom tokenizers
+    specified as parameters to the constructor.
+    This corpus reader can be customized (e.g., to skip preface
+    sections of specific document formats) by creating a subclass and
+    overriding the ``CorpusView`` class variable.
+    """
+    CorpusView = StreamBackedCorpusView
+    """The corpus view class used by this reader.  Subclasses of
+       ``PlaintextCorpusReader`` may specify alternative corpus view
+       classes (e.g., to skip the preface sections of documents.)"""
+    def __init__(
+        self,
+        root,
+        fileids,
+        word_tokenizer=WordPunctTokenizer(),
+        sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),
+        para_block_reader=read_blankline_block,
+        encoding="utf8",
+    ):
+        r"""
+        Construct a new plaintext corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+            >>> root = '/usr/local/share/nltk_data/corpora/webtext/'
+            >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param word_tokenizer: Tokenizer for breaking sentences or
+            paragraphs into words.
+        :param sent_tokenizer: Tokenizer for breaking paragraphs
+            into words.
+        :param para_block_reader: The block reader used to divide the
+            corpus into paragraph blocks.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        if self._sent_tokenizer is None:
+            raise ValueError("No sentence tokenizer for this corpus")
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def paras(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        if self._sent_tokenizer is None:
+            raise ValueError("No sentence tokenizer for this corpus")
+        return concat(
+            [
+                self.CorpusView(path, self._read_para_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20):  # Read 20 lines at a time.
+            words.extend(self._word_tokenizer.tokenize(stream.readline()))
+        return words
+    def _read_sent_block(self, stream):
+        sents = []
+        for para in self._para_block_reader(stream):
+            sents.extend(
+                [
+                    self._word_tokenizer.tokenize(sent)
+                    for sent in self._sent_tokenizer.tokenize(para)
+                ]
+            )
+        return sents
+    def _read_para_block(self, stream):
+        paras = []
+        for para in self._para_block_reader(stream):
+            paras.append(
+                [
+                    self._word_tokenizer.tokenize(sent)
+                    for sent in self._sent_tokenizer.tokenize(para)
+                ]
+            )
+        return paras
+class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
+    """
+    A reader for plaintext corpora whose documents are divided into
+    categories based on their file identifiers.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
+        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
+        are passed to the ``PlaintextCorpusReader`` constructor.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        PlaintextCorpusReader.__init__(self, *args, **kwargs)
+# FIXME: Is there a better way? How to not hardcode this?
+#       Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
+#       override the `sent_tokenizer`.
+class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
+    def __init__(self, *args, **kwargs):
+        CategorizedCorpusReader.__init__(self, kwargs)
+        kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
+            "tokenizers/punkt/portuguese.pickle"
+        )
+        PlaintextCorpusReader.__init__(self, *args, **kwargs)
+class EuroparlCorpusReader(PlaintextCorpusReader):
+    """
+    Reader for Europarl corpora that consist of plaintext documents.
+    Documents are divided into chapters instead of paragraphs as
+    for regular plaintext documents. Chapters are separated using blank
+    lines. Everything is inherited from ``PlaintextCorpusReader`` except
+    that:
+    - Since the corpus is pre-processed and pre-tokenized, the
+      word tokenizer should just split the line at whitespaces.
+    - For the same reason, the sentence tokenizer should just
+      split the paragraph at line breaks.
+    - There is a new 'chapters()' method that returns chapters instead
+      instead of paragraphs.
+    - The 'paras()' method inherited from PlaintextCorpusReader is
+      made non-functional to remove any confusion between chapters
+      and paragraphs for Europarl.
+    """
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20):  # Read 20 lines at a time.
+            words.extend(stream.readline().split())
+        return words
+    def _read_sent_block(self, stream):
+        sents = []
+        for para in self._para_block_reader(stream):
+            sents.extend([sent.split() for sent in para.splitlines()])
+        return sents
+    def _read_para_block(self, stream):
+        paras = []
+        for para in self._para_block_reader(stream):
+            paras.append([sent.split() for sent in para.splitlines()])
+        return paras
+    def chapters(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            chapters, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        return concat(
+            [
+                self.CorpusView(fileid, self._read_para_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def paras(self, fileids=None):
+        raise NotImplementedError(
+            "The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
+        )

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Natural Language Toolkit: PP Attachment Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Read lines from the Prepositional Phrase Attachment Corpus.
+The PP Attachment Corpus contains several files having the format:
+sentence_id verb noun1 preposition noun2 attachment
+For example:
+42960 gives authority to administration V
+46742 gives inventors of microchip N
+The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
+(VP gives (NP authority) (PP to administration))
+(VP gives (NP inventors (PP of microchip)))
+The corpus contains the following files:
+training:   training set
+devset:     development test set, used for algorithm development.
+test:       test set, used to report results
+bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
+Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
+Phrase Attachment.  Proceedings of the ARPA Human Language Technology
+Conference.  [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
+The PP Attachment Corpus is distributed with NLTK with the permission
+of the author.
+"""
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+class PPAttachment:
+    def __init__(self, sent, verb, noun1, prep, noun2, attachment):
+        self.sent = sent
+        self.verb = verb
+        self.noun1 = noun1
+        self.prep = prep
+        self.noun2 = noun2
+        self.attachment = attachment
+    def __repr__(self):
+        return (
+            "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
+            "noun2=%r, attachment=%r)"
+            % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
+        )
+class PPAttachmentCorpusReader(CorpusReader):
+    """
+    sentence_id verb noun1 preposition noun2 attachment
+    """
+    def attachments(self, fileids):
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def tuples(self, fileids):
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def _read_tuple_block(self, stream):
+        line = stream.readline()
+        if line:
+            return [tuple(line.split())]
+        else:
+            return []
+    def _read_obj_block(self, stream):
+        line = stream.readline()
+        if line:
+            return [PPAttachment(*line.split())]
+        else:
+            return []

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py ADDED Viewed

	@@ -0,0 +1,520 @@

+# Natural Language Toolkit: PropBank Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import re
+from functools import total_ordering
+from xml.etree import ElementTree
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.internals import raise_unorderable_types
+from nltk.tree import Tree
+class PropbankCorpusReader(CorpusReader):
+    """
+    Corpus reader for the propbank corpus, which augments the Penn
+    Treebank with information about the predicate argument structure
+    of every verb instance.  The corpus consists of two parts: the
+    predicate-argument annotations themselves, and a set of "frameset
+    files" which define the argument labels used by the annotations,
+    on a per-verb basis.  Each "frameset file" contains one or more
+    predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
+    divided into coarse-grained word senses called "rolesets".  For
+    each "roleset", the frameset file provides descriptions of the
+    argument roles, along with examples.
+    """
+    def __init__(
+        self,
+        root,
+        propfile,
+        framefiles="",
+        verbsfile=None,
+        parse_fileid_xform=None,
+        parse_corpus=None,
+        encoding="utf8",
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param propfile: The name of the file containing the predicate-
+            argument annotations (relative to ``root``).
+        :param framefiles: A list or regexp specifying the frameset
+            fileids for this corpus.
+        :param parse_fileid_xform: A transform that should be applied
+            to the fileids in this corpus.  This should be a function
+            of one argument (a fileid) that returns a string (the new
+            fileid).
+        :param parse_corpus: The corpus containing the parse trees
+            corresponding to this corpus.  These parse trees are
+            necessary to resolve the tree pointers used by propbank.
+        """
+        # If framefiles is specified as a regexp, expand it.
+        if isinstance(framefiles, str):
+            framefiles = find_corpus_fileids(root, framefiles)
+        framefiles = list(framefiles)
+        # Initialize the corpus reader.
+        CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
+        # Record our frame fileids & prop file.
+        self._propfile = propfile
+        self._framefiles = framefiles
+        self._verbsfile = verbsfile
+        self._parse_fileid_xform = parse_fileid_xform
+        self._parse_corpus = parse_corpus
+    def instances(self, baseform=None):
+        """
+        :return: a corpus view that acts as a list of
+            ``PropBankInstance`` objects, one for each noun in the corpus.
+        """
+        kwargs = {}
+        if baseform is not None:
+            kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
+        return StreamBackedCorpusView(
+            self.abspath(self._propfile),
+            lambda stream: self._read_instance_block(stream, **kwargs),
+            encoding=self.encoding(self._propfile),
+        )
+    def lines(self):
+        """
+        :return: a corpus view that acts as a list of strings, one for
+            each line in the predicate-argument annotation file.
+        """
+        return StreamBackedCorpusView(
+            self.abspath(self._propfile),
+            read_line_block,
+            encoding=self.encoding(self._propfile),
+        )
+    def roleset(self, roleset_id):
+        """
+        :return: the xml description for the given roleset.
+        """
+        baseform = roleset_id.split(".")[0]
+        framefile = "frames/%s.xml" % baseform
+        if framefile not in self._framefiles:
+            raise ValueError("Frameset file for %s not found" % roleset_id)
+        # n.b.: The encoding for XML fileids is specified by the file
+        # itself; so we ignore self._encoding here.
+        with self.abspath(framefile).open() as fp:
+            etree = ElementTree.parse(fp).getroot()
+        for roleset in etree.findall("predicate/roleset"):
+            if roleset.attrib["id"] == roleset_id:
+                return roleset
+        raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
+    def rolesets(self, baseform=None):
+        """
+        :return: list of xml descriptions for rolesets.
+        """
+        if baseform is not None:
+            framefile = "frames/%s.xml" % baseform
+            if framefile not in self._framefiles:
+                raise ValueError("Frameset file for %s not found" % baseform)
+            framefiles = [framefile]
+        else:
+            framefiles = self._framefiles
+        rsets = []
+        for framefile in framefiles:
+            # n.b.: The encoding for XML fileids is specified by the file
+            # itself; so we ignore self._encoding here.
+            with self.abspath(framefile).open() as fp:
+                etree = ElementTree.parse(fp).getroot()
+            rsets.append(etree.findall("predicate/roleset"))
+        return LazyConcatenation(rsets)
+    def verbs(self):
+        """
+        :return: a corpus view that acts as a list of all verb lemmas
+            in this corpus (from the verbs.txt file).
+        """
+        return StreamBackedCorpusView(
+            self.abspath(self._verbsfile),
+            read_line_block,
+            encoding=self.encoding(self._verbsfile),
+        )
+    def _read_instance_block(self, stream, instance_filter=lambda inst: True):
+        block = []
+        # Read 100 at a time.
+        for i in range(100):
+            line = stream.readline().strip()
+            if line:
+                inst = PropbankInstance.parse(
+                    line, self._parse_fileid_xform, self._parse_corpus
+                )
+                if instance_filter(inst):
+                    block.append(inst)
+        return block
+######################################################################
+# { Propbank Instance & related datatypes
+######################################################################
+class PropbankInstance:
+    def __init__(
+        self,
+        fileid,
+        sentnum,
+        wordnum,
+        tagger,
+        roleset,
+        inflection,
+        predicate,
+        arguments,
+        parse_corpus=None,
+    ):
+        self.fileid = fileid
+        """The name of the file containing the parse tree for this
+        instance's sentence."""
+        self.sentnum = sentnum
+        """The sentence number of this sentence within ``fileid``.
+        Indexing starts from zero."""
+        self.wordnum = wordnum
+        """The word number of this instance's predicate within its
+        containing sentence.  Word numbers are indexed starting from
+        zero, and include traces and other empty parse elements."""
+        self.tagger = tagger
+        """An identifier for the tagger who tagged this instance; or
+        ``'gold'`` if this is an adjuticated instance."""
+        self.roleset = roleset
+        """The name of the roleset used by this instance's predicate.
+        Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
+        look up information about the roleset."""
+        self.inflection = inflection
+        """A ``PropbankInflection`` object describing the inflection of
+        this instance's predicate."""
+        self.predicate = predicate
+        """A ``PropbankTreePointer`` indicating the position of this
+        instance's predicate within its containing sentence."""
+        self.arguments = tuple(arguments)
+        """A list of tuples (argloc, argid), specifying the location
+        and identifier for each of the predicate's argument in the
+        containing sentence.  Argument identifiers are strings such as
+        ``'ARG0'`` or ``'ARGM-TMP'``.  This list does *not* contain
+        the predicate."""
+        self.parse_corpus = parse_corpus
+        """A corpus reader for the parse trees corresponding to the
+        instances in this propbank corpus."""
+    @property
+    def baseform(self):
+        """The baseform of the predicate."""
+        return self.roleset.split(".")[0]
+    @property
+    def sensenumber(self):
+        """The sense number of the predicate."""
+        return self.roleset.split(".")[1]
+    @property
+    def predid(self):
+        """Identifier of the predicate."""
+        return "rel"
+    def __repr__(self):
+        return "<PropbankInstance: {}, sent {}, word {}>".format(
+            self.fileid,
+            self.sentnum,
+            self.wordnum,
+        )
+    def __str__(self):
+        s = "{} {} {} {} {} {}".format(
+            self.fileid,
+            self.sentnum,
+            self.wordnum,
+            self.tagger,
+            self.roleset,
+            self.inflection,
+        )
+        items = self.arguments + ((self.predicate, "rel"),)
+        for (argloc, argid) in sorted(items):
+            s += f" {argloc}-{argid}"
+        return s
+    def _get_tree(self):
+        if self.parse_corpus is None:
+            return None
+        if self.fileid not in self.parse_corpus.fileids():
+            return None
+        return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
+    tree = property(
+        _get_tree,
+        doc="""
+        The parse tree corresponding to this instance, or None if
+        the corresponding tree is not available.""",
+    )
+    @staticmethod
+    def parse(s, parse_fileid_xform=None, parse_corpus=None):
+        pieces = s.split()
+        if len(pieces) < 7:
+            raise ValueError("Badly formatted propbank line: %r" % s)
+        # Divide the line into its basic pieces.
+        (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
+        rel = [p for p in pieces[6:] if p.endswith("-rel")]
+        args = [p for p in pieces[6:] if not p.endswith("-rel")]
+        if len(rel) != 1:
+            raise ValueError("Badly formatted propbank line: %r" % s)
+        # Apply the fileid selector, if any.
+        if parse_fileid_xform is not None:
+            fileid = parse_fileid_xform(fileid)
+        # Convert sentence & word numbers to ints.
+        sentnum = int(sentnum)
+        wordnum = int(wordnum)
+        # Parse the inflection
+        inflection = PropbankInflection.parse(inflection)
+        # Parse the predicate location.
+        predicate = PropbankTreePointer.parse(rel[0][:-4])
+        # Parse the arguments.
+        arguments = []
+        for arg in args:
+            argloc, argid = arg.split("-", 1)
+            arguments.append((PropbankTreePointer.parse(argloc), argid))
+        # Put it all together.
+        return PropbankInstance(
+            fileid,
+            sentnum,
+            wordnum,
+            tagger,
+            roleset,
+            inflection,
+            predicate,
+            arguments,
+            parse_corpus,
+        )
+class PropbankPointer:
+    """
+    A pointer used by propbank to identify one or more constituents in
+    a parse tree.  ``PropbankPointer`` is an abstract base class with
+    three concrete subclasses:
+      - ``PropbankTreePointer`` is used to point to single constituents.
+      - ``PropbankSplitTreePointer`` is used to point to 'split'
+        constituents, which consist of a sequence of two or more
+        ``PropbankTreePointer`` pointers.
+      - ``PropbankChainTreePointer`` is used to point to entire trace
+        chains in a tree.  It consists of a sequence of pieces, which
+        can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
+    """
+    def __init__(self):
+        if self.__class__ == PropbankPointer:
+            raise NotImplementedError()
+class PropbankChainTreePointer(PropbankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements may
+           be either ``PropbankSplitTreePointer`` or
+           ``PropbankTreePointer`` pointers."""
+    def __str__(self):
+        return "*".join("%s" % p for p in self.pieces)
+    def __repr__(self):
+        return "<PropbankChainTreePointer: %s>" % self
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
+class PropbankSplitTreePointer(PropbankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements are
+           all ``PropbankTreePointer`` pointers."""
+    def __str__(self):
+        return ",".join("%s" % p for p in self.pieces)
+    def __repr__(self):
+        return "<PropbankSplitTreePointer: %s>" % self
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
+@total_ordering
+class PropbankTreePointer(PropbankPointer):
+    """
+    wordnum:height*wordnum:height*...
+    wordnum:height,
+    """
+    def __init__(self, wordnum, height):
+        self.wordnum = wordnum
+        self.height = height
+    @staticmethod
+    def parse(s):
+        # Deal with chains (xx*yy*zz)
+        pieces = s.split("*")
+        if len(pieces) > 1:
+            return PropbankChainTreePointer(
+                [PropbankTreePointer.parse(elt) for elt in pieces]
+            )
+        # Deal with split args (xx,yy,zz)
+        pieces = s.split(",")
+        if len(pieces) > 1:
+            return PropbankSplitTreePointer(
+                [PropbankTreePointer.parse(elt) for elt in pieces]
+            )
+        # Deal with normal pointers.
+        pieces = s.split(":")
+        if len(pieces) != 2:
+            raise ValueError("bad propbank pointer %r" % s)
+        return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
+    def __str__(self):
+        return f"{self.wordnum}:{self.height}"
+    def __repr__(self):
+        return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
+    def __eq__(self, other):
+        while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
+            other = other.pieces[0]
+        if not isinstance(other, PropbankTreePointer):
+            return self is other
+        return self.wordnum == other.wordnum and self.height == other.height
+    def __ne__(self, other):
+        return not self == other
+    def __lt__(self, other):
+        while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
+            other = other.pieces[0]
+        if not isinstance(other, PropbankTreePointer):
+            return id(self) < id(other)
+        return (self.wordnum, -self.height) < (other.wordnum, -other.height)
+    def select(self, tree):
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        return tree[self.treepos(tree)]
+    def treepos(self, tree):
+        """
+        Convert this pointer to a standard 'tree position' pointer,
+        given that it points to the given tree.
+        """
+        if tree is None:
+            raise ValueError("Parse tree not available")
+        stack = [tree]
+        treepos = []
+        wordnum = 0
+        while True:
+            # tree node:
+            if isinstance(stack[-1], Tree):
+                # Select the next child.
+                if len(treepos) < len(stack):
+                    treepos.append(0)
+                else:
+                    treepos[-1] += 1
+                # Update the stack.
+                if treepos[-1] < len(stack[-1]):
+                    stack.append(stack[-1][treepos[-1]])
+                else:
+                    # End of node's child list: pop up a level.
+                    stack.pop()
+                    treepos.pop()
+            # word node:
+            else:
+                if wordnum == self.wordnum:
+                    return tuple(treepos[: len(treepos) - self.height - 1])
+                else:
+                    wordnum += 1
+                    stack.pop()
+class PropbankInflection:
+    # { Inflection Form
+    INFINITIVE = "i"
+    GERUND = "g"
+    PARTICIPLE = "p"
+    FINITE = "v"
+    # { Inflection Tense
+    FUTURE = "f"
+    PAST = "p"
+    PRESENT = "n"
+    # { Inflection Aspect
+    PERFECT = "p"
+    PROGRESSIVE = "o"
+    PERFECT_AND_PROGRESSIVE = "b"
+    # { Inflection Person
+    THIRD_PERSON = "3"
+    # { Inflection Voice
+    ACTIVE = "a"
+    PASSIVE = "p"
+    # { Inflection
+    NONE = "-"
+    # }
+    def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
+        self.form = form
+        self.tense = tense
+        self.aspect = aspect
+        self.person = person
+        self.voice = voice
+    def __str__(self):
+        return self.form + self.tense + self.aspect + self.person + self.voice
+    def __repr__(self):
+        return "<PropbankInflection: %s>" % self
+    _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
+    @staticmethod
+    def parse(s):
+        if not isinstance(s, str):
+            raise TypeError("expected a string")
+        if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
+            raise ValueError("Bad propbank inflection string %r" % s)
+        return PropbankInflection(*s)

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Natural Language Toolkit: Pros and Cons Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+CorpusReader for the Pros and Cons dataset.
+- Pros and Cons dataset information -
+Contact: Bing Liu, liub@cs.uic.edu
+        https://www.cs.uic.edu/~liub
+Distributed with permission.
+Related papers:
+- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
+    Proceedings of the 22nd International Conference on Computational Linguistics
+    (Coling-2008), Manchester, 18-22 August, 2008.
+- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
+    Opinions on the Web". Proceedings of the 14th international World Wide Web
+    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
+"""
+import re
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
+    """
+    Reader for the Pros and Cons sentence dataset.
+        >>> from nltk.corpus import pros_cons
+        >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
+        [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
+        'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
+        ...]
+        >>> pros_cons.words('IntegratedPros.txt')
+        ['Easy', 'to', 'use', ',', 'economical', '!', ...]
+    """
+    CorpusView = StreamBackedCorpusView
+    def __init__(
+        self,
+        root,
+        fileids,
+        word_tokenizer=WordPunctTokenizer(),
+        encoding="utf8",
+        **kwargs
+    ):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param encoding: the encoding that should be used to read the corpus.
+        :param kwargs: additional parameters passed to CategorizedCorpusReader.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._word_tokenizer = word_tokenizer
+    def sents(self, fileids=None, categories=None):
+        """
+        Return all sentences in the corpus or in the specified files/categories.
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :param categories: a list specifying the categories whose sentences
+            have to be returned.
+        :return: the given file(s) as a list of sentences. Each sentence is
+            tokenized using the specified word_tokenizer.
+        :rtype: list(list(str))
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def words(self, fileids=None, categories=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        files/categories.
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :param categories: a list specifying the categories whose words have
+            to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def _read_sent_block(self, stream):
+        sents = []
+        for i in range(20):  # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
+            if sent:
+                sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
+        return sents
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# Natural Language Toolkit: Product Reviews Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
+Customer Review Corpus information
+==================================
+Annotated by: Minqing Hu and Bing Liu, 2004.
+    Department of Computer Science
+    University of Illinois at Chicago
+Contact: Bing Liu, liub@cs.uic.edu
+        https://www.cs.uic.edu/~liub
+Distributed with permission.
+The "product_reviews_1" and "product_reviews_2" datasets respectively contain
+annotated customer reviews of 5 and 9 products from amazon.com.
+Related papers:
+- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
+    Proceedings of the ACM SIGKDD International Conference on Knowledge
+    Discovery & Data Mining (KDD-04), 2004.
+- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
+    Proceedings of Nineteeth National Conference on Artificial Intelligence
+    (AAAI-2004), 2004.
+- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
+    Opinion Mining." Proceedings of First ACM International Conference on Web
+    Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
+    Stanford, California, USA.
+Symbols used in the annotated reviews:
+    :[t]: the title of the review: Each [t] tag starts a review.
+    :xxxx[+|-n]: xxxx is a product feature.
+    :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
+           Note that the strength is quite subjective.
+           You may want ignore it, but only considering + and -
+    :[-n]: Negative opinion
+    :##:   start of each sentence. Each line is a sentence.
+    :[u]:  feature not appeared in the sentence.
+    :[p]:  feature not appeared in the sentence. Pronoun resolution is needed.
+    :[s]:  suggestion or recommendation.
+    :[cc]: comparison with a competing product from a different brand.
+    :[cs]: comparison with a competing product from the same brand.
+Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
+    provide separation between different reviews. This is due to the fact that
+    the dataset was specifically designed for aspect/feature-based sentiment
+    analysis, for which sentence-level annotation is sufficient. For document-
+    level classification and analysis, this peculiarity should be taken into
+    consideration.
+"""
+import re
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+TITLE = re.compile(r"^\[t\](.*)$")  # [t] Title
+FEATURES = re.compile(
+    r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
+)  # find 'feature' in feature[+3]
+NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]")  # find 'p' in camera[+2][p]
+SENT = re.compile(r"##(.*)$")  # find tokenized sentence
+class Review:
+    """
+    A Review is the main block of a ReviewsCorpusReader.
+    """
+    def __init__(self, title=None, review_lines=None):
+        """
+        :param title: the title of the review.
+        :param review_lines: the list of the ReviewLines that belong to the Review.
+        """
+        self.title = title
+        if review_lines is None:
+            self.review_lines = []
+        else:
+            self.review_lines = review_lines
+    def add_line(self, review_line):
+        """
+        Add a line (ReviewLine) to the review.
+        :param review_line: a ReviewLine instance that belongs to the Review.
+        """
+        assert isinstance(review_line, ReviewLine)
+        self.review_lines.append(review_line)
+    def features(self):
+        """
+        Return a list of features in the review. Each feature is a tuple made of
+        the specific item feature and the opinion strength about that feature.
+        :return: all features of the review as a list of tuples (feat, score).
+        :rtype: list(tuple)
+        """
+        features = []
+        for review_line in self.review_lines:
+            features.extend(review_line.features)
+        return features
+    def sents(self):
+        """
+        Return all tokenized sentences in the review.
+        :return: all sentences of the review as lists of tokens.
+        :rtype: list(list(str))
+        """
+        return [review_line.sent for review_line in self.review_lines]
+    def __repr__(self):
+        return 'Review(title="{}", review_lines={})'.format(
+            self.title, self.review_lines
+        )
+class ReviewLine:
+    """
+    A ReviewLine represents a sentence of the review, together with (optional)
+    annotations of its features and notes about the reviewed item.
+    """
+    def __init__(self, sent, features=None, notes=None):
+        self.sent = sent
+        if features is None:
+            self.features = []
+        else:
+            self.features = features
+        if notes is None:
+            self.notes = []
+        else:
+            self.notes = notes
+    def __repr__(self):
+        return "ReviewLine(features={}, notes={}, sent={})".format(
+            self.features, self.notes, self.sent
+        )
+class ReviewsCorpusReader(CorpusReader):
+    """
+    Reader for the Customer Review Data dataset by Hu, Liu (2004).
+    Note: we are not applying any sentence tokenization at the moment, just word
+    tokenization.
+        >>> from nltk.corpus import product_reviews_1
+        >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
+        >>> review = camera_reviews[0]
+        >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
+        ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
+        'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
+        >>> review.features() # doctest: +NORMALIZE_WHITESPACE
+        [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
+        ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
+        ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
+        ('option', '+1')]
+    We can also reach the same information directly from the stream:
+        >>> product_reviews_1.features('Canon_G3.txt')
+        [('canon powershot g3', '+3'), ('use', '+2'), ...]
+    We can compute stats for specific product features:
+        >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> mean = tot / n_reviews
+        >>> print(n_reviews, tot, mean)
+        15 24 1.6
+    """
+    CorpusView = StreamBackedCorpusView
+    def __init__(
+        self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
+    ):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WordPunctTokenizer`
+        :param encoding: the encoding that should be used to read the corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+        self._readme = "README.txt"
+    def features(self, fileids=None):
+        """
+        Return a list of features. Each feature is a tuple made of the specific
+        item feature and the opinion strength about that feature.
+        :param fileids: a list or regexp specifying the ids of the files whose
+            features have to be returned.
+        :return: all features for the item(s) in the given file(s).
+        :rtype: list(tuple)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                self.CorpusView(fileid, self._read_features, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def reviews(self, fileids=None):
+        """
+        Return all the reviews as a list of Review objects. If `fileids` is
+        specified, return all the reviews from each of the specified files.
+        :param fileids: a list or regexp specifying the ids of the files whose
+            reviews have to be returned.
+        :return: the given file(s) as a list of reviews.
+        """
+        if fileids is None:
+            fileids = self._fileids
+        return concat(
+            [
+                self.CorpusView(fileid, self._read_review_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def sents(self, fileids=None):
+        """
+        Return all sentences in the corpus or in the specified files.
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :return: the given file(s) as a list of sentences, each encoded as a
+            list of word strings.
+        :rtype: list(list(str))
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_sent_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def words(self, fileids=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        files.
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_word_block, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def _read_features(self, stream):
+        features = []
+        for i in range(20):
+            line = stream.readline()
+            if not line:
+                return features
+            features.extend(re.findall(FEATURES, line))
+        return features
+    def _read_review_block(self, stream):
+        while True:
+            line = stream.readline()
+            if not line:
+                return []  # end of file.
+            title_match = re.match(TITLE, line)
+            if title_match:
+                review = Review(
+                    title=title_match.group(1).strip()
+                )  # We create a new review
+                break
+        # Scan until we find another line matching the regexp, or EOF.
+        while True:
+            oldpos = stream.tell()
+            line = stream.readline()
+            # End of file:
+            if not line:
+                return [review]
+            # Start of a new review: backup to just before it starts, and
+            # return the review we've already collected.
+            if re.match(TITLE, line):
+                stream.seek(oldpos)
+                return [review]
+            # Anything else is part of the review line.
+            feats = re.findall(FEATURES, line)
+            notes = re.findall(NOTES, line)
+            sent = re.findall(SENT, line)
+            if sent:
+                sent = self._word_tokenizer.tokenize(sent[0])
+            review_line = ReviewLine(sent=sent, features=feats, notes=notes)
+            review.add_line(review_line)
+    def _read_sent_block(self, stream):
+        sents = []
+        for review in self._read_review_block(stream):
+            sents.extend([sent for sent in review.sents()])
+        return sents
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20):  # Read 20 lines at a time.
+            line = stream.readline()
+            sent = re.findall(SENT, line)
+            if sent:
+                words.extend(self._word_tokenizer.tokenize(sent[0]))
+        return words

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Natural Language Toolkit: RTE Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author:  Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
+The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
+were regularized.
+Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
+gold standard annotated files.
+Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
+example is taken from RTE3::
+ <pair id="1" entailment="YES" task="IE" length="short" >
+    <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
+    Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
+    company Baikalfinansgroup which was later bought by the Russian
+    state-owned oil company Rosneft .</t>
+   <h>Baikalfinansgroup was sold to Rosneft.</h>
+ </pair>
+In order to provide globally unique IDs for each pair, a new attribute
+``challenge`` has been added to the root element ``entailment-corpus`` of each
+file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
+challenge number and 'n' is the pair ID.
+"""
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.xmldocs import *
+def norm(value_string):
+    """
+    Normalize the string value in an RTE pair's ``value`` or ``entailment``
+    attribute as an integer (1, 0).
+    :param value_string: the label used to classify a text/hypothesis pair
+    :type value_string: str
+    :rtype: int
+    """
+    valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
+    return valdict[value_string.upper()]
+class RTEPair:
+    """
+    Container for RTE text-hypothesis pairs.
+    The entailment relation is signalled by the ``value`` attribute in RTE1, and by
+    ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
+    attribute of this class.
+    """
+    def __init__(
+        self,
+        pair,
+        challenge=None,
+        id=None,
+        text=None,
+        hyp=None,
+        value=None,
+        task=None,
+        length=None,
+    ):
+        """
+        :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
+        :param id: identifier for the pair
+        :param text: the text component of the pair
+        :param hyp: the hypothesis component of the pair
+        :param value: classification label for the pair
+        :param task: attribute for the particular NLP task that the data was drawn from
+        :param length: attribute for the length of the text of the pair
+        """
+        self.challenge = challenge
+        self.id = pair.attrib["id"]
+        self.gid = f"{self.challenge}-{self.id}"
+        self.text = pair[0].text
+        self.hyp = pair[1].text
+        if "value" in pair.attrib:
+            self.value = norm(pair.attrib["value"])
+        elif "entailment" in pair.attrib:
+            self.value = norm(pair.attrib["entailment"])
+        else:
+            self.value = value
+        if "task" in pair.attrib:
+            self.task = pair.attrib["task"]
+        else:
+            self.task = task
+        if "length" in pair.attrib:
+            self.length = pair.attrib["length"]
+        else:
+            self.length = length
+    def __repr__(self):
+        if self.challenge:
+            return f"<RTEPair: gid={self.challenge}-{self.id}>"
+        else:
+            return "<RTEPair: id=%s>" % self.id
+class RTECorpusReader(XMLCorpusReader):
+    """
+    Corpus reader for corpora in RTE challenges.
+    This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
+    structure of input documents.
+    """
+    def _read_etree(self, doc):
+        """
+        Map the XML input into an RTEPair.
+        This uses the ``getiterator()`` method from the ElementTree package to
+        find all the ``<pair>`` elements.
+        :param doc: a parsed XML document
+        :rtype: list(RTEPair)
+        """
+        try:
+            challenge = doc.attrib["challenge"]
+        except KeyError:
+            challenge = None
+        pairiter = doc.iter("pair")
+        return [RTEPair(pair, challenge=challenge) for pair in pairiter]
+    def pairs(self, fileids):
+        """
+        Build a list of RTEPairs from a RTE corpus.
+        :param fileids: a list of RTE corpus fileids
+        :type: list
+        :rtype: list(RTEPair)
+        """
+        if isinstance(fileids, str):
+            fileids = [fileids]
+        return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Natural Language Toolkit: SemCor Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Nathan Schneider <nschneid@cs.cmu.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Corpus reader for the SemCor Corpus.
+"""
+__docformat__ = "epytext en"
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
+from nltk.tree import Tree
+class SemcorCorpusReader(XMLCorpusReader):
+    """
+    Corpus reader for the SemCor Corpus.
+    For access to the complete XML data structure, use the ``xml()``
+    method.  For access to simple word lists and tagged word lists, use
+    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
+    """
+    def __init__(self, root, fileids, wordnet, lazy=True):
+        XMLCorpusReader.__init__(self, root, fileids)
+        self._lazy = lazy
+        self._wordnet = wordnet
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return self._items(fileids, "word", False, False, False)
+    def chunks(self, fileids=None):
+        """
+        :return: the given file(s) as a list of chunks,
+            each of which is a list of words and punctuation symbols
+            that form a unit.
+        :rtype: list(list(str))
+        """
+        return self._items(fileids, "chunk", False, False, False)
+    def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
+        """
+        :return: the given file(s) as a list of tagged chunks, represented
+            in tree form.
+        :rtype: list(Tree)
+        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
+            to indicate the kind of tags to include.  Semantic tags consist of
+            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
+            without a specific entry in WordNet.  (Named entities of type 'other'
+            have no lemma.  Other chunks not in WordNet have no semantic tag.
+            Punctuation tokens have `None` for their part of speech tag.)
+        """
+        return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of sentences, each encoded
+            as a list of word strings.
+        :rtype: list(list(str))
+        """
+        return self._items(fileids, "word", True, False, False)
+    def chunk_sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of sentences, each encoded
+            as a list of chunks.
+        :rtype: list(list(list(str)))
+        """
+        return self._items(fileids, "chunk", True, False, False)
+    def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
+        """
+        :return: the given file(s) as a list of sentences. Each sentence
+            is represented as a list of tagged chunks (in tree form).
+        :rtype: list(list(Tree))
+        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
+            to indicate the kind of tags to include.  Semantic tags consist of
+            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
+            without a specific entry in WordNet.  (Named entities of type 'other'
+            have no lemma.  Other chunks not in WordNet have no semantic tag.
+            Punctuation tokens have `None` for their part of speech tag.)
+        """
+        return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
+    def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
+        if unit == "word" and not bracket_sent:
+            # the result of the SemcorWordView may be a multiword unit, so the
+            # LazyConcatenation will make sure the sentence is flattened
+            _ = lambda *args: LazyConcatenation(
+                (SemcorWordView if self._lazy else self._words)(*args)
+            )
+        else:
+            _ = SemcorWordView if self._lazy else self._words
+        return concat(
+            [
+                _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
+                for fileid in self.abspaths(fileids)
+            ]
+        )
+    def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
+        """
+        Helper used to implement the view methods -- returns a list of
+        tokens, (segmented) words, chunks, or sentences. The tokens
+        and chunks may optionally be tagged (with POS and sense
+        information).
+        :param fileid: The name of the underlying file.
+        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param pos_tag: Whether to include part-of-speech tags.
+        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
+            and OOV named entity status.
+        """
+        assert unit in ("token", "word", "chunk")
+        result = []
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for xmlsent in xmldoc.findall(".//s"):
+            sent = []
+            for xmlword in _all_xmlwords_in(xmlsent):
+                itm = SemcorCorpusReader._word(
+                    xmlword, unit, pos_tag, sem_tag, self._wordnet
+                )
+                if unit == "word":
+                    sent.extend(itm)
+                else:
+                    sent.append(itm)
+            if bracket_sent:
+                result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
+            else:
+                result.extend(sent)
+        assert None not in result
+        return result
+    @staticmethod
+    def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
+        tkn = xmlword.text
+        if not tkn:
+            tkn = ""  # fixes issue 337?
+        lemma = xmlword.get("lemma", tkn)  # lemma or NE class
+        lexsn = xmlword.get("lexsn")  # lex_sense (locator for the lemma's sense)
+        if lexsn is not None:
+            sense_key = lemma + "%" + lexsn
+            wnpos = ("n", "v", "a", "r", "s")[
+                int(lexsn.split(":")[0]) - 1
+            ]  # see http://wordnet.princeton.edu/man/senseidx.5WN.html
+        else:
+            sense_key = wnpos = None
+        redef = xmlword.get(
+            "rdf", tkn
+        )  # redefinition--this indicates the lookup string
+        # does not exactly match the enclosed string, e.g. due to typographical adjustments
+        # or discontinuity of a multiword expression. If a redefinition has occurred,
+        # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
+        # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
+        sensenum = xmlword.get("wnsn")  # WordNet sense number
+        isOOVEntity = "pn" in xmlword.keys()  # a "personal name" (NE) not in WordNet
+        pos = xmlword.get(
+            "pos"
+        )  # part of speech for the whole chunk (None for punctuation)
+        if unit == "token":
+            if not pos_tag and not sem_tag:
+                itm = tkn
+            else:
+                itm = (
+                    (tkn,)
+                    + ((pos,) if pos_tag else ())
+                    + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
+                )
+            return itm
+        else:
+            ww = tkn.split("_")  # TODO: case where punctuation intervenes in MWE
+            if unit == "word":
+                return ww
+            else:
+                if sensenum is not None:
+                    try:
+                        sense = wordnet.lemma_from_key(sense_key)  # Lemma object
+                    except Exception:
+                        # cannot retrieve the wordnet.Lemma object. possible reasons:
+                        #  (a) the wordnet corpus is not downloaded;
+                        #  (b) a nonexistent sense is annotated: e.g., such.s.00 triggers:
+                        #  nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
+                        # solution: just use the lemma name as a string
+                        try:
+                            sense = "%s.%s.%02d" % (
+                                lemma,
+                                wnpos,
+                                int(sensenum),
+                            )  # e.g.: reach.v.02
+                        except ValueError:
+                            sense = (
+                                lemma + "." + wnpos + "." + sensenum
+                            )  # e.g. the sense number may be "2;1"
+                bottom = [Tree(pos, ww)] if pos_tag else ww
+                if sem_tag and isOOVEntity:
+                    if sensenum is not None:
+                        return Tree(sense, [Tree("NE", bottom)])
+                    else:  # 'other' NE
+                        return Tree("NE", bottom)
+                elif sem_tag and sensenum is not None:
+                    return Tree(sense, bottom)
+                elif pos_tag:
+                    return bottom[0]
+                else:
+                    return bottom  # chunk as a list
+def _all_xmlwords_in(elt, result=None):
+    if result is None:
+        result = []
+    for child in elt:
+        if child.tag in ("wf", "punc"):
+            result.append(child)
+        else:
+            _all_xmlwords_in(child, result)
+    return result
+class SemcorSentence(list):
+    """
+    A list of words, augmented by an attribute ``num`` used to record
+    the sentence identifier (the ``n`` attribute from the XML).
+    """
+    def __init__(self, num, items):
+        self.num = num
+        list.__init__(self, items)
+class SemcorWordView(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with the BNC corpus.
+    """
+    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
+        """
+        :param fileid: The name of the underlying file.
+        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param pos_tag: Whether to include part-of-speech tags.
+        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
+            and OOV named entity status.
+        """
+        if bracket_sent:
+            tagspec = ".*/s"
+        else:
+            tagspec = ".*/s/(punc|wf)"
+        self._unit = unit
+        self._sent = bracket_sent
+        self._pos_tag = pos_tag
+        self._sem_tag = sem_tag
+        self._wordnet = wordnet
+        XMLCorpusView.__init__(self, fileid, tagspec)
+    def handle_elt(self, elt, context):
+        if self._sent:
+            return self.handle_sent(elt)
+        else:
+            return self.handle_word(elt)
+    def handle_word(self, elt):
+        return SemcorCorpusReader._word(
+            elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
+        )
+    def handle_sent(self, elt):
+        sent = []
+        for child in elt:
+            if child.tag in ("wf", "punc"):
+                itm = self.handle_word(child)
+                if self._unit == "word":
+                    sent.extend(itm)
+                else:
+                    sent.append(itm)
+            else:
+                raise ValueError("Unexpected element %s" % child.tag)
+        return SemcorSentence(elt.attrib["snum"], sent)

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# Natural Language Toolkit: Senseval 2 Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+#         Steven Bird <stevenbird1@gmail.com> (modifications)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Read from the Senseval 2 Corpus.
+SENSEVAL [http://www.senseval.org/]
+Evaluation exercises for Word Sense Disambiguation.
+Organized by ACL-SIGLEX [https://www.siglex.org/]
+Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
+https://www.d.umn.edu/~tpederse/data.html
+Distributed with permission.
+The NLTK version of the Senseval 2 files uses well-formed XML.
+Each instance of the ambiguous words "hard", "interest", "line", and "serve"
+is tagged with a sense identifier, and supplied with context.
+"""
+import re
+from xml.etree import ElementTree
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tokenize import *
+class SensevalInstance:
+    def __init__(self, word, position, context, senses):
+        self.word = word
+        self.senses = tuple(senses)
+        self.position = position
+        self.context = context
+    def __repr__(self):
+        return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
+            self.word,
+            self.position,
+            self.context,
+            self.senses,
+        )
+class SensevalCorpusReader(CorpusReader):
+    def instances(self, fileids=None):
+        return concat(
+            [
+                SensevalCorpusView(fileid, enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def _entry(self, tree):
+        elts = []
+        for lexelt in tree.findall("lexelt"):
+            for inst in lexelt.findall("instance"):
+                sense = inst[0].attrib["senseid"]
+                context = [(w.text, w.attrib["pos"]) for w in inst[1]]
+                elts.append((sense, context))
+        return elts
+class SensevalCorpusView(StreamBackedCorpusView):
+    def __init__(self, fileid, encoding):
+        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
+        self._word_tokenizer = WhitespaceTokenizer()
+        self._lexelt_starts = [0]  # list of streampos
+        self._lexelts = [None]  # list of lexelt names
+    def read_block(self, stream):
+        # Decide which lexical element we're in.
+        lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
+        lexelt = self._lexelts[lexelt_num]
+        instance_lines = []
+        in_instance = False
+        while True:
+            line = stream.readline()
+            if line == "":
+                assert instance_lines == []
+                return []
+            # Start of a lexical element?
+            if line.lstrip().startswith("<lexelt"):
+                lexelt_num += 1
+                m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
+                assert m is not None  # <lexelt> has no 'item=...'
+                lexelt = m.group(1)[1:-1]
+                if lexelt_num < len(self._lexelts):
+                    assert lexelt == self._lexelts[lexelt_num]
+                else:
+                    self._lexelts.append(lexelt)
+                    self._lexelt_starts.append(stream.tell())
+            # Start of an instance?
+            if line.lstrip().startswith("<instance"):
+                assert instance_lines == []
+                in_instance = True
+            # Body of an instance?
+            if in_instance:
+                instance_lines.append(line)
+            # End of an instance?
+            if line.lstrip().startswith("</instance"):
+                xml_block = "\n".join(instance_lines)
+                xml_block = _fixXML(xml_block)
+                inst = ElementTree.fromstring(xml_block)
+                return [self._parse_instance(inst, lexelt)]
+    def _parse_instance(self, instance, lexelt):
+        senses = []
+        context = []
+        position = None
+        for child in instance:
+            if child.tag == "answer":
+                senses.append(child.attrib["senseid"])
+            elif child.tag == "context":
+                context += self._word_tokenizer.tokenize(child.text)
+                for cword in child:
+                    if cword.tag == "compound":
+                        cword = cword[0]  # is this ok to do?
+                    if cword.tag == "head":
+                        # Some santiy checks:
+                        assert position is None, "head specified twice"
+                        assert cword.text.strip() or len(cword) == 1
+                        assert not (cword.text.strip() and len(cword) == 1)
+                        # Record the position of the head:
+                        position = len(context)
+                        # Add on the head word itself:
+                        if cword.text.strip():
+                            context.append(cword.text.strip())
+                        elif cword[0].tag == "wf":
+                            context.append((cword[0].text, cword[0].attrib["pos"]))
+                            if cword[0].tail:
+                                context += self._word_tokenizer.tokenize(cword[0].tail)
+                        else:
+                            assert False, "expected CDATA or wf in <head>"
+                    elif cword.tag == "wf":
+                        context.append((cword.text, cword.attrib["pos"]))
+                    elif cword.tag == "s":
+                        pass  # Sentence boundary marker.
+                    else:
+                        print("ACK", cword.tag)
+                        assert False, "expected CDATA or <wf> or <head>"
+                    if cword.tail:
+                        context += self._word_tokenizer.tokenize(cword.tail)
+            else:
+                assert False, "unexpected tag %s" % child.tag
+        return SensevalInstance(lexelt, position, context, senses)
+def _fixXML(text):
+    """
+    Fix the various issues with Senseval pseudo-XML.
+    """
+    # <~> or <^> => ~ or ^
+    text = re.sub(r"<([~\^])>", r"\1", text)
+    # fix lone &
+    text = re.sub(r"(\s+)\&(\s+)", r"\1&amp;\2", text)
+    # fix """
+    text = re.sub(r'"""', "'\"'", text)
+    # fix <s snum=dd> => <s snum="dd"/>
+    text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
+    # fix foreign word tag
+    text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
+    # remove <&I .>
+    text = re.sub(r"<\&I[^>]*>", "", text)
+    # fix <{word}>
+    text = re.sub(r"<{([^}]+)}>", r"\1", text)
+    # remove <@>, <p>, </p>
+    text = re.sub(r"<(@|/?p)>", r"", text)
+    # remove <&M .> and <&T .> and <&Ms .>
+    text = re.sub(r"<&\w+ \.>", r"", text)
+    # remove <!DOCTYPE... > lines
+    text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
+    # remove <[hi]> and <[/p]> etc
+    text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
+    # take the thing out of the brackets: <&hellip;>
+    text = re.sub(r"<(\&\w+;)>", r"\1", text)
+    # and remove the & for those patterns that aren't regular XML
+    text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
+    # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
+    text = re.sub(
+        r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
+    )
+    text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
+    return text

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Natural Language Toolkit: SentiWordNet
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Christopher Potts <cgpotts@stanford.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+An NLTK interface for SentiWordNet
+SentiWordNet is a lexical resource for opinion mining.
+SentiWordNet assigns to each synset of WordNet three
+sentiment scores: positivity, negativity, and objectivity.
+For details about SentiWordNet see:
+http://sentiwordnet.isti.cnr.it/
+    >>> from nltk.corpus import sentiwordnet as swn
+    >>> print(swn.senti_synset('breakdown.n.03'))
+    <breakdown.n.03: PosScore=0.0 NegScore=0.25>
+    >>> list(swn.senti_synsets('slow'))
+    [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
+ SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
+ SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
+ SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
+ SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),\
+ SentiSynset('behind.r.03')]
+    >>> happy = swn.senti_synsets('happy', 'a')
+    >>> happy0 = list(happy)[0]
+    >>> happy0.pos_score()
+    0.875
+    >>> happy0.neg_score()
+    0.0
+    >>> happy0.obj_score()
+    0.125
+"""
+import re
+from nltk.corpus.reader import CorpusReader
+class SentiWordNetCorpusReader(CorpusReader):
+    def __init__(self, root, fileids, encoding="utf-8"):
+        """
+        Construct a new SentiWordNet Corpus Reader, using data from
+        the specified file.
+        """
+        super().__init__(root, fileids, encoding=encoding)
+        if len(self._fileids) != 1:
+            raise ValueError("Exactly one file must be specified")
+        self._db = {}
+        self._parse_src_file()
+    def _parse_src_file(self):
+        lines = self.open(self._fileids[0]).read().splitlines()
+        lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
+        for i, line in enumerate(lines):
+            fields = [field.strip() for field in re.split(r"\t+", line)]
+            try:
+                pos, offset, pos_score, neg_score, synset_terms, gloss = fields
+            except BaseException as e:
+                raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
+            if pos and offset:
+                offset = int(offset)
+                self._db[(pos, offset)] = (float(pos_score), float(neg_score))
+    def senti_synset(self, *vals):
+        from nltk.corpus import wordnet as wn
+        if tuple(vals) in self._db:
+            pos_score, neg_score = self._db[tuple(vals)]
+            pos, offset = vals
+            if pos == "s":
+                pos = "a"
+            synset = wn.synset_from_pos_and_offset(pos, offset)
+            return SentiSynset(pos_score, neg_score, synset)
+        else:
+            synset = wn.synset(vals[0])
+            pos = synset.pos()
+            if pos == "s":
+                pos = "a"
+            offset = synset.offset()
+            if (pos, offset) in self._db:
+                pos_score, neg_score = self._db[(pos, offset)]
+                return SentiSynset(pos_score, neg_score, synset)
+            else:
+                return None
+    def senti_synsets(self, string, pos=None):
+        from nltk.corpus import wordnet as wn
+        sentis = []
+        synset_list = wn.synsets(string, pos)
+        for synset in synset_list:
+            sentis.append(self.senti_synset(synset.name()))
+        sentis = filter(lambda x: x, sentis)
+        return sentis
+    def all_senti_synsets(self):
+        from nltk.corpus import wordnet as wn
+        for key, fields in self._db.items():
+            pos, offset = key
+            pos_score, neg_score = fields
+            synset = wn.synset_from_pos_and_offset(pos, offset)
+            yield SentiSynset(pos_score, neg_score, synset)
+class SentiSynset:
+    def __init__(self, pos_score, neg_score, synset):
+        self._pos_score = pos_score
+        self._neg_score = neg_score
+        self._obj_score = 1.0 - (self._pos_score + self._neg_score)
+        self.synset = synset
+    def pos_score(self):
+        return self._pos_score
+    def neg_score(self):
+        return self._neg_score
+    def obj_score(self):
+        return self._obj_score
+    def __str__(self):
+        """Prints just the Pos/Neg scores for now."""
+        s = "<"
+        s += self.synset.name() + ": "
+        s += "PosScore=%s " % self._pos_score
+        s += "NegScore=%s" % self._neg_score
+        s += ">"
+        return s
+    def __repr__(self):
+        return "Senti" + repr(self.synset)

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Natural Language Toolkit: Sinica Treebank Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Sinica Treebank Corpus Sample
+http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
+10,000 parsed sentences, drawn from the Academia Sinica Balanced
+Corpus of Modern Chinese.  Parse tree notation is based on
+Information-based Case Grammar.  Tagset documentation is available
+at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
+Language and Knowledge Processing Group, Institute of Information
+Science, Academia Sinica
+The data is distributed with the Natural Language Toolkit under the terms of
+the Creative Commons Attribution-NonCommercial-ShareAlike License
+[https://creativecommons.org/licenses/by-nc-sa/2.5/].
+References:
+Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
+The Construction of Sinica Treebank. Computational Linguistics and
+Chinese Language Processing, 4, pp 87-104.
+Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
+Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
+Annotation Guidelines, and On-line Interface. Proceedings of 2nd
+Chinese Language Processing Workshop, Association for Computational
+Linguistics.
+Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
+Extraction, Proceedings of IJCNLP-04, pp560-565.
+"""
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag
+from nltk.tree import sinica_parse
+IDENTIFIER = re.compile(r"^#\S+\s")
+APPENDIX = re.compile(r"(?<=\))#.*$")
+TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
+WORD = re.compile(r":[^:()|]+:([^:()|]+)")
+class SinicaTreebankCorpusReader(SyntaxCorpusReader):
+    """
+    Reader for the sinica treebank.
+    """
+    def _read_block(self, stream):
+        sent = stream.readline()
+        sent = IDENTIFIER.sub("", sent)
+        sent = APPENDIX.sub("", sent)
+        return [sent]
+    def _parse(self, sent):
+        return sinica_parse(sent)
+    def _tag(self, sent, tagset=None):
+        tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
+        if tagset and tagset != self._tagset:
+            tagged_sent = [
+                (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
+            ]
+        return tagged_sent
+    def _word(self, sent):
+        return WORD.findall(sent)

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Natural Language Toolkit: String Category Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Read tuples from a corpus consisting of categorized strings.
+For example, from the question classification corpus:
+NUM:dist How far is it from Denver to Aspen ?
+LOC:city What county is Modesto , California in ?
+HUM:desc Who was Galileo ?
+DESC:def What is an atom ?
+NUM:date When did Hawaii become a state ?
+"""
+from nltk.corpus.reader.api import *
+# based on PPAttachmentCorpusReader
+from nltk.corpus.reader.util import *
+# [xx] Should the order of the tuple be reversed -- in most other places
+# in nltk, we use the form (data, tag) -- e.g., tagged words and
+# labeled texts for classifiers.
+class StringCategoryCorpusReader(CorpusReader):
+    def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param delimiter: Field delimiter
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._delimiter = delimiter
+    def tuples(self, fileids=None):
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, str):
+            fileids = [fileids]
+        return concat(
+            [
+                StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def _read_tuple_block(self, stream):
+        line = stream.readline().strip()
+        if line:
+            return [tuple(line.split(self._delimiter, 1))]
+        else:
+            return []

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Natural Language Toolkit: Switchboard Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import re
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag, str2tuple
+class SwitchboardTurn(list):
+    """
+    A specialized list object used to encode switchboard utterances.
+    The elements of the list are the words in the utterance; and two
+    attributes, ``speaker`` and ``id``, are provided to retrieve the
+    spearker identifier and utterance id.  Note that utterance ids
+    are only unique within a given discourse.
+    """
+    def __init__(self, words, speaker, id):
+        list.__init__(self, words)
+        self.speaker = speaker
+        self.id = int(id)
+    def __repr__(self):
+        if len(self) == 0:
+            text = ""
+        elif isinstance(self[0], tuple):
+            text = " ".join("%s/%s" % w for w in self)
+        else:
+            text = " ".join(self)
+        return f"<{self.speaker}.{self.id}: {text!r}>"
+class SwitchboardCorpusReader(CorpusReader):
+    _FILES = ["tagged"]
+    # Use the "tagged" file even for non-tagged data methods, since
+    # it's tokenized.
+    def __init__(self, root, tagset=None):
+        CorpusReader.__init__(self, root, self._FILES)
+        self._tagset = tagset
+    def words(self):
+        return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
+    def tagged_words(self, tagset=None):
+        def tagged_words_block_reader(stream):
+            return self._tagged_words_block_reader(stream, tagset)
+        return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
+    def turns(self):
+        return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
+    def tagged_turns(self, tagset=None):
+        def tagged_turns_block_reader(stream):
+            return self._tagged_turns_block_reader(stream, tagset)
+        return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
+    def discourses(self):
+        return StreamBackedCorpusView(
+            self.abspath("tagged"), self._discourses_block_reader
+        )
+    def tagged_discourses(self, tagset=False):
+        def tagged_discourses_block_reader(stream):
+            return self._tagged_discourses_block_reader(stream, tagset)
+        return StreamBackedCorpusView(
+            self.abspath("tagged"), tagged_discourses_block_reader
+        )
+    def _discourses_block_reader(self, stream):
+        # returns at most 1 discourse.  (The other methods depend on this.)
+        return [
+            [
+                self._parse_utterance(u, include_tag=False)
+                for b in read_blankline_block(stream)
+                for u in b.split("\n")
+                if u.strip()
+            ]
+        ]
+    def _tagged_discourses_block_reader(self, stream, tagset=None):
+        # returns at most 1 discourse.  (The other methods depend on this.)
+        return [
+            [
+                self._parse_utterance(u, include_tag=True, tagset=tagset)
+                for b in read_blankline_block(stream)
+                for u in b.split("\n")
+                if u.strip()
+            ]
+        ]
+    def _turns_block_reader(self, stream):
+        return self._discourses_block_reader(stream)[0]
+    def _tagged_turns_block_reader(self, stream, tagset=None):
+        return self._tagged_discourses_block_reader(stream, tagset)[0]
+    def _words_block_reader(self, stream):
+        return sum(self._discourses_block_reader(stream)[0], [])
+    def _tagged_words_block_reader(self, stream, tagset=None):
+        return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
+    _UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
+    _SEP = "/"
+    def _parse_utterance(self, utterance, include_tag, tagset=None):
+        m = self._UTTERANCE_RE.match(utterance)
+        if m is None:
+            raise ValueError("Bad utterance %r" % utterance)
+        speaker, id, text = m.groups()
+        words = [str2tuple(s, self._SEP) for s in text.split()]
+        if not include_tag:
+            words = [w for (w, t) in words]
+        elif tagset and tagset != self._tagset:
+            words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
+        return SwitchboardTurn(words, speaker, id)

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py ADDED Viewed

	@@ -0,0 +1,354 @@

+# Natural Language Toolkit: Tagged Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Jacob Perkins <japerk@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+A reader for corpora whose documents contain part-of-speech-tagged words.
+"""
+import os
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.timit import read_timit_block
+from nltk.corpus.reader.util import *
+from nltk.tag import map_tag, str2tuple
+from nltk.tokenize import *
+class TaggedCorpusReader(CorpusReader):
+    """
+    Reader for simple part-of-speech tagged corpora.  Paragraphs are
+    assumed to be split using blank lines.  Sentences and words can be
+    tokenized using the default tokenizers, or by custom tokenizers
+    specified as parameters to the constructor.  Words are parsed
+    using ``nltk.tag.str2tuple``.  By default, ``'/'`` is used as the
+    separator.  I.e., words should have the form::
+       word1/tag1 word2/tag2 word3/tag3 ...
+    But custom separators may be specified as parameters to the
+    constructor.  Part of speech tags are case-normalized to upper
+    case.
+    """
+    def __init__(
+        self,
+        root,
+        fileids,
+        sep="/",
+        word_tokenizer=WhitespaceTokenizer(),
+        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        para_block_reader=read_blankline_block,
+        encoding="utf8",
+        tagset=None,
+    ):
+        """
+        Construct a new Tagged Corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+            >>> root = '/...path to corpus.../'
+            >>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+        self._tagset = tagset
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    False,
+                    False,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    None,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    True,
+                    False,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    None,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def paras(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    False,
+                    True,
+                    True,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    None,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def tagged_words(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    True,
+                    False,
+                    False,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    tag_mapping_function,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def tagged_sents(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+        :rtype: list(list(tuple(str,str)))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    True,
+                    True,
+                    False,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    tag_mapping_function,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def tagged_paras(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of ``(word,tag)`` tuples.
+        :rtype: list(list(list(tuple(str,str))))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat(
+            [
+                TaggedCorpusView(
+                    fileid,
+                    enc,
+                    True,
+                    True,
+                    True,
+                    self._sep,
+                    self._word_tokenizer,
+                    self._sent_tokenizer,
+                    self._para_block_reader,
+                    tag_mapping_function,
+                )
+                for (fileid, enc) in self.abspaths(fileids, True)
+            ]
+        )
+class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
+    """
+    A reader for part-of-speech tagged corpora whose documents are
+    divided into categories based on their file identifiers.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
+        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
+        are passed to the ``TaggedCorpusReader``.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        TaggedCorpusReader.__init__(self, *args, **kwargs)
+    def tagged_words(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_words(self._resolve(fileids, categories), tagset)
+    def tagged_sents(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_sents(self._resolve(fileids, categories), tagset)
+    def tagged_paras(self, fileids=None, categories=None, tagset=None):
+        return super().tagged_paras(self._resolve(fileids, categories), tagset)
+class TaggedCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for tagged documents.  It can be
+    customized via flags to divide the tagged corpus documents up by
+    sentence or paragraph, and to include or omit part of speech tags.
+    ``TaggedCorpusView`` objects are typically created by
+    ``TaggedCorpusReader`` (not directly by nltk users).
+    """
+    def __init__(
+        self,
+        corpus_file,
+        encoding,
+        tagged,
+        group_by_sent,
+        group_by_para,
+        sep,
+        word_tokenizer,
+        sent_tokenizer,
+        para_block_reader,
+        tag_mapping_function=None,
+    ):
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+        self._tag_mapping_function = tag_mapping_function
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+    def read_block(self, stream):
+        """Reads one paragraph at a time."""
+        block = []
+        for para_str in self._para_block_reader(stream):
+            para = []
+            for sent_str in self._sent_tokenizer.tokenize(para_str):
+                sent = [
+                    str2tuple(s, self._sep)
+                    for s in self._word_tokenizer.tokenize(sent_str)
+                ]
+                if self._tag_mapping_function:
+                    sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
+                if not self._tagged:
+                    sent = [w for (w, t) in sent]
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+            if self._group_by_para:
+                block.append(para)
+            else:
+                block.extend(para)
+        return block
+# needs to implement simplified tags
+class MacMorphoCorpusReader(TaggedCorpusReader):
+    """
+    A corpus reader for the MAC_MORPHO corpus.  Each line contains a
+    single tagged word, using '_' as a separator.  Sentence boundaries
+    are based on the end-sentence tag ('_.').  Paragraph information
+    is not included in the corpus, so each paragraph returned by
+    ``self.paras()`` and ``self.tagged_paras()`` contains a single
+    sentence.
+    """
+    def __init__(self, root, fileids, encoding="utf8", tagset=None):
+        TaggedCorpusReader.__init__(
+            self,
+            root,
+            fileids,
+            sep="_",
+            word_tokenizer=LineTokenizer(),
+            sent_tokenizer=RegexpTokenizer(".*\n"),
+            para_block_reader=self._read_block,
+            encoding=encoding,
+            tagset=tagset,
+        )
+    def _read_block(self, stream):
+        return read_regexp_block(stream, r".*", r".*_\.")
+class TimitTaggedCorpusReader(TaggedCorpusReader):
+    """
+    A corpus reader for tagged sentences that are included in the TIMIT corpus.
+    """
+    def __init__(self, *args, **kwargs):
+        TaggedCorpusReader.__init__(
+            self, para_block_reader=read_timit_block, *args, **kwargs
+        )
+    def paras(self):
+        raise NotImplementedError("use sents() instead")
+    def tagged_paras(self):
+        raise NotImplementedError("use tagged_sents() instead")

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py ADDED Viewed

	@@ -0,0 +1,510 @@

+# Natural Language Toolkit: TIMIT Corpus Reader
+#
+# Copyright (C) 2001-2007 NLTK Project
+# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Jacob Perkins <japerk@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+# [xx] this docstring is out-of-date:
+"""
+Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
+This corpus contains selected portion of the TIMIT corpus.
+ - 16 speakers from 8 dialect regions
+ - 1 male and 1 female from each dialect region
+ - total 130 sentences (10 sentences per speaker.  Note that some
+   sentences are shared among other speakers, especially sa1 and sa2
+   are spoken by all speakers.)
+ - total 160 recording of sentences (10 recordings per speaker)
+ - audio format: NIST Sphere, single channel, 16kHz sampling,
+   16 bit sample, PCM encoding
+Module contents
+===============
+The timit corpus reader provides 4 functions and 4 data items.
+ - utterances
+   List of utterances in the corpus.  There are total 160 utterances,
+   each of which corresponds to a unique utterance of a speaker.
+   Here's an example of an utterance identifier in the list::
+       dr1-fvmh0/sx206
+         - _----  _---
+         | |  |   | |
+         | |  |   | |
+         | |  |   | `--- sentence number
+         | |  |   `----- sentence type (a:all, i:shared, x:exclusive)
+         | |  `--------- speaker ID
+         | `------------ sex (m:male, f:female)
+         `-------------- dialect region (1..8)
+ - speakers
+   List of speaker IDs.  An example of speaker ID::
+       dr1-fvmh0
+   Note that if you split an item ID with colon and take the first element of
+   the result, you will get a speaker ID.
+       >>> itemid = 'dr1-fvmh0/sx206'
+       >>> spkrid , sentid = itemid.split('/')
+       >>> spkrid
+       'dr1-fvmh0'
+   The second element of the result is a sentence ID.
+ - dictionary()
+   Phonetic dictionary of words contained in this corpus.  This is a Python
+   dictionary from words to phoneme lists.
+ - spkrinfo()
+   Speaker information table.  It's a Python dictionary from speaker IDs to
+   records of 10 fields.  Speaker IDs the same as the ones in timie.speakers.
+   Each record is a dictionary from field names to values, and the fields are
+   as follows::
+     id         speaker ID as defined in the original TIMIT speaker info table
+     sex        speaker gender (M:male, F:female)
+     dr         speaker dialect region (1:new england, 2:northern,
+                3:north midland, 4:south midland, 5:southern, 6:new york city,
+                7:western, 8:army brat (moved around))
+     use        corpus type (TRN:training, TST:test)
+                in this sample corpus only TRN is available
+     recdate    recording date
+     birthdate  speaker birth date
+     ht         speaker height
+     race       speaker race (WHT:white, BLK:black, AMR:american indian,
+                SPN:spanish-american, ORN:oriental,???:unknown)
+     edu        speaker education level (HS:high school, AS:associate degree,
+                BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
+                PHD:doctorate degree (PhD,JD,MD), ??:unknown)
+     comments   comments by the recorder
+The 4 functions are as follows.
+ - tokenized(sentences=items, offset=False)
+   Given a list of items, returns an iterator of a list of word lists,
+   each of which corresponds to an item (sentence).  If offset is set to True,
+   each element of the word list is a tuple of word(string), start offset and
+   end offset, where offset is represented as a number of 16kHz samples.
+ - phonetic(sentences=items, offset=False)
+   Given a list of items, returns an iterator of a list of phoneme lists,
+   each of which corresponds to an item (sentence).  If offset is set to True,
+   each element of the phoneme list is a tuple of word(string), start offset
+   and end offset, where offset is represented as a number of 16kHz samples.
+ - audiodata(item, start=0, end=None)
+   Given an item, returns a chunk of audio samples formatted into a string.
+   When the function is called, if start and end are omitted, the entire
+   samples of the recording will be returned.  If only end is omitted,
+   samples from the start offset to the end of the recording will be returned.
+ - play(data)
+   Play the given audio samples. The audio samples can be obtained from the
+   timit.audiodata function.
+"""
+import sys
+import time
+from nltk.corpus.reader.api import *
+from nltk.internals import import_from_stdlib
+from nltk.tree import Tree
+class TimitCorpusReader(CorpusReader):
+    """
+    Reader for the TIMIT corpus (or any other corpus with the same
+    file layout and use of file formats).  The corpus root directory
+    should contain the following files:
+      - timitdic.txt: dictionary of standard transcriptions
+      - spkrinfo.txt: table of speaker information
+    In addition, the root directory should contain one subdirectory
+    for each speaker, containing three files for each utterance:
+      - <utterance-id>.txt: text content of utterances
+      - <utterance-id>.wrd: tokenized text content of utterances
+      - <utterance-id>.phn: phonetic transcription of utterances
+      - <utterance-id>.wav: utterance sound file
+    """
+    _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
+    """A regexp matching fileids that are used by this corpus reader."""
+    _UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
+    def __init__(self, root, encoding="utf8"):
+        """
+        Construct a new TIMIT corpus reader in the given directory.
+        :param root: The root directory for this corpus.
+        """
+        # Ensure that wave files don't get treated as unicode data:
+        if isinstance(encoding, str):
+            encoding = [(r".*\.wav", None), (".*", encoding)]
+        CorpusReader.__init__(
+            self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
+        )
+        self._utterances = [
+            name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
+        ]
+        """A list of the utterance identifiers for all utterances in
+        this corpus."""
+        self._speakerinfo = None
+        self._root = root
+        self.speakers = sorted({u.split("/")[0] for u in self._utterances})
+    def fileids(self, filetype=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus.
+        :param filetype: If specified, then ``filetype`` indicates that
+            only the files that have the given type should be
+            returned.  Accepted values are: ``txt``, ``wrd``, ``phn``,
+            ``wav``, or ``metadata``,
+        """
+        if filetype is None:
+            return CorpusReader.fileids(self)
+        elif filetype in ("txt", "wrd", "phn", "wav"):
+            return [f"{u}.{filetype}" for u in self._utterances]
+        elif filetype == "metadata":
+            return ["timitdic.txt", "spkrinfo.txt"]
+        else:
+            raise ValueError("Bad value for filetype: %r" % filetype)
+    def utteranceids(
+        self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
+    ):
+        """
+        :return: A list of the utterance identifiers for all
+            utterances in this corpus, or for the given speaker, dialect
+            region, gender, sentence type, or sentence number, if
+            specified.
+        """
+        if isinstance(dialect, str):
+            dialect = [dialect]
+        if isinstance(sex, str):
+            sex = [sex]
+        if isinstance(spkrid, str):
+            spkrid = [spkrid]
+        if isinstance(sent_type, str):
+            sent_type = [sent_type]
+        if isinstance(sentid, str):
+            sentid = [sentid]
+        utterances = self._utterances[:]
+        if dialect is not None:
+            utterances = [u for u in utterances if u[2] in dialect]
+        if sex is not None:
+            utterances = [u for u in utterances if u[4] in sex]
+        if spkrid is not None:
+            utterances = [u for u in utterances if u[:9] in spkrid]
+        if sent_type is not None:
+            utterances = [u for u in utterances if u[11] in sent_type]
+        if sentid is not None:
+            utterances = [u for u in utterances if u[10:] in spkrid]
+        return utterances
+    def transcription_dict(self):
+        """
+        :return: A dictionary giving the 'standard' transcription for
+            each word.
+        """
+        _transcriptions = {}
+        with self.open("timitdic.txt") as fp:
+            for line in fp:
+                if not line.strip() or line[0] == ";":
+                    continue
+                m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
+                if not m:
+                    raise ValueError("Bad line: %r" % line)
+                _transcriptions[m.group(1)] = m.group(2).split()
+        return _transcriptions
+    def spkrid(self, utterance):
+        return utterance.split("/")[0]
+    def sentid(self, utterance):
+        return utterance.split("/")[1]
+    def utterance(self, spkrid, sentid):
+        return f"{spkrid}/{sentid}"
+    def spkrutteranceids(self, speaker):
+        """
+        :return: A list of all utterances associated with a given
+            speaker.
+        """
+        return [
+            utterance
+            for utterance in self._utterances
+            if utterance.startswith(speaker + "/")
+        ]
+    def spkrinfo(self, speaker):
+        """
+        :return: A dictionary mapping .. something.
+        """
+        if speaker in self._utterances:
+            speaker = self.spkrid(speaker)
+        if self._speakerinfo is None:
+            self._speakerinfo = {}
+            with self.open("spkrinfo.txt") as fp:
+                for line in fp:
+                    if not line.strip() or line[0] == ";":
+                        continue
+                    rec = line.strip().split(None, 9)
+                    key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}"
+                    self._speakerinfo[key] = SpeakerInfo(*rec)
+        return self._speakerinfo[speaker]
+    def phones(self, utterances=None):
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".phn"):
+            with self.open(fileid) as fp:
+                for line in fp:
+                    if line.strip():
+                        results.append(line.split()[-1])
+        return results
+    def phone_times(self, utterances=None):
+        """
+        offset is represented as a number of 16kHz samples!
+        """
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".phn"):
+            with self.open(fileid) as fp:
+                for line in fp:
+                    if line.strip():
+                        results.append(
+                            (
+                                line.split()[2],
+                                int(line.split()[0]),
+                                int(line.split()[1]),
+                            )
+                        )
+        return results
+    def words(self, utterances=None):
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".wrd"):
+            with self.open(fileid) as fp:
+                for line in fp:
+                    if line.strip():
+                        results.append(line.split()[-1])
+        return results
+    def word_times(self, utterances=None):
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".wrd"):
+            with self.open(fileid) as fp:
+                for line in fp:
+                    if line.strip():
+                        results.append(
+                            (
+                                line.split()[2],
+                                int(line.split()[0]),
+                                int(line.split()[1]),
+                            )
+                        )
+        return results
+    def sents(self, utterances=None):
+        results = []
+        for fileid in self._utterance_fileids(utterances, ".wrd"):
+            with self.open(fileid) as fp:
+                results.append([line.split()[-1] for line in fp if line.strip()])
+        return results
+    def sent_times(self, utterances=None):
+        # TODO: Check this
+        return [
+            (
+                line.split(None, 2)[-1].strip(),
+                int(line.split()[0]),
+                int(line.split()[1]),
+            )
+            for fileid in self._utterance_fileids(utterances, ".txt")
+            for line in self.open(fileid)
+            if line.strip()
+        ]
+    def phone_trees(self, utterances=None):
+        if utterances is None:
+            utterances = self._utterances
+        if isinstance(utterances, str):
+            utterances = [utterances]
+        trees = []
+        for utterance in utterances:
+            word_times = self.word_times(utterance)
+            phone_times = self.phone_times(utterance)
+            sent_times = self.sent_times(utterance)
+            while sent_times:
+                (sent, sent_start, sent_end) = sent_times.pop(0)
+                trees.append(Tree("S", []))
+                while (
+                    word_times and phone_times and phone_times[0][2] <= word_times[0][1]
+                ):
+                    trees[-1].append(phone_times.pop(0)[0])
+                while word_times and word_times[0][2] <= sent_end:
+                    (word, word_start, word_end) = word_times.pop(0)
+                    trees[-1].append(Tree(word, []))
+                    while phone_times and phone_times[0][2] <= word_end:
+                        trees[-1][-1].append(phone_times.pop(0)[0])
+                while phone_times and phone_times[0][2] <= sent_end:
+                    trees[-1].append(phone_times.pop(0)[0])
+        return trees
+    # [xx] NOTE: This is currently broken -- we're assuming that the
+    # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
+    # fileids.
+    def wav(self, utterance, start=0, end=None):
+        # nltk.chunk conflicts with the stdlib module 'chunk'
+        wave = import_from_stdlib("wave")
+        w = wave.open(self.open(utterance + ".wav"), "rb")
+        if end is None:
+            end = w.getnframes()
+        # Skip past frames before start, then read the frames we want
+        w.readframes(start)
+        frames = w.readframes(end - start)
+        # Open a new temporary file -- the wave module requires
+        # an actual file, and won't work w/ stringio. :(
+        tf = tempfile.TemporaryFile()
+        out = wave.open(tf, "w")
+        # Write the parameters & data to the new file.
+        out.setparams(w.getparams())
+        out.writeframes(frames)
+        out.close()
+        # Read the data back from the file, and return it.  The
+        # file will automatically be deleted when we return.
+        tf.seek(0)
+        return tf.read()
+    def audiodata(self, utterance, start=0, end=None):
+        assert end is None or end > start
+        headersize = 44
+        with self.open(utterance + ".wav") as fp:
+            if end is None:
+                data = fp.read()
+            else:
+                data = fp.read(headersize + end * 2)
+        return data[headersize + start * 2 :]
+    def _utterance_fileids(self, utterances, extension):
+        if utterances is None:
+            utterances = self._utterances
+        if isinstance(utterances, str):
+            utterances = [utterances]
+        return [f"{u}{extension}" for u in utterances]
+    def play(self, utterance, start=0, end=None):
+        """
+        Play the given audio sample.
+        :param utterance: The utterance id of the sample to play
+        """
+        # Method 1: os audio dev.
+        try:
+            import ossaudiodev
+            try:
+                dsp = ossaudiodev.open("w")
+                dsp.setfmt(ossaudiodev.AFMT_S16_LE)
+                dsp.channels(1)
+                dsp.speed(16000)
+                dsp.write(self.audiodata(utterance, start, end))
+                dsp.close()
+            except OSError as e:
+                print(
+                    (
+                        "can't acquire the audio device; please "
+                        "activate your audio device."
+                    ),
+                    file=sys.stderr,
+                )
+                print("system error message:", str(e), file=sys.stderr)
+            return
+        except ImportError:
+            pass
+        # Method 2: pygame
+        try:
+            # FIXME: this won't work under python 3
+            import pygame.mixer
+            import StringIO
+            pygame.mixer.init(16000)
+            f = StringIO.StringIO(self.wav(utterance, start, end))
+            pygame.mixer.Sound(f).play()
+            while pygame.mixer.get_busy():
+                time.sleep(0.01)
+            return
+        except ImportError:
+            pass
+        # Method 3: complain. :)
+        print(
+            ("you must install pygame or ossaudiodev " "for audio playback."),
+            file=sys.stderr,
+        )
+class SpeakerInfo:
+    def __init__(
+        self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
+    ):
+        self.id = id
+        self.sex = sex
+        self.dr = dr
+        self.use = use
+        self.recdate = recdate
+        self.birthdate = birthdate
+        self.ht = ht
+        self.race = race
+        self.edu = edu
+        self.comments = comments
+    def __repr__(self):
+        attribs = "id sex dr use recdate birthdate ht race edu comments"
+        args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()]
+        return "SpeakerInfo(%s)" % (", ".join(args))
+def read_timit_block(stream):
+    """
+    Block reader for timit tagged sentences, which are preceded by a sentence
+    number that will be ignored.
+    """
+    line = stream.readline()
+    if not line:
+        return []
+    n, sent = line.split(" ", 1)
+    return [sent]

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Natural Language Toolkit: Toolbox Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Greg Aumann <greg_aumann@sil.org>
+#         Stuart Robinson <Stuart.Robinson@mpi.nl>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Module for reading, writing and manipulating
+Toolbox databases and settings fileids.
+"""
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.toolbox import ToolboxData
+class ToolboxCorpusReader(CorpusReader):
+    def xml(self, fileids, key=None):
+        return concat(
+            [
+                ToolboxData(path, enc).parse(key=key)
+                for (path, enc) in self.abspaths(fileids, True)
+            ]
+        )
+    def fields(
+        self,
+        fileids,
+        strip=True,
+        unwrap=True,
+        encoding="utf8",
+        errors="strict",
+        unicode_fields=None,
+    ):
+        return concat(
+            [
+                list(
+                    ToolboxData(fileid, enc).fields(
+                        strip, unwrap, encoding, errors, unicode_fields
+                    )
+                )
+                for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
+            ]
+        )
+    # should probably be done lazily:
+    def entries(self, fileids, **kwargs):
+        if "key" in kwargs:
+            key = kwargs["key"]
+            del kwargs["key"]
+        else:
+            key = "lx"  # the default key in MDF
+        entries = []
+        for marker, contents in self.fields(fileids, **kwargs):
+            if marker == key:
+                entries.append((contents, []))
+            else:
+                try:
+                    entries[-1][-1].append((marker, contents))
+                except IndexError:
+                    pass
+        return entries
+    def words(self, fileids, key="lx"):
+        return [contents for marker, contents in self.fields(fileids) if marker == key]
+def demo():
+    pass
+if __name__ == "__main__":
+    demo()

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Natural Language Toolkit: Twitter Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+A reader for corpora that consist of Tweets. It is assumed that the Tweets
+have been serialised into line-delimited JSON.
+"""
+import json
+import os
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
+from nltk.tokenize import TweetTokenizer
+class TwitterCorpusReader(CorpusReader):
+    r"""
+    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
+    Individual Tweets can be tokenized using the default tokenizer, or by a
+    custom tokenizer specified as a parameter to the constructor.
+    Construct a new Tweet corpus reader for a set of documents
+    located at the given root directory.
+    If you made your own tweet collection in a directory called
+    `twitter-files`, then you can initialise the reader as::
+        from nltk.corpus import TwitterCorpusReader
+        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
+    However, the recommended approach is to set the relevant directory as the
+    value of the environmental variable `TWITTER`, and then invoke the reader
+    as follows::
+       root = os.environ['TWITTER']
+       reader = TwitterCorpusReader(root, '.*\.json')
+    If you want to work directly with the raw Tweets, the `json` library can
+    be used::
+       import json
+       for tweet in reader.docs():
+           print(json.dumps(tweet, indent=1, sort_keys=True))
+    """
+    CorpusView = StreamBackedCorpusView
+    """
+    The corpus view class used by this reader.
+    """
+    def __init__(
+        self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
+    ):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
+            smaller units, including but not limited to words.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        for path in self.abspaths(self._fileids):
+            if isinstance(path, ZipFilePathPointer):
+                pass
+            elif os.path.getsize(path) == 0:
+                raise ValueError(f"File {path} is empty")
+        """Check that all user-created corpus files are non-empty."""
+        self._word_tokenizer = word_tokenizer
+    def docs(self, fileids=None):
+        """
+        Returns the full Tweet objects, as specified by `Twitter
+        documentation on Tweets
+        <https://dev.twitter.com/docs/platform-objects/tweets>`_
+        :return: the given file(s) as a list of dictionaries deserialised
+            from JSON.
+        :rtype: list(dict)
+        """
+        return concat(
+            [
+                self.CorpusView(path, self._read_tweets, encoding=enc)
+                for (path, enc, fileid) in self.abspaths(fileids, True, True)
+            ]
+        )
+    def strings(self, fileids=None):
+        """
+        Returns only the text content of Tweets in the file(s)
+        :return: the given file(s) as a list of Tweets.
+        :rtype: list(str)
+        """
+        fulltweets = self.docs(fileids)
+        tweets = []
+        for jsono in fulltweets:
+            try:
+                text = jsono["text"]
+                if isinstance(text, bytes):
+                    text = text.decode(self.encoding)
+                tweets.append(text)
+            except KeyError:
+                pass
+        return tweets
+    def tokenized(self, fileids=None):
+        """
+        :return: the given file(s) as a list of the text content of Tweets as
+            as a list of words, screenanames, hashtags, URLs and punctuation symbols.
+        :rtype: list(list(str))
+        """
+        tweets = self.strings(fileids)
+        tokenizer = self._word_tokenizer
+        return [tokenizer.tokenize(t) for t in tweets]
+    def _read_tweets(self, stream):
+        """
+        Assumes that each line in ``stream`` is a JSON-serialised object.
+        """
+        tweets = []
+        for i in range(10):
+            line = stream.readline()
+            if not line:
+                return tweets
+            tweet = json.loads(line)
+            tweets.append(tweet)
+        return tweets

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+UDHR corpus reader. It mostly deals with encodings.
+"""
+from nltk.corpus.reader.plaintext import PlaintextCorpusReader
+from nltk.corpus.reader.util import find_corpus_fileids
+class UdhrCorpusReader(PlaintextCorpusReader):
+    ENCODINGS = [
+        (".*-Latin1$", "latin-1"),
+        (".*-Hebrew$", "hebrew"),
+        (".*-Arabic$", "cp1256"),
+        ("Czech_Cesky-UTF8", "cp1250"),  # yeah
+        ("Polish-Latin2", "cp1250"),
+        ("Polish_Polski-Latin2", "cp1250"),
+        (".*-Cyrillic$", "cyrillic"),
+        (".*-SJIS$", "SJIS"),
+        (".*-GB2312$", "GB2312"),
+        (".*-Latin2$", "ISO-8859-2"),
+        (".*-Greek$", "greek"),
+        (".*-UTF8$", "utf-8"),
+        ("Hungarian_Magyar-Unicode", "utf-16-le"),
+        ("Amahuaca", "latin1"),
+        ("Turkish_Turkce-Turkish", "latin5"),
+        ("Lithuanian_Lietuviskai-Baltic", "latin4"),
+        ("Japanese_Nihongo-EUC", "EUC-JP"),
+        ("Japanese_Nihongo-JIS", "iso2022_jp"),
+        ("Chinese_Mandarin-HZ", "hz"),
+        (r"Abkhaz\-Cyrillic\+Abkh", "cp1251"),
+    ]
+    SKIP = {
+        # The following files are not fully decodable because they
+        # were truncated at wrong bytes:
+        "Burmese_Myanmar-UTF8",
+        "Japanese_Nihongo-JIS",
+        "Chinese_Mandarin-HZ",
+        "Chinese_Mandarin-UTF8",
+        "Gujarati-UTF8",
+        "Hungarian_Magyar-Unicode",
+        "Lao-UTF8",
+        "Magahi-UTF8",
+        "Marathi-UTF8",
+        "Tamil-UTF8",
+        # Unfortunately, encodings required for reading
+        # the following files are not supported by Python:
+        "Vietnamese-VPS",
+        "Vietnamese-VIQR",
+        "Vietnamese-TCVN",
+        "Magahi-Agra",
+        "Bhojpuri-Agra",
+        "Esperanto-T61",  # latin3 raises an exception
+        # The following files are encoded for specific fonts:
+        "Burmese_Myanmar-WinResearcher",
+        "Armenian-DallakHelv",
+        "Tigrinya_Tigrigna-VG2Main",
+        "Amharic-Afenegus6..60375",  # ?
+        "Navaho_Dine-Navajo-Navaho-font",
+        # What are these?
+        "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
+        "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
+        # The following files are unintended:
+        "Czech-Latin2-err",
+        "Russian_Russky-UTF8~",
+    }
+    def __init__(self, root="udhr"):
+        fileids = find_corpus_fileids(root, r"(?!README|\.).*")
+        super().__init__(
+            root,
+            [fileid for fileid in fileids if fileid not in self.SKIP],
+            encoding=self.ENCODINGS,
+        )

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py ADDED Viewed

	@@ -0,0 +1,867 @@

+# Natural Language Toolkit: Corpus Reader Utilities
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import bisect
+import os
+import pickle
+import re
+import tempfile
+from functools import reduce
+from xml.etree import ElementTree
+from nltk.data import (
+    FileSystemPathPointer,
+    PathPointer,
+    SeekableUnicodeStreamReader,
+    ZipFilePathPointer,
+)
+from nltk.internals import slice_bounds
+from nltk.tokenize import wordpunct_tokenize
+from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence
+######################################################################
+# { Corpus View
+######################################################################
+class StreamBackedCorpusView(AbstractLazySequence):
+    """
+    A 'view' of a corpus file, which acts like a sequence of tokens:
+    it can be accessed by index, iterated over, etc.  However, the
+    tokens are only constructed as-needed -- the entire corpus is
+    never stored in memory at once.
+    The constructor to ``StreamBackedCorpusView`` takes two arguments:
+    a corpus fileid (specified as a string or as a ``PathPointer``);
+    and a block reader.  A "block reader" is a function that reads
+    zero or more tokens from a stream, and returns them as a list.  A
+    very simple example of a block reader is:
+        >>> def simple_block_reader(stream):
+        ...     return stream.readline().split()
+    This simple block reader reads a single line at a time, and
+    returns a single token (consisting of a string) for each
+    whitespace-separated substring on the line.
+    When deciding how to define the block reader for a given
+    corpus, careful consideration should be given to the size of
+    blocks handled by the block reader.  Smaller block sizes will
+    increase the memory requirements of the corpus view's internal
+    data structures (by 2 integers per block).  On the other hand,
+    larger block sizes may decrease performance for random access to
+    the corpus.  (But note that larger block sizes will *not*
+    decrease performance for iteration.)
+    Internally, ``CorpusView`` maintains a partial mapping from token
+    index to file position, with one entry per block.  When a token
+    with a given index *i* is requested, the ``CorpusView`` constructs
+    it as follows:
+      1. First, it searches the toknum/filepos mapping for the token
+         index closest to (but less than or equal to) *i*.
+      2. Then, starting at the file position corresponding to that
+         index, it reads one block at a time using the block reader
+         until it reaches the requested token.
+    The toknum/filepos mapping is created lazily: it is initially
+    empty, but every time a new block is read, the block's
+    initial token is added to the mapping.  (Thus, the toknum/filepos
+    map has one entry per block.)
+    In order to increase efficiency for random access patterns that
+    have high degrees of locality, the corpus view may cache one or
+    more blocks.
+    :note: Each ``CorpusView`` object internally maintains an open file
+        object for its underlying corpus file.  This file should be
+        automatically closed when the ``CorpusView`` is garbage collected,
+        but if you wish to close it manually, use the ``close()``
+        method.  If you access a ``CorpusView``'s items after it has been
+        closed, the file object will be automatically re-opened.
+    :warning: If the contents of the file are modified during the
+        lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
+        is undefined.
+    :warning: If a unicode encoding is specified when constructing a
+        ``CorpusView``, then the block reader may only call
+        ``stream.seek()`` with offsets that have been returned by
+        ``stream.tell()``; in particular, calling ``stream.seek()`` with
+        relative offsets, or with offsets based on string lengths, may
+        lead to incorrect behavior.
+    :ivar _block_reader: The function used to read
+        a single block from the underlying file stream.
+    :ivar _toknum: A list containing the token index of each block
+        that has been processed.  In particular, ``_toknum[i]`` is the
+        token index of the first token in block ``i``.  Together
+        with ``_filepos``, this forms a partial mapping between token
+        indices and file positions.
+    :ivar _filepos: A list containing the file position of each block
+        that has been processed.  In particular, ``_toknum[i]`` is the
+        file position of the first character in block ``i``.  Together
+        with ``_toknum``, this forms a partial mapping between token
+        indices and file positions.
+    :ivar _stream: The stream used to access the underlying corpus file.
+    :ivar _len: The total number of tokens in the corpus, if known;
+        or None, if the number of tokens is not yet known.
+    :ivar _eofpos: The character position of the last character in the
+        file.  This is calculated when the corpus view is initialized,
+        and is used to decide when the end of file has been reached.
+    :ivar _cache: A cache of the most recently read block.  It
+       is encoded as a tuple (start_toknum, end_toknum, tokens), where
+       start_toknum is the token index of the first token in the block;
+       end_toknum is the token index of the first token not in the
+       block; and tokens is a list of the tokens in the block.
+    """
+    def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
+        """
+        Create a new corpus view, based on the file ``fileid``, and
+        read with ``block_reader``.  See the class documentation
+        for more information.
+        :param fileid: The path to the file that is read by this
+            corpus view.  ``fileid`` can either be a string or a
+            ``PathPointer``.
+        :param startpos: The file position at which the view will
+            start reading.  This can be used to skip over preface
+            sections.
+        :param encoding: The unicode encoding that should be used to
+            read the file's contents.  If no encoding is specified,
+            then the file's contents will be read as a non-unicode
+            string (i.e., a str).
+        """
+        if block_reader:
+            self.read_block = block_reader
+        # Initialize our toknum/filepos mapping.
+        self._toknum = [0]
+        self._filepos = [startpos]
+        self._encoding = encoding
+        # We don't know our length (number of tokens) yet.
+        self._len = None
+        self._fileid = fileid
+        self._stream = None
+        self._current_toknum = None
+        """This variable is set to the index of the next token that
+           will be read, immediately before ``self.read_block()`` is
+           called.  This is provided for the benefit of the block
+           reader, which under rare circumstances may need to know
+           the current token number."""
+        self._current_blocknum = None
+        """This variable is set to the index of the next block that
+           will be read, immediately before ``self.read_block()`` is
+           called.  This is provided for the benefit of the block
+           reader, which under rare circumstances may need to know
+           the current block number."""
+        # Find the length of the file.
+        try:
+            if isinstance(self._fileid, PathPointer):
+                self._eofpos = self._fileid.file_size()
+            else:
+                self._eofpos = os.stat(self._fileid).st_size
+        except Exception as exc:
+            raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc
+        # Maintain a cache of the most recently read block, to
+        # increase efficiency of random access.
+        self._cache = (-1, -1, None)
+    fileid = property(
+        lambda self: self._fileid,
+        doc="""
+        The fileid of the file that is accessed by this view.
+        :type: str or PathPointer""",
+    )
+    def read_block(self, stream):
+        """
+        Read a block from the input stream.
+        :return: a block of tokens from the input stream
+        :rtype: list(any)
+        :param stream: an input stream
+        :type stream: stream
+        """
+        raise NotImplementedError("Abstract Method")
+    def _open(self):
+        """
+        Open the file stream associated with this corpus view.  This
+        will be called performed if any value is read from the view
+        while its file stream is closed.
+        """
+        if isinstance(self._fileid, PathPointer):
+            self._stream = self._fileid.open(self._encoding)
+        elif self._encoding:
+            self._stream = SeekableUnicodeStreamReader(
+                open(self._fileid, "rb"), self._encoding
+            )
+        else:
+            self._stream = open(self._fileid, "rb")
+    def close(self):
+        """
+        Close the file stream associated with this corpus view.  This
+        can be useful if you are worried about running out of file
+        handles (although the stream should automatically be closed
+        upon garbage collection of the corpus view).  If the corpus
+        view is accessed after it is closed, it will be automatically
+        re-opened.
+        """
+        if self._stream is not None:
+            self._stream.close()
+        self._stream = None
+    def __enter__(self):
+        return self
+    def __exit__(self, type, value, traceback):
+        self.close()
+    def __len__(self):
+        if self._len is None:
+            # iterate_from() sets self._len when it reaches the end
+            # of the file:
+            for tok in self.iterate_from(self._toknum[-1]):
+                pass
+        return self._len
+    def __getitem__(self, i):
+        if isinstance(i, slice):
+            start, stop = slice_bounds(self, i)
+            # Check if it's in the cache.
+            offset = self._cache[0]
+            if offset <= start and stop <= self._cache[1]:
+                return self._cache[2][start - offset : stop - offset]
+            # Construct & return the result.
+            return LazySubsequence(self, start, stop)
+        else:
+            # Handle negative indices
+            if i < 0:
+                i += len(self)
+            if i < 0:
+                raise IndexError("index out of range")
+            # Check if it's in the cache.
+            offset = self._cache[0]
+            if offset <= i < self._cache[1]:
+                return self._cache[2][i - offset]
+            # Use iterate_from to extract it.
+            try:
+                return next(self.iterate_from(i))
+            except StopIteration as e:
+                raise IndexError("index out of range") from e
+    # If we wanted to be thread-safe, then this method would need to
+    # do some locking.
+    def iterate_from(self, start_tok):
+        # Start by feeding from the cache, if possible.
+        if self._cache[0] <= start_tok < self._cache[1]:
+            for tok in self._cache[2][start_tok - self._cache[0] :]:
+                yield tok
+                start_tok += 1
+        # Decide where in the file we should start.  If `start` is in
+        # our mapping, then we can jump straight to the correct block;
+        # otherwise, start at the last block we've processed.
+        if start_tok < self._toknum[-1]:
+            block_index = bisect.bisect_right(self._toknum, start_tok) - 1
+            toknum = self._toknum[block_index]
+            filepos = self._filepos[block_index]
+        else:
+            block_index = len(self._toknum) - 1
+            toknum = self._toknum[-1]
+            filepos = self._filepos[-1]
+        # Open the stream, if it's not open already.
+        if self._stream is None:
+            self._open()
+        # If the file is empty, the while loop will never run.
+        # This *seems* to be all the state we need to set:
+        if self._eofpos == 0:
+            self._len = 0
+        # Each iteration through this loop, we read a single block
+        # from the stream.
+        while filepos < self._eofpos:
+            # Read the next block.
+            self._stream.seek(filepos)
+            self._current_toknum = toknum
+            self._current_blocknum = block_index
+            tokens = self.read_block(self._stream)
+            assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
+                "block reader %s() should return list or tuple."
+                % self.read_block.__name__
+            )
+            num_toks = len(tokens)
+            new_filepos = self._stream.tell()
+            assert (
+                new_filepos > filepos
+            ), "block reader %s() should consume at least 1 byte (filepos=%d)" % (
+                self.read_block.__name__,
+                filepos,
+            )
+            # Update our cache.
+            self._cache = (toknum, toknum + num_toks, list(tokens))
+            # Update our mapping.
+            assert toknum <= self._toknum[-1]
+            if num_toks > 0:
+                block_index += 1
+                if toknum == self._toknum[-1]:
+                    assert new_filepos > self._filepos[-1]  # monotonic!
+                    self._filepos.append(new_filepos)
+                    self._toknum.append(toknum + num_toks)
+                else:
+                    # Check for consistency:
+                    assert (
+                        new_filepos == self._filepos[block_index]
+                    ), "inconsistent block reader (num chars read)"
+                    assert (
+                        toknum + num_toks == self._toknum[block_index]
+                    ), "inconsistent block reader (num tokens returned)"
+            # If we reached the end of the file, then update self._len
+            if new_filepos == self._eofpos:
+                self._len = toknum + num_toks
+            # Generate the tokens in this block (but skip any tokens
+            # before start_tok).  Note that between yields, our state
+            # may be modified.
+            for tok in tokens[max(0, start_tok - toknum) :]:
+                yield tok
+            # If we're at the end of the file, then we're done.
+            assert new_filepos <= self._eofpos
+            if new_filepos == self._eofpos:
+                break
+            # Update our indices
+            toknum += num_toks
+            filepos = new_filepos
+        # If we reach this point, then we should know our length.
+        assert self._len is not None
+        # Enforce closing of stream once we reached end of file
+        # We should have reached EOF once we're out of the while loop.
+        self.close()
+    # Use concat for these, so we can use a ConcatenatedCorpusView
+    # when possible.
+    def __add__(self, other):
+        return concat([self, other])
+    def __radd__(self, other):
+        return concat([other, self])
+    def __mul__(self, count):
+        return concat([self] * count)
+    def __rmul__(self, count):
+        return concat([self] * count)
+class ConcatenatedCorpusView(AbstractLazySequence):
+    """
+    A 'view' of a corpus file that joins together one or more
+    ``StreamBackedCorpusViews<StreamBackedCorpusView>``.  At most
+    one file handle is left open at any time.
+    """
+    def __init__(self, corpus_views):
+        self._pieces = corpus_views
+        """A list of the corpus subviews that make up this
+        concatenation."""
+        self._offsets = [0]
+        """A list of offsets, indicating the index at which each
+        subview begins.  In particular::
+            offsets[i] = sum([len(p) for p in pieces[:i]])"""
+        self._open_piece = None
+        """The most recently accessed corpus subview (or None).
+        Before a new subview is accessed, this subview will be closed."""
+    def __len__(self):
+        if len(self._offsets) <= len(self._pieces):
+            # Iterate to the end of the corpus.
+            for tok in self.iterate_from(self._offsets[-1]):
+                pass
+        return self._offsets[-1]
+    def close(self):
+        for piece in self._pieces:
+            piece.close()
+    def iterate_from(self, start_tok):
+        piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
+        while piecenum < len(self._pieces):
+            offset = self._offsets[piecenum]
+            piece = self._pieces[piecenum]
+            # If we've got another piece open, close it first.
+            if self._open_piece is not piece:
+                if self._open_piece is not None:
+                    self._open_piece.close()
+                self._open_piece = piece
+            # Get everything we can from this piece.
+            yield from piece.iterate_from(max(0, start_tok - offset))
+            # Update the offset table.
+            if piecenum + 1 == len(self._offsets):
+                self._offsets.append(self._offsets[-1] + len(piece))
+            # Move on to the next piece.
+            piecenum += 1
+def concat(docs):
+    """
+    Concatenate together the contents of multiple documents from a
+    single corpus, using an appropriate concatenation function.  This
+    utility function is used by corpus readers when the user requests
+    more than one document at a time.
+    """
+    if len(docs) == 1:
+        return docs[0]
+    if len(docs) == 0:
+        raise ValueError("concat() expects at least one object!")
+    types = {d.__class__ for d in docs}
+    # If they're all strings, use string concatenation.
+    if all(isinstance(doc, str) for doc in docs):
+        return "".join(docs)
+    # If they're all corpus views, then use ConcatenatedCorpusView.
+    for typ in types:
+        if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
+            break
+    else:
+        return ConcatenatedCorpusView(docs)
+    # If they're all lazy sequences, use a lazy concatenation
+    for typ in types:
+        if not issubclass(typ, AbstractLazySequence):
+            break
+    else:
+        return LazyConcatenation(docs)
+    # Otherwise, see what we can do:
+    if len(types) == 1:
+        typ = list(types)[0]
+        if issubclass(typ, list):
+            return reduce((lambda a, b: a + b), docs, [])
+        if issubclass(typ, tuple):
+            return reduce((lambda a, b: a + b), docs, ())
+        if ElementTree.iselement(typ):
+            xmltree = ElementTree.Element("documents")
+            for doc in docs:
+                xmltree.append(doc)
+            return xmltree
+    # No method found!
+    raise ValueError("Don't know how to concatenate types: %r" % types)
+######################################################################
+# { Corpus View for Pickled Sequences
+######################################################################
+class PickleCorpusView(StreamBackedCorpusView):
+    """
+    A stream backed corpus view for corpus files that consist of
+    sequences of serialized Python objects (serialized using
+    ``pickle.dump``).  One use case for this class is to store the
+    result of running feature detection on a corpus to disk.  This can
+    be useful when performing feature detection is expensive (so we
+    don't want to repeat it); but the corpus is too large to store in
+    memory.  The following example illustrates this technique:
+        >>> from nltk.corpus.reader.util import PickleCorpusView
+        >>> from nltk.util import LazyMap
+        >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP
+        >>> PickleCorpusView.write(feature_corpus, some_fileid)  # doctest: +SKIP
+        >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP
+    """
+    BLOCK_SIZE = 100
+    PROTOCOL = -1
+    def __init__(self, fileid, delete_on_gc=False):
+        """
+        Create a new corpus view that reads the pickle corpus
+        ``fileid``.
+        :param delete_on_gc: If true, then ``fileid`` will be deleted
+            whenever this object gets garbage-collected.
+        """
+        self._delete_on_gc = delete_on_gc
+        StreamBackedCorpusView.__init__(self, fileid)
+    def read_block(self, stream):
+        result = []
+        for i in range(self.BLOCK_SIZE):
+            try:
+                result.append(pickle.load(stream))
+            except EOFError:
+                break
+        return result
+    def __del__(self):
+        """
+        If ``delete_on_gc`` was set to true when this
+        ``PickleCorpusView`` was created, then delete the corpus view's
+        fileid.  (This method is called whenever a
+        ``PickledCorpusView`` is garbage-collected.
+        """
+        if getattr(self, "_delete_on_gc"):
+            if os.path.exists(self._fileid):
+                try:
+                    os.remove(self._fileid)
+                except OSError:
+                    pass
+        self.__dict__.clear()  # make the garbage collector's job easier
+    @classmethod
+    def write(cls, sequence, output_file):
+        if isinstance(output_file, str):
+            output_file = open(output_file, "wb")
+        for item in sequence:
+            pickle.dump(item, output_file, cls.PROTOCOL)
+    @classmethod
+    def cache_to_tempfile(cls, sequence, delete_on_gc=True):
+        """
+        Write the given sequence to a temporary file as a pickle
+        corpus; and then return a ``PickleCorpusView`` view for that
+        temporary corpus file.
+        :param delete_on_gc: If true, then the temporary file will be
+            deleted whenever this object gets garbage-collected.
+        """
+        try:
+            fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-")
+            output_file = os.fdopen(fd, "wb")
+            cls.write(sequence, output_file)
+            output_file.close()
+            return PickleCorpusView(output_file_name, delete_on_gc)
+        except OSError as e:
+            raise ValueError("Error while creating temp file: %s" % e) from e
+######################################################################
+# { Block Readers
+######################################################################
+def read_whitespace_block(stream):
+    toks = []
+    for i in range(20):  # Read 20 lines at a time.
+        toks.extend(stream.readline().split())
+    return toks
+def read_wordpunct_block(stream):
+    toks = []
+    for i in range(20):  # Read 20 lines at a time.
+        toks.extend(wordpunct_tokenize(stream.readline()))
+    return toks
+def read_line_block(stream):
+    toks = []
+    for i in range(20):
+        line = stream.readline()
+        if not line:
+            return toks
+        toks.append(line.rstrip("\n"))
+    return toks
+def read_blankline_block(stream):
+    s = ""
+    while True:
+        line = stream.readline()
+        # End of file:
+        if not line:
+            if s:
+                return [s]
+            else:
+                return []
+        # Blank line:
+        elif line and not line.strip():
+            if s:
+                return [s]
+        # Other line:
+        else:
+            s += line
+def read_alignedsent_block(stream):
+    s = ""
+    while True:
+        line = stream.readline()
+        if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
+            continue
+        # End of file:
+        if not line:
+            if s:
+                return [s]
+            else:
+                return []
+        # Other line:
+        else:
+            s += line
+            if re.match(r"^\d+-\d+", line) is not None:
+                return [s]
+def read_regexp_block(stream, start_re, end_re=None):
+    """
+    Read a sequence of tokens from a stream, where tokens begin with
+    lines that match ``start_re``.  If ``end_re`` is specified, then
+    tokens end with lines that match ``end_re``; otherwise, tokens end
+    whenever the next line matching ``start_re`` or EOF is found.
+    """
+    # Scan until we find a line matching the start regexp.
+    while True:
+        line = stream.readline()
+        if not line:
+            return []  # end of file.
+        if re.match(start_re, line):
+            break
+    # Scan until we find another line matching the regexp, or EOF.
+    lines = [line]
+    while True:
+        oldpos = stream.tell()
+        line = stream.readline()
+        # End of file:
+        if not line:
+            return ["".join(lines)]
+        # End of token:
+        if end_re is not None and re.match(end_re, line):
+            return ["".join(lines)]
+        # Start of new token: backup to just before it starts, and
+        # return the token we've already collected.
+        if end_re is None and re.match(start_re, line):
+            stream.seek(oldpos)
+            return ["".join(lines)]
+        # Anything else is part of the token.
+        lines.append(line)
+def read_sexpr_block(stream, block_size=16384, comment_char=None):
+    """
+    Read a sequence of s-expressions from the stream, and leave the
+    stream's file position at the end the last complete s-expression
+    read.  This function will always return at least one s-expression,
+    unless there are no more s-expressions in the file.
+    If the file ends in in the middle of an s-expression, then that
+    incomplete s-expression is returned when the end of the file is
+    reached.
+    :param block_size: The default block size for reading.  If an
+        s-expression is longer than one block, then more than one
+        block will be read.
+    :param comment_char: A character that marks comments.  Any lines
+        that begin with this character will be stripped out.
+        (If spaces or tabs precede the comment character, then the
+        line will not be stripped.)
+    """
+    start = stream.tell()
+    block = stream.read(block_size)
+    encoding = getattr(stream, "encoding", None)
+    assert encoding is not None or isinstance(block, str)
+    if encoding not in (None, "utf-8"):
+        import warnings
+        warnings.warn(
+            "Parsing may fail, depending on the properties "
+            "of the %s encoding!" % encoding
+        )
+        # (e.g., the utf-16 encoding does not work because it insists
+        # on adding BOMs to the beginning of encoded strings.)
+    if comment_char:
+        COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
+    while True:
+        try:
+            # If we're stripping comments, then make sure our block ends
+            # on a line boundary; and then replace any comments with
+            # space characters.  (We can't just strip them out -- that
+            # would make our offset wrong.)
+            if comment_char:
+                block += stream.readline()
+                block = re.sub(COMMENT, _sub_space, block)
+            # Read the block.
+            tokens, offset = _parse_sexpr_block(block)
+            # Skip whitespace
+            offset = re.compile(r"\s*").search(block, offset).end()
+            # Move to the end position.
+            if encoding is None:
+                stream.seek(start + offset)
+            else:
+                stream.seek(start + len(block[:offset].encode(encoding)))
+            # Return the list of tokens we processed
+            return tokens
+        except ValueError as e:
+            if e.args[0] == "Block too small":
+                next_block = stream.read(block_size)
+                if next_block:
+                    block += next_block
+                    continue
+                else:
+                    # The file ended mid-sexpr -- return what we got.
+                    return [block.strip()]
+            else:
+                raise
+def _sub_space(m):
+    """Helper function: given a regexp match, return a string of
+    spaces that's the same length as the matched string."""
+    return " " * (m.end() - m.start())
+def _parse_sexpr_block(block):
+    tokens = []
+    start = end = 0
+    while end < len(block):
+        m = re.compile(r"\S").search(block, end)
+        if not m:
+            return tokens, end
+        start = m.start()
+        # Case 1: sexpr is not parenthesized.
+        if m.group() != "(":
+            m2 = re.compile(r"[\s(]").search(block, start)
+            if m2:
+                end = m2.start()
+            else:
+                if tokens:
+                    return tokens, end
+                raise ValueError("Block too small")
+        # Case 2: parenthesized sexpr.
+        else:
+            nesting = 0
+            for m in re.compile(r"[()]").finditer(block, start):
+                if m.group() == "(":
+                    nesting += 1
+                else:
+                    nesting -= 1
+                if nesting == 0:
+                    end = m.end()
+                    break
+            else:
+                if tokens:
+                    return tokens, end
+                raise ValueError("Block too small")
+        tokens.append(block[start:end])
+    return tokens, end
+######################################################################
+# { Finding Corpus Items
+######################################################################
+def find_corpus_fileids(root, regexp):
+    if not isinstance(root, PathPointer):
+        raise TypeError("find_corpus_fileids: expected a PathPointer")
+    regexp += "$"
+    # Find fileids in a zipfile: scan the zipfile's namelist.  Filter
+    # out entries that end in '/' -- they're directories.
+    if isinstance(root, ZipFilePathPointer):
+        fileids = [
+            name[len(root.entry) :]
+            for name in root.zipfile.namelist()
+            if not name.endswith("/")
+        ]
+        items = [name for name in fileids if re.match(regexp, name)]
+        return sorted(items)
+    # Find fileids in a directory: use os.walk to search all (proper
+    # or symlinked) subdirectories, and match paths against the regexp.
+    elif isinstance(root, FileSystemPathPointer):
+        items = []
+        for dirname, subdirs, fileids in os.walk(root.path):
+            prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
+            items += [
+                prefix + fileid
+                for fileid in fileids
+                if re.match(regexp, prefix + fileid)
+            ]
+            # Don't visit svn directories:
+            if ".svn" in subdirs:
+                subdirs.remove(".svn")
+        return sorted(items)
+    else:
+        raise AssertionError("Don't know how to handle %r" % root)
+def _path_from(parent, child):
+    if os.path.split(parent)[1] == "":
+        parent = os.path.split(parent)[0]
+    path = []
+    while parent != child:
+        child, dirname = os.path.split(child)
+        path.insert(0, dirname)
+        assert os.path.split(child)[0] != child
+    return path
+######################################################################
+# { Paragraph structure in Treebank files
+######################################################################
+def tagged_treebank_para_block_reader(stream):
+    # Read the next paragraph.
+    para = ""
+    while True:
+        line = stream.readline()
+        # End of paragraph:
+        if re.match(r"======+\s*$", line):
+            if para.strip():
+                return [para]
+        # End of file:
+        elif line == "":
+            if para.strip():
+                return [para]
+            else:
+                return []
+        # Content line:
+        else:
+            para += line

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py ADDED Viewed

	@@ -0,0 +1,629 @@

+# Natural Language Toolkit: Verbnet Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+An NLTK interface to the VerbNet verb lexicon
+For details about VerbNet see:
+https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+"""
+import re
+import textwrap
+from collections import defaultdict
+from nltk.corpus.reader.xmldocs import XMLCorpusReader
+class VerbnetCorpusReader(XMLCorpusReader):
+    """
+    An NLTK interface to the VerbNet verb lexicon.
+    From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
+    on-line verb lexicon currently available for English. It is a hierarchical
+    domain-independent, broad-coverage verb lexicon with mappings to other
+    lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
+    (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
+    For details about VerbNet see:
+    https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+    """
+    # No unicode encoding param, since the data files are all XML.
+    def __init__(self, root, fileids, wrap_etree=False):
+        XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
+        self._lemma_to_class = defaultdict(list)
+        """A dictionary mapping from verb lemma strings to lists of
+        VerbNet class identifiers."""
+        self._wordnet_to_class = defaultdict(list)
+        """A dictionary mapping from wordnet identifier strings to
+        lists of VerbNet class identifiers."""
+        self._class_to_fileid = {}
+        """A dictionary mapping from class identifiers to
+        corresponding file identifiers.  The keys of this dictionary
+        provide a complete list of all classes and subclasses."""
+        self._shortid_to_longid = {}
+        # Initialize the dictionaries.  Use the quick (regexp-based)
+        # method instead of the slow (xml-based) method, because it
+        # runs 2-30 times faster.
+        self._quick_index()
+    _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
+    """Regular expression that matches (and decomposes) longids"""
+    _SHORTID_RE = re.compile(r"[\d+.\-]+$")
+    """Regular expression that matches shortids"""
+    _INDEX_RE = re.compile(
+        r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' r'<VNSUBCLASS ID="([^"]+)"/?>'
+    )
+    """Regular expression used by ``_index()`` to quickly scan the corpus
+       for basic information."""
+    def lemmas(self, vnclass=None):
+        """
+        Return a list of all verb lemmas that appear in any class, or
+        in the ``classid`` if specified.
+        """
+        if vnclass is None:
+            return sorted(self._lemma_to_class.keys())
+        else:
+            # [xx] should this include subclass members?
+            if isinstance(vnclass, str):
+                vnclass = self.vnclass(vnclass)
+            return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
+    def wordnetids(self, vnclass=None):
+        """
+        Return a list of all wordnet identifiers that appear in any
+        class, or in ``classid`` if specified.
+        """
+        if vnclass is None:
+            return sorted(self._wordnet_to_class.keys())
+        else:
+            # [xx] should this include subclass members?
+            if isinstance(vnclass, str):
+                vnclass = self.vnclass(vnclass)
+            return sum(
+                (
+                    member.get("wn", "").split()
+                    for member in vnclass.findall("MEMBERS/MEMBER")
+                ),
+                [],
+            )
+    def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
+        """
+        Return a list of the VerbNet class identifiers.  If a file
+        identifier is specified, then return only the VerbNet class
+        identifiers for classes (and subclasses) defined by that file.
+        If a lemma is specified, then return only VerbNet class
+        identifiers for classes that contain that lemma as a member.
+        If a wordnetid is specified, then return only identifiers for
+        classes that contain that wordnetid as a member.  If a classid
+        is specified, then return only identifiers for subclasses of
+        the specified VerbNet class.
+        If nothing is specified, return all classids within VerbNet
+        """
+        if fileid is not None:
+            return [c for (c, f) in self._class_to_fileid.items() if f == fileid]
+        elif lemma is not None:
+            return self._lemma_to_class[lemma]
+        elif wordnetid is not None:
+            return self._wordnet_to_class[wordnetid]
+        elif classid is not None:
+            xmltree = self.vnclass(classid)
+            return [
+                subclass.get("ID")
+                for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
+            ]
+        else:
+            return sorted(self._class_to_fileid.keys())
+    def vnclass(self, fileid_or_classid):
+        """Returns VerbNet class ElementTree
+        Return an ElementTree containing the xml for the specified
+        VerbNet class.
+        :param fileid_or_classid: An identifier specifying which class
+            should be returned.  Can be a file identifier (such as
+            ``'put-9.1.xml'``), or a VerbNet class identifier (such as
+            ``'put-9.1'``) or a short VerbNet class identifier (such as
+            ``'9.1'``).
+        """
+        # File identifier: just return the xml.
+        if fileid_or_classid in self._fileids:
+            return self.xml(fileid_or_classid)
+        # Class identifier: get the xml, and find the right elt.
+        classid = self.longid(fileid_or_classid)
+        if classid in self._class_to_fileid:
+            fileid = self._class_to_fileid[self.longid(classid)]
+            tree = self.xml(fileid)
+            if classid == tree.get("ID"):
+                return tree
+            else:
+                for subclass in tree.findall(".//VNSUBCLASS"):
+                    if classid == subclass.get("ID"):
+                        return subclass
+                else:
+                    assert False  # we saw it during _index()!
+        else:
+            raise ValueError(f"Unknown identifier {fileid_or_classid}")
+    def fileids(self, vnclass_ids=None):
+        """
+        Return a list of fileids that make up this corpus.  If
+        ``vnclass_ids`` is specified, then return the fileids that make
+        up the specified VerbNet class(es).
+        """
+        if vnclass_ids is None:
+            return self._fileids
+        elif isinstance(vnclass_ids, str):
+            return [self._class_to_fileid[self.longid(vnclass_ids)]]
+        else:
+            return [
+                self._class_to_fileid[self.longid(vnclass_id)]
+                for vnclass_id in vnclass_ids
+            ]
+    def frames(self, vnclass):
+        """Given a VerbNet class, this method returns VerbNet frames
+        The members returned are:
+        1) Example
+        2) Description
+        3) Syntax
+        4) Semantics
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: frames - a list of frame dictionaries
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        frames = []
+        vnframes = vnclass.findall("FRAMES/FRAME")
+        for vnframe in vnframes:
+            frames.append(
+                {
+                    "example": self._get_example_within_frame(vnframe),
+                    "description": self._get_description_within_frame(vnframe),
+                    "syntax": self._get_syntactic_list_within_frame(vnframe),
+                    "semantics": self._get_semantics_within_frame(vnframe),
+                }
+            )
+        return frames
+    def subclasses(self, vnclass):
+        """Returns subclass ids, if any exist
+        Given a VerbNet class, this method returns subclass ids (if they exist)
+        in a list of strings.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: list of subclasses
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        subclasses = [
+            subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
+        ]
+        return subclasses
+    def themroles(self, vnclass):
+        """Returns thematic roles participating in a VerbNet class
+        Members returned as part of roles are-
+        1) Type
+        2) Modifiers
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: themroles: A list of thematic roles in the VerbNet class
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        themroles = []
+        for trole in vnclass.findall("THEMROLES/THEMROLE"):
+            themroles.append(
+                {
+                    "type": trole.get("type"),
+                    "modifiers": [
+                        {"value": restr.get("Value"), "type": restr.get("type")}
+                        for restr in trole.findall("SELRESTRS/SELRESTR")
+                    ],
+                }
+            )
+        return themroles
+    ######################################################################
+    # { Index Initialization
+    ######################################################################
+    def _index(self):
+        """
+        Initialize the indexes ``_lemma_to_class``,
+        ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
+        through the corpus fileids.  This is fast if ElementTree
+        uses the C implementation (<0.1 secs), but quite slow (>10 secs)
+        if only the python implementation is available.
+        """
+        for fileid in self._fileids:
+            self._index_helper(self.xml(fileid), fileid)
+    def _index_helper(self, xmltree, fileid):
+        """Helper for ``_index()``"""
+        vnclass = xmltree.get("ID")
+        self._class_to_fileid[vnclass] = fileid
+        self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+        for member in xmltree.findall("MEMBERS/MEMBER"):
+            self._lemma_to_class[member.get("name")].append(vnclass)
+            for wn in member.get("wn", "").split():
+                self._wordnet_to_class[wn].append(vnclass)
+        for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
+            self._index_helper(subclass, fileid)
+    def _quick_index(self):
+        """
+        Initialize the indexes ``_lemma_to_class``,
+        ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
+        through the corpus fileids.  This doesn't do proper xml parsing,
+        but is good enough to find everything in the standard VerbNet
+        corpus -- and it runs about 30 times faster than xml parsing
+        (with the python ElementTree; only 2-3 times faster
+        if ElementTree uses the C implementation).
+        """
+        # nb: if we got rid of wordnet_to_class, this would run 2-3
+        # times faster.
+        for fileid in self._fileids:
+            vnclass = fileid[:-4]  # strip the '.xml'
+            self._class_to_fileid[vnclass] = fileid
+            self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+            with self.open(fileid) as fp:
+                for m in self._INDEX_RE.finditer(fp.read()):
+                    groups = m.groups()
+                    if groups[0] is not None:
+                        self._lemma_to_class[groups[0]].append(vnclass)
+                        for wn in groups[1].split():
+                            self._wordnet_to_class[wn].append(vnclass)
+                    elif groups[2] is not None:
+                        self._class_to_fileid[groups[2]] = fileid
+                        vnclass = groups[2]  # for <MEMBER> elts.
+                        self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+                    else:
+                        assert False, "unexpected match condition"
+    ######################################################################
+    # { Identifier conversion
+    ######################################################################
+    def longid(self, shortid):
+        """Returns longid of a VerbNet class
+        Given a short VerbNet class identifier (eg '37.10'), map it
+        to a long id (eg 'confess-37.10').  If ``shortid`` is already a
+        long id, then return it as-is"""
+        if self._LONGID_RE.match(shortid):
+            return shortid  # it's already a longid.
+        elif not self._SHORTID_RE.match(shortid):
+            raise ValueError("vnclass identifier %r not found" % shortid)
+        try:
+            return self._shortid_to_longid[shortid]
+        except KeyError as e:
+            raise ValueError("vnclass identifier %r not found" % shortid) from e
+    def shortid(self, longid):
+        """Returns shortid of a VerbNet class
+        Given a long VerbNet class identifier (eg 'confess-37.10'),
+        map it to a short id (eg '37.10').  If ``longid`` is already a
+        short id, then return it as-is."""
+        if self._SHORTID_RE.match(longid):
+            return longid  # it's already a shortid.
+        m = self._LONGID_RE.match(longid)
+        if m:
+            return m.group(2)
+        else:
+            raise ValueError("vnclass identifier %r not found" % longid)
+    ######################################################################
+    # { Frame access utility functions
+    ######################################################################
+    def _get_semantics_within_frame(self, vnframe):
+        """Returns semantics within a single frame
+        A utility function to retrieve semantics within a frame in VerbNet
+        Members of the semantics dictionary:
+        1) Predicate value
+        2) Arguments
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: semantics: semantics dictionary
+        """
+        semantics_within_single_frame = []
+        for pred in vnframe.findall("SEMANTICS/PRED"):
+            arguments = [
+                {"type": arg.get("type"), "value": arg.get("value")}
+                for arg in pred.findall("ARGS/ARG")
+            ]
+            semantics_within_single_frame.append(
+                {
+                    "predicate_value": pred.get("value"),
+                    "arguments": arguments,
+                    "negated": pred.get("bool") == "!",
+                }
+            )
+        return semantics_within_single_frame
+    def _get_example_within_frame(self, vnframe):
+        """Returns example within a frame
+        A utility function to retrieve an example within a frame in VerbNet.
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: example_text: The example sentence for this particular frame
+        """
+        example_element = vnframe.find("EXAMPLES/EXAMPLE")
+        if example_element is not None:
+            example_text = example_element.text
+        else:
+            example_text = ""
+        return example_text
+    def _get_description_within_frame(self, vnframe):
+        """Returns member description within frame
+        A utility function to retrieve a description of participating members
+        within a frame in VerbNet.
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: description: a description dictionary with members - primary and secondary
+        """
+        description_element = vnframe.find("DESCRIPTION")
+        return {
+            "primary": description_element.attrib["primary"],
+            "secondary": description_element.get("secondary", ""),
+        }
+    def _get_syntactic_list_within_frame(self, vnframe):
+        """Returns semantics within a frame
+        A utility function to retrieve semantics within a frame in VerbNet.
+        Members of the syntactic dictionary:
+        1) POS Tag
+        2) Modifiers
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: syntax_within_single_frame
+        """
+        syntax_within_single_frame = []
+        for elt in vnframe.find("SYNTAX"):
+            pos_tag = elt.tag
+            modifiers = dict()
+            modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
+            modifiers["selrestrs"] = [
+                {"value": restr.get("Value"), "type": restr.get("type")}
+                for restr in elt.findall("SELRESTRS/SELRESTR")
+            ]
+            modifiers["synrestrs"] = [
+                {"value": restr.get("Value"), "type": restr.get("type")}
+                for restr in elt.findall("SYNRESTRS/SYNRESTR")
+            ]
+            syntax_within_single_frame.append(
+                {"pos_tag": pos_tag, "modifiers": modifiers}
+            )
+        return syntax_within_single_frame
+    ######################################################################
+    # { Pretty Printing
+    ######################################################################
+    def pprint(self, vnclass):
+        """Returns pretty printed version of a VerbNet class
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        s = vnclass.get("ID") + "\n"
+        s += self.pprint_subclasses(vnclass, indent="  ") + "\n"
+        s += self.pprint_members(vnclass, indent="  ") + "\n"
+        s += "  Thematic roles:\n"
+        s += self.pprint_themroles(vnclass, indent="    ") + "\n"
+        s += "  Frames:\n"
+        s += self.pprint_frames(vnclass, indent="    ")
+        return s
+    def pprint_subclasses(self, vnclass, indent=""):
+        """Returns pretty printed version of subclasses of VerbNet class
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's subclasses.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        subclasses = self.subclasses(vnclass)
+        if not subclasses:
+            subclasses = ["(none)"]
+        s = "Subclasses: " + " ".join(subclasses)
+        return textwrap.fill(
+            s, 70, initial_indent=indent, subsequent_indent=indent + "  "
+        )
+    def pprint_members(self, vnclass, indent=""):
+        """Returns pretty printed version of members in a VerbNet class
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's member verbs.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        members = self.lemmas(vnclass)
+        if not members:
+            members = ["(none)"]
+        s = "Members: " + " ".join(members)
+        return textwrap.fill(
+            s, 70, initial_indent=indent, subsequent_indent=indent + "  "
+        )
+    def pprint_themroles(self, vnclass, indent=""):
+        """Returns pretty printed version of thematic roles in a VerbNet class
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's thematic roles.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        pieces = []
+        for themrole in self.themroles(vnclass):
+            piece = indent + "* " + themrole.get("type")
+            modifiers = [
+                modifier["value"] + modifier["type"]
+                for modifier in themrole["modifiers"]
+            ]
+            if modifiers:
+                piece += "[{}]".format(" ".join(modifiers))
+            pieces.append(piece)
+        return "\n".join(pieces)
+    def pprint_frames(self, vnclass, indent=""):
+        """Returns pretty version of all frames in a VerbNet class
+        Return a string containing a pretty-printed representation of
+        the list of frames within the VerbNet class.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, str):
+            vnclass = self.vnclass(vnclass)
+        pieces = []
+        for vnframe in self.frames(vnclass):
+            pieces.append(self._pprint_single_frame(vnframe, indent))
+        return "\n".join(pieces)
+    def _pprint_single_frame(self, vnframe, indent=""):
+        """Returns pretty printed version of a single frame in a VerbNet class
+        Returns a string containing a pretty-printed representation of
+        the given frame.
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
+        frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
+        frame_string += (
+            self._pprint_syntax_within_frame(vnframe, indent + "  Syntax: ") + "\n"
+        )
+        frame_string += indent + "  Semantics:\n"
+        frame_string += self._pprint_semantics_within_frame(vnframe, indent + "    ")
+        return frame_string
+    def _pprint_example_within_frame(self, vnframe, indent=""):
+        """Returns pretty printed version of example within frame in a VerbNet class
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame example.
+        :param vnframe: An ElementTree containing the xml contents of
+            a Verbnet frame.
+        """
+        if vnframe["example"]:
+            return indent + " Example: " + vnframe["example"]
+    def _pprint_description_within_frame(self, vnframe, indent=""):
+        """Returns pretty printed version of a VerbNet frame description
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame description.
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        description = indent + vnframe["description"]["primary"]
+        if vnframe["description"]["secondary"]:
+            description += " ({})".format(vnframe["description"]["secondary"])
+        return description
+    def _pprint_syntax_within_frame(self, vnframe, indent=""):
+        """Returns pretty printed version of syntax within a frame in a VerbNet class
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame syntax.
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        pieces = []
+        for element in vnframe["syntax"]:
+            piece = element["pos_tag"]
+            modifier_list = []
+            if "value" in element["modifiers"] and element["modifiers"]["value"]:
+                modifier_list.append(element["modifiers"]["value"])
+            modifier_list += [
+                "{}{}".format(restr["value"], restr["type"])
+                for restr in (
+                    element["modifiers"]["selrestrs"]
+                    + element["modifiers"]["synrestrs"]
+                )
+            ]
+            if modifier_list:
+                piece += "[{}]".format(" ".join(modifier_list))
+            pieces.append(piece)
+        return indent + " ".join(pieces)
+    def _pprint_semantics_within_frame(self, vnframe, indent=""):
+        """Returns a pretty printed version of semantics within frame in a VerbNet class
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame semantics.
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        pieces = []
+        for predicate in vnframe["semantics"]:
+            arguments = [argument["value"] for argument in predicate["arguments"]]
+            pieces.append(
+                f"{'¬' if predicate['negated'] else ''}{predicate['predicate_value']}({', '.join(arguments)})"
+            )
+        return "\n".join(f"{indent}* {piece}" for piece in pieces)

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Natural Language Toolkit: Word List Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.tokenize import line_tokenize
+class WordListCorpusReader(CorpusReader):
+    """
+    List of words, one per line.  Blank lines are ignored.
+    """
+    def words(self, fileids=None, ignore_lines_startswith="\n"):
+        return [
+            line
+            for line in line_tokenize(self.raw(fileids))
+            if not line.startswith(ignore_lines_startswith)
+        ]
+class SwadeshCorpusReader(WordListCorpusReader):
+    def entries(self, fileids=None):
+        """
+        :return: a tuple of words for the specified fileids.
+        """
+        if not fileids:
+            fileids = self.fileids()
+        wordlists = [self.words(f) for f in fileids]
+        return list(zip(*wordlists))
+class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
+    """
+    This is a class to read the nonbreaking prefixes textfiles from the
+    Moses Machine Translation toolkit. These lists are used in the Python port
+    of the Moses' word tokenizer.
+    """
+    available_langs = {
+        "catalan": "ca",
+        "czech": "cs",
+        "german": "de",
+        "greek": "el",
+        "english": "en",
+        "spanish": "es",
+        "finnish": "fi",
+        "french": "fr",
+        "hungarian": "hu",
+        "icelandic": "is",
+        "italian": "it",
+        "latvian": "lv",
+        "dutch": "nl",
+        "polish": "pl",
+        "portuguese": "pt",
+        "romanian": "ro",
+        "russian": "ru",
+        "slovak": "sk",
+        "slovenian": "sl",
+        "swedish": "sv",
+        "tamil": "ta",
+    }
+    # Also, add the lang IDs as the keys.
+    available_langs.update({v: v for v in available_langs.values()})
+    def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
+        """
+        This module returns a list of nonbreaking prefixes for the specified
+        language(s).
+        >>> from nltk.corpus import nonbreaking_prefixes as nbp
+        >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
+        True
+        >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
+        True
+        :return: a list words for the specified language(s).
+        """
+        # If *lang* in list of languages available, allocate apt fileid.
+        # Otherwise, the function returns non-breaking prefixes for
+        # all languages when fileids==None.
+        if lang in self.available_langs:
+            lang = self.available_langs[lang]
+            fileids = ["nonbreaking_prefix." + lang]
+        return [
+            line
+            for line in line_tokenize(self.raw(fileids))
+            if not line.startswith(ignore_lines_startswith)
+        ]
+class UnicharsCorpusReader(WordListCorpusReader):
+    """
+    This class is used to read lists of characters from the Perl Unicode
+    Properties (see https://perldoc.perl.org/perluniprops.html).
+    The files in the perluniprop.zip are extracted using the Unicode::Tussle
+    module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
+    """
+    # These are categories similar to the Perl Unicode Properties
+    available_categories = [
+        "Close_Punctuation",
+        "Currency_Symbol",
+        "IsAlnum",
+        "IsAlpha",
+        "IsLower",
+        "IsN",
+        "IsSc",
+        "IsSo",
+        "IsUpper",
+        "Line_Separator",
+        "Number",
+        "Open_Punctuation",
+        "Punctuation",
+        "Separator",
+        "Symbol",
+    ]
+    def chars(self, category=None, fileids=None):
+        """
+        This module returns a list of characters from  the Perl Unicode Properties.
+        They are very useful when porting Perl tokenizers to Python.
+        >>> from nltk.corpus import perluniprops as pup
+        >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
+        True
+        >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
+        True
+        >>> pup.available_categories
+        ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
+        :return: a list of characters given the specific unicode character category
+        """
+        if category in self.available_categories:
+            fileids = [category + ".txt"]
+        return list(self.raw(fileids).strip())
+class MWAPPDBCorpusReader(WordListCorpusReader):
+    """
+    This class is used to read the list of word pairs from the subset of lexical
+    pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
+    Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
+     - http://acl2014.org/acl2014/Q14/pdf/Q14-1017
+     - https://www.aclweb.org/anthology/S14-2039
+     - https://www.aclweb.org/anthology/S15-2027
+    The original source of the full PPDB corpus can be found on
+    https://www.cis.upenn.edu/~ccb/ppdb/
+    :return: a list of tuples of similar lexical terms.
+    """
+    mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
+    def entries(self, fileids=mwa_ppdb_xxxl_file):
+        """
+        :return: a tuple of synonym word pairs.
+        """
+        return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py ADDED Viewed

	@@ -0,0 +1,2489 @@

+# Natural Language Toolkit: WordNet
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bethard <Steven.Bethard@colorado.edu>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+#         Nitin Madnani <nmadnani@ets.org>
+#         Nasruddin A’aidil Shari
+#         Sim Wei Ying Geraldine
+#         Soe Lynn
+#         Francis Bond <bond@ieee.org>
+#         Eric Kafe <kafe.eric@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+An NLTK interface for WordNet
+WordNet is a lexical database of English.
+Using synsets, helps find conceptual relationships between words
+such as hypernyms, hyponyms, synonyms, antonyms etc.
+For details about WordNet see:
+https://wordnet.princeton.edu/
+This module also allows you to find lemmas in languages
+other than English from the Open Multilingual Wordnet
+http://compling.hss.ntu.edu.sg/omw/
+"""
+import math
+import os
+import re
+import warnings
+from collections import defaultdict, deque
+from functools import total_ordering
+from itertools import chain, islice
+from operator import itemgetter
+from nltk.corpus.reader import CorpusReader
+from nltk.internals import deprecated
+from nltk.probability import FreqDist
+from nltk.util import binary_search_file as _binary_search_file
+######################################################################
+# Table of Contents
+######################################################################
+# - Constants
+# - Data Classes
+#   - WordNetError
+#   - Lemma
+#   - Synset
+# - WordNet Corpus Reader
+# - WordNet Information Content Corpus Reader
+# - Similarity Metrics
+# - Demo
+######################################################################
+# Constants
+######################################################################
+#: Positive infinity (for similarity functions)
+_INF = 1e300
+# { Part-of-speech constants
+ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
+# }
+POS_LIST = [NOUN, VERB, ADJ, ADV]
+# A table of strings that are used to express verb frames.
+VERB_FRAME_STRINGS = (
+    None,
+    "Something %s",
+    "Somebody %s",
+    "It is %sing",
+    "Something is %sing PP",
+    "Something %s something Adjective/Noun",
+    "Something %s Adjective/Noun",
+    "Somebody %s Adjective",
+    "Somebody %s something",
+    "Somebody %s somebody",
+    "Something %s somebody",
+    "Something %s something",
+    "Something %s to somebody",
+    "Somebody %s on something",
+    "Somebody %s somebody something",
+    "Somebody %s something to somebody",
+    "Somebody %s something from somebody",
+    "Somebody %s somebody with something",
+    "Somebody %s somebody of something",
+    "Somebody %s something on somebody",
+    "Somebody %s somebody PP",
+    "Somebody %s something PP",
+    "Somebody %s PP",
+    "Somebody's (body part) %s",
+    "Somebody %s somebody to INFINITIVE",
+    "Somebody %s somebody INFINITIVE",
+    "Somebody %s that CLAUSE",
+    "Somebody %s to somebody",
+    "Somebody %s to INFINITIVE",
+    "Somebody %s whether INFINITIVE",
+    "Somebody %s somebody into V-ing something",
+    "Somebody %s something with something",
+    "Somebody %s INFINITIVE",
+    "Somebody %s VERB-ing",
+    "It %s that CLAUSE",
+    "Something %s INFINITIVE",
+    # OEWN additions:
+    "Somebody %s at something",
+    "Somebody %s for something",
+    "Somebody %s on somebody",
+    "Somebody %s out of somebody",
+)
+SENSENUM_RE = re.compile(r"\.[\d]+\.")
+######################################################################
+# Data Classes
+######################################################################
+class WordNetError(Exception):
+    """An exception class for wordnet-related errors."""
+@total_ordering
+class _WordNetObject:
+    """A common base class for lemmas and synsets."""
+    def hypernyms(self):
+        return self._related("@")
+    def _hypernyms(self):
+        return self._related("@")
+    def instance_hypernyms(self):
+        return self._related("@i")
+    def _instance_hypernyms(self):
+        return self._related("@i")
+    def hyponyms(self):
+        return self._related("~")
+    def instance_hyponyms(self):
+        return self._related("~i")
+    def member_holonyms(self):
+        return self._related("#m")
+    def substance_holonyms(self):
+        return self._related("#s")
+    def part_holonyms(self):
+        return self._related("#p")
+    def member_meronyms(self):
+        return self._related("%m")
+    def substance_meronyms(self):
+        return self._related("%s")
+    def part_meronyms(self):
+        return self._related("%p")
+    def topic_domains(self):
+        return self._related(";c")
+    def in_topic_domains(self):
+        return self._related("-c")
+    def region_domains(self):
+        return self._related(";r")
+    def in_region_domains(self):
+        return self._related("-r")
+    def usage_domains(self):
+        return self._related(";u")
+    def in_usage_domains(self):
+        return self._related("-u")
+    def attributes(self):
+        return self._related("=")
+    def entailments(self):
+        return self._related("*")
+    def causes(self):
+        return self._related(">")
+    def also_sees(self):
+        return self._related("^")
+    def verb_groups(self):
+        return self._related("$")
+    def similar_tos(self):
+        return self._related("&")
+    def __hash__(self):
+        return hash(self._name)
+    def __eq__(self, other):
+        return self._name == other._name
+    def __ne__(self, other):
+        return self._name != other._name
+    def __lt__(self, other):
+        return self._name < other._name
+class Lemma(_WordNetObject):
+    """
+    The lexical entry for a single morphological form of a
+    sense-disambiguated word.
+    Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
+    <word> is the morphological stem identifying the synset
+    <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
+    <number> is the sense number, counting from 0.
+    <lemma> is the morphological form of interest
+    Note that <word> and <lemma> can be different, e.g. the Synset
+    'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
+    'salt.n.03.salinity'.
+    Lemma attributes, accessible via methods with the same name:
+    - name: The canonical name of this lemma.
+    - synset: The synset that this lemma belongs to.
+    - syntactic_marker: For adjectives, the WordNet string identifying the
+      syntactic position relative modified noun. See:
+      https://wordnet.princeton.edu/documentation/wninput5wn
+      For all other parts of speech, this attribute is None.
+    - count: The frequency of this lemma in wordnet.
+    Lemma methods:
+    Lemmas have the following methods for retrieving related Lemmas. They
+    correspond to the names for the pointer symbols defined here:
+    https://wordnet.princeton.edu/documentation/wninput5wn
+    These methods all return lists of Lemmas:
+    - antonyms
+    - hypernyms, instance_hypernyms
+    - hyponyms, instance_hyponyms
+    - member_holonyms, substance_holonyms, part_holonyms
+    - member_meronyms, substance_meronyms, part_meronyms
+    - topic_domains, region_domains, usage_domains
+    - attributes
+    - derivationally_related_forms
+    - entailments
+    - causes
+    - also_sees
+    - verb_groups
+    - similar_tos
+    - pertainyms
+    """
+    __slots__ = [
+        "_wordnet_corpus_reader",
+        "_name",
+        "_syntactic_marker",
+        "_synset",
+        "_frame_strings",
+        "_frame_ids",
+        "_lexname_index",
+        "_lex_id",
+        "_lang",
+        "_key",
+    ]
+    def __init__(
+        self,
+        wordnet_corpus_reader,
+        synset,
+        name,
+        lexname_index,
+        lex_id,
+        syntactic_marker,
+    ):
+        self._wordnet_corpus_reader = wordnet_corpus_reader
+        self._name = name
+        self._syntactic_marker = syntactic_marker
+        self._synset = synset
+        self._frame_strings = []
+        self._frame_ids = []
+        self._lexname_index = lexname_index
+        self._lex_id = lex_id
+        self._lang = "eng"
+        self._key = None  # gets set later.
+    def name(self):
+        return self._name
+    def syntactic_marker(self):
+        return self._syntactic_marker
+    def synset(self):
+        return self._synset
+    def frame_strings(self):
+        return self._frame_strings
+    def frame_ids(self):
+        return self._frame_ids
+    def lang(self):
+        return self._lang
+    def key(self):
+        return self._key
+    def __repr__(self):
+        tup = type(self).__name__, self._synset._name, self._name
+        return "%s('%s.%s')" % tup
+    def _related(self, relation_symbol):
+        get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
+        if (self._name, relation_symbol) not in self._synset._lemma_pointers:
+            return []
+        return [
+            get_synset(pos, offset)._lemmas[lemma_index]
+            for pos, offset, lemma_index in self._synset._lemma_pointers[
+                self._name, relation_symbol
+            ]
+        ]
+    def count(self):
+        """Return the frequency count for this Lemma"""
+        return self._wordnet_corpus_reader.lemma_count(self)
+    def antonyms(self):
+        return self._related("!")
+    def derivationally_related_forms(self):
+        return self._related("+")
+    def pertainyms(self):
+        return self._related("\\")
+class Synset(_WordNetObject):
+    """Create a Synset from a "<lemma>.<pos>.<number>" string where:
+    <lemma> is the word's morphological stem
+    <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
+    <number> is the sense number, counting from 0.
+    Synset attributes, accessible via methods with the same name:
+    - name: The canonical name of this synset, formed using the first lemma
+      of this synset. Note that this may be different from the name
+      passed to the constructor if that string used a different lemma to
+      identify the synset.
+    - pos: The synset's part of speech, matching one of the module level
+      attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
+    - lemmas: A list of the Lemma objects for this synset.
+    - definition: The definition for this synset.
+    - examples: A list of example strings for this synset.
+    - offset: The offset in the WordNet dict file of this synset.
+    - lexname: The name of the lexicographer file containing this synset.
+    Synset methods:
+    Synsets have the following methods for retrieving related Synsets.
+    They correspond to the names for the pointer symbols defined here:
+    https://wordnet.princeton.edu/documentation/wninput5wn
+    These methods all return lists of Synsets.
+    - hypernyms, instance_hypernyms
+    - hyponyms, instance_hyponyms
+    - member_holonyms, substance_holonyms, part_holonyms
+    - member_meronyms, substance_meronyms, part_meronyms
+    - attributes
+    - entailments
+    - causes
+    - also_sees
+    - verb_groups
+    - similar_tos
+    Additionally, Synsets support the following methods specific to the
+    hypernym relation:
+    - root_hypernyms
+    - common_hypernyms
+    - lowest_common_hypernyms
+    Note that Synsets do not support the following relations because
+    these are defined by WordNet as lexical relations:
+    - antonyms
+    - derivationally_related_forms
+    - pertainyms
+    """
+    __slots__ = [
+        "_pos",
+        "_offset",
+        "_name",
+        "_frame_ids",
+        "_lemmas",
+        "_lemma_names",
+        "_definition",
+        "_examples",
+        "_lexname",
+        "_pointers",
+        "_lemma_pointers",
+        "_max_depth",
+        "_min_depth",
+    ]
+    def __init__(self, wordnet_corpus_reader):
+        self._wordnet_corpus_reader = wordnet_corpus_reader
+        # All of these attributes get initialized by
+        # WordNetCorpusReader._synset_from_pos_and_line()
+        self._pos = None
+        self._offset = None
+        self._name = None
+        self._frame_ids = []
+        self._lemmas = []
+        self._lemma_names = []
+        self._definition = None
+        self._examples = []
+        self._lexname = None  # lexicographer name
+        self._all_hypernyms = None
+        self._pointers = defaultdict(set)
+        self._lemma_pointers = defaultdict(list)
+    def pos(self):
+        return self._pos
+    def offset(self):
+        return self._offset
+    def name(self):
+        return self._name
+    def frame_ids(self):
+        return self._frame_ids
+    def _doc(self, doc_type, default, lang="eng"):
+        """Helper method for Synset.definition and Synset.examples"""
+        corpus = self._wordnet_corpus_reader
+        if lang not in corpus.langs():
+            return None
+        elif lang == "eng":
+            return default
+        else:
+            corpus._load_lang_data(lang)
+            of = corpus.ss2of(self)
+            i = corpus.lg_attrs.index(doc_type)
+            if of in corpus._lang_data[lang][i]:
+                return corpus._lang_data[lang][i][of]
+            else:
+                return None
+    def definition(self, lang="eng"):
+        """Return definition in specified language"""
+        return self._doc("def", self._definition, lang=lang)
+    def examples(self, lang="eng"):
+        """Return examples in specified language"""
+        return self._doc("exe", self._examples, lang=lang)
+    def lexname(self):
+        return self._lexname
+    def _needs_root(self):
+        if self._pos == NOUN and self._wordnet_corpus_reader.get_version() != "1.6":
+            return False
+        else:
+            return True
+    def lemma_names(self, lang="eng"):
+        """Return all the lemma_names associated with the synset"""
+        if lang == "eng":
+            return self._lemma_names
+        else:
+            reader = self._wordnet_corpus_reader
+            reader._load_lang_data(lang)
+            i = reader.ss2of(self)
+            if i in reader._lang_data[lang][0]:
+                return reader._lang_data[lang][0][i]
+            else:
+                return []
+    def lemmas(self, lang="eng"):
+        """Return all the lemma objects associated with the synset"""
+        if lang == "eng":
+            return self._lemmas
+        elif self._name:
+            self._wordnet_corpus_reader._load_lang_data(lang)
+            lemmark = []
+            lemmy = self.lemma_names(lang)
+            for lem in lemmy:
+                temp = Lemma(
+                    self._wordnet_corpus_reader,
+                    self,
+                    lem,
+                    self._wordnet_corpus_reader._lexnames.index(self.lexname()),
+                    0,
+                    None,
+                )
+                temp._lang = lang
+                lemmark.append(temp)
+            return lemmark
+    def root_hypernyms(self):
+        """Get the topmost hypernyms of this synset in WordNet."""
+        result = []
+        seen = set()
+        todo = [self]
+        while todo:
+            next_synset = todo.pop()
+            if next_synset not in seen:
+                seen.add(next_synset)
+                next_hypernyms = (
+                    next_synset.hypernyms() + next_synset.instance_hypernyms()
+                )
+                if not next_hypernyms:
+                    result.append(next_synset)
+                else:
+                    todo.extend(next_hypernyms)
+        return result
+    # Simpler implementation which makes incorrect assumption that
+    # hypernym hierarchy is acyclic:
+    #
+    #        if not self.hypernyms():
+    #            return [self]
+    #        else:
+    #            return list(set(root for h in self.hypernyms()
+    #                            for root in h.root_hypernyms()))
+    def max_depth(self):
+        """
+        :return: The length of the longest hypernym path from this
+            synset to the root.
+        """
+        if "_max_depth" not in self.__dict__:
+            hypernyms = self.hypernyms() + self.instance_hypernyms()
+            if not hypernyms:
+                self._max_depth = 0
+            else:
+                self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
+        return self._max_depth
+    def min_depth(self):
+        """
+        :return: The length of the shortest hypernym path from this
+            synset to the root.
+        """
+        if "_min_depth" not in self.__dict__:
+            hypernyms = self.hypernyms() + self.instance_hypernyms()
+            if not hypernyms:
+                self._min_depth = 0
+            else:
+                self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
+        return self._min_depth
+    def closure(self, rel, depth=-1):
+        """
+        Return the transitive closure of source under the rel
+        relationship, breadth-first, discarding cycles:
+        >>> from nltk.corpus import wordnet as wn
+        >>> computer = wn.synset('computer.n.01')
+        >>> topic = lambda s:s.topic_domains()
+        >>> print(list(computer.closure(topic)))
+        [Synset('computer_science.n.01')]
+        UserWarning: Discarded redundant search for Synset('computer.n.01') at depth 2
+        Include redundant paths (but only once), avoiding duplicate searches
+        (from 'animal.n.01' to 'entity.n.01'):
+        >>> dog = wn.synset('dog.n.01')
+        >>> hyp = lambda s:s.hypernyms()
+        >>> print(list(dog.closure(hyp)))
+        [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'),\
+ Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'),\
+ Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'),\
+ Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),\
+ Synset('physical_entity.n.01'), Synset('entity.n.01')]
+        UserWarning: Discarded redundant search for Synset('animal.n.01') at depth 7
+        """
+        from nltk.util import acyclic_breadth_first
+        for synset in acyclic_breadth_first(self, rel, depth):
+            if synset != self:
+                yield synset
+    from nltk.util import acyclic_depth_first as acyclic_tree
+    from nltk.util import unweighted_minimum_spanning_tree as mst
+    # Also add this shortcut?
+    #    from nltk.util import unweighted_minimum_spanning_digraph as umsd
+    def tree(self, rel, depth=-1, cut_mark=None):
+        """
+        Return the full relation tree, including self,
+        discarding cycles:
+        >>> from nltk.corpus import wordnet as wn
+        >>> from pprint import pprint
+        >>> computer = wn.synset('computer.n.01')
+        >>> topic = lambda s:s.topic_domains()
+        >>> pprint(computer.tree(topic))
+        [Synset('computer.n.01'), [Synset('computer_science.n.01')]]
+        UserWarning: Discarded redundant search for Synset('computer.n.01') at depth -3
+        But keep duplicate branches (from 'animal.n.01' to 'entity.n.01'):
+        >>> dog = wn.synset('dog.n.01')
+        >>> hyp = lambda s:s.hypernyms()
+        >>> pprint(dog.tree(hyp))
+        [Synset('dog.n.01'),
+         [Synset('canine.n.02'),
+          [Synset('carnivore.n.01'),
+           [Synset('placental.n.01'),
+            [Synset('mammal.n.01'),
+             [Synset('vertebrate.n.01'),
+              [Synset('chordate.n.01'),
+               [Synset('animal.n.01'),
+                [Synset('organism.n.01'),
+                 [Synset('living_thing.n.01'),
+                  [Synset('whole.n.02'),
+                   [Synset('object.n.01'),
+                    [Synset('physical_entity.n.01'),
+                     [Synset('entity.n.01')]]]]]]]]]]]]],
+         [Synset('domestic_animal.n.01'),
+          [Synset('animal.n.01'),
+           [Synset('organism.n.01'),
+            [Synset('living_thing.n.01'),
+             [Synset('whole.n.02'),
+              [Synset('object.n.01'),
+               [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]
+        """
+        from nltk.util import acyclic_branches_depth_first
+        return acyclic_branches_depth_first(self, rel, depth, cut_mark)
+    def hypernym_paths(self):
+        """
+        Get the path(s) from this synset to the root, where each path is a
+        list of the synset nodes traversed on the way to the root.
+        :return: A list of lists, where each list gives the node sequence
+           connecting the initial ``Synset`` node and a root node.
+        """
+        paths = []
+        hypernyms = self.hypernyms() + self.instance_hypernyms()
+        if len(hypernyms) == 0:
+            paths = [[self]]
+        for hypernym in hypernyms:
+            for ancestor_list in hypernym.hypernym_paths():
+                ancestor_list.append(self)
+                paths.append(ancestor_list)
+        return paths
+    def common_hypernyms(self, other):
+        """
+        Find all synsets that are hypernyms of this synset and the
+        other synset.
+        :type other: Synset
+        :param other: other input synset.
+        :return: The synsets that are hypernyms of both synsets.
+        """
+        if not self._all_hypernyms:
+            self._all_hypernyms = {
+                self_synset
+                for self_synsets in self._iter_hypernym_lists()
+                for self_synset in self_synsets
+            }
+        if not other._all_hypernyms:
+            other._all_hypernyms = {
+                other_synset
+                for other_synsets in other._iter_hypernym_lists()
+                for other_synset in other_synsets
+            }
+        return list(self._all_hypernyms.intersection(other._all_hypernyms))
+    def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
+        """
+        Get a list of lowest synset(s) that both synsets have as a hypernym.
+        When `use_min_depth == False` this means that the synset which appears
+        as a hypernym of both `self` and `other` with the lowest maximum depth
+        is returned or if there are multiple such synsets at the same depth
+        they are all returned
+        However, if `use_min_depth == True` then the synset(s) which has/have
+        the lowest minimum depth and appear(s) in both paths is/are returned.
+        By setting the use_min_depth flag to True, the behavior of NLTK2 can be
+        preserved. This was changed in NLTK3 to give more accurate results in a
+        small set of cases, generally with synsets concerning people. (eg:
+        'chef.n.01', 'fireman.n.01', etc.)
+        This method is an implementation of Ted Pedersen's "Lowest Common
+        Subsumer" method from the Perl Wordnet module. It can return either
+        "self" or "other" if they are a hypernym of the other.
+        :type other: Synset
+        :param other: other input synset
+        :type simulate_root: bool
+        :param simulate_root: The various verb taxonomies do not
+            share a single root which disallows this metric from working for
+            synsets that are not connected. This flag (False by default)
+            creates a fake root that connects all the taxonomies. Set it
+            to True to enable this behavior. For the noun taxonomy,
+            there is usually a default root except for WordNet version 1.6.
+            If you are using wordnet 1.6, a fake root will need to be added
+            for nouns as well.
+        :type use_min_depth: bool
+        :param use_min_depth: This setting mimics older (v2) behavior of NLTK
+            wordnet If True, will use the min_depth function to calculate the
+            lowest common hypernyms. This is known to give strange results for
+            some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
+            for backwards compatibility
+        :return: The synsets that are the lowest common hypernyms of both
+            synsets
+        """
+        synsets = self.common_hypernyms(other)
+        if simulate_root:
+            fake_synset = Synset(None)
+            fake_synset._name = "*ROOT*"
+            fake_synset.hypernyms = lambda: []
+            fake_synset.instance_hypernyms = lambda: []
+            synsets.append(fake_synset)
+        try:
+            if use_min_depth:
+                max_depth = max(s.min_depth() for s in synsets)
+                unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
+            else:
+                max_depth = max(s.max_depth() for s in synsets)
+                unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
+            return sorted(unsorted_lch)
+        except ValueError:
+            return []
+    def hypernym_distances(self, distance=0, simulate_root=False):
+        """
+        Get the path(s) from this synset to the root, counting the distance
+        of each node from the initial node on the way. A set of
+        (synset, distance) tuples is returned.
+        :type distance: int
+        :param distance: the distance (number of edges) from this hypernym to
+            the original hypernym ``Synset`` on which this method was called.
+        :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is
+           a hypernym of the first ``Synset``.
+        """
+        distances = {(self, distance)}
+        for hypernym in self._hypernyms() + self._instance_hypernyms():
+            distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
+        if simulate_root:
+            fake_synset = Synset(None)
+            fake_synset._name = "*ROOT*"
+            fake_synset_distance = max(distances, key=itemgetter(1))[1]
+            distances.add((fake_synset, fake_synset_distance + 1))
+        return distances
+    def _shortest_hypernym_paths(self, simulate_root):
+        if self._name == "*ROOT*":
+            return {self: 0}
+        queue = deque([(self, 0)])
+        path = {}
+        while queue:
+            s, depth = queue.popleft()
+            if s in path:
+                continue
+            path[s] = depth
+            depth += 1
+            queue.extend((hyp, depth) for hyp in s._hypernyms())
+            queue.extend((hyp, depth) for hyp in s._instance_hypernyms())
+        if simulate_root:
+            fake_synset = Synset(None)
+            fake_synset._name = "*ROOT*"
+            path[fake_synset] = max(path.values()) + 1
+        return path
+    def shortest_path_distance(self, other, simulate_root=False):
+        """
+        Returns the distance of the shortest path linking the two synsets (if
+        one exists). For each synset, all the ancestor nodes and their
+        distances are recorded and compared. The ancestor node common to both
+        synsets that can be reached with the minimum number of traversals is
+        used. If no ancestor nodes are common, None is returned. If a node is
+        compared with itself 0 is returned.
+        :type other: Synset
+        :param other: The Synset to which the shortest path will be found.
+        :return: The number of edges in the shortest path connecting the two
+            nodes, or None if no path exists.
+        """
+        if self == other:
+            return 0
+        dist_dict1 = self._shortest_hypernym_paths(simulate_root)
+        dist_dict2 = other._shortest_hypernym_paths(simulate_root)
+        # For each ancestor synset common to both subject synsets, find the
+        # connecting path length. Return the shortest of these.
+        inf = float("inf")
+        path_distance = inf
+        for synset, d1 in dist_dict1.items():
+            d2 = dist_dict2.get(synset, inf)
+            path_distance = min(path_distance, d1 + d2)
+        return None if math.isinf(path_distance) else path_distance
+    # interface to similarity methods
+    def path_similarity(self, other, verbose=False, simulate_root=True):
+        """
+        Path Distance Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        shortest path that connects the senses in the is-a (hypernym/hypnoym)
+        taxonomy. The score is in the range 0 to 1, except in those cases where
+        a path cannot be found (will only be true for verbs as there are many
+        distinct verb taxonomies), in which case None is returned. A score of
+        1 represents identity i.e. comparing a sense with itself will return 1.
+        :type other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type simulate_root: bool
+        :param simulate_root: The various verb taxonomies do not
+            share a single root which disallows this metric from working for
+            synsets that are not connected. This flag (True by default)
+            creates a fake root that connects all the taxonomies. Set it
+            to false to disable this behavior. For the noun taxonomy,
+            there is usually a default root except for WordNet version 1.6.
+            If you are using wordnet 1.6, a fake root will be added for nouns
+            as well.
+        :return: A score denoting the similarity of the two ``Synset`` objects,
+            normally between 0 and 1. None is returned if no connecting path
+            could be found. 1 is returned if a ``Synset`` is compared with
+            itself.
+        """
+        distance = self.shortest_path_distance(
+            other,
+            simulate_root=simulate_root and (self._needs_root() or other._needs_root()),
+        )
+        if distance is None or distance < 0:
+            return None
+        return 1.0 / (distance + 1)
+    def lch_similarity(self, other, verbose=False, simulate_root=True):
+        """
+        Leacock Chodorow Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        shortest path that connects the senses (as above) and the maximum depth
+        of the taxonomy in which the senses occur. The relationship is given as
+        -log(p/2d) where p is the shortest path length and d is the taxonomy
+        depth.
+        :type  other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type simulate_root: bool
+        :param simulate_root: The various verb taxonomies do not
+            share a single root which disallows this metric from working for
+            synsets that are not connected. This flag (True by default)
+            creates a fake root that connects all the taxonomies. Set it
+            to false to disable this behavior. For the noun taxonomy,
+            there is usually a default root except for WordNet version 1.6.
+            If you are using wordnet 1.6, a fake root will be added for nouns
+            as well.
+        :return: A score denoting the similarity of the two ``Synset`` objects,
+            normally greater than 0. None is returned if no connecting path
+            could be found. If a ``Synset`` is compared with itself, the
+            maximum score is returned, which varies depending on the taxonomy
+            depth.
+        """
+        if self._pos != other._pos:
+            raise WordNetError(
+                "Computing the lch similarity requires "
+                "%s and %s to have the same part of speech." % (self, other)
+            )
+        need_root = self._needs_root()
+        if self._pos not in self._wordnet_corpus_reader._max_depth:
+            self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
+        depth = self._wordnet_corpus_reader._max_depth[self._pos]
+        distance = self.shortest_path_distance(
+            other, simulate_root=simulate_root and need_root
+        )
+        if distance is None or distance < 0 or depth == 0:
+            return None
+        return -math.log((distance + 1) / (2.0 * depth))
+    def wup_similarity(self, other, verbose=False, simulate_root=True):
+        """
+        Wu-Palmer Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        depth of the two senses in the taxonomy and that of their Least Common
+        Subsumer (most specific ancestor node). Previously, the scores computed
+        by this implementation did _not_ always agree with those given by
+        Pedersen's Perl implementation of WordNet Similarity. However, with
+        the addition of the simulate_root flag (see below), the score for
+        verbs now almost always agree but not always for nouns.
+        The LCS does not necessarily feature in the shortest path connecting
+        the two senses, as it is by definition the common ancestor deepest in
+        the taxonomy, not closest to the two senses. Typically, however, it
+        will so feature. Where multiple candidates for the LCS exist, that
+        whose shortest path to the root node is the longest will be selected.
+        Where the LCS has multiple paths to the root, the longer path is used
+        for the purposes of the calculation.
+        :type  other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type simulate_root: bool
+        :param simulate_root: The various verb taxonomies do not
+            share a single root which disallows this metric from working for
+            synsets that are not connected. This flag (True by default)
+            creates a fake root that connects all the taxonomies. Set it
+            to false to disable this behavior. For the noun taxonomy,
+            there is usually a default root except for WordNet version 1.6.
+            If you are using wordnet 1.6, a fake root will be added for nouns
+            as well.
+        :return: A float score denoting the similarity of the two ``Synset``
+            objects, normally greater than zero. If no connecting path between
+            the two senses can be found, None is returned.
+        """
+        need_root = self._needs_root() or other._needs_root()
+        # Note that to preserve behavior from NLTK2 we set use_min_depth=True
+        # It is possible that more accurate results could be obtained by
+        # removing this setting and it should be tested later on
+        subsumers = self.lowest_common_hypernyms(
+            other, simulate_root=simulate_root and need_root, use_min_depth=True
+        )
+        # If no LCS was found return None
+        if len(subsumers) == 0:
+            return None
+        subsumer = self if self in subsumers else subsumers[0]
+        # Get the longest path from the LCS to the root,
+        # including a correction:
+        # - add one because the calculations include both the start and end
+        #   nodes
+        depth = subsumer.max_depth() + 1
+        # Note: No need for an additional add-one correction for non-nouns
+        # to account for an imaginary root node because that is now
+        # automatically handled by simulate_root
+        # if subsumer._pos != NOUN:
+        #     depth += 1
+        # Get the shortest path from the LCS to each of the synsets it is
+        # subsuming.  Add this to the LCS path length to get the path
+        # length from each synset to the root.
+        len1 = self.shortest_path_distance(
+            subsumer, simulate_root=simulate_root and need_root
+        )
+        len2 = other.shortest_path_distance(
+            subsumer, simulate_root=simulate_root and need_root
+        )
+        if len1 is None or len2 is None:
+            return None
+        len1 += depth
+        len2 += depth
+        return (2.0 * depth) / (len1 + len2)
+    def res_similarity(self, other, ic, verbose=False):
+        """
+        Resnik Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        Information Content (IC) of the Least Common Subsumer (most specific
+        ancestor node).
+        :type  other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type ic: dict
+        :param ic: an information content object (as returned by
+            ``nltk.corpus.wordnet_ic.ic()``).
+        :return: A float score denoting the similarity of the two ``Synset``
+            objects. Synsets whose LCS is the root node of the taxonomy will
+            have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
+        """
+        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
+        return lcs_ic
+    def jcn_similarity(self, other, ic, verbose=False):
+        """
+        Jiang-Conrath Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        Information Content (IC) of the Least Common Subsumer (most specific
+        ancestor node) and that of the two input Synsets. The relationship is
+        given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
+        :type  other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type  ic: dict
+        :param ic: an information content object (as returned by
+            ``nltk.corpus.wordnet_ic.ic()``).
+        :return: A float score denoting the similarity of the two ``Synset``
+            objects.
+        """
+        if self == other:
+            return _INF
+        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
+        # If either of the input synsets are the root synset, or have a
+        # frequency of 0 (sparse data problem), return 0.
+        if ic1 == 0 or ic2 == 0:
+            return 0
+        ic_difference = ic1 + ic2 - 2 * lcs_ic
+        if ic_difference == 0:
+            return _INF
+        return 1 / ic_difference
+    def lin_similarity(self, other, ic, verbose=False):
+        """
+        Lin Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        Information Content (IC) of the Least Common Subsumer (most specific
+        ancestor node) and that of the two input Synsets. The relationship is
+        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
+        :type other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type ic: dict
+        :param ic: an information content object (as returned by
+            ``nltk.corpus.wordnet_ic.ic()``).
+        :return: A float score denoting the similarity of the two ``Synset``
+            objects, in the range 0 to 1.
+        """
+        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
+        return (2.0 * lcs_ic) / (ic1 + ic2)
+    def _iter_hypernym_lists(self):
+        """
+        :return: An iterator over ``Synset`` objects that are either proper
+        hypernyms or instance of hypernyms of the synset.
+        """
+        todo = [self]
+        seen = set()
+        while todo:
+            for synset in todo:
+                seen.add(synset)
+            yield todo
+            todo = [
+                hypernym
+                for synset in todo
+                for hypernym in (synset.hypernyms() + synset.instance_hypernyms())
+                if hypernym not in seen
+            ]
+    def __repr__(self):
+        return f"{type(self).__name__}('{self._name}')"
+    def _related(self, relation_symbol, sort=True):
+        get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
+        if relation_symbol not in self._pointers:
+            return []
+        pointer_tuples = self._pointers[relation_symbol]
+        r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
+        if sort:
+            r.sort()
+        return r
+######################################################################
+# WordNet Corpus Reader
+######################################################################
+class WordNetCorpusReader(CorpusReader):
+    """
+    A corpus reader used to access wordnet or its variants.
+    """
+    _ENCODING = "utf8"
+    # { Part-of-speech constants
+    ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
+    # }
+    # { Filename constants
+    _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"}
+    # }
+    # { Part of speech constants
+    _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
+    _pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
+    # }
+    #: A list of file identifiers for all the fileids used by this
+    #: corpus reader.
+    _FILES = (
+        "cntlist.rev",
+        "lexnames",
+        "index.sense",
+        "index.adj",
+        "index.adv",
+        "index.noun",
+        "index.verb",
+        "data.adj",
+        "data.adv",
+        "data.noun",
+        "data.verb",
+        "adj.exc",
+        "adv.exc",
+        "noun.exc",
+        "verb.exc",
+    )
+    def __init__(self, root, omw_reader):
+        """
+        Construct a new wordnet corpus reader, with the given root
+        directory.
+        """
+        super().__init__(root, self._FILES, encoding=self._ENCODING)
+        # A index that provides the file offset
+        # Map from lemma -> pos -> synset_index -> offset
+        self._lemma_pos_offset_map = defaultdict(dict)
+        # A cache so we don't have to reconstruct synsets
+        # Map from pos -> offset -> synset
+        self._synset_offset_cache = defaultdict(dict)
+        # A lookup for the maximum depth of each part of speech.  Useful for
+        # the lch similarity metric.
+        self._max_depth = defaultdict(dict)
+        # Corpus reader containing omw data.
+        self._omw_reader = omw_reader
+        # Corpus reader containing extended_omw data.
+        self._exomw_reader = None
+        self.provenances = defaultdict(str)
+        self.provenances["eng"] = ""
+        if self._omw_reader is None:
+            warnings.warn(
+                "The multilingual functions are not available with this Wordnet version"
+            )
+        self.omw_langs = set()
+        # A cache to store the wordnet data of multiple languages
+        self._lang_data = defaultdict(list)
+        self._data_file_map = {}
+        self._exception_map = {}
+        self._lexnames = []
+        self._key_count_file = None
+        self._key_synset_file = None
+        # Load the lexnames
+        with self.open("lexnames") as fp:
+            for i, line in enumerate(fp):
+                index, lexname, _ = line.split()
+                assert int(index) == i
+                self._lexnames.append(lexname)
+        # Load the indices for lemmas and synset offsets
+        self._load_lemma_pos_offset_map()
+        # load the exception file data into memory
+        self._load_exception_map()
+        self.nomap = []
+        self.splits = {}
+        # map from WordNet 3.0 for OMW data
+        self.map30 = self.map_wn30()
+        # Language data attributes
+        self.lg_attrs = ["lemma", "none", "def", "exe"]
+    def index_sense(self, version=None):
+        """Read sense key to synset id mapping from index.sense file in corpus directory"""
+        fn = "index.sense"
+        if version:
+            from nltk.corpus import CorpusReader, LazyCorpusLoader
+            ixreader = LazyCorpusLoader(version, CorpusReader, r".*/" + fn)
+        else:
+            ixreader = self
+        with ixreader.open(fn) as fp:
+            sensekey_map = {}
+            for line in fp:
+                fields = line.strip().split()
+                sensekey = fields[0]
+                pos = self._pos_names[int(sensekey.split("%")[1].split(":")[0])]
+                sensekey_map[sensekey] = f"{fields[1]}-{pos}"
+        return sensekey_map
+    def map_to_many(self):
+        sensekey_map1 = self.index_sense("wordnet")
+        sensekey_map2 = self.index_sense()
+        synset_to_many = {}
+        for synsetid in set(sensekey_map1.values()):
+            synset_to_many[synsetid] = []
+        for sensekey in set(sensekey_map1.keys()).intersection(
+            set(sensekey_map2.keys())
+        ):
+            source = sensekey_map1[sensekey]
+            target = sensekey_map2[sensekey]
+            synset_to_many[source].append(target)
+        return synset_to_many
+    def map_to_one(self):
+        synset_to_many = self.map_to_many()
+        synset_to_one = {}
+        for source in synset_to_many:
+            candidates_bag = synset_to_many[source]
+            if candidates_bag:
+                candidates_set = set(candidates_bag)
+                if len(candidates_set) == 1:
+                    target = candidates_bag[0]
+                else:
+                    counts = []
+                    for candidate in candidates_set:
+                        counts.append((candidates_bag.count(candidate), candidate))
+                    self.splits[source] = counts
+                    target = max(counts)[1]
+                synset_to_one[source] = target
+                if source[-1] == "s":
+                    # Add a mapping from "a" to target for applications like omw,
+                    # where only Lithuanian and Slovak use the "s" ss_type.
+                    synset_to_one[f"{source[:-1]}a"] = target
+            else:
+                self.nomap.append(source)
+        return synset_to_one
+    def map_wn30(self):
+        """Mapping from Wordnet 3.0 to currently loaded Wordnet version"""
+        if self.get_version() == "3.0":
+            return None
+        else:
+            return self.map_to_one()
+    # Open Multilingual WordNet functions, contributed by
+    # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
+    def of2ss(self, of):
+        """take an id and return the synsets"""
+        return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
+    def ss2of(self, ss):
+        """return the ID of the synset"""
+        if ss:
+            return f"{ss.offset():08d}-{ss.pos()}"
+    def _load_lang_data(self, lang):
+        """load the wordnet data of the requested language from the file to
+        the cache, _lang_data"""
+        if lang in self._lang_data:
+            return
+        if self._omw_reader and not self.omw_langs:
+            self.add_omw()
+        if lang not in self.langs():
+            raise WordNetError("Language is not supported.")
+        if self._exomw_reader and lang not in self.omw_langs:
+            reader = self._exomw_reader
+        else:
+            reader = self._omw_reader
+        prov = self.provenances[lang]
+        if prov in ["cldr", "wikt"]:
+            prov2 = prov
+        else:
+            prov2 = "data"
+        with reader.open(f"{prov}/wn-{prov2}-{lang.split('_')[0]}.tab") as fp:
+            self.custom_lemmas(fp, lang)
+        self.disable_custom_lemmas(lang)
+    def add_provs(self, reader):
+        """Add languages from Multilingual Wordnet to the provenance dictionary"""
+        fileids = reader.fileids()
+        for fileid in fileids:
+            prov, langfile = os.path.split(fileid)
+            file_name, file_extension = os.path.splitext(langfile)
+            if file_extension == ".tab":
+                lang = file_name.split("-")[-1]
+                if lang in self.provenances or prov in ["cldr", "wikt"]:
+                    # We already have another resource for this lang,
+                    # so we need to further specify the lang id:
+                    lang = f"{lang}_{prov}"
+                self.provenances[lang] = prov
+    def add_omw(self):
+        self.add_provs(self._omw_reader)
+        self.omw_langs = set(self.provenances.keys())
+    def add_exomw(self):
+        """
+        Add languages from Extended OMW
+        >>> import nltk
+        >>> from nltk.corpus import wordnet as wn
+        >>> wn.add_exomw()
+        >>> print(wn.synset('intrinsically.r.01').lemmas(lang="eng_wikt"))
+        [Lemma('intrinsically.r.01.per_se'), Lemma('intrinsically.r.01.as_such')]
+        """
+        from nltk.corpus import extended_omw
+        self.add_omw()
+        self._exomw_reader = extended_omw
+        self.add_provs(self._exomw_reader)
+    def langs(self):
+        """return a list of languages supported by Multilingual Wordnet"""
+        return list(self.provenances.keys())
+    def _load_lemma_pos_offset_map(self):
+        for suffix in self._FILEMAP.values():
+            # parse each line of the file (ignoring comment lines)
+            with self.open("index.%s" % suffix) as fp:
+                for i, line in enumerate(fp):
+                    if line.startswith(" "):
+                        continue
+                    _iter = iter(line.split())
+                    def _next_token():
+                        return next(_iter)
+                    try:
+                        # get the lemma and part-of-speech
+                        lemma = _next_token()
+                        pos = _next_token()
+                        # get the number of synsets for this lemma
+                        n_synsets = int(_next_token())
+                        assert n_synsets > 0
+                        # get and ignore the pointer symbols for all synsets of
+                        # this lemma
+                        n_pointers = int(_next_token())
+                        [_next_token() for _ in range(n_pointers)]
+                        # same as number of synsets
+                        n_senses = int(_next_token())
+                        assert n_synsets == n_senses
+                        # get and ignore number of senses ranked according to
+                        # frequency
+                        _next_token()
+                        # get synset offsets
+                        synset_offsets = [int(_next_token()) for _ in range(n_synsets)]
+                    # raise more informative error with file name and line number
+                    except (AssertionError, ValueError) as e:
+                        tup = ("index.%s" % suffix), (i + 1), e
+                        raise WordNetError("file %s, line %i: %s" % tup) from e
+                    # map lemmas and parts of speech to synsets
+                    self._lemma_pos_offset_map[lemma][pos] = synset_offsets
+                    if pos == ADJ:
+                        self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
+    def _load_exception_map(self):
+        # load the exception file data into memory
+        for pos, suffix in self._FILEMAP.items():
+            self._exception_map[pos] = {}
+            with self.open("%s.exc" % suffix) as fp:
+                for line in fp:
+                    terms = line.split()
+                    self._exception_map[pos][terms[0]] = terms[1:]
+        self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
+    def _compute_max_depth(self, pos, simulate_root):
+        """
+        Compute the max depth for the given part of speech.  This is
+        used by the lch similarity metric.
+        """
+        depth = 0
+        for ii in self.all_synsets(pos):
+            try:
+                depth = max(depth, ii.max_depth())
+            except RuntimeError:
+                print(ii)
+        if simulate_root:
+            depth += 1
+        self._max_depth[pos] = depth
+    def get_version(self):
+        fh = self._data_file(ADJ)
+        fh.seek(0)
+        for line in fh:
+            match = re.search(r"Word[nN]et (\d+|\d+\.\d+) Copyright", line)
+            if match is not None:
+                version = match.group(1)
+                fh.seek(0)
+                return version
+    #############################################################
+    # Loading Lemmas
+    #############################################################
+    def lemma(self, name, lang="eng"):
+        """Return lemma object that matches the name"""
+        # cannot simply split on first '.',
+        # e.g.: '.45_caliber.a.01..45_caliber'
+        separator = SENSENUM_RE.search(name).end()
+        synset_name, lemma_name = name[: separator - 1], name[separator:]
+        synset = self.synset(synset_name)
+        for lemma in synset.lemmas(lang):
+            if lemma._name == lemma_name:
+                return lemma
+        raise WordNetError(f"No lemma {lemma_name!r} in {synset_name!r}")
+    def lemma_from_key(self, key):
+        # Keys are case sensitive and always lower-case
+        key = key.lower()
+        lemma_name, lex_sense = key.split("%")
+        pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":")
+        pos = self._pos_names[int(pos_number)]
+        # open the key -> synset file if necessary
+        if self._key_synset_file is None:
+            self._key_synset_file = self.open("index.sense")
+        # Find the synset for the lemma.
+        synset_line = _binary_search_file(self._key_synset_file, key)
+        if not synset_line:
+            raise WordNetError("No synset found for key %r" % key)
+        offset = int(synset_line.split()[1])
+        synset = self.synset_from_pos_and_offset(pos, offset)
+        # return the corresponding lemma
+        for lemma in synset._lemmas:
+            if lemma._key == key:
+                return lemma
+        raise WordNetError("No lemma found for for key %r" % key)
+    #############################################################
+    # Loading Synsets
+    #############################################################
+    def synset(self, name):
+        # split name into lemma, part of speech and synset number
+        lemma, pos, synset_index_str = name.lower().rsplit(".", 2)
+        synset_index = int(synset_index_str) - 1
+        # get the offset for this synset
+        try:
+            offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
+        except KeyError as e:
+            raise WordNetError(f"No lemma {lemma!r} with part of speech {pos!r}") from e
+        except IndexError as e:
+            n_senses = len(self._lemma_pos_offset_map[lemma][pos])
+            raise WordNetError(
+                f"Lemma {lemma!r} with part of speech {pos!r} only "
+                f"has {n_senses} {'sense' if n_senses == 1 else 'senses'}"
+            ) from e
+        # load synset information from the appropriate file
+        synset = self.synset_from_pos_and_offset(pos, offset)
+        # some basic sanity checks on loaded attributes
+        if pos == "s" and synset._pos == "a":
+            message = (
+                "Adjective satellite requested but only plain "
+                "adjective found for lemma %r"
+            )
+            raise WordNetError(message % lemma)
+        assert synset._pos == pos or (pos == "a" and synset._pos == "s")
+        # Return the synset object.
+        return synset
+    def _data_file(self, pos):
+        """
+        Return an open file pointer for the data file for the given
+        part of speech.
+        """
+        if pos == ADJ_SAT:
+            pos = ADJ
+        if self._data_file_map.get(pos) is None:
+            fileid = "data.%s" % self._FILEMAP[pos]
+            self._data_file_map[pos] = self.open(fileid)
+        return self._data_file_map[pos]
+    def synset_from_pos_and_offset(self, pos, offset):
+        """
+        - pos: The synset's part of speech, matching one of the module level
+          attributes ADJ, ADJ_SAT, ADV, NOUN or VERB ('a', 's', 'r', 'n', or 'v').
+        - offset: The byte offset of this synset in the WordNet dict file
+          for this pos.
+        >>> from nltk.corpus import wordnet as wn
+        >>> print(wn.synset_from_pos_and_offset('n', 1740))
+        Synset('entity.n.01')
+        """
+        # Check to see if the synset is in the cache
+        if offset in self._synset_offset_cache[pos]:
+            return self._synset_offset_cache[pos][offset]
+        data_file = self._data_file(pos)
+        data_file.seek(offset)
+        data_file_line = data_file.readline()
+        # If valid, the offset equals the 8-digit 0-padded integer found at the start of the line:
+        line_offset = data_file_line[:8]
+        if (
+            line_offset.isalnum()
+            and line_offset == f"{'0'*(8-len(str(offset)))}{str(offset)}"
+        ):
+            synset = self._synset_from_pos_and_line(pos, data_file_line)
+            assert synset._offset == offset
+            self._synset_offset_cache[pos][offset] = synset
+        else:
+            synset = None
+            warnings.warn(f"No WordNet synset found for pos={pos} at offset={offset}.")
+        data_file.seek(0)
+        return synset
+    @deprecated("Use public method synset_from_pos_and_offset() instead")
+    def _synset_from_pos_and_offset(self, *args, **kwargs):
+        """
+        Hack to help people like the readers of
+        https://stackoverflow.com/a/27145655/1709587
+        who were using this function before it was officially a public method
+        """
+        return self.synset_from_pos_and_offset(*args, **kwargs)
+    def _synset_from_pos_and_line(self, pos, data_file_line):
+        # Construct a new (empty) synset.
+        synset = Synset(self)
+        # parse the entry for this synset
+        try:
+            # parse out the definitions and examples from the gloss
+            columns_str, gloss = data_file_line.strip().split("|")
+            definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
+            examples = re.findall(r'"([^"]*)"', gloss)
+            for example in examples:
+                synset._examples.append(example)
+            synset._definition = definition.strip("; ")
+            # split the other info into fields
+            _iter = iter(columns_str.split())
+            def _next_token():
+                return next(_iter)
+            # get the offset
+            synset._offset = int(_next_token())
+            # determine the lexicographer file name
+            lexname_index = int(_next_token())
+            synset._lexname = self._lexnames[lexname_index]
+            # get the part of speech
+            synset._pos = _next_token()
+            # create Lemma objects for each lemma
+            n_lemmas = int(_next_token(), 16)
+            for _ in range(n_lemmas):
+                # get the lemma name
+                lemma_name = _next_token()
+                # get the lex_id (used for sense_keys)
+                lex_id = int(_next_token(), 16)
+                # If the lemma has a syntactic marker, extract it.
+                m = re.match(r"(.*?)(\(.*\))?$", lemma_name)
+                lemma_name, syn_mark = m.groups()
+                # create the lemma object
+                lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
+                synset._lemmas.append(lemma)
+                synset._lemma_names.append(lemma._name)
+            # collect the pointer tuples
+            n_pointers = int(_next_token())
+            for _ in range(n_pointers):
+                symbol = _next_token()
+                offset = int(_next_token())
+                pos = _next_token()
+                lemma_ids_str = _next_token()
+                if lemma_ids_str == "0000":
+                    synset._pointers[symbol].add((pos, offset))
+                else:
+                    source_index = int(lemma_ids_str[:2], 16) - 1
+                    target_index = int(lemma_ids_str[2:], 16) - 1
+                    source_lemma_name = synset._lemmas[source_index]._name
+                    lemma_pointers = synset._lemma_pointers
+                    tups = lemma_pointers[source_lemma_name, symbol]
+                    tups.append((pos, offset, target_index))
+            # read the verb frames
+            try:
+                frame_count = int(_next_token())
+            except StopIteration:
+                pass
+            else:
+                for _ in range(frame_count):
+                    # read the plus sign
+                    plus = _next_token()
+                    assert plus == "+"
+                    # read the frame and lemma number
+                    frame_number = int(_next_token())
+                    frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
+                    lemma_number = int(_next_token(), 16)
+                    # lemma number of 00 means all words in the synset
+                    if lemma_number == 0:
+                        synset._frame_ids.append(frame_number)
+                        for lemma in synset._lemmas:
+                            lemma._frame_ids.append(frame_number)
+                            lemma._frame_strings.append(frame_string_fmt % lemma._name)
+                    # only a specific word in the synset
+                    else:
+                        lemma = synset._lemmas[lemma_number - 1]
+                        lemma._frame_ids.append(frame_number)
+                        lemma._frame_strings.append(frame_string_fmt % lemma._name)
+        # raise a more informative error with line text
+        except ValueError as e:
+            raise WordNetError(f"line {data_file_line!r}: {e}") from e
+        # set sense keys for Lemma objects - note that this has to be
+        # done afterwards so that the relations are available
+        for lemma in synset._lemmas:
+            if synset._pos == ADJ_SAT:
+                head_lemma = synset.similar_tos()[0]._lemmas[0]
+                head_name = head_lemma._name
+                head_id = "%02d" % head_lemma._lex_id
+            else:
+                head_name = head_id = ""
+            tup = (
+                lemma._name,
+                WordNetCorpusReader._pos_numbers[synset._pos],
+                lemma._lexname_index,
+                lemma._lex_id,
+                head_name,
+                head_id,
+            )
+            lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower()
+        # the canonical name is based on the first lemma
+        lemma_name = synset._lemmas[0]._name.lower()
+        offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
+        sense_index = offsets.index(synset._offset)
+        tup = lemma_name, synset._pos, sense_index + 1
+        synset._name = "%s.%s.%02i" % tup
+        return synset
+    def synset_from_sense_key(self, sense_key):
+        """
+        Retrieves synset based on a given sense_key. Sense keys can be
+        obtained from lemma.key()
+        From https://wordnet.princeton.edu/documentation/senseidx5wn:
+        A sense_key is represented as::
+            lemma % lex_sense (e.g. 'dog%1:18:01::')
+        where lex_sense is encoded as::
+            ss_type:lex_filenum:lex_id:head_word:head_id
+        :lemma:       ASCII text of word/collocation, in lower case
+        :ss_type:     synset type for the sense (1 digit int)
+                      The synset type is encoded as follows::
+                          1    NOUN
+                          2    VERB
+                          3    ADJECTIVE
+                          4    ADVERB
+                          5    ADJECTIVE SATELLITE
+        :lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
+        :lex_id:      when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
+        :head_word:   lemma of the first word in satellite's head synset
+                      Only used if sense is in an adjective satellite synset
+        :head_id:     uniquely identifies sense in a lexicographer file when paired with head_word
+                      Only used if head_word is present (2 digit int)
+        >>> import nltk
+        >>> from nltk.corpus import wordnet as wn
+        >>> print(wn.synset_from_sense_key("drive%1:04:03::"))
+        Synset('drive.n.06')
+        >>> print(wn.synset_from_sense_key("driving%1:04:03::"))
+        Synset('drive.n.06')
+        """
+        return self.lemma_from_key(sense_key).synset()
+    #############################################################
+    # Retrieve synsets and lemmas.
+    #############################################################
+    def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True):
+        """Load all synsets with a given lemma and part of speech tag.
+        If no pos is specified, all synsets for all parts of speech
+        will be loaded.
+        If lang is specified, all the synsets associated with the lemma name
+        of that language will be returned.
+        """
+        lemma = lemma.lower()
+        if lang == "eng":
+            get_synset = self.synset_from_pos_and_offset
+            index = self._lemma_pos_offset_map
+            if pos is None:
+                pos = POS_LIST
+            return [
+                get_synset(p, offset)
+                for p in pos
+                for form in self._morphy(lemma, p, check_exceptions)
+                for offset in index[form].get(p, [])
+            ]
+        else:
+            self._load_lang_data(lang)
+            synset_list = []
+            if lemma in self._lang_data[lang][1]:
+                for l in self._lang_data[lang][1][lemma]:
+                    if pos is not None and l[-1] != pos:
+                        continue
+                    synset_list.append(self.of2ss(l))
+            return synset_list
+    def lemmas(self, lemma, pos=None, lang="eng"):
+        """Return all Lemma objects with a name matching the specified lemma
+        name and part of speech tag. Matches any part of speech tag if none is
+        specified."""
+        lemma = lemma.lower()
+        if lang == "eng":
+            return [
+                lemma_obj
+                for synset in self.synsets(lemma, pos)
+                for lemma_obj in synset.lemmas()
+                if lemma_obj.name().lower() == lemma
+            ]
+        else:
+            self._load_lang_data(lang)
+            lemmas = []
+            syn = self.synsets(lemma, lang=lang)
+            for s in syn:
+                if pos is not None and s.pos() != pos:
+                    continue
+                for lemma_obj in s.lemmas(lang=lang):
+                    if lemma_obj.name().lower() == lemma:
+                        lemmas.append(lemma_obj)
+            return lemmas
+    def all_lemma_names(self, pos=None, lang="eng"):
+        """Return all lemma names for all synsets for the given
+        part of speech tag and language or languages. If pos is
+        not specified, all synsets for all parts of speech will
+        be used."""
+        if lang == "eng":
+            if pos is None:
+                return iter(self._lemma_pos_offset_map)
+            else:
+                return (
+                    lemma
+                    for lemma in self._lemma_pos_offset_map
+                    if pos in self._lemma_pos_offset_map[lemma]
+                )
+        else:
+            self._load_lang_data(lang)
+            lemma = []
+            for i in self._lang_data[lang][0]:
+                if pos is not None and i[-1] != pos:
+                    continue
+                lemma.extend(self._lang_data[lang][0][i])
+            lemma = iter(set(lemma))
+            return lemma
+    def all_omw_synsets(self, pos=None, lang=None):
+        if lang not in self.langs():
+            return None
+        self._load_lang_data(lang)
+        for of in self._lang_data[lang][0]:
+            if not pos or of[-1] == pos:
+                ss = self.of2ss(of)
+                if ss:
+                    yield ss
+    #            else:
+    # A few OMW offsets don't exist in Wordnet 3.0.
+    #                warnings.warn(f"Language {lang}: no synset found for {of}")
+    def all_synsets(self, pos=None, lang="eng"):
+        """Iterate over all synsets with a given part of speech tag.
+        If no pos is specified, all synsets for all parts of speech
+        will be loaded.
+        """
+        if lang == "eng":
+            return self.all_eng_synsets(pos=pos)
+        else:
+            return self.all_omw_synsets(pos=pos, lang=lang)
+    def all_eng_synsets(self, pos=None):
+        if pos is None:
+            pos_tags = self._FILEMAP.keys()
+        else:
+            pos_tags = [pos]
+        cache = self._synset_offset_cache
+        from_pos_and_line = self._synset_from_pos_and_line
+        # generate all synsets for each part of speech
+        for pos_tag in pos_tags:
+            # Open the file for reading.  Note that we can not re-use
+            # the file pointers from self._data_file_map here, because
+            # we're defining an iterator, and those file pointers might
+            # be moved while we're not looking.
+            if pos_tag == ADJ_SAT:
+                pos_file = ADJ
+            else:
+                pos_file = pos_tag
+            fileid = "data.%s" % self._FILEMAP[pos_file]
+            data_file = self.open(fileid)
+            try:
+                # generate synsets for each line in the POS file
+                offset = data_file.tell()
+                line = data_file.readline()
+                while line:
+                    if not line[0].isspace():
+                        if offset in cache[pos_tag]:
+                            # See if the synset is cached
+                            synset = cache[pos_tag][offset]
+                        else:
+                            # Otherwise, parse the line
+                            synset = from_pos_and_line(pos_tag, line)
+                            cache[pos_tag][offset] = synset
+                        # adjective satellites are in the same file as
+                        # adjectives so only yield the synset if it's actually
+                        # a satellite
+                        if pos_tag == ADJ_SAT and synset._pos == ADJ_SAT:
+                            yield synset
+                        # for all other POS tags, yield all synsets (this means
+                        # that adjectives also include adjective satellites)
+                        elif pos_tag != ADJ_SAT:
+                            yield synset
+                    offset = data_file.tell()
+                    line = data_file.readline()
+            # close the extra file handle we opened
+            except:
+                data_file.close()
+                raise
+            else:
+                data_file.close()
+    def words(self, lang="eng"):
+        """return lemmas of the given language as list of words"""
+        return self.all_lemma_names(lang=lang)
+    def synonyms(self, word, lang="eng"):
+        """return nested list with the synonyms of the different senses of word in the given language"""
+        return [
+            sorted(list(set(ss.lemma_names(lang=lang)) - {word}))
+            for ss in self.synsets(word, lang=lang)
+        ]
+    def doc(self, file="README", lang="eng"):
+        """Return the contents of readme, license or citation file
+        use lang=lang to get the file for an individual language"""
+        if lang == "eng":
+            reader = self
+        else:
+            reader = self._omw_reader
+            if lang in self.langs():
+                file = f"{os.path.join(self.provenances[lang],file)}"
+        try:
+            with reader.open(file) as fp:
+                return fp.read()
+        except:
+            if lang in self._lang_data:
+                return f"Cannot determine {file} for {lang}"
+            else:
+                return f"Language {lang} is not supported."
+    def license(self, lang="eng"):
+        """Return the contents of LICENSE (for omw)
+        use lang=lang to get the license for an individual language"""
+        return self.doc(file="LICENSE", lang=lang)
+    def readme(self, lang="eng"):
+        """Return the contents of README (for omw)
+        use lang=lang to get the readme for an individual language"""
+        return self.doc(file="README", lang=lang)
+    def citation(self, lang="eng"):
+        """Return the contents of citation.bib file (for omw)
+        use lang=lang to get the citation for an individual language"""
+        return self.doc(file="citation.bib", lang=lang)
+    #############################################################
+    # Misc
+    #############################################################
+    def lemma_count(self, lemma):
+        """Return the frequency count for this Lemma"""
+        # Currently, count is only work for English
+        if lemma._lang != "eng":
+            return 0
+        # open the count file if we haven't already
+        if self._key_count_file is None:
+            self._key_count_file = self.open("cntlist.rev")
+        # find the key in the counts file and return the count
+        line = _binary_search_file(self._key_count_file, lemma._key)
+        if line:
+            return int(line.rsplit(" ", 1)[-1])
+        else:
+            return 0
+    def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+        return synset1.path_similarity(synset2, verbose, simulate_root)
+    path_similarity.__doc__ = Synset.path_similarity.__doc__
+    def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+        return synset1.lch_similarity(synset2, verbose, simulate_root)
+    lch_similarity.__doc__ = Synset.lch_similarity.__doc__
+    def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+        return synset1.wup_similarity(synset2, verbose, simulate_root)
+    wup_similarity.__doc__ = Synset.wup_similarity.__doc__
+    def res_similarity(self, synset1, synset2, ic, verbose=False):
+        return synset1.res_similarity(synset2, ic, verbose)
+    res_similarity.__doc__ = Synset.res_similarity.__doc__
+    def jcn_similarity(self, synset1, synset2, ic, verbose=False):
+        return synset1.jcn_similarity(synset2, ic, verbose)
+    jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
+    def lin_similarity(self, synset1, synset2, ic, verbose=False):
+        return synset1.lin_similarity(synset2, ic, verbose)
+    lin_similarity.__doc__ = Synset.lin_similarity.__doc__
+    #############################################################
+    # Morphy
+    #############################################################
+    # Morphy, adapted from Oliver Steele's pywordnet
+    def morphy(self, form, pos=None, check_exceptions=True):
+        """
+        Find a possible base form for the given form, with the given
+        part of speech, by checking WordNet's list of exceptional
+        forms, and by recursively stripping affixes for this part of
+        speech until a form in WordNet is found.
+        >>> from nltk.corpus import wordnet as wn
+        >>> print(wn.morphy('dogs'))
+        dog
+        >>> print(wn.morphy('churches'))
+        church
+        >>> print(wn.morphy('aardwolves'))
+        aardwolf
+        >>> print(wn.morphy('abaci'))
+        abacus
+        >>> wn.morphy('hardrock', wn.ADV)
+        >>> print(wn.morphy('book', wn.NOUN))
+        book
+        >>> wn.morphy('book', wn.ADJ)
+        """
+        if pos is None:
+            morphy = self._morphy
+            analyses = chain(a for p in POS_LIST for a in morphy(form, p))
+        else:
+            analyses = self._morphy(form, pos, check_exceptions)
+        # get the first one we find
+        first = list(islice(analyses, 1))
+        if len(first) == 1:
+            return first[0]
+        else:
+            return None
+    MORPHOLOGICAL_SUBSTITUTIONS = {
+        NOUN: [
+            ("s", ""),
+            ("ses", "s"),
+            ("ves", "f"),
+            ("xes", "x"),
+            ("zes", "z"),
+            ("ches", "ch"),
+            ("shes", "sh"),
+            ("men", "man"),
+            ("ies", "y"),
+        ],
+        VERB: [
+            ("s", ""),
+            ("ies", "y"),
+            ("es", "e"),
+            ("es", ""),
+            ("ed", "e"),
+            ("ed", ""),
+            ("ing", "e"),
+            ("ing", ""),
+        ],
+        ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")],
+        ADV: [],
+    }
+    MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
+    def _morphy(self, form, pos, check_exceptions=True):
+        # from jordanbg:
+        # Given an original string x
+        # 1. Apply rules once to the input to get y1, y2, y3, etc.
+        # 2. Return all that are in the database
+        # 3. If there are no matches, keep applying rules until you either
+        #    find a match or you can't go any further
+        exceptions = self._exception_map[pos]
+        substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
+        def apply_rules(forms):
+            return [
+                form[: -len(old)] + new
+                for form in forms
+                for old, new in substitutions
+                if form.endswith(old)
+            ]
+        def filter_forms(forms):
+            result = []
+            seen = set()
+            for form in forms:
+                if form in self._lemma_pos_offset_map:
+                    if pos in self._lemma_pos_offset_map[form]:
+                        if form not in seen:
+                            result.append(form)
+                            seen.add(form)
+            return result
+        # 0. Check the exception lists
+        if check_exceptions:
+            if form in exceptions:
+                return filter_forms([form] + exceptions[form])
+        # 1. Apply rules once to the input to get y1, y2, y3, etc.
+        forms = apply_rules([form])
+        # 2. Return all that are in the database (and check the original too)
+        results = filter_forms([form] + forms)
+        if results:
+            return results
+        # 3. If there are no matches, keep applying rules until we find a match
+        while forms:
+            forms = apply_rules(forms)
+            results = filter_forms(forms)
+            if results:
+                return results
+        # Return an empty list if we can't find anything
+        return []
+    #############################################################
+    # Create information content from corpus
+    #############################################################
+    def ic(self, corpus, weight_senses_equally=False, smoothing=1.0):
+        """
+        Creates an information content lookup dictionary from a corpus.
+        :type corpus: CorpusReader
+        :param corpus: The corpus from which we create an information
+            content dictionary.
+        :type weight_senses_equally: bool
+        :param weight_senses_equally: If this is True, gives all
+            possible senses equal weight rather than dividing by the
+            number of possible senses.  (If a word has 3 synses, each
+            sense gets 0.3333 per appearance when this is False, 1.0 when
+            it is true.)
+        :param smoothing: How much do we smooth synset counts (default is 1.0)
+        :type smoothing: float
+        :return: An information content dictionary
+        """
+        counts = FreqDist()
+        for ww in corpus.words():
+            counts[ww] += 1
+        ic = {}
+        for pp in POS_LIST:
+            ic[pp] = defaultdict(float)
+        # Initialize the counts with the smoothing value
+        if smoothing > 0.0:
+            for pp in POS_LIST:
+                ic[pp][0] = smoothing
+            for ss in self.all_synsets():
+                pos = ss._pos
+                if pos == ADJ_SAT:
+                    pos = ADJ
+                ic[pos][ss._offset] = smoothing
+        for ww in counts:
+            possible_synsets = self.synsets(ww)
+            if len(possible_synsets) == 0:
+                continue
+            # Distribute weight among possible synsets
+            weight = float(counts[ww])
+            if not weight_senses_equally:
+                weight /= float(len(possible_synsets))
+            for ss in possible_synsets:
+                pos = ss._pos
+                if pos == ADJ_SAT:
+                    pos = ADJ
+                for level in ss._iter_hypernym_lists():
+                    for hh in level:
+                        ic[pos][hh._offset] += weight
+                # Add the weight to the root
+                ic[pos][0] += weight
+        return ic
+    def custom_lemmas(self, tab_file, lang):
+        """
+        Reads a custom tab file containing mappings of lemmas in the given
+        language to Princeton WordNet 3.0 synset offsets, allowing NLTK's
+        WordNet functions to then be used with that language.
+        See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for
+        documentation on the Multilingual WordNet tab file format.
+        :param tab_file: Tab file as a file or file-like object
+        :type: lang str
+        :param: lang ISO 639-3 code of the language of the tab file
+        """
+        lg = lang.split("_")[0]
+        if len(lg) != 3:
+            raise ValueError("lang should be a (3 character) ISO 639-3 code")
+        self._lang_data[lang] = [
+            defaultdict(list),
+            defaultdict(list),
+            defaultdict(list),
+            defaultdict(list),
+        ]
+        for line in tab_file.readlines():
+            if isinstance(line, bytes):
+                # Support byte-stream files (e.g. as returned by Python 2's
+                # open() function) as well as text-stream ones
+                line = line.decode("utf-8")
+            if not line.startswith("#"):
+                triple = line.strip().split("\t")
+                if len(triple) < 3:
+                    continue
+                offset_pos, label = triple[:2]
+                val = triple[-1]
+                if self.map30:
+                    if offset_pos in self.map30:
+                        # Map offset_pos to current Wordnet version:
+                        offset_pos = self.map30[offset_pos]
+                    else:
+                        # Some OMW offsets were never in Wordnet:
+                        if (
+                            offset_pos not in self.nomap
+                            and offset_pos.replace("a", "s") not in self.nomap
+                        ):
+                            warnings.warn(
+                                f"{lang}: invalid offset {offset_pos} in '{line}'"
+                            )
+                        continue
+                elif offset_pos[-1] == "a":
+                    wnss = self.of2ss(offset_pos)
+                    if wnss and wnss.pos() == "s":  # Wordnet pos is "s"
+                        # Label OMW adjective satellites back to their Wordnet pos ("s")
+                        offset_pos = self.ss2of(wnss)
+                pair = label.split(":")
+                attr = pair[-1]
+                if len(pair) == 1 or pair[0] == lg:
+                    if attr == "lemma":
+                        val = val.strip().replace(" ", "_")
+                        self._lang_data[lang][1][val.lower()].append(offset_pos)
+                    if attr in self.lg_attrs:
+                        self._lang_data[lang][self.lg_attrs.index(attr)][
+                            offset_pos
+                        ].append(val)
+    def disable_custom_lemmas(self, lang):
+        """prevent synsets from being mistakenly added"""
+        for n in range(len(self.lg_attrs)):
+            self._lang_data[lang][n].default_factory = None
+    ######################################################################
+    # Visualize WordNet relation graphs using Graphviz
+    ######################################################################
+    def digraph(
+        self,
+        inputs,
+        rel=lambda s: s.hypernyms(),
+        pos=None,
+        maxdepth=-1,
+        shapes=None,
+        attr=None,
+        verbose=False,
+    ):
+        """
+        Produce a graphical representation from 'inputs' (a list of
+        start nodes, which can be a mix of Synsets, Lemmas and/or words),
+        and a synset relation, for drawing with the 'dot' graph visualisation
+        program from the Graphviz package.
+        Return a string in the DOT graph file language, which can then be
+        converted to an image by nltk.parse.dependencygraph.dot2img(dot_string).
+        Optional Parameters:
+        :rel: Wordnet synset relation
+        :pos: for words, restricts Part of Speech to 'n', 'v', 'a' or 'r'
+        :maxdepth: limit the longest path
+        :shapes: dictionary of strings that trigger a specified shape
+        :attr: dictionary with global graph attributes
+        :verbose: warn about cycles
+        >>> from nltk.corpus import wordnet as wn
+        >>> print(wn.digraph([wn.synset('dog.n.01')]))
+        digraph G {
+        "Synset('animal.n.01')" -> "Synset('organism.n.01')";
+        "Synset('canine.n.02')" -> "Synset('carnivore.n.01')";
+        "Synset('carnivore.n.01')" -> "Synset('placental.n.01')";
+        "Synset('chordate.n.01')" -> "Synset('animal.n.01')";
+        "Synset('dog.n.01')" -> "Synset('canine.n.02')";
+        "Synset('dog.n.01')" -> "Synset('domestic_animal.n.01')";
+        "Synset('domestic_animal.n.01')" -> "Synset('animal.n.01')";
+        "Synset('living_thing.n.01')" -> "Synset('whole.n.02')";
+        "Synset('mammal.n.01')" -> "Synset('vertebrate.n.01')";
+        "Synset('object.n.01')" -> "Synset('physical_entity.n.01')";
+        "Synset('organism.n.01')" -> "Synset('living_thing.n.01')";
+        "Synset('physical_entity.n.01')" -> "Synset('entity.n.01')";
+        "Synset('placental.n.01')" -> "Synset('mammal.n.01')";
+        "Synset('vertebrate.n.01')" -> "Synset('chordate.n.01')";
+        "Synset('whole.n.02')" -> "Synset('object.n.01')";
+        }
+        <BLANKLINE>
+        """
+        from nltk.util import edge_closure, edges2dot
+        synsets = set()
+        edges = set()
+        if not shapes:
+            shapes = dict()
+        if not attr:
+            attr = dict()
+        def add_lemma(lem):
+            ss = lem.synset()
+            synsets.add(ss)
+            edges.add((lem, ss))
+        for node in inputs:
+            typ = type(node)
+            if typ == Synset:
+                synsets.add(node)
+            elif typ == Lemma:
+                add_lemma(node)
+            elif typ == str:
+                for lemma in self.lemmas(node, pos):
+                    add_lemma(lemma)
+        for ss in synsets:
+            edges = edges.union(edge_closure(ss, rel, maxdepth, verbose))
+        dot_string = edges2dot(sorted(list(edges)), shapes=shapes, attr=attr)
+        return dot_string
+######################################################################
+# WordNet Information Content Corpus Reader
+######################################################################
+class WordNetICCorpusReader(CorpusReader):
+    """
+    A corpus reader for the WordNet information content corpus.
+    """
+    def __init__(self, root, fileids):
+        CorpusReader.__init__(self, root, fileids, encoding="utf8")
+    # this load function would be more efficient if the data was pickled
+    # Note that we can't use NLTK's frequency distributions because
+    # synsets are overlapping (each instance of a synset also counts
+    # as an instance of its hypernyms)
+    def ic(self, icfile):
+        """
+        Load an information content file from the wordnet_ic corpus
+        and return a dictionary.  This dictionary has just two keys,
+        NOUN and VERB, whose values are dictionaries that map from
+        synsets to information content values.
+        :type icfile: str
+        :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
+        :return: An information content dictionary
+        """
+        ic = {}
+        ic[NOUN] = defaultdict(float)
+        ic[VERB] = defaultdict(float)
+        with self.open(icfile) as fp:
+            for num, line in enumerate(fp):
+                if num == 0:  # skip the header
+                    continue
+                fields = line.split()
+                offset = int(fields[0][:-1])
+                value = float(fields[1])
+                pos = _get_pos(fields[0])
+                if len(fields) == 3 and fields[2] == "ROOT":
+                    # Store root count.
+                    ic[pos][0] += value
+                if value != 0:
+                    ic[pos][offset] = value
+        return ic
+######################################################################
+# Similarity metrics
+######################################################################
+# TODO: Add in the option to manually add a new root node; this will be
+# useful for verb similarity as there exist multiple verb taxonomies.
+# More information about the metrics is available at
+# http://marimba.d.umn.edu/similarity/measures.html
+def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
+    return synset1.path_similarity(
+        synset2, verbose=verbose, simulate_root=simulate_root
+    )
+def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
+    return synset1.lch_similarity(synset2, verbose=verbose, simulate_root=simulate_root)
+def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
+    return synset1.wup_similarity(synset2, verbose=verbose, simulate_root=simulate_root)
+def res_similarity(synset1, synset2, ic, verbose=False):
+    return synset1.res_similarity(synset2, ic, verbose=verbose)
+def jcn_similarity(synset1, synset2, ic, verbose=False):
+    return synset1.jcn_similarity(synset2, ic, verbose=verbose)
+def lin_similarity(synset1, synset2, ic, verbose=False):
+    return synset1.lin_similarity(synset2, ic, verbose=verbose)
+path_similarity.__doc__ = Synset.path_similarity.__doc__
+lch_similarity.__doc__ = Synset.lch_similarity.__doc__
+wup_similarity.__doc__ = Synset.wup_similarity.__doc__
+res_similarity.__doc__ = Synset.res_similarity.__doc__
+jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
+lin_similarity.__doc__ = Synset.lin_similarity.__doc__
+def _lcs_ic(synset1, synset2, ic, verbose=False):
+    """
+    Get the information content of the least common subsumer that has
+    the highest information content value.  If two nodes have no
+    explicit common subsumer, assume that they share an artificial
+    root node that is the hypernym of all explicit roots.
+    :type synset1: Synset
+    :param synset1: First input synset.
+    :type synset2: Synset
+    :param synset2: Second input synset.  Must be the same part of
+    speech as the first synset.
+    :type  ic: dict
+    :param ic: an information content object (as returned by ``load_ic()``).
+    :return: The information content of the two synsets and their most
+    informative subsumer
+    """
+    if synset1._pos != synset2._pos:
+        raise WordNetError(
+            "Computing the least common subsumer requires "
+            "%s and %s to have the same part of speech." % (synset1, synset2)
+        )
+    ic1 = information_content(synset1, ic)
+    ic2 = information_content(synset2, ic)
+    subsumers = synset1.common_hypernyms(synset2)
+    if len(subsumers) == 0:
+        subsumer_ic = 0
+    else:
+        subsumer_ic = max(information_content(s, ic) for s in subsumers)
+    if verbose:
+        print("> LCS Subsumer by content:", subsumer_ic)
+    return ic1, ic2, subsumer_ic
+# Utility functions
+def information_content(synset, ic):
+    pos = synset._pos
+    if pos == ADJ_SAT:
+        pos = ADJ
+    try:
+        icpos = ic[pos]
+    except KeyError as e:
+        msg = "Information content file has no entries for part-of-speech: %s"
+        raise WordNetError(msg % pos) from e
+    counts = icpos[synset._offset]
+    if counts == 0:
+        return _INF
+    else:
+        return -math.log(counts / icpos[0])
+# get the part of speech (NOUN or VERB) from the information content record
+# (each identifier has a 'n' or 'v' suffix)
+def _get_pos(field):
+    if field[-1] == "n":
+        return NOUN
+    elif field[-1] == "v":
+        return VERB
+    else:
+        msg = (
+            "Unidentified part of speech in WordNet Information Content file "
+            "for field %s" % field
+        )
+        raise ValueError(msg)

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# Natural Language Toolkit: XML Corpus Reader
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Corpus reader for corpora whose documents are xml files.
+(note -- not named 'xml' to avoid conflicting w/ standard xml package)
+"""
+import codecs
+from xml.etree import ElementTree
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import *
+from nltk.data import SeekableUnicodeStreamReader
+from nltk.internals import ElementWrapper
+from nltk.tokenize import WordPunctTokenizer
+class XMLCorpusReader(CorpusReader):
+    """
+    Corpus reader for corpora whose documents are xml files.
+    Note that the ``XMLCorpusReader`` constructor does not take an
+    ``encoding`` argument, because the unicode encoding is specified by
+    the XML files themselves.  See the XML specs for more info.
+    """
+    def __init__(self, root, fileids, wrap_etree=False):
+        self._wrap_etree = wrap_etree
+        CorpusReader.__init__(self, root, fileids)
+    def xml(self, fileid=None):
+        # Make sure we have exactly one file -- no concatenating XML.
+        if fileid is None and len(self._fileids) == 1:
+            fileid = self._fileids[0]
+        if not isinstance(fileid, str):
+            raise TypeError("Expected a single file identifier string")
+        # Read the XML in using ElementTree.
+        with self.abspath(fileid).open() as fp:
+            elt = ElementTree.parse(fp).getroot()
+        # If requested, wrap it.
+        if self._wrap_etree:
+            elt = ElementWrapper(elt)
+        # Return the ElementTree element.
+        return elt
+    def words(self, fileid=None):
+        """
+        Returns all of the words and punctuation symbols in the specified file
+        that were in text nodes -- ie, tags are ignored. Like the xml() method,
+        fileid can only specify one file.
+        :return: the given file's text nodes as a list of words and punctuation symbols
+        :rtype: list(str)
+        """
+        elt = self.xml(fileid)
+        encoding = self.encoding(fileid)
+        word_tokenizer = WordPunctTokenizer()
+        try:
+            iterator = elt.getiterator()
+        except:
+            iterator = elt.iter()
+        out = []
+        for node in iterator:
+            text = node.text
+            if text is not None:
+                if isinstance(text, bytes):
+                    text = text.decode(encoding)
+                toks = word_tokenizer.tokenize(text)
+                out.extend(toks)
+        return out
+class XMLCorpusView(StreamBackedCorpusView):
+    """
+    A corpus view that selects out specified elements from an XML
+    file, and provides a flat list-like interface for accessing them.
+    (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
+    but may be used by subclasses of ``XMLCorpusReader``.)
+    Every XML corpus view has a "tag specification", indicating what
+    XML elements should be included in the view; and each (non-nested)
+    element that matches this specification corresponds to one item in
+    the view.  Tag specifications are regular expressions over tag
+    paths, where a tag path is a list of element tag names, separated
+    by '/', indicating the ancestry of the element.  Some examples:
+      - ``'foo'``: A top-level element whose tag is ``foo``.
+      - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
+        is a top-level element whose tag is ``foo``.
+      - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
+        in the xml tree.
+      - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
+        appearing anywhere in the xml tree.
+    The view items are generated from the selected XML elements via
+    the method ``handle_elt()``.  By default, this method returns the
+    element as-is (i.e., as an ElementTree object); but it can be
+    overridden, either via subclassing or via the ``elt_handler``
+    constructor parameter.
+    """
+    #: If true, then display debugging output to stdout when reading
+    #: blocks.
+    _DEBUG = False
+    #: The number of characters read at a time by this corpus reader.
+    _BLOCK_SIZE = 1024
+    def __init__(self, fileid, tagspec, elt_handler=None):
+        """
+        Create a new corpus view based on a specified XML file.
+        Note that the ``XMLCorpusView`` constructor does not take an
+        ``encoding`` argument, because the unicode encoding is
+        specified by the XML files themselves.
+        :type tagspec: str
+        :param tagspec: A tag specification, indicating what XML
+            elements should be included in the view.  Each non-nested
+            element that matches this specification corresponds to one
+            item in the view.
+        :param elt_handler: A function used to transform each element
+            to a value for the view.  If no handler is specified, then
+            ``self.handle_elt()`` is called, which returns the element
+            as an ElementTree object.  The signature of elt_handler is::
+                elt_handler(elt, tagspec) -> value
+        """
+        if elt_handler:
+            self.handle_elt = elt_handler
+        self._tagspec = re.compile(tagspec + r"\Z")
+        """The tag specification for this corpus view."""
+        self._tag_context = {0: ()}
+        """A dictionary mapping from file positions (as returned by
+           ``stream.seek()`` to XML contexts.  An XML context is a
+           tuple of XML tag names, indicating which tags have not yet
+           been closed."""
+        encoding = self._detect_encoding(fileid)
+        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
+    def _detect_encoding(self, fileid):
+        if isinstance(fileid, PathPointer):
+            try:
+                infile = fileid.open()
+                s = infile.readline()
+            finally:
+                infile.close()
+        else:
+            with open(fileid, "rb") as infile:
+                s = infile.readline()
+        if s.startswith(codecs.BOM_UTF16_BE):
+            return "utf-16-be"
+        if s.startswith(codecs.BOM_UTF16_LE):
+            return "utf-16-le"
+        if s.startswith(codecs.BOM_UTF32_BE):
+            return "utf-32-be"
+        if s.startswith(codecs.BOM_UTF32_LE):
+            return "utf-32-le"
+        if s.startswith(codecs.BOM_UTF8):
+            return "utf-8"
+        m = re.match(rb'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
+        if m:
+            return m.group(1).decode()
+        m = re.match(rb"\s*<\?xml\b.*\bencoding='([^']+)'", s)
+        if m:
+            return m.group(1).decode()
+        # No encoding found -- what should the default be?
+        return "utf-8"
+    def handle_elt(self, elt, context):
+        """
+        Convert an element into an appropriate value for inclusion in
+        the view.  Unless overridden by a subclass or by the
+        ``elt_handler`` constructor argument, this method simply
+        returns ``elt``.
+        :return: The view value corresponding to ``elt``.
+        :type elt: ElementTree
+        :param elt: The element that should be converted.
+        :type context: str
+        :param context: A string composed of element tags separated by
+            forward slashes, indicating the XML context of the given
+            element.  For example, the string ``'foo/bar/baz'``
+            indicates that the element is a ``baz`` element whose
+            parent is a ``bar`` element and whose grandparent is a
+            top-level ``foo`` element.
+        """
+        return elt
+    #: A regular expression that matches XML fragments that do not
+    #: contain any un-closed tags.
+    _VALID_XML_RE = re.compile(
+        r"""
+        [^<]*
+        (
+          ((<!--.*?-->)                         |  # comment
+           (<![CDATA[.*?]])                     |  # raw character data
+           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl
+           (<[^!>][^>]*>))                         # tag or PI
+          [^<]*)*
+        \Z""",
+        re.DOTALL | re.VERBOSE,
+    )
+    #: A regular expression used to extract the tag name from a start tag,
+    #: end tag, or empty-elt tag string.
+    _XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)")
+    #: A regular expression used to find all start-tags, end-tags, and
+    #: empty-elt tags in an XML file.  This regexp is more lenient than
+    #: the XML spec -- e.g., it allows spaces in some places where the
+    #: spec does not.
+    _XML_PIECE = re.compile(
+        r"""
+        # Include these so we can skip them:
+        (?P<COMMENT>        <!--.*?-->                          )|
+        (?P<CDATA>          <![CDATA[.*?]]>                     )|
+        (?P<PI>             <\?.*?\?>                           )|
+        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
+        # These are the ones we actually care about:
+        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )|
+        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )|
+        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )""",
+        re.DOTALL | re.VERBOSE,
+    )
+    def _read_xml_fragment(self, stream):
+        """
+        Read a string from the given stream that does not contain any
+        un-closed tags.  In particular, this function first reads a
+        block from the stream of size ``self._BLOCK_SIZE``.  It then
+        checks if that block contains an un-closed tag.  If it does,
+        then this function either backtracks to the last '<', or reads
+        another block.
+        """
+        fragment = ""
+        if isinstance(stream, SeekableUnicodeStreamReader):
+            startpos = stream.tell()
+        while True:
+            # Read a block and add it to the fragment.
+            xml_block = stream.read(self._BLOCK_SIZE)
+            fragment += xml_block
+            # Do we have a well-formed xml fragment?
+            if self._VALID_XML_RE.match(fragment):
+                return fragment
+            # Do we have a fragment that will never be well-formed?
+            if re.search("[<>]", fragment).group(0) == ">":
+                pos = stream.tell() - (
+                    len(fragment) - re.search("[<>]", fragment).end()
+                )
+                raise ValueError('Unexpected ">" near char %s' % pos)
+            # End of file?
+            if not xml_block:
+                raise ValueError("Unexpected end of file: tag not closed")
+            # If not, then we must be in the middle of a <..tag..>.
+            # If appropriate, backtrack to the most recent '<'
+            # character.
+            last_open_bracket = fragment.rfind("<")
+            if last_open_bracket > 0:
+                if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
+                    if isinstance(stream, SeekableUnicodeStreamReader):
+                        stream.seek(startpos)
+                        stream.char_seek_forward(last_open_bracket)
+                    else:
+                        stream.seek(-(len(fragment) - last_open_bracket), 1)
+                    return fragment[:last_open_bracket]
+            # Otherwise, read another block. (i.e., return to the
+            # top of the loop.)
+    def read_block(self, stream, tagspec=None, elt_handler=None):
+        """
+        Read from ``stream`` until we find at least one element that
+        matches ``tagspec``, and return the result of applying
+        ``elt_handler`` to each element found.
+        """
+        if tagspec is None:
+            tagspec = self._tagspec
+        if elt_handler is None:
+            elt_handler = self.handle_elt
+        # Use a stack of strings to keep track of our context:
+        context = list(self._tag_context.get(stream.tell()))
+        assert context is not None  # check this -- could it ever happen?
+        elts = []
+        elt_start = None  # where does the elt start
+        elt_depth = None  # what context depth
+        elt_text = ""
+        while elts == [] or elt_start is not None:
+            if isinstance(stream, SeekableUnicodeStreamReader):
+                startpos = stream.tell()
+            xml_fragment = self._read_xml_fragment(stream)
+            # End of file.
+            if not xml_fragment:
+                if elt_start is None:
+                    break
+                else:
+                    raise ValueError("Unexpected end of file")
+            # Process each <tag> in the xml fragment.
+            for piece in self._XML_PIECE.finditer(xml_fragment):
+                if self._DEBUG:
+                    print("{:>25} {}".format("/".join(context)[-20:], piece.group()))
+                if piece.group("START_TAG"):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    # Keep context up-to-date.
+                    context.append(name)
+                    # Is this one of the elts we're looking for?
+                    if elt_start is None:
+                        if re.match(tagspec, "/".join(context)):
+                            elt_start = piece.start()
+                            elt_depth = len(context)
+                elif piece.group("END_TAG"):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    # sanity checks:
+                    if not context:
+                        raise ValueError("Unmatched tag </%s>" % name)
+                    if name != context[-1]:
+                        raise ValueError(f"Unmatched tag <{context[-1]}>...</{name}>")
+                    # Is this the end of an element?
+                    if elt_start is not None and elt_depth == len(context):
+                        elt_text += xml_fragment[elt_start : piece.end()]
+                        elts.append((elt_text, "/".join(context)))
+                        elt_start = elt_depth = None
+                        elt_text = ""
+                    # Keep context up-to-date
+                    context.pop()
+                elif piece.group("EMPTY_ELT_TAG"):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    if elt_start is None:
+                        if re.match(tagspec, "/".join(context) + "/" + name):
+                            elts.append((piece.group(), "/".join(context) + "/" + name))
+            if elt_start is not None:
+                # If we haven't found any elements yet, then keep
+                # looping until we do.
+                if elts == []:
+                    elt_text += xml_fragment[elt_start:]
+                    elt_start = 0
+                # If we've found at least one element, then try
+                # backtracking to the start of the element that we're
+                # inside of.
+                else:
+                    # take back the last start-tag, and return what
+                    # we've gotten so far (elts is non-empty).
+                    if self._DEBUG:
+                        print(" " * 36 + "(backtrack)")
+                    if isinstance(stream, SeekableUnicodeStreamReader):
+                        stream.seek(startpos)
+                        stream.char_seek_forward(elt_start)
+                    else:
+                        stream.seek(-(len(xml_fragment) - elt_start), 1)
+                    context = context[: elt_depth - 1]
+                    elt_start = elt_depth = None
+                    elt_text = ""
+        # Update the _tag_context dict.
+        pos = stream.tell()
+        if pos in self._tag_context:
+            assert tuple(context) == self._tag_context[pos]
+        else:
+            self._tag_context[pos] = tuple(context)
+        return [
+            elt_handler(
+                ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
+                context,
+            )
+            for (elt, context) in elts
+        ]

.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Selina Dennis <selina@tranzfusion.net>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
+English Prose (YCOE), a 1.5 million word syntactically-annotated
+corpus of Old English prose texts. The corpus is distributed by the
+Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
+with NLTK.
+The YCOE corpus is divided into 100 files, each representing
+an Old English prose text. Tags used within each text complies
+to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
+"""
+import os
+import re
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
+from nltk.corpus.reader.tagged import TaggedCorpusReader
+from nltk.corpus.reader.util import *
+from nltk.tokenize import RegexpTokenizer
+class YCOECorpusReader(CorpusReader):
+    """
+    Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
+    English Prose (YCOE), a 1.5 million word syntactically-annotated
+    corpus of Old English prose texts.
+    """
+    def __init__(self, root, encoding="utf8"):
+        CorpusReader.__init__(self, root, [], encoding)
+        self._psd_reader = YCOEParseCorpusReader(
+            self.root.join("psd"), ".*", ".psd", encoding=encoding
+        )
+        self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
+        # Make sure we have a consistent set of items:
+        documents = {f[:-4] for f in self._psd_reader.fileids()}
+        if {f[:-4] for f in self._pos_reader.fileids()} != documents:
+            raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
+        fileids = sorted(
+            ["%s.psd" % doc for doc in documents]
+            + ["%s.pos" % doc for doc in documents]
+        )
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._documents = sorted(documents)
+    def documents(self, fileids=None):
+        """
+        Return a list of document identifiers for all documents in
+        this corpus, or for the documents with the given file(s) if
+        specified.
+        """
+        if fileids is None:
+            return self._documents
+        if isinstance(fileids, str):
+            fileids = [fileids]
+        for f in fileids:
+            if f not in self._fileids:
+                raise KeyError("File id %s not found" % fileids)
+        # Strip off the '.pos' and '.psd' extensions.
+        return sorted({f[:-4] for f in fileids})
+    def fileids(self, documents=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus, or that store the given document(s) if specified.
+        """
+        if documents is None:
+            return self._fileids
+        elif isinstance(documents, str):
+            documents = [documents]
+        return sorted(
+            set(
+                ["%s.pos" % doc for doc in documents]
+                + ["%s.psd" % doc for doc in documents]
+            )
+        )
+    def _getfileids(self, documents, subcorpus):
+        """
+        Helper that selects the appropriate fileids for a given set of
+        documents from a given subcorpus (pos or psd).
+        """
+        if documents is None:
+            documents = self._documents
+        else:
+            if isinstance(documents, str):
+                documents = [documents]
+            for document in documents:
+                if document not in self._documents:
+                    if document[-4:] in (".pos", ".psd"):
+                        raise ValueError(
+                            "Expected a document identifier, not a file "
+                            "identifier.  (Use corpus.documents() to get "
+                            "a list of document identifiers."
+                        )
+                    else:
+                        raise ValueError("Document identifier %s not found" % document)
+        return [f"{d}.{subcorpus}" for d in documents]
+    # Delegate to one of our two sub-readers:
+    def words(self, documents=None):
+        return self._pos_reader.words(self._getfileids(documents, "pos"))
+    def sents(self, documents=None):
+        return self._pos_reader.sents(self._getfileids(documents, "pos"))
+    def paras(self, documents=None):
+        return self._pos_reader.paras(self._getfileids(documents, "pos"))
+    def tagged_words(self, documents=None):
+        return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
+    def tagged_sents(self, documents=None):
+        return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
+    def tagged_paras(self, documents=None):
+        return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
+    def parsed_sents(self, documents=None):
+        return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
+class YCOEParseCorpusReader(BracketParseCorpusReader):
+    """Specialized version of the standard bracket parse corpus reader
+    that strips out (CODE ...) and (ID ...) nodes."""
+    def _parse(self, t):
+        t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
+        if re.match(r"\s*\(\s*\)\s*$", t):
+            return None
+        return BracketParseCorpusReader._parse(self, t)
+class YCOETaggedCorpusReader(TaggedCorpusReader):
+    def __init__(self, root, items, encoding="utf8"):
+        gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
+        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
+        TaggedCorpusReader.__init__(
+            self, root, items, sep="_", sent_tokenizer=sent_tokenizer
+        )
+#: A list of all documents and their titles in ycoe.
+documents = {
+    "coadrian.o34": "Adrian and Ritheus",
+    "coaelhom.o3": "Ælfric, Supplemental Homilies",
+    "coaelive.o3": "Ælfric's Lives of Saints",
+    "coalcuin": "Alcuin De virtutibus et vitiis",
+    "coalex.o23": "Alexander's Letter to Aristotle",
+    "coapollo.o3": "Apollonius of Tyre",
+    "coaugust": "Augustine",
+    "cobede.o2": "Bede's History of the English Church",
+    "cobenrul.o3": "Benedictine Rule",
+    "coblick.o23": "Blickling Homilies",
+    "coboeth.o2": "Boethius' Consolation of Philosophy",
+    "cobyrhtf.o3": "Byrhtferth's Manual",
+    "cocanedgD": "Canons of Edgar (D)",
+    "cocanedgX": "Canons of Edgar (X)",
+    "cocathom1.o3": "Ælfric's Catholic Homilies I",
+    "cocathom2.o3": "Ælfric's Catholic Homilies II",
+    "cochad.o24": "Saint Chad",
+    "cochdrul": "Chrodegang of Metz, Rule",
+    "cochristoph": "Saint Christopher",
+    "cochronA.o23": "Anglo-Saxon Chronicle A",
+    "cochronC": "Anglo-Saxon Chronicle C",
+    "cochronD": "Anglo-Saxon Chronicle D",
+    "cochronE.o34": "Anglo-Saxon Chronicle E",
+    "cocura.o2": "Cura Pastoralis",
+    "cocuraC": "Cura Pastoralis (Cotton)",
+    "codicts.o34": "Dicts of Cato",
+    "codocu1.o1": "Documents 1 (O1)",
+    "codocu2.o12": "Documents 2 (O1/O2)",
+    "codocu2.o2": "Documents 2 (O2)",
+    "codocu3.o23": "Documents 3 (O2/O3)",
+    "codocu3.o3": "Documents 3 (O3)",
+    "codocu4.o24": "Documents 4 (O2/O4)",
+    "coeluc1": "Honorius of Autun, Elucidarium 1",
+    "coeluc2": "Honorius of Autun, Elucidarium 1",
+    "coepigen.o3": "Ælfric's Epilogue to Genesis",
+    "coeuphr": "Saint Euphrosyne",
+    "coeust": "Saint Eustace and his companions",
+    "coexodusP": "Exodus (P)",
+    "cogenesiC": "Genesis (C)",
+    "cogregdC.o24": "Gregory's Dialogues (C)",
+    "cogregdH.o23": "Gregory's Dialogues (H)",
+    "coherbar": "Pseudo-Apuleius, Herbarium",
+    "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
+    "coinspolX": "Wulfstan's Institute of Polity (X)",
+    "cojames": "Saint James",
+    "colacnu.o23": "Lacnunga",
+    "colaece.o2": "Leechdoms",
+    "colaw1cn.o3": "Laws, Cnut I",
+    "colaw2cn.o3": "Laws, Cnut II",
+    "colaw5atr.o3": "Laws, Æthelred V",
+    "colaw6atr.o3": "Laws, Æthelred VI",
+    "colawaf.o2": "Laws, Alfred",
+    "colawafint.o2": "Alfred's Introduction to Laws",
+    "colawger.o34": "Laws, Gerefa",
+    "colawine.ox2": "Laws, Ine",
+    "colawnorthu.o3": "Northumbra Preosta Lagu",
+    "colawwllad.o4": "Laws, William I, Lad",
+    "coleofri.o4": "Leofric",
+    "colsigef.o3": "Ælfric's Letter to Sigefyrth",
+    "colsigewB": "Ælfric's Letter to Sigeweard (B)",
+    "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
+    "colwgeat": "Ælfric's Letter to Wulfgeat",
+    "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
+    "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
+    "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
+    "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
+    "comargaC.o34": "Saint Margaret (C)",
+    "comargaT": "Saint Margaret (T)",
+    "comart1": "Martyrology, I",
+    "comart2": "Martyrology, II",
+    "comart3.o23": "Martyrology, III",
+    "comarvel.o23": "Marvels of the East",
+    "comary": "Mary of Egypt",
+    "coneot": "Saint Neot",
+    "conicodA": "Gospel of Nicodemus (A)",
+    "conicodC": "Gospel of Nicodemus (C)",
+    "conicodD": "Gospel of Nicodemus (D)",
+    "conicodE": "Gospel of Nicodemus (E)",
+    "coorosiu.o2": "Orosius",
+    "cootest.o3": "Heptateuch",
+    "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
+    "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
+    "coprefcura.o2": "Preface to the Cura Pastoralis",
+    "coprefgen.o3": "Ælfric's Preface to Genesis",
+    "copreflives.o3": "Ælfric's Preface to Lives of Saints",
+    "coprefsolilo": "Preface to Augustine's Soliloquies",
+    "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
+    "corood": "History of the Holy Rood-Tree",
+    "cosevensl": "Seven Sleepers",
+    "cosolilo": "St. Augustine's Soliloquies",
+    "cosolsat1.o4": "Solomon and Saturn I",
+    "cosolsat2": "Solomon and Saturn II",
+    "cotempo.o3": "Ælfric's De Temporibus Anni",
+    "coverhom": "Vercelli Homilies",
+    "coverhomE": "Vercelli Homilies (E)",
+    "coverhomL": "Vercelli Homilies (L)",
+    "covinceB": "Saint Vincent (Bodley 343)",
+    "covinsal": "Vindicta Salvatoris",
+    "cowsgosp.o3": "West-Saxon Gospels",
+    "cowulf.o34": "Wulfstan's Homilies",
+}

.eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# Natural Language Toolkit: Interface to MaltParser
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+# Contributor: Liling Tan, Mustufain, osamamukhtar11
+#
+# Copyright (C) 2001-2022 NLTK Project
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import inspect
+import os
+import subprocess
+import sys
+import tempfile
+from nltk.data import ZipFilePathPointer
+from nltk.internals import find_dir, find_file, find_jars_within_path
+from nltk.parse.api import ParserI
+from nltk.parse.dependencygraph import DependencyGraph
+from nltk.parse.util import taggedsents_to_conll
+def malt_regex_tagger():
+    from nltk.tag import RegexpTagger
+    _tagger = RegexpTagger(
+        [
+            (r"\.$", "."),
+            (r"\,$", ","),
+            (r"\?$", "?"),  # fullstop, comma, Qmark
+            (r"\($", "("),
+            (r"\)$", ")"),  # round brackets
+            (r"\[$", "["),
+            (r"\]$", "]"),  # square brackets
+            (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
+            (r"(The|the|A|a|An|an)$", "DT"),  # articles
+            (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"),  # pronouns
+            (r"(His|his|Her|her|Its|its)$", "PRP$"),  # possessive
+            (r"(my|Your|your|Yours|yours)$", "PRP$"),  # possessive
+            (r"(on|On|in|In|at|At|since|Since)$", "IN"),  # time prepopsitions
+            (r"(for|For|ago|Ago|before|Before)$", "IN"),  # time prepopsitions
+            (r"(till|Till|until|Until)$", "IN"),  # time prepopsitions
+            (r"(by|By|beside|Beside)$", "IN"),  # space prepopsitions
+            (r"(under|Under|below|Below)$", "IN"),  # space prepopsitions
+            (r"(over|Over|above|Above)$", "IN"),  # space prepopsitions
+            (r"(across|Across|through|Through)$", "IN"),  # space prepopsitions
+            (r"(into|Into|towards|Towards)$", "IN"),  # space prepopsitions
+            (r"(onto|Onto|from|From)$", "IN"),  # space prepopsitions
+            (r".*able$", "JJ"),  # adjectives
+            (r".*ness$", "NN"),  # nouns formed from adjectives
+            (r".*ly$", "RB"),  # adverbs
+            (r".*s$", "NNS"),  # plural nouns
+            (r".*ing$", "VBG"),  # gerunds
+            (r".*ed$", "VBD"),  # past tense verbs
+            (r".*", "NN"),  # nouns (default)
+        ]
+    )
+    return _tagger.tag
+def find_maltparser(parser_dirname):
+    """
+    A module to find MaltParser .jar file and its dependencies.
+    """
+    if os.path.exists(parser_dirname):  # If a full path is given.
+        _malt_dir = parser_dirname
+    else:  # Try to find path to maltparser directory in environment variables.
+        _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
+    # Checks that that the found directory contains all the necessary .jar
+    malt_dependencies = ["", "", ""]
+    _malt_jars = set(find_jars_within_path(_malt_dir))
+    _jars = {os.path.split(jar)[1] for jar in _malt_jars}
+    malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"}
+    assert malt_dependencies.issubset(_jars)
+    assert any(
+        filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
+    )
+    return list(_malt_jars)
+def find_malt_model(model_filename):
+    """
+    A module to find pre-trained MaltParser model.
+    """
+    if model_filename is None:
+        return "malt_temp.mco"
+    elif os.path.exists(model_filename):  # If a full path is given.
+        return model_filename
+    else:  # Try to find path to malt model in environment variables.
+        return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
+class MaltParser(ParserI):
+    """
+    A class for dependency parsing with MaltParser. The input is the paths to:
+    - (optionally) a maltparser directory
+    - (optionally) the path to a pre-trained MaltParser .mco model file
+    - (optionally) the tagger to use for POS tagging before parsing
+    - (optionally) additional Java arguments
+    Example:
+        >>> from nltk.parse import malt
+        >>> # With MALT_PARSER and MALT_MODEL environment set.
+        >>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP
+        >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
+        (shot I (elephant an) (in (pajamas my)) .)
+        >>> # Without MALT_PARSER and MALT_MODEL environment.
+        >>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
+        >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
+        (shot I (elephant an) (in (pajamas my)) .)
+    """
+    def __init__(
+        self,
+        parser_dirname="",
+        model_filename=None,
+        tagger=None,
+        additional_java_args=None,
+    ):
+        """
+        An interface for parsing with the Malt Parser.
+        :param parser_dirname: The path to the maltparser directory that
+            contains the maltparser-1.x.jar
+        :type parser_dirname: str
+        :param model_filename: The name of the pre-trained model with .mco file
+            extension. If provided, training will not be required.
+            (see http://www.maltparser.org/mco/mco.html and
+            see http://www.patful.com/chalk/node/185)
+        :type model_filename: str
+        :param tagger: The tagger used to POS tag the raw string before
+            formatting to CONLL format. It should behave like `nltk.pos_tag`
+        :type tagger: function
+        :param additional_java_args: This is the additional Java arguments that
+            one can use when calling Maltparser, usually this is the heapsize
+            limits, e.g. `additional_java_args=['-Xmx1024m']`
+            (see https://goo.gl/mpDBvQ)
+        :type additional_java_args: list
+        """
+        # Find all the necessary jar files for MaltParser.
+        self.malt_jars = find_maltparser(parser_dirname)
+        # Initialize additional java arguments.
+        self.additional_java_args = (
+            additional_java_args if additional_java_args is not None else []
+        )
+        # Initialize model.
+        self.model = find_malt_model(model_filename)
+        self._trained = self.model != "malt_temp.mco"
+        # Set the working_dir parameters i.e. `-w` from MaltParser's option.
+        self.working_dir = tempfile.gettempdir()
+        # Initialize POS tagger.
+        self.tagger = tagger if tagger is not None else malt_regex_tagger()
+    def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
+        """
+        Use MaltParser to parse multiple POS tagged sentences. Takes multiple
+        sentences where each sentence is a list of (word, tag) tuples.
+        The sentences must have already been tokenized and tagged.
+        :param sentences: Input sentences to parse
+        :type sentence: list(list(tuple(str, str)))
+        :return: iter(iter(``DependencyGraph``)) the dependency graph
+            representation of each sentence
+        """
+        if not self._trained:
+            raise Exception("Parser has not been trained. Call train() first.")
+        with tempfile.NamedTemporaryFile(
+            prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
+        ) as input_file:
+            with tempfile.NamedTemporaryFile(
+                prefix="malt_output.conll.",
+                dir=self.working_dir,
+                mode="w",
+                delete=False,
+            ) as output_file:
+                # Convert list of sentences to CONLL format.
+                for line in taggedsents_to_conll(sentences):
+                    input_file.write(str(line))
+                input_file.close()
+                # Generate command to run maltparser.
+                cmd = self.generate_malt_command(
+                    input_file.name, output_file.name, mode="parse"
+                )
+                # This is a maltparser quirk, it needs to be run
+                # where the model file is. otherwise it goes into an awkward
+                # missing .jars or strange -w working_dir problem.
+                _current_path = os.getcwd()  # Remembers the current path.
+                try:  # Change to modelfile path
+                    os.chdir(os.path.split(self.model)[0])
+                except:
+                    pass
+                ret = self._execute(cmd, verbose)  # Run command.
+                os.chdir(_current_path)  # Change back to current path.
+                if ret != 0:
+                    raise Exception(
+                        "MaltParser parsing (%s) failed with exit "
+                        "code %d" % (" ".join(cmd), ret)
+                    )
+                # Must return iter(iter(Tree))
+                with open(output_file.name) as infile:
+                    for tree_str in infile.read().split("\n\n"):
+                        yield (
+                            iter(
+                                [
+                                    DependencyGraph(
+                                        tree_str, top_relation_label=top_relation_label
+                                    )
+                                ]
+                            )
+                        )
+        os.remove(input_file.name)
+        os.remove(output_file.name)
+    def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
+        """
+        Use MaltParser to parse multiple sentences.
+        Takes a list of sentences, where each sentence is a list of words.
+        Each sentence will be automatically tagged with this
+        MaltParser instance's tagger.
+        :param sentences: Input sentences to parse
+        :type sentence: list(list(str))
+        :return: iter(DependencyGraph)
+        """
+        tagged_sentences = (self.tagger(sentence) for sentence in sentences)
+        return self.parse_tagged_sents(
+            tagged_sentences, verbose, top_relation_label=top_relation_label
+        )
+    def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
+        """
+        This function generates the maltparser command use at the terminal.
+        :param inputfilename: path to the input file
+        :type inputfilename: str
+        :param outputfilename: path to the output file
+        :type outputfilename: str
+        """
+        cmd = ["java"]
+        cmd += self.additional_java_args  # Adds additional java arguments
+        # Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
+        classpaths_separator = ";" if sys.platform.startswith("win") else ":"
+        cmd += [
+            "-cp",
+            classpaths_separator.join(self.malt_jars),
+        ]  # Adds classpaths for jars
+        cmd += ["org.maltparser.Malt"]  # Adds the main function.
+        # Adds the model file.
+        if os.path.exists(self.model):  # when parsing
+            cmd += ["-c", os.path.split(self.model)[-1]]
+        else:  # when learning
+            cmd += ["-c", self.model]
+        cmd += ["-i", inputfilename]
+        if mode == "parse":
+            cmd += ["-o", outputfilename]
+        cmd += ["-m", mode]  # mode use to generate parses.
+        return cmd
+    @staticmethod
+    def _execute(cmd, verbose=False):
+        output = None if verbose else subprocess.PIPE
+        p = subprocess.Popen(cmd, stdout=output, stderr=output)
+        return p.wait()
+    def train(self, depgraphs, verbose=False):
+        """
+        Train MaltParser from a list of ``DependencyGraph`` objects
+        :param depgraphs: list of ``DependencyGraph`` objects for training input data
+        :type depgraphs: DependencyGraph
+        """
+        # Write the conll_str to malt_train.conll file in /tmp/
+        with tempfile.NamedTemporaryFile(
+            prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
+        ) as input_file:
+            input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
+            input_file.write(str(input_str))
+        # Trains the model with the malt_train.conll
+        self.train_from_file(input_file.name, verbose=verbose)
+        # Removes the malt_train.conll once training finishes.
+        os.remove(input_file.name)
+    def train_from_file(self, conll_file, verbose=False):
+        """
+        Train MaltParser from a file
+        :param conll_file: str for the filename of the training input data
+        :type conll_file: str
+        """
+        # If conll_file is a ZipFilePathPointer,
+        # then we need to do some extra massaging
+        if isinstance(conll_file, ZipFilePathPointer):
+            with tempfile.NamedTemporaryFile(
+                prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
+            ) as input_file:
+                with conll_file.open() as conll_input_file:
+                    conll_str = conll_input_file.read()
+                    input_file.write(str(conll_str))
+                return self.train_from_file(input_file.name, verbose=verbose)
+        # Generate command to run maltparser.
+        cmd = self.generate_malt_command(conll_file, mode="learn")
+        ret = self._execute(cmd, verbose)
+        if ret != 0:
+            raise Exception(
+                "MaltParser training (%s) failed with exit "
+                "code %d" % (" ".join(cmd), ret)
+            )
+        self._trained = True
+if __name__ == "__main__":
+    """
+    A demonstration function to show how NLTK users can use the malt parser API.
+    >>> from nltk import pos_tag
+    >>> assert 'MALT_PARSER' in os.environ, str(
+    ... "Please set MALT_PARSER in your global environment, e.g.:\n"
+    ... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'")
+    >>>
+    >>> assert 'MALT_MODEL' in os.environ, str(
+    ... "Please set MALT_MODEL in your global environment, e.g.:\n"
+    ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
+    >>>
+    >>> _dg1_str = str("1    John    _    NNP   _    _    2    SUBJ    _    _\n"
+    ...             "2    sees    _    VB    _    _    0    ROOT    _    _\n"
+    ...             "3    a       _    DT    _    _    4    SPEC    _    _\n"
+    ...             "4    dog     _    NN    _    _    2    OBJ     _    _\n"
+    ...             "5    .     _    .    _    _    2    PUNCT     _    _\n")
+    >>>
+    >>>
+    >>> _dg2_str  = str("1    John    _    NNP   _    _    2    SUBJ    _    _\n"
+    ...             "2    walks   _    VB    _    _    0    ROOT    _    _\n"
+    ...             "3    .     _    .    _    _    2    PUNCT     _    _\n")
+    >>> dg1 = DependencyGraph(_dg1_str)
+    >>> dg2 = DependencyGraph(_dg2_str)
+    >>> # Initialize a MaltParser object
+    >>> mp = MaltParser()
+    >>>
+    >>> # Trains a model.
+    >>> mp.train([dg1,dg2], verbose=False)
+    >>> sent1 = ['John','sees','Mary', '.']
+    >>> sent2 = ['John', 'walks', 'a', 'dog', '.']
+    >>>
+    >>> # Parse a single sentence.
+    >>> parsed_sent1 = mp.parse_one(sent1)
+    >>> parsed_sent2 = mp.parse_one(sent2)
+    >>> print(parsed_sent1.tree())
+    (sees John Mary .)
+    >>> print(parsed_sent2.tree())
+    (walks John (dog a) .)
+    >>>
+    >>> # Parsing multiple sentences.
+    >>> sentences = [sent1,sent2]
+    >>> parsed_sents = mp.parse_sents(sentences)
+    >>> print(next(next(parsed_sents)).tree())
+    (sees John Mary .)
+    >>> print(next(next(parsed_sents)).tree())
+    (walks John (dog a) .)
+    >>>
+    >>> # Initialize a MaltParser object with an English pre-trained model.
+    >>> parser_dirname = 'maltparser-1.9.2'
+    >>> model_name = 'engmalt.linear-1.7.mco'
+    >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
+    >>> sent1 = 'I shot an elephant in my pajamas .'.split()
+    >>> sent2 = 'Time flies like banana .'.split()
+    >>> # Parse a single sentence.
+    >>> print(mp.parse_one(sent1).tree())
+    (shot I (elephant an) (in (pajamas my)) .)
+    # Parsing multiple sentences
+    >>> sentences = [sent1,sent2]
+    >>> parsed_sents = mp.parse_sents(sentences)
+    >>> print(next(next(parsed_sents)).tree())
+    (shot I (elephant an) (in (pajamas my)) .)
+    >>> print(next(next(parsed_sents)).tree())
+    (flies Time (like banana) .)
+    """
+    import doctest
+    doctest.testmod()

.eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py ADDED Viewed

	@@ -0,0 +1,772 @@

+# Natural Language Toolkit: Dependency Grammars
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Jason Narad <jason.narad@gmail.com>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+#
+import logging
+import math
+from nltk.parse.dependencygraph import DependencyGraph
+logger = logging.getLogger(__name__)
+#################################################################
+# DependencyScorerI - Interface for Graph-Edge Weight Calculation
+#################################################################
+class DependencyScorerI:
+    """
+    A scorer for calculated the weights on the edges of a weighted
+    dependency graph.  This is used by a
+    ``ProbabilisticNonprojectiveParser`` to initialize the edge
+    weights of a ``DependencyGraph``.  While typically this would be done
+    by training a binary classifier, any class that can return a
+    multidimensional list representation of the edge weights can
+    implement this interface.  As such, it has no necessary
+    fields.
+    """
+    def __init__(self):
+        if self.__class__ == DependencyScorerI:
+            raise TypeError("DependencyScorerI is an abstract interface")
+    def train(self, graphs):
+        """
+        :type graphs: list(DependencyGraph)
+        :param graphs: A list of dependency graphs to train the scorer.
+            Typically the edges present in the graphs can be used as
+            positive training examples, and the edges not present as negative
+            examples.
+        """
+        raise NotImplementedError()
+    def score(self, graph):
+        """
+        :type graph: DependencyGraph
+        :param graph: A dependency graph whose set of edges need to be
+            scored.
+        :rtype: A three-dimensional list of numbers.
+        :return: The score is returned in a multidimensional(3) list, such
+            that the outer-dimension refers to the head, and the
+            inner-dimension refers to the dependencies.  For instance,
+            scores[0][1] would reference the list of scores corresponding to
+            arcs from node 0 to node 1.  The node's 'address' field can be used
+            to determine its number identification.
+        For further illustration, a score list corresponding to Fig.2 of
+        Keith Hall's 'K-best Spanning Tree Parsing' paper::
+              scores = [[[], [5],  [1],  [1]],
+                       [[], [],   [11], [4]],
+                       [[], [10], [],   [5]],
+                       [[], [8],  [8],  []]]
+        When used in conjunction with a MaxEntClassifier, each score would
+        correspond to the confidence of a particular edge being classified
+        with the positive training examples.
+        """
+        raise NotImplementedError()
+#################################################################
+# NaiveBayesDependencyScorer
+#################################################################
+class NaiveBayesDependencyScorer(DependencyScorerI):
+    """
+    A dependency scorer built around a MaxEnt classifier.  In this
+    particular class that classifier is a ``NaiveBayesClassifier``.
+    It uses head-word, head-tag, child-word, and child-tag features
+    for classification.
+    >>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2
+    >>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry]
+    >>> npp = ProbabilisticNonprojectiveParser()
+    >>> npp.train(graphs, NaiveBayesDependencyScorer())
+    >>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc'])
+    >>> len(list(parses))
+    1
+    """
+    def __init__(self):
+        pass  # Do nothing without throwing error
+    def train(self, graphs):
+        """
+        Trains a ``NaiveBayesClassifier`` using the edges present in
+        graphs list as positive examples, the edges not present as
+        negative examples.  Uses a feature vector of head-word,
+        head-tag, child-word, and child-tag.
+        :type graphs: list(DependencyGraph)
+        :param graphs: A list of dependency graphs to train the scorer.
+        """
+        from nltk.classify import NaiveBayesClassifier
+        # Create training labeled training examples
+        labeled_examples = []
+        for graph in graphs:
+            for head_node in graph.nodes.values():
+                for child_index, child_node in graph.nodes.items():
+                    if child_index in head_node["deps"]:
+                        label = "T"
+                    else:
+                        label = "F"
+                    labeled_examples.append(
+                        (
+                            dict(
+                                a=head_node["word"],
+                                b=head_node["tag"],
+                                c=child_node["word"],
+                                d=child_node["tag"],
+                            ),
+                            label,
+                        )
+                    )
+        self.classifier = NaiveBayesClassifier.train(labeled_examples)
+    def score(self, graph):
+        """
+        Converts the graph into a feature-based representation of
+        each edge, and then assigns a score to each based on the
+        confidence of the classifier in assigning it to the
+        positive label.  Scores are returned in a multidimensional list.
+        :type graph: DependencyGraph
+        :param graph: A dependency graph to score.
+        :rtype: 3 dimensional list
+        :return: Edge scores for the graph parameter.
+        """
+        # Convert graph to feature representation
+        edges = []
+        for head_node in graph.nodes.values():
+            for child_node in graph.nodes.values():
+                edges.append(
+                    dict(
+                        a=head_node["word"],
+                        b=head_node["tag"],
+                        c=child_node["word"],
+                        d=child_node["tag"],
+                    )
+                )
+        # Score edges
+        edge_scores = []
+        row = []
+        count = 0
+        for pdist in self.classifier.prob_classify_many(edges):
+            logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
+            # smoothing in case the probability = 0
+            row.append([math.log(pdist.prob("T") + 0.00000000001)])
+            count += 1
+            if count == len(graph.nodes):
+                edge_scores.append(row)
+                row = []
+                count = 0
+        return edge_scores
+#################################################################
+# A Scorer for Demo Purposes
+#################################################################
+# A short class necessary to show parsing example from paper
+class DemoScorer(DependencyScorerI):
+    def train(self, graphs):
+        print("Training...")
+    def score(self, graph):
+        # scores for Keith Hall 'K-best Spanning Tree Parsing' paper
+        return [
+            [[], [5], [1], [1]],
+            [[], [], [11], [4]],
+            [[], [10], [], [5]],
+            [[], [8], [8], []],
+        ]
+#################################################################
+# Non-Projective Probabilistic Parsing
+#################################################################
+class ProbabilisticNonprojectiveParser:
+    """A probabilistic non-projective dependency parser.
+    Nonprojective dependencies allows for "crossing branches" in the parse tree
+    which is necessary for representing particular linguistic phenomena, or even
+    typical parses in some languages.  This parser follows the MST parsing
+    algorithm, outlined in McDonald(2005), which likens the search for the best
+    non-projective parse to finding the maximum spanning tree in a weighted
+    directed graph.
+    >>> class Scorer(DependencyScorerI):
+    ...     def train(self, graphs):
+    ...         pass
+    ...
+    ...     def score(self, graph):
+    ...         return [
+    ...             [[], [5],  [1],  [1]],
+    ...             [[], [],   [11], [4]],
+    ...             [[], [10], [],   [5]],
+    ...             [[], [8],  [8],  []],
+    ...         ]
+    >>> npp = ProbabilisticNonprojectiveParser()
+    >>> npp.train([], Scorer())
+    >>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None])
+    >>> len(list(parses))
+    1
+    Rule based example
+    >>> from nltk.grammar import DependencyGrammar
+    >>> grammar = DependencyGrammar.fromstring('''
+    ... 'taught' -> 'play' | 'man'
+    ... 'man' -> 'the' | 'in'
+    ... 'in' -> 'corner'
+    ... 'corner' -> 'the'
+    ... 'play' -> 'golf' | 'dachshund' | 'to'
+    ... 'dachshund' -> 'his'
+    ... ''')
+    >>> ndp = NonprojectiveDependencyParser(grammar)
+    >>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
+    >>> len(list(parses))
+    4
+    """
+    def __init__(self):
+        """
+        Creates a new non-projective parser.
+        """
+        logging.debug("initializing prob. nonprojective...")
+    def train(self, graphs, dependency_scorer):
+        """
+        Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects,
+        and establishes this as the parser's scorer.  This is used to
+        initialize the scores on a ``DependencyGraph`` during the parsing
+        procedure.
+        :type graphs: list(DependencyGraph)
+        :param graphs: A list of dependency graphs to train the scorer.
+        :type dependency_scorer: DependencyScorerI
+        :param dependency_scorer: A scorer which implements the
+            ``DependencyScorerI`` interface.
+        """
+        self._scorer = dependency_scorer
+        self._scorer.train(graphs)
+    def initialize_edge_scores(self, graph):
+        """
+        Assigns a score to every edge in the ``DependencyGraph`` graph.
+        These scores are generated via the parser's scorer which
+        was assigned during the training process.
+        :type graph: DependencyGraph
+        :param graph: A dependency graph to assign scores to.
+        """
+        self.scores = self._scorer.score(graph)
+    def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph):
+        """
+        Takes a list of nodes that have been identified to belong to a cycle,
+        and collapses them into on larger node.  The arcs of all nodes in
+        the graph must be updated to account for this.
+        :type new_node: Node.
+        :param new_node: A Node (Dictionary) to collapse the cycle nodes into.
+        :type cycle_path: A list of integers.
+        :param cycle_path: A list of node addresses, each of which is in the cycle.
+        :type g_graph, b_graph, c_graph: DependencyGraph
+        :param g_graph, b_graph, c_graph: Graphs which need to be updated.
+        """
+        logger.debug("Collapsing nodes...")
+        # Collapse all cycle nodes into v_n+1 in G_Graph
+        for cycle_node_index in cycle_path:
+            g_graph.remove_by_address(cycle_node_index)
+        g_graph.add_node(new_node)
+        g_graph.redirect_arcs(cycle_path, new_node["address"])
+    def update_edge_scores(self, new_node, cycle_path):
+        """
+        Updates the edge scores to reflect a collapse operation into
+        new_node.
+        :type new_node: A Node.
+        :param new_node: The node which cycle nodes are collapsed into.
+        :type cycle_path: A list of integers.
+        :param cycle_path: A list of node addresses that belong to the cycle.
+        """
+        logger.debug("cycle %s", cycle_path)
+        cycle_path = self.compute_original_indexes(cycle_path)
+        logger.debug("old cycle %s", cycle_path)
+        logger.debug("Prior to update: %s", self.scores)
+        for i, row in enumerate(self.scores):
+            for j, column in enumerate(self.scores[i]):
+                logger.debug(self.scores[i][j])
+                if j in cycle_path and i not in cycle_path and self.scores[i][j]:
+                    subtract_val = self.compute_max_subtract_score(j, cycle_path)
+                    logger.debug("%s - %s", self.scores[i][j], subtract_val)
+                    new_vals = []
+                    for cur_val in self.scores[i][j]:
+                        new_vals.append(cur_val - subtract_val)
+                    self.scores[i][j] = new_vals
+        for i, row in enumerate(self.scores):
+            for j, cell in enumerate(self.scores[i]):
+                if i in cycle_path and j in cycle_path:
+                    self.scores[i][j] = []
+        logger.debug("After update: %s", self.scores)
+    def compute_original_indexes(self, new_indexes):
+        """
+        As nodes are collapsed into others, they are replaced
+        by the new node in the graph, but it's still necessary
+        to keep track of what these original nodes were.  This
+        takes a list of node addresses and replaces any collapsed
+        node addresses with their original addresses.
+        :type new_indexes: A list of integers.
+        :param new_indexes: A list of node addresses to check for
+            subsumed nodes.
+        """
+        swapped = True
+        while swapped:
+            originals = []
+            swapped = False
+            for new_index in new_indexes:
+                if new_index in self.inner_nodes:
+                    for old_val in self.inner_nodes[new_index]:
+                        if old_val not in originals:
+                            originals.append(old_val)
+                            swapped = True
+                else:
+                    originals.append(new_index)
+            new_indexes = originals
+        return new_indexes
+    def compute_max_subtract_score(self, column_index, cycle_indexes):
+        """
+        When updating scores the score of the highest-weighted incoming
+        arc is subtracted upon collapse.  This returns the correct
+        amount to subtract from that edge.
+        :type column_index: integer.
+        :param column_index: A index representing the column of incoming arcs
+            to a particular node being updated
+        :type cycle_indexes: A list of integers.
+        :param cycle_indexes: Only arcs from cycle nodes are considered.  This
+            is a list of such nodes addresses.
+        """
+        max_score = -100000
+        for row_index in cycle_indexes:
+            for subtract_val in self.scores[row_index][column_index]:
+                if subtract_val > max_score:
+                    max_score = subtract_val
+        return max_score
+    def best_incoming_arc(self, node_index):
+        """
+        Returns the source of the best incoming arc to the
+        node with address: node_index
+        :type node_index: integer.
+        :param node_index: The address of the 'destination' node,
+            the node that is arced to.
+        """
+        originals = self.compute_original_indexes([node_index])
+        logger.debug("originals: %s", originals)
+        max_arc = None
+        max_score = None
+        for row_index in range(len(self.scores)):
+            for col_index in range(len(self.scores[row_index])):
+                if col_index in originals and (
+                    max_score is None or self.scores[row_index][col_index] > max_score
+                ):
+                    max_score = self.scores[row_index][col_index]
+                    max_arc = row_index
+                    logger.debug("%s, %s", row_index, col_index)
+        logger.debug(max_score)
+        for key in self.inner_nodes:
+            replaced_nodes = self.inner_nodes[key]
+            if max_arc in replaced_nodes:
+                return key
+        return max_arc
+    def original_best_arc(self, node_index):
+        originals = self.compute_original_indexes([node_index])
+        max_arc = None
+        max_score = None
+        max_orig = None
+        for row_index in range(len(self.scores)):
+            for col_index in range(len(self.scores[row_index])):
+                if col_index in originals and (
+                    max_score is None or self.scores[row_index][col_index] > max_score
+                ):
+                    max_score = self.scores[row_index][col_index]
+                    max_arc = row_index
+                    max_orig = col_index
+        return [max_arc, max_orig]
+    def parse(self, tokens, tags):
+        """
+        Parses a list of tokens in accordance to the MST parsing algorithm
+        for non-projective dependency parses.  Assumes that the tokens to
+        be parsed have already been tagged and those tags are provided.  Various
+        scoring methods can be used by implementing the ``DependencyScorerI``
+        interface and passing it to the training algorithm.
+        :type tokens: list(str)
+        :param tokens: A list of words or punctuation to be parsed.
+        :type tags: list(str)
+        :param tags: A list of tags corresponding by index to the words in the tokens list.
+        :return: An iterator of non-projective parses.
+        :rtype: iter(DependencyGraph)
+        """
+        self.inner_nodes = {}
+        # Initialize g_graph
+        g_graph = DependencyGraph()
+        for index, token in enumerate(tokens):
+            g_graph.nodes[index + 1].update(
+                {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+            )
+        # Fully connect non-root nodes in g_graph
+        g_graph.connect_graph()
+        original_graph = DependencyGraph()
+        for index, token in enumerate(tokens):
+            original_graph.nodes[index + 1].update(
+                {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+            )
+        b_graph = DependencyGraph()
+        c_graph = DependencyGraph()
+        for index, token in enumerate(tokens):
+            c_graph.nodes[index + 1].update(
+                {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+            )
+        # Assign initial scores to g_graph edges
+        self.initialize_edge_scores(g_graph)
+        logger.debug(self.scores)
+        # Initialize a list of unvisited vertices (by node address)
+        unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
+        # Iterate over unvisited vertices
+        nr_vertices = len(tokens)
+        betas = {}
+        while unvisited_vertices:
+            # Mark current node as visited
+            current_vertex = unvisited_vertices.pop(0)
+            logger.debug("current_vertex: %s", current_vertex)
+            # Get corresponding node n_i to vertex v_i
+            current_node = g_graph.get_by_address(current_vertex)
+            logger.debug("current_node: %s", current_node)
+            # Get best in-edge node b for current node
+            best_in_edge = self.best_incoming_arc(current_vertex)
+            betas[current_vertex] = self.original_best_arc(current_vertex)
+            logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
+            # b_graph = Union(b_graph, b)
+            for new_vertex in [current_vertex, best_in_edge]:
+                b_graph.nodes[new_vertex].update(
+                    {"word": "TEMP", "rel": "NTOP", "address": new_vertex}
+                )
+            b_graph.add_arc(best_in_edge, current_vertex)
+            # Beta(current node) = b  - stored for parse recovery
+            # If b_graph contains a cycle, collapse it
+            cycle_path = b_graph.contains_cycle()
+            if cycle_path:
+                # Create a new node v_n+1 with address = len(nodes) + 1
+                new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
+                # c_graph = Union(c_graph, v_n+1)
+                c_graph.add_node(new_node)
+                # Collapse all nodes in cycle C into v_n+1
+                self.update_edge_scores(new_node, cycle_path)
+                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
+                for cycle_index in cycle_path:
+                    c_graph.add_arc(new_node["address"], cycle_index)
+                    # self.replaced_by[cycle_index] = new_node['address']
+                self.inner_nodes[new_node["address"]] = cycle_path
+                # Add v_n+1 to list of unvisited vertices
+                unvisited_vertices.insert(0, nr_vertices + 1)
+                # increment # of nodes counter
+                nr_vertices += 1
+                # Remove cycle nodes from b_graph; B = B - cycle c
+                for cycle_node_address in cycle_path:
+                    b_graph.remove_by_address(cycle_node_address)
+            logger.debug("g_graph: %s", g_graph)
+            logger.debug("b_graph: %s", b_graph)
+            logger.debug("c_graph: %s", c_graph)
+            logger.debug("Betas: %s", betas)
+            logger.debug("replaced nodes %s", self.inner_nodes)
+        # Recover parse tree
+        logger.debug("Final scores: %s", self.scores)
+        logger.debug("Recovering parse...")
+        for i in range(len(tokens) + 1, nr_vertices + 1):
+            betas[betas[i][1]] = betas[i]
+        logger.debug("Betas: %s", betas)
+        for node in original_graph.nodes.values():
+            # TODO: It's dangerous to assume that deps it a dictionary
+            # because it's a default dictionary. Ideally, here we should not
+            # be concerned how dependencies are stored inside of a dependency
+            # graph.
+            node["deps"] = {}
+        for i in range(1, len(tokens) + 1):
+            original_graph.add_arc(betas[i][0], betas[i][1])
+        logger.debug("Done.")
+        yield original_graph
+#################################################################
+# Rule-based Non-Projective Parser
+#################################################################
+class NonprojectiveDependencyParser:
+    """
+    A non-projective, rule-based, dependency parser.  This parser
+    will return the set of all possible non-projective parses based on
+    the word-to-word relations defined in the parser's dependency
+    grammar, and will allow the branches of the parse tree to cross
+    in order to capture a variety of linguistic phenomena that a
+    projective parser will not.
+    """
+    def __init__(self, dependency_grammar):
+        """
+        Creates a new ``NonprojectiveDependencyParser``.
+        :param dependency_grammar: a grammar of word-to-word relations.
+        :type dependency_grammar: DependencyGrammar
+        """
+        self._grammar = dependency_grammar
+    def parse(self, tokens):
+        """
+        Parses the input tokens with respect to the parser's grammar.  Parsing
+        is accomplished by representing the search-space of possible parses as
+        a fully-connected directed graph.  Arcs that would lead to ungrammatical
+        parses are removed and a lattice is constructed of length n, where n is
+        the number of input tokens, to represent all possible grammatical
+        traversals.  All possible paths through the lattice are then enumerated
+        to produce the set of non-projective parses.
+        param tokens: A list of tokens to parse.
+        type tokens: list(str)
+        return: An iterator of non-projective parses.
+        rtype: iter(DependencyGraph)
+        """
+        # Create graph representation of tokens
+        self._graph = DependencyGraph()
+        for index, token in enumerate(tokens):
+            self._graph.nodes[index] = {
+                "word": token,
+                "deps": [],
+                "rel": "NTOP",
+                "address": index,
+            }
+        for head_node in self._graph.nodes.values():
+            deps = []
+            for dep_node in self._graph.nodes.values():
+                if (
+                    self._grammar.contains(head_node["word"], dep_node["word"])
+                    and head_node["word"] != dep_node["word"]
+                ):
+                    deps.append(dep_node["address"])
+            head_node["deps"] = deps
+        # Create lattice of possible heads
+        roots = []
+        possible_heads = []
+        for i, word in enumerate(tokens):
+            heads = []
+            for j, head in enumerate(tokens):
+                if (i != j) and self._grammar.contains(head, word):
+                    heads.append(j)
+            if len(heads) == 0:
+                roots.append(i)
+            possible_heads.append(heads)
+        # Set roots to attempt
+        if len(roots) < 2:
+            if len(roots) == 0:
+                for i in range(len(tokens)):
+                    roots.append(i)
+            # Traverse lattice
+            analyses = []
+            for _ in roots:
+                stack = []
+                analysis = [[] for i in range(len(possible_heads))]
+            i = 0
+            forward = True
+            while i >= 0:
+                if forward:
+                    if len(possible_heads[i]) == 1:
+                        analysis[i] = possible_heads[i][0]
+                    elif len(possible_heads[i]) == 0:
+                        analysis[i] = -1
+                    else:
+                        head = possible_heads[i].pop()
+                        analysis[i] = head
+                        stack.append([i, head])
+                if not forward:
+                    index_on_stack = False
+                    for stack_item in stack:
+                        if stack_item[0] == i:
+                            index_on_stack = True
+                    orig_length = len(possible_heads[i])
+                    if index_on_stack and orig_length == 0:
+                        for j in range(len(stack) - 1, -1, -1):
+                            stack_item = stack[j]
+                            if stack_item[0] == i:
+                                possible_heads[i].append(stack.pop(j)[1])
+                    elif index_on_stack and orig_length > 0:
+                        head = possible_heads[i].pop()
+                        analysis[i] = head
+                        stack.append([i, head])
+                        forward = True
+                if i + 1 == len(possible_heads):
+                    analyses.append(analysis[:])
+                    forward = False
+                if forward:
+                    i += 1
+                else:
+                    i -= 1
+        # Filter parses
+        # ensure 1 root, every thing has 1 head
+        for analysis in analyses:
+            if analysis.count(-1) > 1:
+                # there are several root elements!
+                continue
+            graph = DependencyGraph()
+            graph.root = graph.nodes[analysis.index(-1) + 1]
+            for address, (token, head_index) in enumerate(
+                zip(tokens, analysis), start=1
+            ):
+                head_address = head_index + 1
+                node = graph.nodes[address]
+                node.update({"word": token, "address": address})
+                if head_address == 0:
+                    rel = "ROOT"
+                else:
+                    rel = ""
+                graph.nodes[head_index + 1]["deps"][rel].append(address)
+            # TODO: check for cycles
+            yield graph
+#################################################################
+# Demos
+#################################################################
+def demo():
+    # hall_demo()
+    nonprojective_conll_parse_demo()
+    rule_based_demo()
+def hall_demo():
+    npp = ProbabilisticNonprojectiveParser()
+    npp.train([], DemoScorer())
+    for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
+        print(parse_graph)
+def nonprojective_conll_parse_demo():
+    from nltk.parse.dependencygraph import conll_data2
+    graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+    npp = ProbabilisticNonprojectiveParser()
+    npp.train(graphs, NaiveBayesDependencyScorer())
+    for parse_graph in npp.parse(
+        ["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
+    ):
+        print(parse_graph)
+def rule_based_demo():
+    from nltk.grammar import DependencyGrammar
+    grammar = DependencyGrammar.fromstring(
+        """
+    'taught' -> 'play' | 'man'
+    'man' -> 'the' | 'in'
+    'in' -> 'corner'
+    'corner' -> 'the'
+    'play' -> 'golf' | 'dachshund' | 'to'
+    'dachshund' -> 'his'
+    """
+    )
+    print(grammar)
+    ndp = NonprojectiveDependencyParser(grammar)
+    graphs = ndp.parse(
+        [
+            "the",
+            "man",
+            "in",
+            "the",
+            "corner",
+            "taught",
+            "his",
+            "dachshund",
+            "to",
+            "play",
+            "golf",
+        ]
+    )
+    print("Graphs:")
+    for graph in graphs:
+        print(graph)
+if __name__ == "__main__":
+    demo()

.eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py ADDED Viewed

	@@ -0,0 +1,684 @@

+# Natural Language Toolkit: Recursive Descent Parser
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+from nltk.grammar import Nonterminal
+from nltk.parse.api import ParserI
+from nltk.tree import ImmutableTree, Tree
+##//////////////////////////////////////////////////////
+##  Recursive Descent Parser
+##//////////////////////////////////////////////////////
+class RecursiveDescentParser(ParserI):
+    """
+    A simple top-down CFG parser that parses texts by recursively
+    expanding the fringe of a Tree, and matching it against a
+    text.
+    ``RecursiveDescentParser`` uses a list of tree locations called a
+    "frontier" to remember which subtrees have not yet been expanded
+    and which leaves have not yet been matched against the text.  Each
+    tree location consists of a list of child indices specifying the
+    path from the root of the tree to a subtree or a leaf; see the
+    reference documentation for Tree for more information
+    about tree locations.
+    When the parser begins parsing a text, it constructs a tree
+    containing only the start symbol, and a frontier containing the
+    location of the tree's root node.  It then extends the tree to
+    cover the text, using the following recursive procedure:
+      - If the frontier is empty, and the text is covered by the tree,
+        then return the tree as a possible parse.
+      - If the frontier is empty, and the text is not covered by the
+        tree, then return no parses.
+      - If the first element of the frontier is a subtree, then
+        use CFG productions to "expand" it.  For each applicable
+        production, add the expanded subtree's children to the
+        frontier, and recursively find all parses that can be
+        generated by the new tree and frontier.
+      - If the first element of the frontier is a token, then "match"
+        it against the next token from the text.  Remove the token
+        from the frontier, and recursively find all parses that can be
+        generated by the new tree and frontier.
+    :see: ``nltk.grammar``
+    """
+    def __init__(self, grammar, trace=0):
+        """
+        Create a new ``RecursiveDescentParser``, that uses ``grammar``
+        to parse texts.
+        :type grammar: CFG
+        :param grammar: The grammar used to parse texts.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        """
+        self._grammar = grammar
+        self._trace = trace
+    def grammar(self):
+        return self._grammar
+    def parse(self, tokens):
+        # Inherit docs from ParserI
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+        # Start a recursive descent parse, with an initial tree
+        # containing just the start symbol.
+        start = self._grammar.start().symbol()
+        initial_tree = Tree(start, [])
+        frontier = [()]
+        if self._trace:
+            self._trace_start(initial_tree, frontier, tokens)
+        return self._parse(tokens, initial_tree, frontier)
+    def _parse(self, remaining_text, tree, frontier):
+        """
+        Recursively expand and match each elements of ``tree``
+        specified by ``frontier``, to cover ``remaining_text``.  Return
+        a list of all parses found.
+        :return: An iterator of all parses that can be generated by
+            matching and expanding the elements of ``tree``
+            specified by ``frontier``.
+        :rtype: iter(Tree)
+        :type tree: Tree
+        :param tree: A partial structure for the text that is
+            currently being parsed.  The elements of ``tree``
+            that are specified by ``frontier`` have not yet been
+            expanded or matched.
+        :type remaining_text: list(str)
+        :param remaining_text: The portion of the text that is not yet
+            covered by ``tree``.
+        :type frontier: list(tuple(int))
+        :param frontier: A list of the locations within ``tree`` of
+            all subtrees that have not yet been expanded, and all
+            leaves that have not yet been matched.  This list sorted
+            in left-to-right order of location within the tree.
+        """
+        # If the tree covers the text, and there's nothing left to
+        # expand, then we've found a complete parse; return it.
+        if len(remaining_text) == 0 and len(frontier) == 0:
+            if self._trace:
+                self._trace_succeed(tree, frontier)
+            yield tree
+        # If there's still text, but nothing left to expand, we failed.
+        elif len(frontier) == 0:
+            if self._trace:
+                self._trace_backtrack(tree, frontier)
+        # If the next element on the frontier is a tree, expand it.
+        elif isinstance(tree[frontier[0]], Tree):
+            yield from self._expand(remaining_text, tree, frontier)
+        # If the next element on the frontier is a token, match it.
+        else:
+            yield from self._match(remaining_text, tree, frontier)
+    def _match(self, rtext, tree, frontier):
+        """
+        :rtype: iter(Tree)
+        :return: an iterator of all parses that can be generated by
+            matching the first element of ``frontier`` against the
+            first token in ``rtext``.  In particular, if the first
+            element of ``frontier`` has the same type as the first
+            token in ``rtext``, then substitute the token into
+            ``tree``; and return all parses that can be generated by
+            matching and expanding the remaining elements of
+            ``frontier``.  If the first element of ``frontier`` does not
+            have the same type as the first token in ``rtext``, then
+            return empty list.
+        :type tree: Tree
+        :param tree: A partial structure for the text that is
+            currently being parsed.  The elements of ``tree``
+            that are specified by ``frontier`` have not yet been
+            expanded or matched.
+        :type rtext: list(str)
+        :param rtext: The portion of the text that is not yet
+            covered by ``tree``.
+        :type frontier: list of tuple of int
+        :param frontier: A list of the locations within ``tree`` of
+            all subtrees that have not yet been expanded, and all
+            leaves that have not yet been matched.
+        """
+        tree_leaf = tree[frontier[0]]
+        if len(rtext) > 0 and tree_leaf == rtext[0]:
+            # If it's a terminal that matches rtext[0], then substitute
+            # in the token, and continue parsing.
+            newtree = tree.copy(deep=True)
+            newtree[frontier[0]] = rtext[0]
+            if self._trace:
+                self._trace_match(newtree, frontier[1:], rtext[0])
+            yield from self._parse(rtext[1:], newtree, frontier[1:])
+        else:
+            # If it's a non-matching terminal, fail.
+            if self._trace:
+                self._trace_backtrack(tree, frontier, rtext[:1])
+    def _expand(self, remaining_text, tree, frontier, production=None):
+        """
+        :rtype: iter(Tree)
+        :return: An iterator of all parses that can be generated by
+            expanding the first element of ``frontier`` with
+            ``production``.  In particular, if the first element of
+            ``frontier`` is a subtree whose node type is equal to
+            ``production``'s left hand side, then add a child to that
+            subtree for each element of ``production``'s right hand
+            side; and return all parses that can be generated by
+            matching and expanding the remaining elements of
+            ``frontier``.  If the first element of ``frontier`` is not a
+            subtree whose node type is equal to ``production``'s left
+            hand side, then return an empty list.  If ``production`` is
+            not specified, then return a list of all parses that can
+            be generated by expanding the first element of ``frontier``
+            with *any* CFG production.
+        :type tree: Tree
+        :param tree: A partial structure for the text that is
+            currently being parsed.  The elements of ``tree``
+            that are specified by ``frontier`` have not yet been
+            expanded or matched.
+        :type remaining_text: list(str)
+        :param remaining_text: The portion of the text that is not yet
+            covered by ``tree``.
+        :type frontier: list(tuple(int))
+        :param frontier: A list of the locations within ``tree`` of
+            all subtrees that have not yet been expanded, and all
+            leaves that have not yet been matched.
+        """
+        if production is None:
+            productions = self._grammar.productions()
+        else:
+            productions = [production]
+        for production in productions:
+            lhs = production.lhs().symbol()
+            if lhs == tree[frontier[0]].label():
+                subtree = self._production_to_tree(production)
+                if frontier[0] == ():
+                    newtree = subtree
+                else:
+                    newtree = tree.copy(deep=True)
+                    newtree[frontier[0]] = subtree
+                new_frontier = [
+                    frontier[0] + (i,) for i in range(len(production.rhs()))
+                ]
+                if self._trace:
+                    self._trace_expand(newtree, new_frontier, production)
+                yield from self._parse(
+                    remaining_text, newtree, new_frontier + frontier[1:]
+                )
+    def _production_to_tree(self, production):
+        """
+        :rtype: Tree
+        :return: The Tree that is licensed by ``production``.
+            In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
+            return a tree that has a node ``lhs.symbol``, and
+            ``n`` children.  For each nonterminal element
+            ``elt[i]`` in the production, the tree token has a
+            childless subtree with node value ``elt[i].symbol``; and
+            for each terminal element ``elt[j]``, the tree token has
+            a leaf token with type ``elt[j]``.
+        :param production: The CFG production that licenses the tree
+            token that should be returned.
+        :type production: Production
+        """
+        children = []
+        for elt in production.rhs():
+            if isinstance(elt, Nonterminal):
+                children.append(Tree(elt.symbol(), []))
+            else:
+                # This will be matched.
+                children.append(elt)
+        return Tree(production.lhs().symbol(), children)
+    def trace(self, trace=2):
+        """
+        Set the level of tracing output that should be generated when
+        parsing a text.
+        :type trace: int
+        :param trace: The trace level.  A trace level of ``0`` will
+            generate no tracing output; and higher trace levels will
+            produce more verbose tracing output.
+        :rtype: None
+        """
+        self._trace = trace
+    def _trace_fringe(self, tree, treeloc=None):
+        """
+        Print trace output displaying the fringe of ``tree``.  The
+        fringe of ``tree`` consists of all of its leaves and all of
+        its childless subtrees.
+        :rtype: None
+        """
+        if treeloc == ():
+            print("*", end=" ")
+        if isinstance(tree, Tree):
+            if len(tree) == 0:
+                print(repr(Nonterminal(tree.label())), end=" ")
+            for i in range(len(tree)):
+                if treeloc is not None and i == treeloc[0]:
+                    self._trace_fringe(tree[i], treeloc[1:])
+                else:
+                    self._trace_fringe(tree[i])
+        else:
+            print(repr(tree), end=" ")
+    def _trace_tree(self, tree, frontier, operation):
+        """
+        Print trace output displaying the parser's current state.
+        :param operation: A character identifying the operation that
+            generated the current state.
+        :rtype: None
+        """
+        if self._trace == 2:
+            print("  %c [" % operation, end=" ")
+        else:
+            print("    [", end=" ")
+        if len(frontier) > 0:
+            self._trace_fringe(tree, frontier[0])
+        else:
+            self._trace_fringe(tree)
+        print("]")
+    def _trace_start(self, tree, frontier, text):
+        print("Parsing %r" % " ".join(text))
+        if self._trace > 2:
+            print("Start:")
+        if self._trace > 1:
+            self._trace_tree(tree, frontier, " ")
+    def _trace_expand(self, tree, frontier, production):
+        if self._trace > 2:
+            print("Expand: %s" % production)
+        if self._trace > 1:
+            self._trace_tree(tree, frontier, "E")
+    def _trace_match(self, tree, frontier, tok):
+        if self._trace > 2:
+            print("Match: %r" % tok)
+        if self._trace > 1:
+            self._trace_tree(tree, frontier, "M")
+    def _trace_succeed(self, tree, frontier):
+        if self._trace > 2:
+            print("GOOD PARSE:")
+        if self._trace == 1:
+            print("Found a parse:\n%s" % tree)
+        if self._trace > 1:
+            self._trace_tree(tree, frontier, "+")
+    def _trace_backtrack(self, tree, frontier, toks=None):
+        if self._trace > 2:
+            if toks:
+                print("Backtrack: %r match failed" % toks[0])
+            else:
+                print("Backtrack")
+##//////////////////////////////////////////////////////
+##  Stepping Recursive Descent Parser
+##//////////////////////////////////////////////////////
+class SteppingRecursiveDescentParser(RecursiveDescentParser):
+    """
+    A ``RecursiveDescentParser`` that allows you to step through the
+    parsing process, performing a single operation at a time.
+    The ``initialize`` method is used to start parsing a text.
+    ``expand`` expands the first element on the frontier using a single
+    CFG production, and ``match`` matches the first element on the
+    frontier against the next text token. ``backtrack`` undoes the most
+    recent expand or match operation.  ``step`` performs a single
+    expand, match, or backtrack operation.  ``parses`` returns the set
+    of parses that have been found by the parser.
+    :ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
+        containing the previous states of the parser.  This history is
+        used to implement the ``backtrack`` operation.
+    :ivar _tried_e: A record of all productions that have been tried
+        for a given tree.  This record is used by ``expand`` to perform
+        the next untried production.
+    :ivar _tried_m: A record of what tokens have been matched for a
+        given tree.  This record is used by ``step`` to decide whether
+        or not to match a token.
+    :see: ``nltk.grammar``
+    """
+    def __init__(self, grammar, trace=0):
+        super().__init__(grammar, trace)
+        self._rtext = None
+        self._tree = None
+        self._frontier = [()]
+        self._tried_e = {}
+        self._tried_m = {}
+        self._history = []
+        self._parses = []
+    # [XX] TEMPORARY HACK WARNING!  This should be replaced with
+    # something nicer when we get the chance.
+    def _freeze(self, tree):
+        c = tree.copy()
+        #        for pos in c.treepositions('leaves'):
+        #            c[pos] = c[pos].freeze()
+        return ImmutableTree.convert(c)
+    def parse(self, tokens):
+        tokens = list(tokens)
+        self.initialize(tokens)
+        while self.step() is not None:
+            pass
+        return self.parses()
+    def initialize(self, tokens):
+        """
+        Start parsing a given text.  This sets the parser's tree to
+        the start symbol, its frontier to the root node, and its
+        remaining text to ``token['SUBTOKENS']``.
+        """
+        self._rtext = tokens
+        start = self._grammar.start().symbol()
+        self._tree = Tree(start, [])
+        self._frontier = [()]
+        self._tried_e = {}
+        self._tried_m = {}
+        self._history = []
+        self._parses = []
+        if self._trace:
+            self._trace_start(self._tree, self._frontier, self._rtext)
+    def remaining_text(self):
+        """
+        :return: The portion of the text that is not yet covered by the
+            tree.
+        :rtype: list(str)
+        """
+        return self._rtext
+    def frontier(self):
+        """
+        :return: A list of the tree locations of all subtrees that
+            have not yet been expanded, and all leaves that have not
+            yet been matched.
+        :rtype: list(tuple(int))
+        """
+        return self._frontier
+    def tree(self):
+        """
+        :return: A partial structure for the text that is
+            currently being parsed.  The elements specified by the
+            frontier have not yet been expanded or matched.
+        :rtype: Tree
+        """
+        return self._tree
+    def step(self):
+        """
+        Perform a single parsing operation.  If an untried match is
+        possible, then perform the match, and return the matched
+        token.  If an untried expansion is possible, then perform the
+        expansion, and return the production that it is based on.  If
+        backtracking is possible, then backtrack, and return True.
+        Otherwise, return None.
+        :return: None if no operation was performed; a token if a match
+            was performed; a production if an expansion was performed;
+            and True if a backtrack operation was performed.
+        :rtype: Production or String or bool
+        """
+        # Try matching (if we haven't already)
+        if self.untried_match():
+            token = self.match()
+            if token is not None:
+                return token
+        # Try expanding.
+        production = self.expand()
+        if production is not None:
+            return production
+        # Try backtracking
+        if self.backtrack():
+            self._trace_backtrack(self._tree, self._frontier)
+            return True
+        # Nothing left to do.
+        return None
+    def expand(self, production=None):
+        """
+        Expand the first element of the frontier.  In particular, if
+        the first element of the frontier is a subtree whose node type
+        is equal to ``production``'s left hand side, then add a child
+        to that subtree for each element of ``production``'s right hand
+        side.  If ``production`` is not specified, then use the first
+        untried expandable production.  If all expandable productions
+        have been tried, do nothing.
+        :return: The production used to expand the frontier, if an
+           expansion was performed.  If no expansion was performed,
+           return None.
+        :rtype: Production or None
+        """
+        # Make sure we *can* expand.
+        if len(self._frontier) == 0:
+            return None
+        if not isinstance(self._tree[self._frontier[0]], Tree):
+            return None
+        # If they didn't specify a production, check all untried ones.
+        if production is None:
+            productions = self.untried_expandable_productions()
+        else:
+            productions = [production]
+        parses = []
+        for prod in productions:
+            # Record that we've tried this production now.
+            self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
+            # Try expanding.
+            for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
+                return prod
+        # We didn't expand anything.
+        return None
+    def match(self):
+        """
+        Match the first element of the frontier.  In particular, if
+        the first element of the frontier has the same type as the
+        next text token, then substitute the text token into the tree.
+        :return: The token matched, if a match operation was
+            performed.  If no match was performed, return None
+        :rtype: str or None
+        """
+        # Record that we've tried matching this token.
+        tok = self._rtext[0]
+        self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
+        # Make sure we *can* match.
+        if len(self._frontier) == 0:
+            return None
+        if isinstance(self._tree[self._frontier[0]], Tree):
+            return None
+        for _result in self._match(self._rtext, self._tree, self._frontier):
+            # Return the token we just matched.
+            return self._history[-1][0][0]
+        return None
+    def backtrack(self):
+        """
+        Return the parser to its state before the most recent
+        match or expand operation.  Calling ``undo`` repeatedly return
+        the parser to successively earlier states.  If no match or
+        expand operations have been performed, ``undo`` will make no
+        changes.
+        :return: true if an operation was successfully undone.
+        :rtype: bool
+        """
+        if len(self._history) == 0:
+            return False
+        (self._rtext, self._tree, self._frontier) = self._history.pop()
+        return True
+    def expandable_productions(self):
+        """
+        :return: A list of all the productions for which expansions
+            are available for the current parser state.
+        :rtype: list(Production)
+        """
+        # Make sure we *can* expand.
+        if len(self._frontier) == 0:
+            return []
+        frontier_child = self._tree[self._frontier[0]]
+        if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
+            return []
+        return [
+            p
+            for p in self._grammar.productions()
+            if p.lhs().symbol() == frontier_child.label()
+        ]
+    def untried_expandable_productions(self):
+        """
+        :return: A list of all the untried productions for which
+            expansions are available for the current parser state.
+        :rtype: list(Production)
+        """
+        tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
+        return [p for p in self.expandable_productions() if p not in tried_expansions]
+    def untried_match(self):
+        """
+        :return: Whether the first element of the frontier is a token
+            that has not yet been matched.
+        :rtype: bool
+        """
+        if len(self._rtext) == 0:
+            return False
+        tried_matches = self._tried_m.get(self._freeze(self._tree), [])
+        return self._rtext[0] not in tried_matches
+    def currently_complete(self):
+        """
+        :return: Whether the parser's current state represents a
+            complete parse.
+        :rtype: bool
+        """
+        return len(self._frontier) == 0 and len(self._rtext) == 0
+    def _parse(self, remaining_text, tree, frontier):
+        """
+        A stub version of ``_parse`` that sets the parsers current
+        state to the given arguments.  In ``RecursiveDescentParser``,
+        the ``_parse`` method is used to recursively continue parsing a
+        text.  ``SteppingRecursiveDescentParser`` overrides it to
+        capture these recursive calls.  It records the parser's old
+        state in the history (to allow for backtracking), and updates
+        the parser's new state using the given arguments.  Finally, it
+        returns ``[1]``, which is used by ``match`` and ``expand`` to
+        detect whether their operations were successful.
+        :return: ``[1]``
+        :rtype: list of int
+        """
+        self._history.append((self._rtext, self._tree, self._frontier))
+        self._rtext = remaining_text
+        self._tree = tree
+        self._frontier = frontier
+        # Is it a good parse?  If so, record it.
+        if len(frontier) == 0 and len(remaining_text) == 0:
+            self._parses.append(tree)
+            self._trace_succeed(self._tree, self._frontier)
+        return [1]
+    def parses(self):
+        """
+        :return: An iterator of the parses that have been found by this
+            parser so far.
+        :rtype: list of Tree
+        """
+        return iter(self._parses)
+    def set_grammar(self, grammar):
+        """
+        Change the grammar used to parse texts.
+        :param grammar: The new grammar.
+        :type grammar: CFG
+        """
+        self._grammar = grammar
+##//////////////////////////////////////////////////////
+##  Demonstration Code
+##//////////////////////////////////////////////////////
+def demo():
+    """
+    A demonstration of the recursive descent parser.
+    """
+    from nltk import CFG, parse
+    grammar = CFG.fromstring(
+        """
+    S -> NP VP
+    NP -> Det N | Det N PP
+    VP -> V NP | V NP PP
+    PP -> P NP
+    NP -> 'I'
+    N -> 'man' | 'park' | 'telescope' | 'dog'
+    Det -> 'the' | 'a'
+    P -> 'in' | 'with'
+    V -> 'saw'
+    """
+    )
+    for prod in grammar.productions():
+        print(prod)
+    sent = "I saw a man in the park".split()
+    parser = parse.RecursiveDescentParser(grammar, trace=2)
+    for p in parser.parse(sent):
+        print(p)
+if __name__ == "__main__":
+    demo()

.eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py ADDED Viewed

	@@ -0,0 +1,479 @@

+# Natural Language Toolkit: Shift-Reduce Parser
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+from nltk.grammar import Nonterminal
+from nltk.parse.api import ParserI
+from nltk.tree import Tree
+##//////////////////////////////////////////////////////
+##  Shift/Reduce Parser
+##//////////////////////////////////////////////////////
+class ShiftReduceParser(ParserI):
+    """
+    A simple bottom-up CFG parser that uses two operations, "shift"
+    and "reduce", to find a single parse for a text.
+    ``ShiftReduceParser`` maintains a stack, which records the
+    structure of a portion of the text.  This stack is a list of
+    strings and Trees that collectively cover a portion of
+    the text.  For example, while parsing the sentence "the dog saw
+    the man" with a typical grammar, ``ShiftReduceParser`` will produce
+    the following stack, which covers "the dog saw"::
+       [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
+    ``ShiftReduceParser`` attempts to extend the stack to cover the
+    entire text, and to combine the stack elements into a single tree,
+    producing a complete parse for the sentence.
+    Initially, the stack is empty.  It is extended to cover the text,
+    from left to right, by repeatedly applying two operations:
+      - "shift" moves a token from the beginning of the text to the
+        end of the stack.
+      - "reduce" uses a CFG production to combine the rightmost stack
+        elements into a single Tree.
+    Often, more than one operation can be performed on a given stack.
+    In this case, ``ShiftReduceParser`` uses the following heuristics
+    to decide which operation to perform:
+      - Only shift if no reductions are available.
+      - If multiple reductions are available, then apply the reduction
+        whose CFG production is listed earliest in the grammar.
+    Note that these heuristics are not guaranteed to choose an
+    operation that leads to a parse of the text.  Also, if multiple
+    parses exists, ``ShiftReduceParser`` will return at most one of
+    them.
+    :see: ``nltk.grammar``
+    """
+    def __init__(self, grammar, trace=0):
+        """
+        Create a new ``ShiftReduceParser``, that uses ``grammar`` to
+        parse texts.
+        :type grammar: Grammar
+        :param grammar: The grammar used to parse texts.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        """
+        self._grammar = grammar
+        self._trace = trace
+        self._check_grammar()
+    def grammar(self):
+        return self._grammar
+    def parse(self, tokens):
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+        # initialize the stack.
+        stack = []
+        remaining_text = tokens
+        # Trace output.
+        if self._trace:
+            print("Parsing %r" % " ".join(tokens))
+            self._trace_stack(stack, remaining_text)
+        # iterate through the text, pushing the token onto
+        # the stack, then reducing the stack.
+        while len(remaining_text) > 0:
+            self._shift(stack, remaining_text)
+            while self._reduce(stack, remaining_text):
+                pass
+        # Did we reduce everything?
+        if len(stack) == 1:
+            # Did we end up with the right category?
+            if stack[0].label() == self._grammar.start().symbol():
+                yield stack[0]
+    def _shift(self, stack, remaining_text):
+        """
+        Move a token from the beginning of ``remaining_text`` to the
+        end of ``stack``.
+        :type stack: list(str and Tree)
+        :param stack: A list of strings and Trees, encoding
+            the structure of the text that has been parsed so far.
+        :type remaining_text: list(str)
+        :param remaining_text: The portion of the text that is not yet
+            covered by ``stack``.
+        :rtype: None
+        """
+        stack.append(remaining_text[0])
+        remaining_text.remove(remaining_text[0])
+        if self._trace:
+            self._trace_shift(stack, remaining_text)
+    def _match_rhs(self, rhs, rightmost_stack):
+        """
+        :rtype: bool
+        :return: true if the right hand side of a CFG production
+            matches the rightmost elements of the stack.  ``rhs``
+            matches ``rightmost_stack`` if they are the same length,
+            and each element of ``rhs`` matches the corresponding
+            element of ``rightmost_stack``.  A nonterminal element of
+            ``rhs`` matches any Tree whose node value is equal
+            to the nonterminal's symbol.  A terminal element of ``rhs``
+            matches any string whose type is equal to the terminal.
+        :type rhs: list(terminal and Nonterminal)
+        :param rhs: The right hand side of a CFG production.
+        :type rightmost_stack: list(string and Tree)
+        :param rightmost_stack: The rightmost elements of the parser's
+            stack.
+        """
+        if len(rightmost_stack) != len(rhs):
+            return False
+        for i in range(len(rightmost_stack)):
+            if isinstance(rightmost_stack[i], Tree):
+                if not isinstance(rhs[i], Nonterminal):
+                    return False
+                if rightmost_stack[i].label() != rhs[i].symbol():
+                    return False
+            else:
+                if isinstance(rhs[i], Nonterminal):
+                    return False
+                if rightmost_stack[i] != rhs[i]:
+                    return False
+        return True
+    def _reduce(self, stack, remaining_text, production=None):
+        """
+        Find a CFG production whose right hand side matches the
+        rightmost stack elements; and combine those stack elements
+        into a single Tree, with the node specified by the
+        production's left-hand side.  If more than one CFG production
+        matches the stack, then use the production that is listed
+        earliest in the grammar.  The new Tree replaces the
+        elements in the stack.
+        :rtype: Production or None
+        :return: If a reduction is performed, then return the CFG
+            production that the reduction is based on; otherwise,
+            return false.
+        :type stack: list(string and Tree)
+        :param stack: A list of strings and Trees, encoding
+            the structure of the text that has been parsed so far.
+        :type remaining_text: list(str)
+        :param remaining_text: The portion of the text that is not yet
+            covered by ``stack``.
+        """
+        if production is None:
+            productions = self._grammar.productions()
+        else:
+            productions = [production]
+        # Try each production, in order.
+        for production in productions:
+            rhslen = len(production.rhs())
+            # check if the RHS of a production matches the top of the stack
+            if self._match_rhs(production.rhs(), stack[-rhslen:]):
+                # combine the tree to reflect the reduction
+                tree = Tree(production.lhs().symbol(), stack[-rhslen:])
+                stack[-rhslen:] = [tree]
+                # We reduced something
+                if self._trace:
+                    self._trace_reduce(stack, production, remaining_text)
+                return production
+        # We didn't reduce anything
+        return None
+    def trace(self, trace=2):
+        """
+        Set the level of tracing output that should be generated when
+        parsing a text.
+        :type trace: int
+        :param trace: The trace level.  A trace level of ``0`` will
+            generate no tracing output; and higher trace levels will
+            produce more verbose tracing output.
+        :rtype: None
+        """
+        # 1: just show shifts.
+        # 2: show shifts & reduces
+        # 3: display which tokens & productions are shifed/reduced
+        self._trace = trace
+    def _trace_stack(self, stack, remaining_text, marker=" "):
+        """
+        Print trace output displaying the given stack and text.
+        :rtype: None
+        :param marker: A character that is printed to the left of the
+            stack.  This is used with trace level 2 to print 'S'
+            before shifted stacks and 'R' before reduced stacks.
+        """
+        s = "  " + marker + " [ "
+        for elt in stack:
+            if isinstance(elt, Tree):
+                s += repr(Nonterminal(elt.label())) + " "
+            else:
+                s += repr(elt) + " "
+        s += "* " + " ".join(remaining_text) + "]"
+        print(s)
+    def _trace_shift(self, stack, remaining_text):
+        """
+        Print trace output displaying that a token has been shifted.
+        :rtype: None
+        """
+        if self._trace > 2:
+            print("Shift %r:" % stack[-1])
+        if self._trace == 2:
+            self._trace_stack(stack, remaining_text, "S")
+        elif self._trace > 0:
+            self._trace_stack(stack, remaining_text)
+    def _trace_reduce(self, stack, production, remaining_text):
+        """
+        Print trace output displaying that ``production`` was used to
+        reduce ``stack``.
+        :rtype: None
+        """
+        if self._trace > 2:
+            rhs = " ".join(production.rhs())
+            print(f"Reduce {production.lhs()!r} <- {rhs}")
+        if self._trace == 2:
+            self._trace_stack(stack, remaining_text, "R")
+        elif self._trace > 1:
+            self._trace_stack(stack, remaining_text)
+    def _check_grammar(self):
+        """
+        Check to make sure that all of the CFG productions are
+        potentially useful.  If any productions can never be used,
+        then print a warning.
+        :rtype: None
+        """
+        productions = self._grammar.productions()
+        # Any production whose RHS is an extension of another production's RHS
+        # will never be used.
+        for i in range(len(productions)):
+            for j in range(i + 1, len(productions)):
+                rhs1 = productions[i].rhs()
+                rhs2 = productions[j].rhs()
+                if rhs1[: len(rhs2)] == rhs2:
+                    print("Warning: %r will never be used" % productions[i])
+##//////////////////////////////////////////////////////
+##  Stepping Shift/Reduce Parser
+##//////////////////////////////////////////////////////
+class SteppingShiftReduceParser(ShiftReduceParser):
+    """
+    A ``ShiftReduceParser`` that allows you to setp through the parsing
+    process, performing a single operation at a time.  It also allows
+    you to change the parser's grammar midway through parsing a text.
+    The ``initialize`` method is used to start parsing a text.
+    ``shift`` performs a single shift operation, and ``reduce`` performs
+    a single reduce operation.  ``step`` will perform a single reduce
+    operation if possible; otherwise, it will perform a single shift
+    operation.  ``parses`` returns the set of parses that have been
+    found by the parser.
+    :ivar _history: A list of ``(stack, remaining_text)`` pairs,
+        containing all of the previous states of the parser.  This
+        history is used to implement the ``undo`` operation.
+    :see: ``nltk.grammar``
+    """
+    def __init__(self, grammar, trace=0):
+        super().__init__(grammar, trace)
+        self._stack = None
+        self._remaining_text = None
+        self._history = []
+    def parse(self, tokens):
+        tokens = list(tokens)
+        self.initialize(tokens)
+        while self.step():
+            pass
+        return self.parses()
+    def stack(self):
+        """
+        :return: The parser's stack.
+        :rtype: list(str and Tree)
+        """
+        return self._stack
+    def remaining_text(self):
+        """
+        :return: The portion of the text that is not yet covered by the
+            stack.
+        :rtype: list(str)
+        """
+        return self._remaining_text
+    def initialize(self, tokens):
+        """
+        Start parsing a given text.  This sets the parser's stack to
+        ``[]`` and sets its remaining text to ``tokens``.
+        """
+        self._stack = []
+        self._remaining_text = tokens
+        self._history = []
+    def step(self):
+        """
+        Perform a single parsing operation.  If a reduction is
+        possible, then perform that reduction, and return the
+        production that it is based on.  Otherwise, if a shift is
+        possible, then perform it, and return True.  Otherwise,
+        return False.
+        :return: False if no operation was performed; True if a shift was
+            performed; and the CFG production used to reduce if a
+            reduction was performed.
+        :rtype: Production or bool
+        """
+        return self.reduce() or self.shift()
+    def shift(self):
+        """
+        Move a token from the beginning of the remaining text to the
+        end of the stack.  If there are no more tokens in the
+        remaining text, then do nothing.
+        :return: True if the shift operation was successful.
+        :rtype: bool
+        """
+        if len(self._remaining_text) == 0:
+            return False
+        self._history.append((self._stack[:], self._remaining_text[:]))
+        self._shift(self._stack, self._remaining_text)
+        return True
+    def reduce(self, production=None):
+        """
+        Use ``production`` to combine the rightmost stack elements into
+        a single Tree.  If ``production`` does not match the
+        rightmost stack elements, then do nothing.
+        :return: The production used to reduce the stack, if a
+            reduction was performed.  If no reduction was performed,
+            return None.
+        :rtype: Production or None
+        """
+        self._history.append((self._stack[:], self._remaining_text[:]))
+        return_val = self._reduce(self._stack, self._remaining_text, production)
+        if not return_val:
+            self._history.pop()
+        return return_val
+    def undo(self):
+        """
+        Return the parser to its state before the most recent
+        shift or reduce operation.  Calling ``undo`` repeatedly return
+        the parser to successively earlier states.  If no shift or
+        reduce operations have been performed, ``undo`` will make no
+        changes.
+        :return: true if an operation was successfully undone.
+        :rtype: bool
+        """
+        if len(self._history) == 0:
+            return False
+        (self._stack, self._remaining_text) = self._history.pop()
+        return True
+    def reducible_productions(self):
+        """
+        :return: A list of the productions for which reductions are
+            available for the current parser state.
+        :rtype: list(Production)
+        """
+        productions = []
+        for production in self._grammar.productions():
+            rhslen = len(production.rhs())
+            if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
+                productions.append(production)
+        return productions
+    def parses(self):
+        """
+        :return: An iterator of the parses that have been found by this
+            parser so far.
+        :rtype: iter(Tree)
+        """
+        if (
+            len(self._remaining_text) == 0
+            and len(self._stack) == 1
+            and self._stack[0].label() == self._grammar.start().symbol()
+        ):
+            yield self._stack[0]
+    # copied from nltk.parser
+    def set_grammar(self, grammar):
+        """
+        Change the grammar used to parse texts.
+        :param grammar: The new grammar.
+        :type grammar: CFG
+        """
+        self._grammar = grammar
+##//////////////////////////////////////////////////////
+##  Demonstration Code
+##//////////////////////////////////////////////////////
+def demo():
+    """
+    A demonstration of the shift-reduce parser.
+    """
+    from nltk import CFG, parse
+    grammar = CFG.fromstring(
+        """
+    S -> NP VP
+    NP -> Det N | Det N PP
+    VP -> V NP | V NP PP
+    PP -> P NP
+    NP -> 'I'
+    N -> 'man' | 'park' | 'telescope' | 'dog'
+    Det -> 'the' | 'a'
+    P -> 'in' | 'with'
+    V -> 'saw'
+    """
+    )
+    sent = "I saw a man in the park".split()
+    parser = parse.ShiftReduceParser(grammar, trace=2)
+    for p in parser.parse(sent):
+        print(p)
+if __name__ == "__main__":
+    demo()

.eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py ADDED Viewed

	@@ -0,0 +1,470 @@

+# Natural Language Toolkit: Interface to the Stanford Parser
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Xu <xxu@student.unimelb.edu.au>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import os
+import tempfile
+import warnings
+from subprocess import PIPE
+from nltk.internals import (
+    _java_options,
+    config_java,
+    find_jar_iter,
+    find_jars_within_path,
+    java,
+)
+from nltk.parse.api import ParserI
+from nltk.parse.dependencygraph import DependencyGraph
+from nltk.tree import Tree
+_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
+class GenericStanfordParser(ParserI):
+    """Interface to the Stanford Parser"""
+    _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
+    _JAR = r"stanford-parser\.jar"
+    _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
+    _USE_STDIN = False
+    _DOUBLE_SPACED_OUTPUT = False
+    def __init__(
+        self,
+        path_to_jar=None,
+        path_to_models_jar=None,
+        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
+        encoding="utf8",
+        verbose=False,
+        java_options="-mx4g",
+        corenlp_options="",
+    ):
+        # find the most recent code and model jar
+        stanford_jar = max(
+            find_jar_iter(
+                self._JAR,
+                path_to_jar,
+                env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
+                searchpath=(),
+                url=_stanford_url,
+                verbose=verbose,
+                is_regex=True,
+            ),
+            key=lambda model_path: os.path.dirname(model_path),
+        )
+        model_jar = max(
+            find_jar_iter(
+                self._MODEL_JAR_PATTERN,
+                path_to_models_jar,
+                env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
+                searchpath=(),
+                url=_stanford_url,
+                verbose=verbose,
+                is_regex=True,
+            ),
+            key=lambda model_path: os.path.dirname(model_path),
+        )
+        # self._classpath = (stanford_jar, model_jar)
+        # Adding logging jar files to classpath
+        stanford_dir = os.path.split(stanford_jar)[0]
+        self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
+        self.model_path = model_path
+        self._encoding = encoding
+        self.corenlp_options = corenlp_options
+        self.java_options = java_options
+    def _parse_trees_output(self, output_):
+        res = []
+        cur_lines = []
+        cur_trees = []
+        blank = False
+        for line in output_.splitlines(False):
+            if line == "":
+                if blank:
+                    res.append(iter(cur_trees))
+                    cur_trees = []
+                    blank = False
+                elif self._DOUBLE_SPACED_OUTPUT:
+                    cur_trees.append(self._make_tree("\n".join(cur_lines)))
+                    cur_lines = []
+                    blank = True
+                else:
+                    res.append(iter([self._make_tree("\n".join(cur_lines))]))
+                    cur_lines = []
+            else:
+                cur_lines.append(line)
+                blank = False
+        return iter(res)
+    def parse_sents(self, sentences, verbose=False):
+        """
+        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
+        list where each sentence is a list of words.
+        Each sentence will be automatically tagged with this StanfordParser instance's
+        tagger.
+        If whitespaces exists inside a token, then the token will be treated as
+        separate tokens.
+        :param sentences: Input sentences to parse
+        :type sentences: list(list(str))
+        :rtype: iter(iter(Tree))
+        """
+        cmd = [
+            self._MAIN_CLASS,
+            "-model",
+            self.model_path,
+            "-sentences",
+            "newline",
+            "-outputFormat",
+            self._OUTPUT_FORMAT,
+            "-tokenized",
+            "-escaper",
+            "edu.stanford.nlp.process.PTBEscapingProcessor",
+        ]
+        return self._parse_trees_output(
+            self._execute(
+                cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
+            )
+        )
+    def raw_parse(self, sentence, verbose=False):
+        """
+        Use StanfordParser to parse a sentence. Takes a sentence as a string;
+        before parsing, it will be automatically tokenized and tagged by
+        the Stanford Parser.
+        :param sentence: Input sentence to parse
+        :type sentence: str
+        :rtype: iter(Tree)
+        """
+        return next(self.raw_parse_sents([sentence], verbose))
+    def raw_parse_sents(self, sentences, verbose=False):
+        """
+        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
+        list of strings.
+        Each sentence will be automatically tokenized and tagged by the Stanford Parser.
+        :param sentences: Input sentences to parse
+        :type sentences: list(str)
+        :rtype: iter(iter(Tree))
+        """
+        cmd = [
+            self._MAIN_CLASS,
+            "-model",
+            self.model_path,
+            "-sentences",
+            "newline",
+            "-outputFormat",
+            self._OUTPUT_FORMAT,
+        ]
+        return self._parse_trees_output(
+            self._execute(cmd, "\n".join(sentences), verbose)
+        )
+    def tagged_parse(self, sentence, verbose=False):
+        """
+        Use StanfordParser to parse a sentence. Takes a sentence as a list of
+        (word, tag) tuples; the sentence must have already been tokenized and
+        tagged.
+        :param sentence: Input sentence to parse
+        :type sentence: list(tuple(str, str))
+        :rtype: iter(Tree)
+        """
+        return next(self.tagged_parse_sents([sentence], verbose))
+    def tagged_parse_sents(self, sentences, verbose=False):
+        """
+        Use StanfordParser to parse multiple sentences. Takes multiple sentences
+        where each sentence is a list of (word, tag) tuples.
+        The sentences must have already been tokenized and tagged.
+        :param sentences: Input sentences to parse
+        :type sentences: list(list(tuple(str, str)))
+        :rtype: iter(iter(Tree))
+        """
+        tag_separator = "/"
+        cmd = [
+            self._MAIN_CLASS,
+            "-model",
+            self.model_path,
+            "-sentences",
+            "newline",
+            "-outputFormat",
+            self._OUTPUT_FORMAT,
+            "-tokenized",
+            "-tagSeparator",
+            tag_separator,
+            "-tokenizerFactory",
+            "edu.stanford.nlp.process.WhitespaceTokenizer",
+            "-tokenizerMethod",
+            "newCoreLabelTokenizerFactory",
+        ]
+        # We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
+        return self._parse_trees_output(
+            self._execute(
+                cmd,
+                "\n".join(
+                    " ".join(tag_separator.join(tagged) for tagged in sentence)
+                    for sentence in sentences
+                ),
+                verbose,
+            )
+        )
+    def _execute(self, cmd, input_, verbose=False):
+        encoding = self._encoding
+        cmd.extend(["-encoding", encoding])
+        if self.corenlp_options:
+            cmd.extend(self.corenlp_options.split())
+        default_options = " ".join(_java_options)
+        # Configure java.
+        config_java(options=self.java_options, verbose=verbose)
+        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
+        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
+            # Write the actual sentences to the temporary input file
+            if isinstance(input_, str) and encoding:
+                input_ = input_.encode(encoding)
+            input_file.write(input_)
+            input_file.flush()
+            # Run the tagger and get the output.
+            if self._USE_STDIN:
+                input_file.seek(0)
+                stdout, stderr = java(
+                    cmd,
+                    classpath=self._classpath,
+                    stdin=input_file,
+                    stdout=PIPE,
+                    stderr=PIPE,
+                )
+            else:
+                cmd.append(input_file.name)
+                stdout, stderr = java(
+                    cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
+                )
+            stdout = stdout.replace(b"\xc2\xa0", b" ")
+            stdout = stdout.replace(b"\x00\xa0", b" ")
+            stdout = stdout.decode(encoding)
+        os.unlink(input_file.name)
+        # Return java configurations to their default values.
+        config_java(options=default_options, verbose=False)
+        return stdout
+class StanfordParser(GenericStanfordParser):
+    """
+    >>> parser=StanfordParser(
+    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+    ... ) # doctest: +SKIP
+    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
+    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
+    >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
+    ...     "the quick brown fox jumps over the lazy dog",
+    ...     "the quick grey wolf jumps over the lazy fox"
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
+    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
+    [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
+    [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
+    Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
+    >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
+    Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
+    [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
+    Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
+    Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
+    >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
+    ...     (
+    ...         ("The", "DT"),
+    ...         ("quick", "JJ"),
+    ...         ("brown", "JJ"),
+    ...         ("fox", "NN"),
+    ...         ("jumped", "VBD"),
+    ...         ("over", "IN"),
+    ...         ("the", "DT"),
+    ...         ("lazy", "JJ"),
+    ...         ("dog", "NN"),
+    ...         (".", "."),
+    ...     ),
+    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
+    [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
+    """
+    _OUTPUT_FORMAT = "penn"
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The StanfordParser will be deprecated\n"
+            "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        super().__init__(*args, **kwargs)
+    def _make_tree(self, result):
+        return Tree.fromstring(result)
+class StanfordDependencyParser(GenericStanfordParser):
+    """
+    >>> dep_parser=StanfordDependencyParser(
+    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+    ... ) # doctest: +SKIP
+    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
+    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
+    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
+    ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
+    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
+    ...     "The quick brown fox jumps over the lazy dog.",
+    ...     "The quick grey wolf jumps over the lazy fox."
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
+    Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
+    >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
+    ...     (
+    ...         ("The", "DT"),
+    ...         ("quick", "JJ"),
+    ...         ("brown", "JJ"),
+    ...         ("fox", "NN"),
+    ...         ("jumped", "VBD"),
+    ...         ("over", "IN"),
+    ...         ("the", "DT"),
+    ...         ("lazy", "JJ"),
+    ...         ("dog", "NN"),
+    ...         (".", "."),
+    ...     ),
+    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
+    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
+    ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
+    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
+    """
+    _OUTPUT_FORMAT = "conll2007"
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The StanfordDependencyParser will be deprecated\n"
+            "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        super().__init__(*args, **kwargs)
+    def _make_tree(self, result):
+        return DependencyGraph(result, top_relation_label="root")
+class StanfordNeuralDependencyParser(GenericStanfordParser):
+    """
+    >>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP
+    >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP
+    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
+    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
+    (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
+    u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
+    ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
+    (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
+    u'punct', (u'.', u'.'))]]
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
+    ...     "The quick brown fox jumps over the lazy dog.",
+    ...     "The quick grey wolf jumps over the lazy fox."
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
+    'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
+    Tree('fox', ['over', 'the', 'lazy']), '.'])]
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
+    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
+    ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
+    """
+    _OUTPUT_FORMAT = "conll"
+    _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
+    _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
+    _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
+    _USE_STDIN = True
+    _DOUBLE_SPACED_OUTPUT = True
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The StanfordNeuralDependencyParser will be deprecated\n"
+            "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        super().__init__(*args, **kwargs)
+        self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
+    def tagged_parse_sents(self, sentences, verbose=False):
+        """
+        Currently unimplemented because the neural dependency parser (and
+        the StanfordCoreNLP pipeline class) doesn't support passing in pre-
+        tagged tokens.
+        """
+        raise NotImplementedError(
+            "tagged_parse[_sents] is not supported by "
+            "StanfordNeuralDependencyParser; use "
+            "parse[_sents] or raw_parse[_sents] instead."
+        )
+    def _make_tree(self, result):
+        return DependencyGraph(result, top_relation_label="ROOT")

.eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py ADDED Viewed

	@@ -0,0 +1,794 @@

+# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers
+#
+# Author: Long Duong <longdt219@gmail.com>
+#
+# Copyright (C) 2001-2022 NLTK Project
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import pickle
+import tempfile
+from copy import deepcopy
+from operator import itemgetter
+from os import remove
+try:
+    from numpy import array
+    from scipy import sparse
+    from sklearn import svm
+    from sklearn.datasets import load_svmlight_file
+except ImportError:
+    pass
+from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI
+class Configuration:
+    """
+    Class for holding configuration which is the partial analysis of the input sentence.
+    The transition based parser aims at finding set of operators that transfer the initial
+    configuration to the terminal configuration.
+    The configuration includes:
+        - Stack: for storing partially proceeded words
+        - Buffer: for storing remaining input words
+        - Set of arcs: for storing partially built dependency tree
+    This class also provides a method to represent a configuration as list of features.
+    """
+    def __init__(self, dep_graph):
+        """
+        :param dep_graph: the representation of an input in the form of dependency graph.
+        :type dep_graph: DependencyGraph where the dependencies are not specified.
+        """
+        # dep_graph.nodes contain list of token for a sentence
+        self.stack = [0]  # The root element
+        self.buffer = list(range(1, len(dep_graph.nodes)))  # The rest is in the buffer
+        self.arcs = []  # empty set of arc
+        self._tokens = dep_graph.nodes
+        self._max_address = len(self.buffer)
+    def __str__(self):
+        return (
+            "Stack : "
+            + str(self.stack)
+            + "  Buffer : "
+            + str(self.buffer)
+            + "   Arcs : "
+            + str(self.arcs)
+        )
+    def _check_informative(self, feat, flag=False):
+        """
+        Check whether a feature is informative
+        The flag control whether "_" is informative or not
+        """
+        if feat is None:
+            return False
+        if feat == "":
+            return False
+        if flag is False:
+            if feat == "_":
+                return False
+        return True
+    def extract_features(self):
+        """
+        Extract the set of features for the current configuration. Implement standard features as describe in
+        Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre.
+        Please note that these features are very basic.
+        :return: list(str)
+        """
+        result = []
+        # Todo : can come up with more complicated features set for better
+        # performance.
+        if len(self.stack) > 0:
+            # Stack 0
+            stack_idx0 = self.stack[len(self.stack) - 1]
+            token = self._tokens[stack_idx0]
+            if self._check_informative(token["word"], True):
+                result.append("STK_0_FORM_" + token["word"])
+            if "lemma" in token and self._check_informative(token["lemma"]):
+                result.append("STK_0_LEMMA_" + token["lemma"])
+            if self._check_informative(token["tag"]):
+                result.append("STK_0_POS_" + token["tag"])
+            if "feats" in token and self._check_informative(token["feats"]):
+                feats = token["feats"].split("|")
+                for feat in feats:
+                    result.append("STK_0_FEATS_" + feat)
+            # Stack 1
+            if len(self.stack) > 1:
+                stack_idx1 = self.stack[len(self.stack) - 2]
+                token = self._tokens[stack_idx1]
+                if self._check_informative(token["tag"]):
+                    result.append("STK_1_POS_" + token["tag"])
+            # Left most, right most dependency of stack[0]
+            left_most = 1000000
+            right_most = -1
+            dep_left_most = ""
+            dep_right_most = ""
+            for (wi, r, wj) in self.arcs:
+                if wi == stack_idx0:
+                    if (wj > wi) and (wj > right_most):
+                        right_most = wj
+                        dep_right_most = r
+                    if (wj < wi) and (wj < left_most):
+                        left_most = wj
+                        dep_left_most = r
+            if self._check_informative(dep_left_most):
+                result.append("STK_0_LDEP_" + dep_left_most)
+            if self._check_informative(dep_right_most):
+                result.append("STK_0_RDEP_" + dep_right_most)
+        # Check Buffered 0
+        if len(self.buffer) > 0:
+            # Buffer 0
+            buffer_idx0 = self.buffer[0]
+            token = self._tokens[buffer_idx0]
+            if self._check_informative(token["word"], True):
+                result.append("BUF_0_FORM_" + token["word"])
+            if "lemma" in token and self._check_informative(token["lemma"]):
+                result.append("BUF_0_LEMMA_" + token["lemma"])
+            if self._check_informative(token["tag"]):
+                result.append("BUF_0_POS_" + token["tag"])
+            if "feats" in token and self._check_informative(token["feats"]):
+                feats = token["feats"].split("|")
+                for feat in feats:
+                    result.append("BUF_0_FEATS_" + feat)
+            # Buffer 1
+            if len(self.buffer) > 1:
+                buffer_idx1 = self.buffer[1]
+                token = self._tokens[buffer_idx1]
+                if self._check_informative(token["word"], True):
+                    result.append("BUF_1_FORM_" + token["word"])
+                if self._check_informative(token["tag"]):
+                    result.append("BUF_1_POS_" + token["tag"])
+            if len(self.buffer) > 2:
+                buffer_idx2 = self.buffer[2]
+                token = self._tokens[buffer_idx2]
+                if self._check_informative(token["tag"]):
+                    result.append("BUF_2_POS_" + token["tag"])
+            if len(self.buffer) > 3:
+                buffer_idx3 = self.buffer[3]
+                token = self._tokens[buffer_idx3]
+                if self._check_informative(token["tag"]):
+                    result.append("BUF_3_POS_" + token["tag"])
+                    # Left most, right most dependency of stack[0]
+            left_most = 1000000
+            right_most = -1
+            dep_left_most = ""
+            dep_right_most = ""
+            for (wi, r, wj) in self.arcs:
+                if wi == buffer_idx0:
+                    if (wj > wi) and (wj > right_most):
+                        right_most = wj
+                        dep_right_most = r
+                    if (wj < wi) and (wj < left_most):
+                        left_most = wj
+                        dep_left_most = r
+            if self._check_informative(dep_left_most):
+                result.append("BUF_0_LDEP_" + dep_left_most)
+            if self._check_informative(dep_right_most):
+                result.append("BUF_0_RDEP_" + dep_right_most)
+        return result
+class Transition:
+    """
+    This class defines a set of transition which is applied to a configuration to get another configuration
+    Note that for different parsing algorithm, the transition is different.
+    """
+    # Define set of transitions
+    LEFT_ARC = "LEFTARC"
+    RIGHT_ARC = "RIGHTARC"
+    SHIFT = "SHIFT"
+    REDUCE = "REDUCE"
+    def __init__(self, alg_option):
+        """
+        :param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
+        :type alg_option: str
+        """
+        self._algo = alg_option
+        if alg_option not in [
+            TransitionParser.ARC_STANDARD,
+            TransitionParser.ARC_EAGER,
+        ]:
+            raise ValueError(
+                " Currently we only support %s and %s "
+                % (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER)
+            )
+    def left_arc(self, conf, relation):
+        """
+        Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager
+        :param configuration: is the current configuration
+        :return: A new configuration or -1 if the pre-condition is not satisfied
+        """
+        if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
+            return -1
+        if conf.buffer[0] == 0:
+            # here is the Root element
+            return -1
+        idx_wi = conf.stack[len(conf.stack) - 1]
+        flag = True
+        if self._algo == TransitionParser.ARC_EAGER:
+            for (idx_parent, r, idx_child) in conf.arcs:
+                if idx_child == idx_wi:
+                    flag = False
+        if flag:
+            conf.stack.pop()
+            idx_wj = conf.buffer[0]
+            conf.arcs.append((idx_wj, relation, idx_wi))
+        else:
+            return -1
+    def right_arc(self, conf, relation):
+        """
+        Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager
+        :param configuration: is the current configuration
+        :return: A new configuration or -1 if the pre-condition is not satisfied
+        """
+        if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
+            return -1
+        if self._algo == TransitionParser.ARC_STANDARD:
+            idx_wi = conf.stack.pop()
+            idx_wj = conf.buffer[0]
+            conf.buffer[0] = idx_wi
+            conf.arcs.append((idx_wi, relation, idx_wj))
+        else:  # arc-eager
+            idx_wi = conf.stack[len(conf.stack) - 1]
+            idx_wj = conf.buffer.pop(0)
+            conf.stack.append(idx_wj)
+            conf.arcs.append((idx_wi, relation, idx_wj))
+    def reduce(self, conf):
+        """
+        Note that the algorithm for reduce is only available for arc-eager
+        :param configuration: is the current configuration
+        :return: A new configuration or -1 if the pre-condition is not satisfied
+        """
+        if self._algo != TransitionParser.ARC_EAGER:
+            return -1
+        if len(conf.stack) <= 0:
+            return -1
+        idx_wi = conf.stack[len(conf.stack) - 1]
+        flag = False
+        for (idx_parent, r, idx_child) in conf.arcs:
+            if idx_child == idx_wi:
+                flag = True
+        if flag:
+            conf.stack.pop()  # reduce it
+        else:
+            return -1
+    def shift(self, conf):
+        """
+        Note that the algorithm for shift is the SAME for arc-standard and arc-eager
+        :param configuration: is the current configuration
+        :return: A new configuration or -1 if the pre-condition is not satisfied
+        """
+        if len(conf.buffer) <= 0:
+            return -1
+        idx_wi = conf.buffer.pop(0)
+        conf.stack.append(idx_wi)
+class TransitionParser(ParserI):
+    """
+    Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
+    """
+    ARC_STANDARD = "arc-standard"
+    ARC_EAGER = "arc-eager"
+    def __init__(self, algorithm):
+        """
+        :param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
+        :type algorithm: str
+        """
+        if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
+            raise ValueError(
+                " Currently we only support %s and %s "
+                % (self.ARC_STANDARD, self.ARC_EAGER)
+            )
+        self._algorithm = algorithm
+        self._dictionary = {}
+        self._transition = {}
+        self._match_transition = {}
+    def _get_dep_relation(self, idx_parent, idx_child, depgraph):
+        p_node = depgraph.nodes[idx_parent]
+        c_node = depgraph.nodes[idx_child]
+        if c_node["word"] is None:
+            return None  # Root word
+        if c_node["head"] == p_node["address"]:
+            return c_node["rel"]
+        else:
+            return None
+    def _convert_to_binary_features(self, features):
+        """
+        :param features: list of feature string which is needed to convert to binary features
+        :type features: list(str)
+        :return : string of binary features in libsvm format  which is 'featureID:value' pairs
+        """
+        unsorted_result = []
+        for feature in features:
+            self._dictionary.setdefault(feature, len(self._dictionary))
+            unsorted_result.append(self._dictionary[feature])
+        # Default value of each feature is 1.0
+        return " ".join(
+            str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
+        )
+    def _is_projective(self, depgraph):
+        arc_list = []
+        for key in depgraph.nodes:
+            node = depgraph.nodes[key]
+            if "head" in node:
+                childIdx = node["address"]
+                parentIdx = node["head"]
+                if parentIdx is not None:
+                    arc_list.append((parentIdx, childIdx))
+        for (parentIdx, childIdx) in arc_list:
+            # Ensure that childIdx < parentIdx
+            if childIdx > parentIdx:
+                temp = childIdx
+                childIdx = parentIdx
+                parentIdx = temp
+            for k in range(childIdx + 1, parentIdx):
+                for m in range(len(depgraph.nodes)):
+                    if (m < childIdx) or (m > parentIdx):
+                        if (k, m) in arc_list:
+                            return False
+                        if (m, k) in arc_list:
+                            return False
+        return True
+    def _write_to_file(self, key, binary_features, input_file):
+        """
+        write the binary features to input file and update the transition dictionary
+        """
+        self._transition.setdefault(key, len(self._transition) + 1)
+        self._match_transition[self._transition[key]] = key
+        input_str = str(self._transition[key]) + " " + binary_features + "\n"
+        input_file.write(input_str.encode("utf-8"))
+    def _create_training_examples_arc_std(self, depgraphs, input_file):
+        """
+        Create the training example in the libsvm format and write it to the input_file.
+        Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009)
+        """
+        operation = Transition(self.ARC_STANDARD)
+        count_proj = 0
+        training_seq = []
+        for depgraph in depgraphs:
+            if not self._is_projective(depgraph):
+                continue
+            count_proj += 1
+            conf = Configuration(depgraph)
+            while len(conf.buffer) > 0:
+                b0 = conf.buffer[0]
+                features = conf.extract_features()
+                binary_features = self._convert_to_binary_features(features)
+                if len(conf.stack) > 0:
+                    s0 = conf.stack[len(conf.stack) - 1]
+                    # Left-arc operation
+                    rel = self._get_dep_relation(b0, s0, depgraph)
+                    if rel is not None:
+                        key = Transition.LEFT_ARC + ":" + rel
+                        self._write_to_file(key, binary_features, input_file)
+                        operation.left_arc(conf, rel)
+                        training_seq.append(key)
+                        continue
+                    # Right-arc operation
+                    rel = self._get_dep_relation(s0, b0, depgraph)
+                    if rel is not None:
+                        precondition = True
+                        # Get the max-index of buffer
+                        maxID = conf._max_address
+                        for w in range(maxID + 1):
+                            if w != b0:
+                                relw = self._get_dep_relation(b0, w, depgraph)
+                                if relw is not None:
+                                    if (b0, relw, w) not in conf.arcs:
+                                        precondition = False
+                        if precondition:
+                            key = Transition.RIGHT_ARC + ":" + rel
+                            self._write_to_file(key, binary_features, input_file)
+                            operation.right_arc(conf, rel)
+                            training_seq.append(key)
+                            continue
+                # Shift operation as the default
+                key = Transition.SHIFT
+                self._write_to_file(key, binary_features, input_file)
+                operation.shift(conf)
+                training_seq.append(key)
+        print(" Number of training examples : " + str(len(depgraphs)))
+        print(" Number of valid (projective) examples : " + str(count_proj))
+        return training_seq
+    def _create_training_examples_arc_eager(self, depgraphs, input_file):
+        """
+        Create the training example in the libsvm format and write it to the input_file.
+        Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre
+        """
+        operation = Transition(self.ARC_EAGER)
+        countProj = 0
+        training_seq = []
+        for depgraph in depgraphs:
+            if not self._is_projective(depgraph):
+                continue
+            countProj += 1
+            conf = Configuration(depgraph)
+            while len(conf.buffer) > 0:
+                b0 = conf.buffer[0]
+                features = conf.extract_features()
+                binary_features = self._convert_to_binary_features(features)
+                if len(conf.stack) > 0:
+                    s0 = conf.stack[len(conf.stack) - 1]
+                    # Left-arc operation
+                    rel = self._get_dep_relation(b0, s0, depgraph)
+                    if rel is not None:
+                        key = Transition.LEFT_ARC + ":" + rel
+                        self._write_to_file(key, binary_features, input_file)
+                        operation.left_arc(conf, rel)
+                        training_seq.append(key)
+                        continue
+                    # Right-arc operation
+                    rel = self._get_dep_relation(s0, b0, depgraph)
+                    if rel is not None:
+                        key = Transition.RIGHT_ARC + ":" + rel
+                        self._write_to_file(key, binary_features, input_file)
+                        operation.right_arc(conf, rel)
+                        training_seq.append(key)
+                        continue
+                    # reduce operation
+                    flag = False
+                    for k in range(s0):
+                        if self._get_dep_relation(k, b0, depgraph) is not None:
+                            flag = True
+                        if self._get_dep_relation(b0, k, depgraph) is not None:
+                            flag = True
+                    if flag:
+                        key = Transition.REDUCE
+                        self._write_to_file(key, binary_features, input_file)
+                        operation.reduce(conf)
+                        training_seq.append(key)
+                        continue
+                # Shift operation as the default
+                key = Transition.SHIFT
+                self._write_to_file(key, binary_features, input_file)
+                operation.shift(conf)
+                training_seq.append(key)
+        print(" Number of training examples : " + str(len(depgraphs)))
+        print(" Number of valid (projective) examples : " + str(countProj))
+        return training_seq
+    def train(self, depgraphs, modelfile, verbose=True):
+        """
+        :param depgraphs : list of DependencyGraph as the training data
+        :type depgraphs : DependencyGraph
+        :param modelfile : file name to save the trained model
+        :type modelfile : str
+        """
+        try:
+            input_file = tempfile.NamedTemporaryFile(
+                prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
+            )
+            if self._algorithm == self.ARC_STANDARD:
+                self._create_training_examples_arc_std(depgraphs, input_file)
+            else:
+                self._create_training_examples_arc_eager(depgraphs, input_file)
+            input_file.close()
+            # Using the temporary file to train the libsvm classifier
+            x_train, y_train = load_svmlight_file(input_file.name)
+            # The parameter is set according to the paper:
+            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
+            # Todo : because of probability = True => very slow due to
+            # cross-validation. Need to improve the speed here
+            model = svm.SVC(
+                kernel="poly",
+                degree=2,
+                coef0=0,
+                gamma=0.2,
+                C=0.5,
+                verbose=verbose,
+                probability=True,
+            )
+            model.fit(x_train, y_train)
+            # Save the model to file name (as pickle)
+            pickle.dump(model, open(modelfile, "wb"))
+        finally:
+            remove(input_file.name)
+    def parse(self, depgraphs, modelFile):
+        """
+        :param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy
+        :type depgraphs: list(DependencyGraph)
+        :param modelfile: the model file
+        :type modelfile: str
+        :return: list (DependencyGraph) with the 'head' and 'rel' information
+        """
+        result = []
+        # First load the model
+        model = pickle.load(open(modelFile, "rb"))
+        operation = Transition(self._algorithm)
+        for depgraph in depgraphs:
+            conf = Configuration(depgraph)
+            while len(conf.buffer) > 0:
+                features = conf.extract_features()
+                col = []
+                row = []
+                data = []
+                for feature in features:
+                    if feature in self._dictionary:
+                        col.append(self._dictionary[feature])
+                        row.append(0)
+                        data.append(1.0)
+                np_col = array(sorted(col))  # NB : index must be sorted
+                np_row = array(row)
+                np_data = array(data)
+                x_test = sparse.csr_matrix(
+                    (np_data, (np_row, np_col)), shape=(1, len(self._dictionary))
+                )
+                # It's best to use decision function as follow BUT it's not supported yet for sparse SVM
+                # Using decision function to build the votes array
+                # dec_func = model.decision_function(x_test)[0]
+                # votes = {}
+                # k = 0
+                # for i in range(len(model.classes_)):
+                #    for j in range(i+1, len(model.classes_)):
+                #        #if  dec_func[k] > 0:
+                #            votes.setdefault(i,0)
+                #            votes[i] +=1
+                #        else:
+                #           votes.setdefault(j,0)
+                #           votes[j] +=1
+                #        k +=1
+                # Sort votes according to the values
+                # sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
+                # We will use predict_proba instead of decision_function
+                prob_dict = {}
+                pred_prob = model.predict_proba(x_test)[0]
+                for i in range(len(pred_prob)):
+                    prob_dict[i] = pred_prob[i]
+                sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True)
+                # Note that SHIFT is always a valid operation
+                for (y_pred_idx, confidence) in sorted_Prob:
+                    # y_pred = model.predict(x_test)[0]
+                    # From the prediction match to the operation
+                    y_pred = model.classes_[y_pred_idx]
+                    if y_pred in self._match_transition:
+                        strTransition = self._match_transition[y_pred]
+                        baseTransition = strTransition.split(":")[0]
+                        if baseTransition == Transition.LEFT_ARC:
+                            if (
+                                operation.left_arc(conf, strTransition.split(":")[1])
+                                != -1
+                            ):
+                                break
+                        elif baseTransition == Transition.RIGHT_ARC:
+                            if (
+                                operation.right_arc(conf, strTransition.split(":")[1])
+                                != -1
+                            ):
+                                break
+                        elif baseTransition == Transition.REDUCE:
+                            if operation.reduce(conf) != -1:
+                                break
+                        elif baseTransition == Transition.SHIFT:
+                            if operation.shift(conf) != -1:
+                                break
+                    else:
+                        raise ValueError(
+                            "The predicted transition is not recognized, expected errors"
+                        )
+            # Finish with operations build the dependency graph from Conf.arcs
+            new_depgraph = deepcopy(depgraph)
+            for key in new_depgraph.nodes:
+                node = new_depgraph.nodes[key]
+                node["rel"] = ""
+                # With the default, all the token depend on the Root
+                node["head"] = 0
+            for (head, rel, child) in conf.arcs:
+                c_node = new_depgraph.nodes[child]
+                c_node["head"] = head
+                c_node["rel"] = rel
+            result.append(new_depgraph)
+        return result
+def demo():
+    """
+    >>> from nltk.parse import DependencyGraph, DependencyEvaluator
+    >>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
+    >>> gold_sent = DependencyGraph(\"""
+    ... Economic  JJ     2      ATT
+    ... news  NN     3       SBJ
+    ... has       VBD       0       ROOT
+    ... little      JJ      5       ATT
+    ... effect   NN     3       OBJ
+    ... on     IN      5       ATT
+    ... financial       JJ       8       ATT
+    ... markets    NNS      6       PC
+    ... .    .      3       PU
+    ... \""")
+    >>> conf = Configuration(gold_sent)
+    ###################### Check the Initial Feature ########################
+    >>> print(', '.join(conf.extract_features()))
+    STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ
+    ###################### Check The Transition #######################
+    Check the Initialized Configuration
+    >>> print(conf)
+    Stack : [0]  Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9]   Arcs : []
+    A. Do some transition checks for ARC-STANDARD
+    >>> operation = Transition('arc-standard')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf, "ATT")
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,"SBJ")
+    >>> operation.shift(conf)
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf, "ATT")
+    >>> operation.shift(conf)
+    >>> operation.shift(conf)
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf, "ATT")
+    Middle Configuration and Features Check
+    >>> print(conf)
+    Stack : [0, 3, 5, 6]  Buffer : [8, 9]   Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)]
+    >>> print(', '.join(conf.extract_features()))
+    STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT
+    >>> operation.right_arc(conf, "PC")
+    >>> operation.right_arc(conf, "ATT")
+    >>> operation.right_arc(conf, "OBJ")
+    >>> operation.shift(conf)
+    >>> operation.right_arc(conf, "PU")
+    >>> operation.right_arc(conf, "ROOT")
+    >>> operation.shift(conf)
+    Terminated Configuration Check
+    >>> print(conf)
+    Stack : [0]  Buffer : []   Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)]
+    B. Do some transition checks for ARC-EAGER
+    >>> conf = Configuration(gold_sent)
+    >>> operation = Transition('arc-eager')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,'ATT')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,'SBJ')
+    >>> operation.right_arc(conf,'ROOT')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,'ATT')
+    >>> operation.right_arc(conf,'OBJ')
+    >>> operation.right_arc(conf,'ATT')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,'ATT')
+    >>> operation.right_arc(conf,'PC')
+    >>> operation.reduce(conf)
+    >>> operation.reduce(conf)
+    >>> operation.reduce(conf)
+    >>> operation.right_arc(conf,'PU')
+    >>> print(conf)
+    Stack : [0, 3, 9]  Buffer : []   Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)]
+    ###################### Check The Training Function #######################
+    A. Check the ARC-STANDARD training
+    >>> import tempfile
+    >>> import os
+    >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False)
+    >>> parser_std = TransitionParser('arc-standard')
+    >>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file)))
+     Number of training examples : 1
+     Number of valid (projective) examples : 1
+    SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT
+    >>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False)
+     Number of training examples : 1
+     Number of valid (projective) examples : 1
+    >>> input_file.close()
+    >>> remove(input_file.name)
+    B. Check the ARC-EAGER training
+    >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False)
+    >>> parser_eager = TransitionParser('arc-eager')
+    >>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file)))
+     Number of training examples : 1
+     Number of valid (projective) examples : 1
+    SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU
+    >>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False)
+     Number of training examples : 1
+     Number of valid (projective) examples : 1
+    >>> input_file.close()
+    >>> remove(input_file.name)
+    ###################### Check The Parsing Function ########################
+    A. Check the ARC-STANDARD parser
+    >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
+    >>> de = DependencyEvaluator(result, [gold_sent])
+    >>> de.eval() >= (0, 0)
+    True
+    B. Check the ARC-EAGER parser
+    >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
+    >>> de = DependencyEvaluator(result, [gold_sent])
+    >>> de.eval() >= (0, 0)
+    True
+    Remove test temporary files
+    >>> remove('temp.arceager.model')
+    >>> remove('temp.arcstd.model')
+    Note that result is very poor because of only one training example.
+    """

.eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# Natural Language Toolkit: Parser Utility Functions
+#
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#         Tom Aarsen <>
+#
+# Copyright (C) 2001-2022 NLTK Project
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Utility functions for parsers.
+"""
+from nltk.data import load
+from nltk.grammar import CFG, PCFG, FeatureGrammar
+from nltk.parse.chart import Chart, ChartParser
+from nltk.parse.featurechart import FeatureChart, FeatureChartParser
+from nltk.parse.pchart import InsideChartParser
+def load_parser(
+    grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args
+):
+    """
+    Load a grammar from a file, and build a parser based on that grammar.
+    The parser depends on the grammar format, and might also depend
+    on properties of the grammar itself.
+    The following grammar formats are currently supported:
+      - ``'cfg'``  (CFGs: ``CFG``)
+      - ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
+      - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
+    :type grammar_url: str
+    :param grammar_url: A URL specifying where the grammar is located.
+        The default protocol is ``"nltk:"``, which searches for the file
+        in the the NLTK data package.
+    :type trace: int
+    :param trace: The level of tracing that should be used when
+        parsing a text.  ``0`` will generate no tracing output;
+        and higher numbers will produce more verbose tracing output.
+    :param parser: The class used for parsing; should be ``ChartParser``
+        or a subclass.
+        If None, the class depends on the grammar format.
+    :param chart_class: The class used for storing the chart;
+        should be ``Chart`` or a subclass.
+        Only used for CFGs and feature CFGs.
+        If None, the chart class depends on the grammar format.
+    :type beam_size: int
+    :param beam_size: The maximum length for the parser's edge queue.
+        Only used for probabilistic CFGs.
+    :param load_args: Keyword parameters used when loading the grammar.
+        See ``data.load`` for more information.
+    """
+    grammar = load(grammar_url, **load_args)
+    if not isinstance(grammar, CFG):
+        raise ValueError("The grammar must be a CFG, " "or a subclass thereof.")
+    if isinstance(grammar, PCFG):
+        if parser is None:
+            parser = InsideChartParser
+        return parser(grammar, trace=trace, beam_size=beam_size)
+    elif isinstance(grammar, FeatureGrammar):
+        if parser is None:
+            parser = FeatureChartParser
+        if chart_class is None:
+            chart_class = FeatureChart
+        return parser(grammar, trace=trace, chart_class=chart_class)
+    else:  # Plain CFG.
+        if parser is None:
+            parser = ChartParser
+        if chart_class is None:
+            chart_class = Chart
+        return parser(grammar, trace=trace, chart_class=chart_class)
+def taggedsent_to_conll(sentence):
+    """
+    A module to convert a single POS tagged sentence into CONLL format.
+    >>> from nltk import word_tokenize, pos_tag
+    >>> text = "This is a foobar sentence."
+    >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): # doctest: +NORMALIZE_WHITESPACE
+    ... 	print(line, end="")
+        1	This	_	DT	DT	_	0	a	_	_
+        2	is	_	VBZ	VBZ	_	0	a	_	_
+        3	a	_	DT	DT	_	0	a	_	_
+        4	foobar	_	JJ	JJ	_	0	a	_	_
+        5	sentence	_	NN	NN	_	0	a	_	_
+        6	.		_	.	.	_	0	a	_	_
+    :param sentence: A single input sentence to parse
+    :type sentence: list(tuple(str, str))
+    :rtype: iter(str)
+    :return: a generator yielding a single sentence in CONLL format.
+    """
+    for (i, (word, tag)) in enumerate(sentence, start=1):
+        input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
+        input_str = "\t".join(input_str) + "\n"
+        yield input_str
+def taggedsents_to_conll(sentences):
+    """
+    A module to convert the a POS tagged document stream
+    (i.e. list of list of tuples, a list of sentences) and yield lines
+    in CONLL format. This module yields one line per word and two newlines
+    for end of sentence.
+    >>> from nltk import word_tokenize, sent_tokenize, pos_tag
+    >>> text = "This is a foobar sentence. Is that right?"
+    >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
+    >>> for line in taggedsents_to_conll(sentences): # doctest: +NORMALIZE_WHITESPACE
+    ...     if line:
+    ...         print(line, end="")
+    1	This	_	DT	DT	_	0	a	_	_
+    2	is	_	VBZ	VBZ	_	0	a	_	_
+    3	a	_	DT	DT	_	0	a	_	_
+    4	foobar	_	JJ	JJ	_	0	a	_	_
+    5	sentence	_	NN	NN	_	0	a	_	_
+    6	.		_	.	.	_	0	a	_	_
+    <BLANKLINE>
+    <BLANKLINE>
+    1	Is	_	VBZ	VBZ	_	0	a	_	_
+    2	that	_	IN	IN	_	0	a	_	_
+    3	right	_	NN	NN	_	0	a	_	_
+    4	?	_	.	.	_	0	a	_	_
+    <BLANKLINE>
+    <BLANKLINE>
+    :param sentences: Input sentences to parse
+    :type sentence: list(list(tuple(str, str)))
+    :rtype: iter(str)
+    :return: a generator yielding sentences in CONLL format.
+    """
+    for sentence in sentences:
+        yield from taggedsent_to_conll(sentence)
+        yield "\n\n"
+######################################################################
+# { Test Suites
+######################################################################
+class TestGrammar:
+    """
+    Unit tests for  CFG.
+    """
+    def __init__(self, grammar, suite, accept=None, reject=None):
+        self.test_grammar = grammar
+        self.cp = load_parser(grammar, trace=0)
+        self.suite = suite
+        self._accept = accept
+        self._reject = reject
+    def run(self, show_trees=False):
+        """
+        Sentences in the test suite are divided into two classes:
+        - grammatical (``accept``) and
+        - ungrammatical (``reject``).
+        If a sentence should parse according to the grammar, the value of
+        ``trees`` will be a non-empty list. If a sentence should be rejected
+        according to the grammar, then the value of ``trees`` will be None.
+        """
+        for test in self.suite:
+            print(test["doc"] + ":", end=" ")
+            for key in ["accept", "reject"]:
+                for sent in test[key]:
+                    tokens = sent.split()
+                    trees = list(self.cp.parse(tokens))
+                    if show_trees and trees:
+                        print()
+                        print(sent)
+                        for tree in trees:
+                            print(tree)
+                    if key == "accept":
+                        if trees == []:
+                            raise ValueError("Sentence '%s' failed to parse'" % sent)
+                        else:
+                            accepted = True
+                    else:
+                        if trees:
+                            raise ValueError("Sentence '%s' received a parse'" % sent)
+                        else:
+                            rejected = True
+            if accepted and rejected:
+                print("All tests passed!")
+def extract_test_sentences(string, comment_chars="#%;", encoding=None):
+    """
+    Parses a string with one test sentence per line.
+    Lines can optionally begin with:
+    - a bool, saying if the sentence is grammatical or not, or
+    - an int, giving the number of parse trees is should have,
+    The result information is followed by a colon, and then the sentence.
+    Empty lines and lines beginning with a comment char are ignored.
+    :return: a list of tuple of sentences and expected results,
+        where a sentence is a list of str,
+        and a result is None, or bool, or int
+    :param comment_chars: ``str`` of possible comment characters.
+    :param encoding: the encoding of the string, if it is binary
+    """
+    if encoding is not None:
+        string = string.decode(encoding)
+    sentences = []
+    for sentence in string.split("\n"):
+        if sentence == "" or sentence[0] in comment_chars:
+            continue
+        split_info = sentence.split(":", 1)
+        result = None
+        if len(split_info) == 2:
+            if split_info[0] in ["True", "true", "False", "false"]:
+                result = split_info[0] in ["True", "true"]
+                sentence = split_info[1]
+            else:
+                result = int(split_info[0])
+                sentence = split_info[1]
+        tokens = sentence.split()
+        if tokens == []:
+            continue
+        sentences += [(tokens, result)]
+    return sentences

.eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py ADDED Viewed

	@@ -0,0 +1,453 @@

+# Natural Language Toolkit: Viterbi Probabilistic Parser
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+from functools import reduce
+from nltk.parse.api import ParserI
+from nltk.tree import ProbabilisticTree, Tree
+##//////////////////////////////////////////////////////
+##  Viterbi PCFG Parser
+##//////////////////////////////////////////////////////
+class ViterbiParser(ParserI):
+    """
+    A bottom-up ``PCFG`` parser that uses dynamic programming to find
+    the single most likely parse for a text.  The ``ViterbiParser`` parser
+    parses texts by filling in a "most likely constituent table".
+    This table records the most probable tree representation for any
+    given span and node value.  In particular, it has an entry for
+    every start index, end index, and node value, recording the most
+    likely subtree that spans from the start index to the end index,
+    and has the given node value.
+    The ``ViterbiParser`` parser fills in this table incrementally.  It starts
+    by filling in all entries for constituents that span one element
+    of text (i.e., entries where the end index is one greater than the
+    start index).  After it has filled in all table entries for
+    constituents that span one element of text, it fills in the
+    entries for constitutants that span two elements of text.  It
+    continues filling in the entries for constituents spanning larger
+    and larger portions of the text, until the entire table has been
+    filled.  Finally, it returns the table entry for a constituent
+    spanning the entire text, whose node value is the grammar's start
+    symbol.
+    In order to find the most likely constituent with a given span and
+    node value, the ``ViterbiParser`` parser considers all productions that
+    could produce that node value.  For each production, it finds all
+    children that collectively cover the span and have the node values
+    specified by the production's right hand side.  If the probability
+    of the tree formed by applying the production to the children is
+    greater than the probability of the current entry in the table,
+    then the table is updated with this new tree.
+    A pseudo-code description of the algorithm used by
+    ``ViterbiParser`` is:
+    | Create an empty most likely constituent table, *MLC*.
+    | For width in 1...len(text):
+    |   For start in 1...len(text)-width:
+    |     For prod in grammar.productions:
+    |       For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
+    |         where t[i].label()==prod.rhs[i],
+    |         and the sequence covers [start:start+width]:
+    |           old_p = MLC[start, start+width, prod.lhs]
+    |           new_p = P(t[1])P(t[1])...P(t[n])P(prod)
+    |           if new_p > old_p:
+    |             new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
+    |             MLC[start, start+width, prod.lhs] = new_tree
+    | Return MLC[0, len(text), start_symbol]
+    :type _grammar: PCFG
+    :ivar _grammar: The grammar used to parse sentences.
+    :type _trace: int
+    :ivar _trace: The level of tracing output that should be generated
+        when parsing a text.
+    """
+    def __init__(self, grammar, trace=0):
+        """
+        Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
+        parse texts.
+        :type grammar: PCFG
+        :param grammar: The grammar used to parse texts.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        """
+        self._grammar = grammar
+        self._trace = trace
+    def grammar(self):
+        return self._grammar
+    def trace(self, trace=2):
+        """
+        Set the level of tracing output that should be generated when
+        parsing a text.
+        :type trace: int
+        :param trace: The trace level.  A trace level of ``0`` will
+            generate no tracing output; and higher trace levels will
+            produce more verbose tracing output.
+        :rtype: None
+        """
+        self._trace = trace
+    def parse(self, tokens):
+        # Inherit docs from ParserI
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+        # The most likely constituent table.  This table specifies the
+        # most likely constituent for a given span and type.
+        # Constituents can be either Trees or tokens.  For Trees,
+        # the "type" is the Nonterminal for the tree's root node
+        # value.  For Tokens, the "type" is the token's type.
+        # The table is stored as a dictionary, since it is sparse.
+        constituents = {}
+        # Initialize the constituents dictionary with the words from
+        # the text.
+        if self._trace:
+            print("Inserting tokens into the most likely" + " constituents table...")
+        for index in range(len(tokens)):
+            token = tokens[index]
+            constituents[index, index + 1, token] = token
+            if self._trace > 1:
+                self._trace_lexical_insertion(token, index, len(tokens))
+        # Consider each span of length 1, 2, ..., n; and add any trees
+        # that might cover that span to the constituents dictionary.
+        for length in range(1, len(tokens) + 1):
+            if self._trace:
+                print(
+                    "Finding the most likely constituents"
+                    + " spanning %d text elements..." % length
+                )
+            for start in range(len(tokens) - length + 1):
+                span = (start, start + length)
+                self._add_constituents_spanning(span, constituents, tokens)
+        # Return the tree that spans the entire text & have the right cat
+        tree = constituents.get((0, len(tokens), self._grammar.start()))
+        if tree is not None:
+            yield tree
+    def _add_constituents_spanning(self, span, constituents, tokens):
+        """
+        Find any constituents that might cover ``span``, and add them
+        to the most likely constituents table.
+        :rtype: None
+        :type span: tuple(int, int)
+        :param span: The section of the text for which we are
+            trying to find possible constituents.  The span is
+            specified as a pair of integers, where the first integer
+            is the index of the first token that should be included in
+            the constituent; and the second integer is the index of
+            the first token that should not be included in the
+            constituent.  I.e., the constituent should cover
+            ``text[span[0]:span[1]]``, where ``text`` is the text
+            that we are parsing.
+        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
+        :param constituents: The most likely constituents table.  This
+            table records the most probable tree representation for
+            any given span and node value.  In particular,
+            ``constituents(s,e,nv)`` is the most likely
+            ``ProbabilisticTree`` that covers ``text[s:e]``
+            and has a node value ``nv.symbol()``, where ``text``
+            is the text that we are parsing.  When
+            ``_add_constituents_spanning`` is called, ``constituents``
+            should contain all possible constituents that are shorter
+            than ``span``.
+        :type tokens: list of tokens
+        :param tokens: The text we are parsing.  This is only used for
+            trace output.
+        """
+        # Since some of the grammar productions may be unary, we need to
+        # repeatedly try all of the productions until none of them add any
+        # new constituents.
+        changed = True
+        while changed:
+            changed = False
+            # Find all ways instantiations of the grammar productions that
+            # cover the span.
+            instantiations = self._find_instantiations(span, constituents)
+            # For each production instantiation, add a new
+            # ProbabilisticTree whose probability is the product
+            # of the childrens' probabilities and the production's
+            # probability.
+            for (production, children) in instantiations:
+                subtrees = [c for c in children if isinstance(c, Tree)]
+                p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
+                node = production.lhs().symbol()
+                tree = ProbabilisticTree(node, children, prob=p)
+                # If it's new a constituent, then add it to the
+                # constituents dictionary.
+                c = constituents.get((span[0], span[1], production.lhs()))
+                if self._trace > 1:
+                    if c is None or c != tree:
+                        if c is None or c.prob() < tree.prob():
+                            print("   Insert:", end=" ")
+                        else:
+                            print("  Discard:", end=" ")
+                        self._trace_production(production, p, span, len(tokens))
+                if c is None or c.prob() < tree.prob():
+                    constituents[span[0], span[1], production.lhs()] = tree
+                    changed = True
+    def _find_instantiations(self, span, constituents):
+        """
+        :return: a list of the production instantiations that cover a
+            given span of the text.  A "production instantiation" is
+            a tuple containing a production and a list of children,
+            where the production's right hand side matches the list of
+            children; and the children cover ``span``.  :rtype: list
+            of ``pair`` of ``Production``, (list of
+            (``ProbabilisticTree`` or token.
+        :type span: tuple(int, int)
+        :param span: The section of the text for which we are
+            trying to find production instantiations.  The span is
+            specified as a pair of integers, where the first integer
+            is the index of the first token that should be covered by
+            the production instantiation; and the second integer is
+            the index of the first token that should not be covered by
+            the production instantiation.
+        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
+        :param constituents: The most likely constituents table.  This
+            table records the most probable tree representation for
+            any given span and node value.  See the module
+            documentation for more information.
+        """
+        rv = []
+        for production in self._grammar.productions():
+            childlists = self._match_rhs(production.rhs(), span, constituents)
+            for childlist in childlists:
+                rv.append((production, childlist))
+        return rv
+    def _match_rhs(self, rhs, span, constituents):
+        """
+        :return: a set of all the lists of children that cover ``span``
+            and that match ``rhs``.
+        :rtype: list(list(ProbabilisticTree or token)
+        :type rhs: list(Nonterminal or any)
+        :param rhs: The list specifying what kinds of children need to
+            cover ``span``.  Each nonterminal in ``rhs`` specifies
+            that the corresponding child should be a tree whose node
+            value is that nonterminal's symbol.  Each terminal in ``rhs``
+            specifies that the corresponding child should be a token
+            whose type is that terminal.
+        :type span: tuple(int, int)
+        :param span: The section of the text for which we are
+            trying to find child lists.  The span is specified as a
+            pair of integers, where the first integer is the index of
+            the first token that should be covered by the child list;
+            and the second integer is the index of the first token
+            that should not be covered by the child list.
+        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
+        :param constituents: The most likely constituents table.  This
+            table records the most probable tree representation for
+            any given span and node value.  See the module
+            documentation for more information.
+        """
+        (start, end) = span
+        # Base case
+        if start >= end and rhs == ():
+            return [[]]
+        if start >= end or rhs == ():
+            return []
+        # Find everything that matches the 1st symbol of the RHS
+        childlists = []
+        for split in range(start, end + 1):
+            l = constituents.get((start, split, rhs[0]))
+            if l is not None:
+                rights = self._match_rhs(rhs[1:], (split, end), constituents)
+                childlists += [[l] + r for r in rights]
+        return childlists
+    def _trace_production(self, production, p, span, width):
+        """
+        Print trace output indicating that a given production has been
+        applied at a given location.
+        :param production: The production that has been applied
+        :type production: Production
+        :param p: The probability of the tree produced by the production.
+        :type p: float
+        :param span: The span of the production
+        :type span: tuple
+        :rtype: None
+        """
+        str = "|" + "." * span[0]
+        str += "=" * (span[1] - span[0])
+        str += "." * (width - span[1]) + "| "
+        str += "%s" % production
+        if self._trace > 2:
+            str = f"{str:<40} {p:12.10f} "
+        print(str)
+    def _trace_lexical_insertion(self, token, index, width):
+        str = "   Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
+        str += f"{token}"
+        print(str)
+    def __repr__(self):
+        return "<ViterbiParser for %r>" % self._grammar
+##//////////////////////////////////////////////////////
+##  Test Code
+##//////////////////////////////////////////////////////
+def demo():
+    """
+    A demonstration of the probabilistic parsers.  The user is
+    prompted to select which demo to run, and how many parses should
+    be found; and then each parser is run on the same demo, and a
+    summary of the results are displayed.
+    """
+    import sys
+    import time
+    from nltk import tokenize
+    from nltk.grammar import PCFG
+    from nltk.parse import ViterbiParser
+    toy_pcfg1 = PCFG.fromstring(
+        """
+    S -> NP VP [1.0]
+    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
+    Det -> 'the' [0.8] | 'my' [0.2]
+    N -> 'man' [0.5] | 'telescope' [0.5]
+    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
+    V -> 'ate' [0.35] | 'saw' [0.65]
+    PP -> P NP [1.0]
+    P -> 'with' [0.61] | 'under' [0.39]
+    """
+    )
+    toy_pcfg2 = PCFG.fromstring(
+        """
+    S    -> NP VP         [1.0]
+    VP   -> V NP          [.59]
+    VP   -> V             [.40]
+    VP   -> VP PP         [.01]
+    NP   -> Det N         [.41]
+    NP   -> Name          [.28]
+    NP   -> NP PP         [.31]
+    PP   -> P NP          [1.0]
+    V    -> 'saw'         [.21]
+    V    -> 'ate'         [.51]
+    V    -> 'ran'         [.28]
+    N    -> 'boy'         [.11]
+    N    -> 'cookie'      [.12]
+    N    -> 'table'       [.13]
+    N    -> 'telescope'   [.14]
+    N    -> 'hill'        [.5]
+    Name -> 'Jack'        [.52]
+    Name -> 'Bob'         [.48]
+    P    -> 'with'        [.61]
+    P    -> 'under'       [.39]
+    Det  -> 'the'         [.41]
+    Det  -> 'a'           [.31]
+    Det  -> 'my'          [.28]
+    """
+    )
+    # Define two demos.  Each demo has a sentence and a grammar.
+    demos = [
+        ("I saw the man with my telescope", toy_pcfg1),
+        ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
+    ]
+    # Ask the user which demo they want to use.
+    print()
+    for i in range(len(demos)):
+        print(f"{i + 1:>3}: {demos[i][0]}")
+        print("     %r" % demos[i][1])
+        print()
+    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
+    try:
+        snum = int(sys.stdin.readline().strip()) - 1
+        sent, grammar = demos[snum]
+    except:
+        print("Bad sentence number")
+        return
+    # Tokenize the sentence.
+    tokens = sent.split()
+    parser = ViterbiParser(grammar)
+    all_parses = {}
+    print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
+    parser.trace(3)
+    t = time.time()
+    parses = parser.parse_all(tokens)
+    time = time.time() - t
+    average = (
+        reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
+    )
+    num_parses = len(parses)
+    for p in parses:
+        all_parses[p.freeze()] = 1
+    # Print some summary statistics
+    print()
+    print("Time (secs)   # Parses   Average P(parse)")
+    print("-----------------------------------------")
+    print("%11.4f%11d%19.14f" % (time, num_parses, average))
+    parses = all_parses.keys()
+    if parses:
+        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
+    else:
+        p = 0
+    print("------------------------------------------")
+    print("%11s%11d%19.14f" % ("n/a", len(parses), p))
+    # Ask the user if we should draw the parses.
+    print()
+    print("Draw parses (y/n)? ", end=" ")
+    if sys.stdin.readline().strip().lower().startswith("y"):
+        from nltk.draw.tree import draw_trees
+        print("  please wait...")
+        draw_trees(*parses)
+    # Ask the user if we should print the parses.
+    print()
+    print("Print parses (y/n)? ", end=" ")
+    if sys.stdin.readline().strip().lower().startswith("y"):
+        for parse in parses:
+            print(parse)
+if __name__ == "__main__":
+    demo()

.eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py ADDED Viewed

	@@ -0,0 +1,1605 @@

+# Natural Language Toolkit: Interface to Boxer
+# <http://svn.ask.it.usyd.edu.au/trac/candc/wiki/boxer>
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2022 NLTK Project
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+An interface to Boxer.
+This interface relies on the latest version of the development (subversion) version of
+C&C and Boxer.
+Usage
+=====
+Set the environment variable CANDC to the bin directory of your CandC installation.
+The models directory should be in the CandC root directory.
+For example::
+    /path/to/candc/
+    bin/
+        candc
+        boxer
+    models/
+        boxer/
+"""
+import operator
+import os
+import re
+import subprocess
+import tempfile
+from functools import reduce
+from optparse import OptionParser
+from nltk.internals import find_binary
+from nltk.sem.drt import (
+    DRS,
+    DrtApplicationExpression,
+    DrtEqualityExpression,
+    DrtNegatedExpression,
+    DrtOrExpression,
+    DrtParser,
+    DrtProposition,
+    DrtTokens,
+    DrtVariableExpression,
+)
+from nltk.sem.logic import (
+    ExpectedMoreTokensException,
+    LogicalExpressionException,
+    UnexpectedTokenException,
+    Variable,
+)
+class Boxer:
+    """
+    This class is an interface to Johan Bos's program Boxer, a wide-coverage
+    semantic parser that produces Discourse Representation Structures (DRSs).
+    """
+    def __init__(
+        self,
+        boxer_drs_interpreter=None,
+        elimeq=False,
+        bin_dir=None,
+        verbose=False,
+        resolve=True,
+    ):
+        """
+        :param boxer_drs_interpreter: A class that converts from the
+            ``AbstractBoxerDrs`` object hierarchy to a different object.  The
+            default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK
+            DRT hierarchy.
+        :param elimeq: When set to true, Boxer removes all equalities from the
+            DRSs and discourse referents standing in the equality relation are
+            unified, but only if this can be done in a meaning-preserving manner.
+        :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction.
+            Resolution follows Van der Sandt's theory of binding and accommodation.
+        """
+        if boxer_drs_interpreter is None:
+            boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter()
+        self._boxer_drs_interpreter = boxer_drs_interpreter
+        self._resolve = resolve
+        self._elimeq = elimeq
+        self.set_bin_dir(bin_dir, verbose)
+    def set_bin_dir(self, bin_dir, verbose=False):
+        self._candc_bin = self._find_binary("candc", bin_dir, verbose)
+        self._candc_models_path = os.path.normpath(
+            os.path.join(self._candc_bin[:-5], "../models")
+        )
+        self._boxer_bin = self._find_binary("boxer", bin_dir, verbose)
+    def interpret(self, input, discourse_id=None, question=False, verbose=False):
+        """
+        Use Boxer to give a first order representation.
+        :param input: str Input sentence to parse
+        :param occur_index: bool Should predicates be occurrence indexed?
+        :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
+        :return: ``drt.DrtExpression``
+        """
+        discourse_ids = [discourse_id] if discourse_id is not None else None
+        (d,) = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
+        if not d:
+            raise Exception(f'Unable to interpret: "{input}"')
+        return d
+    def interpret_multi(self, input, discourse_id=None, question=False, verbose=False):
+        """
+        Use Boxer to give a first order representation.
+        :param input: list of str Input sentences to parse as a single discourse
+        :param occur_index: bool Should predicates be occurrence indexed?
+        :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
+        :return: ``drt.DrtExpression``
+        """
+        discourse_ids = [discourse_id] if discourse_id is not None else None
+        (d,) = self.interpret_multi_sents([input], discourse_ids, question, verbose)
+        if not d:
+            raise Exception(f'Unable to interpret: "{input}"')
+        return d
+    def interpret_sents(
+        self, inputs, discourse_ids=None, question=False, verbose=False
+    ):
+        """
+        Use Boxer to give a first order representation.
+        :param inputs: list of str Input sentences to parse as individual discourses
+        :param occur_index: bool Should predicates be occurrence indexed?
+        :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
+        :return: list of ``drt.DrtExpression``
+        """
+        return self.interpret_multi_sents(
+            [[input] for input in inputs], discourse_ids, question, verbose
+        )
+    def interpret_multi_sents(
+        self, inputs, discourse_ids=None, question=False, verbose=False
+    ):
+        """
+        Use Boxer to give a first order representation.
+        :param inputs: list of list of str Input discourses to parse
+        :param occur_index: bool Should predicates be occurrence indexed?
+        :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
+        :return: ``drt.DrtExpression``
+        """
+        if discourse_ids is not None:
+            assert len(inputs) == len(discourse_ids)
+            assert reduce(operator.and_, (id is not None for id in discourse_ids))
+            use_disc_id = True
+        else:
+            discourse_ids = list(map(str, range(len(inputs))))
+            use_disc_id = False
+        candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose)
+        boxer_out = self._call_boxer(candc_out, verbose=verbose)
+        #        if 'ERROR: input file contains no ccg/2 terms.' in boxer_out:
+        #            raise UnparseableInputException('Could not parse with candc: "%s"' % input_str)
+        drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id)
+        return [drs_dict.get(id, None) for id in discourse_ids]
+    def _call_candc(self, inputs, discourse_ids, question, verbose=False):
+        """
+        Call the ``candc`` binary with the given input.
+        :param inputs: list of list of str Input discourses to parse
+        :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
+        :param filename: str A filename for the output file
+        :return: stdout
+        """
+        args = [
+            "--models",
+            os.path.join(self._candc_models_path, ["boxer", "questions"][question]),
+            "--candc-printer",
+            "boxer",
+        ]
+        return self._call(
+            "\n".join(
+                sum(
+                    ([f"<META>'{id}'"] + d for d, id in zip(inputs, discourse_ids)),
+                    [],
+                )
+            ),
+            self._candc_bin,
+            args,
+            verbose,
+        )
+    def _call_boxer(self, candc_out, verbose=False):
+        """
+        Call the ``boxer`` binary with the given input.
+        :param candc_out: str output from C&C parser
+        :return: stdout
+        """
+        f = None
+        try:
+            fd, temp_filename = tempfile.mkstemp(
+                prefix="boxer-", suffix=".in", text=True
+            )
+            f = os.fdopen(fd, "w")
+            f.write(candc_out.decode("utf-8"))
+        finally:
+            if f:
+                f.close()
+        args = [
+            "--box",
+            "false",
+            "--semantics",
+            "drs",
+            #'--flat', 'false', # removed from boxer
+            "--resolve",
+            ["false", "true"][self._resolve],
+            "--elimeq",
+            ["false", "true"][self._elimeq],
+            "--format",
+            "prolog",
+            "--instantiate",
+            "true",
+            "--input",
+            temp_filename,
+        ]
+        stdout = self._call(None, self._boxer_bin, args, verbose)
+        os.remove(temp_filename)
+        return stdout
+    def _find_binary(self, name, bin_dir, verbose=False):
+        return find_binary(
+            name,
+            path_to_bin=bin_dir,
+            env_vars=["CANDC"],
+            url="http://svn.ask.it.usyd.edu.au/trac/candc/",
+            binary_names=[name, name + ".exe"],
+            verbose=verbose,
+        )
+    def _call(self, input_str, binary, args=[], verbose=False):
+        """
+        Call the binary with the given input.
+        :param input_str: A string whose contents are used as stdin.
+        :param binary: The location of the binary to call
+        :param args: A list of command-line arguments.
+        :return: stdout
+        """
+        if verbose:
+            print("Calling:", binary)
+            print("Args:", args)
+            print("Input:", input_str)
+            print("Command:", binary + " " + " ".join(args))
+        # Call via a subprocess
+        if input_str is None:
+            cmd = [binary] + args
+            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        else:
+            cmd = 'echo "{}" | {} {}'.format(input_str, binary, " ".join(args))
+            p = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
+            )
+        stdout, stderr = p.communicate()
+        if verbose:
+            print("Return code:", p.returncode)
+            if stdout:
+                print("stdout:\n", stdout, "\n")
+            if stderr:
+                print("stderr:\n", stderr, "\n")
+        if p.returncode != 0:
+            raise Exception(
+                "ERROR CALLING: {} {}\nReturncode: {}\n{}".format(
+                    binary, " ".join(args), p.returncode, stderr
+                )
+            )
+        return stdout
+    def _parse_to_drs_dict(self, boxer_out, use_disc_id):
+        lines = boxer_out.decode("utf-8").split("\n")
+        drs_dict = {}
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            if line.startswith("id("):
+                comma_idx = line.index(",")
+                discourse_id = line[3:comma_idx]
+                if discourse_id[0] == "'" and discourse_id[-1] == "'":
+                    discourse_id = discourse_id[1:-1]
+                drs_id = line[comma_idx + 1 : line.index(")")]
+                i += 1
+                line = lines[i]
+                assert line.startswith(f"sem({drs_id},")
+                if line[-4:] == "').'":
+                    line = line[:-4] + ")."
+                assert line.endswith(")."), f"can't parse line: {line}"
+                search_start = len(f"sem({drs_id},[")
+                brace_count = 1
+                drs_start = -1
+                for j, c in enumerate(line[search_start:]):
+                    if c == "[":
+                        brace_count += 1
+                    if c == "]":
+                        brace_count -= 1
+                        if brace_count == 0:
+                            drs_start = search_start + j + 1
+                            if line[drs_start : drs_start + 3] == "','":
+                                drs_start = drs_start + 3
+                            else:
+                                drs_start = drs_start + 1
+                            break
+                assert drs_start > -1
+                drs_input = line[drs_start:-2].strip()
+                parsed = self._parse_drs(drs_input, discourse_id, use_disc_id)
+                drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed)
+            i += 1
+        return drs_dict
+    def _parse_drs(self, drs_string, discourse_id, use_disc_id):
+        return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string)
+class BoxerOutputDrsParser(DrtParser):
+    def __init__(self, discourse_id=None):
+        """
+        This class is used to parse the Prolog DRS output from Boxer into a
+        hierarchy of python objects.
+        """
+        DrtParser.__init__(self)
+        self.discourse_id = discourse_id
+        self.sentence_id_offset = None
+        self.quote_chars = [("'", "'", "\\", False)]
+    def parse(self, data, signature=None):
+        return DrtParser.parse(self, data, signature)
+    def get_all_symbols(self):
+        return ["(", ")", ",", "[", "]", ":"]
+    def handle(self, tok, context):
+        return self.handle_drs(tok)
+    def attempt_adjuncts(self, expression, context):
+        return expression
+    def parse_condition(self, indices):
+        """
+        Parse a DRS condition
+        :return: list of ``DrtExpression``
+        """
+        tok = self.token()
+        accum = self.handle_condition(tok, indices)
+        if accum is None:
+            raise UnexpectedTokenException(tok)
+        return accum
+    def handle_drs(self, tok):
+        if tok == "drs":
+            return self.parse_drs()
+        elif tok in ["merge", "smerge"]:
+            return self._handle_binary_expression(self._make_merge_expression)(None, [])
+        elif tok in ["alfa"]:
+            return self._handle_alfa(self._make_merge_expression)(None, [])
+    def handle_condition(self, tok, indices):
+        """
+        Handle a DRS condition
+        :param indices: list of int
+        :return: list of ``DrtExpression``
+        """
+        if tok == "not":
+            return [self._handle_not()]
+        if tok == "or":
+            conds = [self._handle_binary_expression(self._make_or_expression)]
+        elif tok == "imp":
+            conds = [self._handle_binary_expression(self._make_imp_expression)]
+        elif tok == "eq":
+            conds = [self._handle_eq()]
+        elif tok == "prop":
+            conds = [self._handle_prop()]
+        elif tok == "pred":
+            conds = [self._handle_pred()]
+        elif tok == "named":
+            conds = [self._handle_named()]
+        elif tok == "rel":
+            conds = [self._handle_rel()]
+        elif tok == "timex":
+            conds = self._handle_timex()
+        elif tok == "card":
+            conds = [self._handle_card()]
+        elif tok == "whq":
+            conds = [self._handle_whq()]
+        elif tok == "duplex":
+            conds = [self._handle_duplex()]
+        else:
+            conds = []
+        return sum(
+            (
+                [cond(sent_index, word_indices) for cond in conds]
+                for sent_index, word_indices in self._sent_and_word_indices(indices)
+            ),
+            [],
+        )
+    def _handle_not(self):
+        self.assertToken(self.token(), "(")
+        drs = self.process_next_expression(None)
+        self.assertToken(self.token(), ")")
+        return BoxerNot(drs)
+    def _handle_pred(self):
+        # pred(_G3943, dog, n, 0)
+        self.assertToken(self.token(), "(")
+        variable = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        name = self.token()
+        self.assertToken(self.token(), ",")
+        pos = self.token()
+        self.assertToken(self.token(), ",")
+        sense = int(self.token())
+        self.assertToken(self.token(), ")")
+        def _handle_pred_f(sent_index, word_indices):
+            return BoxerPred(
+                self.discourse_id, sent_index, word_indices, variable, name, pos, sense
+            )
+        return _handle_pred_f
+    def _handle_duplex(self):
+        # duplex(whq, drs(...), var, drs(...))
+        self.assertToken(self.token(), "(")
+        # self.assertToken(self.token(), '[')
+        ans_types = []
+        # while self.token(0) != ']':
+        #     cat = self.token()
+        #     self.assertToken(self.token(), ':')
+        #     if cat == 'des':
+        #         ans_types.append(self.token())
+        #     elif cat == 'num':
+        #         ans_types.append('number')
+        #         typ = self.token()
+        #         if typ == 'cou':
+        #             ans_types.append('count')
+        #         else:
+        #             ans_types.append(typ)
+        #     else:
+        #         ans_types.append(self.token())
+        # self.token() #swallow the ']'
+        self.assertToken(self.token(), "whq")
+        self.assertToken(self.token(), ",")
+        d1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ",")
+        ref = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        d2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: BoxerWhq(
+            self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
+        )
+    def _handle_named(self):
+        # named(x0, john, per, 0)
+        self.assertToken(self.token(), "(")
+        variable = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        name = self.token()
+        self.assertToken(self.token(), ",")
+        type = self.token()
+        self.assertToken(self.token(), ",")
+        sense = self.token()  # as per boxer rev 2554
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: BoxerNamed(
+            self.discourse_id, sent_index, word_indices, variable, name, type, sense
+        )
+    def _handle_rel(self):
+        # rel(_G3993, _G3943, agent, 0)
+        self.assertToken(self.token(), "(")
+        var1 = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        var2 = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        rel = self.token()
+        self.assertToken(self.token(), ",")
+        sense = int(self.token())
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: BoxerRel(
+            self.discourse_id, sent_index, word_indices, var1, var2, rel, sense
+        )
+    def _handle_timex(self):
+        # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
+        self.assertToken(self.token(), "(")
+        arg = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        new_conds = self._handle_time_expression(arg)
+        self.assertToken(self.token(), ")")
+        return new_conds
+    def _handle_time_expression(self, arg):
+        # date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
+        tok = self.token()
+        self.assertToken(self.token(), "(")
+        if tok == "date":
+            conds = self._handle_date(arg)
+        elif tok == "time":
+            conds = self._handle_time(arg)
+        else:
+            return None
+        self.assertToken(self.token(), ")")
+        return [
+            lambda sent_index, word_indices: BoxerPred(
+                self.discourse_id, sent_index, word_indices, arg, tok, "n", 0
+            )
+        ] + [lambda sent_index, word_indices: cond for cond in conds]
+    def _handle_date(self, arg):
+        # []: (+), []:'XXXX', [1004]:'04', []:'XX'
+        conds = []
+        ((sent_index, word_indices),) = self._sent_and_word_indices(
+            self._parse_index_list()
+        )
+        self.assertToken(self.token(), "(")
+        pol = self.token()
+        self.assertToken(self.token(), ")")
+        conds.append(
+            BoxerPred(
+                self.discourse_id,
+                sent_index,
+                word_indices,
+                arg,
+                f"date_pol_{pol}",
+                "a",
+                0,
+            )
+        )
+        self.assertToken(self.token(), ",")
+        ((sent_index, word_indices),) = self._sent_and_word_indices(
+            self._parse_index_list()
+        )
+        year = self.token()
+        if year != "XXXX":
+            year = year.replace(":", "_")
+            conds.append(
+                BoxerPred(
+                    self.discourse_id,
+                    sent_index,
+                    word_indices,
+                    arg,
+                    f"date_year_{year}",
+                    "a",
+                    0,
+                )
+            )
+        self.assertToken(self.token(), ",")
+        ((sent_index, word_indices),) = self._sent_and_word_indices(
+            self._parse_index_list()
+        )
+        month = self.token()
+        if month != "XX":
+            conds.append(
+                BoxerPred(
+                    self.discourse_id,
+                    sent_index,
+                    word_indices,
+                    arg,
+                    f"date_month_{month}",
+                    "a",
+                    0,
+                )
+            )
+        self.assertToken(self.token(), ",")
+        ((sent_index, word_indices),) = self._sent_and_word_indices(
+            self._parse_index_list()
+        )
+        day = self.token()
+        if day != "XX":
+            conds.append(
+                BoxerPred(
+                    self.discourse_id,
+                    sent_index,
+                    word_indices,
+                    arg,
+                    f"date_day_{day}",
+                    "a",
+                    0,
+                )
+            )
+        return conds
+    def _handle_time(self, arg):
+        # time([1018]:'18', []:'XX', []:'XX')
+        conds = []
+        self._parse_index_list()
+        hour = self.token()
+        if hour != "XX":
+            conds.append(self._make_atom("r_hour_2", arg, hour))
+        self.assertToken(self.token(), ",")
+        self._parse_index_list()
+        min = self.token()
+        if min != "XX":
+            conds.append(self._make_atom("r_min_2", arg, min))
+        self.assertToken(self.token(), ",")
+        self._parse_index_list()
+        sec = self.token()
+        if sec != "XX":
+            conds.append(self._make_atom("r_sec_2", arg, sec))
+        return conds
+    def _handle_card(self):
+        # card(_G18535, 28, ge)
+        self.assertToken(self.token(), "(")
+        variable = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        value = self.token()
+        self.assertToken(self.token(), ",")
+        type = self.token()
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: BoxerCard(
+            self.discourse_id, sent_index, word_indices, variable, value, type
+        )
+    def _handle_prop(self):
+        # prop(_G15949, drs(...))
+        self.assertToken(self.token(), "(")
+        variable = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        drs = self.process_next_expression(None)
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: BoxerProp(
+            self.discourse_id, sent_index, word_indices, variable, drs
+        )
+    def _parse_index_list(self):
+        # [1001,1002]:
+        indices = []
+        self.assertToken(self.token(), "[")
+        while self.token(0) != "]":
+            indices.append(self.parse_index())
+            if self.token(0) == ",":
+                self.token()  # swallow ','
+        self.token()  # swallow ']'
+        self.assertToken(self.token(), ":")
+        return indices
+    def parse_drs(self):
+        # drs([[1001]:_G3943],
+        #    [[1002]:pred(_G3943, dog, n, 0)]
+        #   )
+        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), "[")
+        refs = set()
+        while self.token(0) != "]":
+            indices = self._parse_index_list()
+            refs.add(self.parse_variable())
+            if self.token(0) == ",":
+                self.token()  # swallow ','
+        self.token()  # swallow ']'
+        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), "[")
+        conds = []
+        while self.token(0) != "]":
+            indices = self._parse_index_list()
+            conds.extend(self.parse_condition(indices))
+            if self.token(0) == ",":
+                self.token()  # swallow ','
+        self.token()  # swallow ']'
+        self.assertToken(self.token(), ")")
+        return BoxerDrs(list(refs), conds)
+    def _handle_binary_expression(self, make_callback):
+        self.assertToken(self.token(), "(")
+        drs1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ",")
+        drs2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: make_callback(
+            sent_index, word_indices, drs1, drs2
+        )
+    def _handle_alfa(self, make_callback):
+        self.assertToken(self.token(), "(")
+        type = self.token()
+        self.assertToken(self.token(), ",")
+        drs1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ",")
+        drs2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: make_callback(
+            sent_index, word_indices, drs1, drs2
+        )
+    def _handle_eq(self):
+        self.assertToken(self.token(), "(")
+        var1 = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        var2 = self.parse_variable()
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: BoxerEq(
+            self.discourse_id, sent_index, word_indices, var1, var2
+        )
+    def _handle_whq(self):
+        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), "[")
+        ans_types = []
+        while self.token(0) != "]":
+            cat = self.token()
+            self.assertToken(self.token(), ":")
+            if cat == "des":
+                ans_types.append(self.token())
+            elif cat == "num":
+                ans_types.append("number")
+                typ = self.token()
+                if typ == "cou":
+                    ans_types.append("count")
+                else:
+                    ans_types.append(typ)
+            else:
+                ans_types.append(self.token())
+        self.token()  # swallow the ']'
+        self.assertToken(self.token(), ",")
+        d1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ",")
+        ref = self.parse_variable()
+        self.assertToken(self.token(), ",")
+        d2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ")")
+        return lambda sent_index, word_indices: BoxerWhq(
+            self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
+        )
+    def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
+        return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
+    def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
+        return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
+    def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
+        return BoxerDrs(drs1.refs, drs1.conds, drs2)
+    def parse_variable(self):
+        var = self.token()
+        assert re.match(r"^[exps]\d+$", var), var
+        return var
+    def parse_index(self):
+        return int(self.token())
+    def _sent_and_word_indices(self, indices):
+        """
+        :return: list of (sent_index, word_indices) tuples
+        """
+        sent_indices = {(i / 1000) - 1 for i in indices if i >= 0}
+        if sent_indices:
+            pairs = []
+            for sent_index in sent_indices:
+                word_indices = [
+                    (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1
+                ]
+                pairs.append((sent_index, word_indices))
+            return pairs
+        else:
+            word_indices = [(i % 1000) - 1 for i in indices]
+            return [(None, word_indices)]
+class BoxerDrsParser(DrtParser):
+    """
+    Reparse the str form of subclasses of ``AbstractBoxerDrs``
+    """
+    def __init__(self, discourse_id=None):
+        DrtParser.__init__(self)
+        self.discourse_id = discourse_id
+    def get_all_symbols(self):
+        return [
+            DrtTokens.OPEN,
+            DrtTokens.CLOSE,
+            DrtTokens.COMMA,
+            DrtTokens.OPEN_BRACKET,
+            DrtTokens.CLOSE_BRACKET,
+        ]
+    def attempt_adjuncts(self, expression, context):
+        return expression
+    def handle(self, tok, context):
+        try:
+            #             if tok == 'drs':
+            #                 self.assertNextToken(DrtTokens.OPEN)
+            #                 label = int(self.token())
+            #                 self.assertNextToken(DrtTokens.COMMA)
+            #                 refs = list(map(int, self.handle_refs()))
+            #                 self.assertNextToken(DrtTokens.COMMA)
+            #                 conds = self.handle_conds(None)
+            #                 self.assertNextToken(DrtTokens.CLOSE)
+            #                 return BoxerDrs(label, refs, conds)
+            if tok == "pred":
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (
+                    self.discourse_id if self.discourse_id is not None else self.token()
+                )
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                variable = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                name = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                pos = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                sense = int(self.token())
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
+            elif tok == "named":
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (
+                    self.discourse_id if self.discourse_id is not None else self.token()
+                )
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = map(int, self.handle_refs())
+                self.assertNextToken(DrtTokens.COMMA)
+                variable = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                name = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                type = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                sense = int(self.token())
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerNamed(
+                    disc_id, sent_id, word_ids, variable, name, type, sense
+                )
+            elif tok == "rel":
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (
+                    self.discourse_id if self.discourse_id is not None else self.token()
+                )
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                var1 = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                var2 = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                rel = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                sense = int(self.token())
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
+            elif tok == "prop":
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (
+                    self.discourse_id if self.discourse_id is not None else self.token()
+                )
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                variable = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                drs = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
+            elif tok == "not":
+                self.assertNextToken(DrtTokens.OPEN)
+                drs = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerNot(drs)
+            elif tok == "imp":
+                self.assertNextToken(DrtTokens.OPEN)
+                drs1 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.COMMA)
+                drs2 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerDrs(drs1.refs, drs1.conds, drs2)
+            elif tok == "or":
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (
+                    self.discourse_id if self.discourse_id is not None else self.token()
+                )
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = map(int, self.handle_refs())
+                self.assertNextToken(DrtTokens.COMMA)
+                drs1 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.COMMA)
+                drs2 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
+            elif tok == "eq":
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (
+                    self.discourse_id if self.discourse_id is not None else self.token()
+                )
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                var1 = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                var2 = int(self.token())
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
+            elif tok == "card":
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (
+                    self.discourse_id if self.discourse_id is not None else self.token()
+                )
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = map(int, self.handle_refs())
+                self.assertNextToken(DrtTokens.COMMA)
+                var = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                value = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                type = self.token()
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
+            elif tok == "whq":
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (
+                    self.discourse_id if self.discourse_id is not None else self.token()
+                )
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                ans_types = self.handle_refs()
+                self.assertNextToken(DrtTokens.COMMA)
+                drs1 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.COMMA)
+                var = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                drs2 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2)
+        except Exception as e:
+            raise LogicalExpressionException(self._currentIndex, str(e)) from e
+        assert False, repr(tok)
+    def nullableIntToken(self):
+        t = self.token()
+        return int(t) if t != "None" else None
+    def get_next_token_variable(self, description):
+        try:
+            return self.token()
+        except ExpectedMoreTokensException as e:
+            raise ExpectedMoreTokensException(e.index, "Variable expected.") from e
+class AbstractBoxerDrs:
+    def variables(self):
+        """
+        :return: (set<variables>, set<events>, set<propositions>)
+        """
+        variables, events, propositions = self._variables()
+        return (variables - (events | propositions), events, propositions - events)
+    def variable_types(self):
+        vartypes = {}
+        for t, vars in zip(("z", "e", "p"), self.variables()):
+            for v in vars:
+                vartypes[v] = t
+        return vartypes
+    def _variables(self):
+        """
+        :return: (set<variables>, set<events>, set<propositions>)
+        """
+        return (set(), set(), set())
+    def atoms(self):
+        return set()
+    def clean(self):
+        return self
+    def _clean_name(self, name):
+        return name.replace("-", "_").replace("'", "_")
+    def renumber_sentences(self, f):
+        return self
+    def __hash__(self):
+        return hash(f"{self}")
+class BoxerDrs(AbstractBoxerDrs):
+    def __init__(self, refs, conds, consequent=None):
+        AbstractBoxerDrs.__init__(self)
+        self.refs = refs
+        self.conds = conds
+        self.consequent = consequent
+    def _variables(self):
+        variables = (set(), set(), set())
+        for cond in self.conds:
+            for s, v in zip(variables, cond._variables()):
+                s.update(v)
+        if self.consequent is not None:
+            for s, v in zip(variables, self.consequent._variables()):
+                s.update(v)
+        return variables
+    def atoms(self):
+        atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set())
+        if self.consequent is not None:
+            atoms.update(self.consequent.atoms())
+        return atoms
+    def clean(self):
+        consequent = self.consequent.clean() if self.consequent else None
+        return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
+    def renumber_sentences(self, f):
+        consequent = self.consequent.renumber_sentences(f) if self.consequent else None
+        return BoxerDrs(
+            self.refs, [c.renumber_sentences(f) for c in self.conds], consequent
+        )
+    def __repr__(self):
+        s = "drs([{}], [{}])".format(
+            ", ".join("%s" % r for r in self.refs),
+            ", ".join("%s" % c for c in self.conds),
+        )
+        if self.consequent is not None:
+            s = f"imp({s}, {self.consequent})"
+        return s
+    def __eq__(self, other):
+        return (
+            self.__class__ == other.__class__
+            and self.refs == other.refs
+            and len(self.conds) == len(other.conds)
+            and reduce(
+                operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds))
+            )
+            and self.consequent == other.consequent
+        )
+    def __ne__(self, other):
+        return not self == other
+    __hash__ = AbstractBoxerDrs.__hash__
+class BoxerNot(AbstractBoxerDrs):
+    def __init__(self, drs):
+        AbstractBoxerDrs.__init__(self)
+        self.drs = drs
+    def _variables(self):
+        return self.drs._variables()
+    def atoms(self):
+        return self.drs.atoms()
+    def clean(self):
+        return BoxerNot(self.drs.clean())
+    def renumber_sentences(self, f):
+        return BoxerNot(self.drs.renumber_sentences(f))
+    def __repr__(self):
+        return "not(%s)" % (self.drs)
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and self.drs == other.drs
+    def __ne__(self, other):
+        return not self == other
+    __hash__ = AbstractBoxerDrs.__hash__
+class BoxerIndexed(AbstractBoxerDrs):
+    def __init__(self, discourse_id, sent_index, word_indices):
+        AbstractBoxerDrs.__init__(self)
+        self.discourse_id = discourse_id
+        self.sent_index = sent_index
+        self.word_indices = word_indices
+    def atoms(self):
+        return {self}
+    def __eq__(self, other):
+        return (
+            self.__class__ == other.__class__
+            and self.discourse_id == other.discourse_id
+            and self.sent_index == other.sent_index
+            and self.word_indices == other.word_indices
+            and reduce(operator.and_, (s == o for s, o in zip(self, other)))
+        )
+    def __ne__(self, other):
+        return not self == other
+    __hash__ = AbstractBoxerDrs.__hash__
+    def __repr__(self):
+        s = "{}({}, {}, [{}]".format(
+            self._pred(),
+            self.discourse_id,
+            self.sent_index,
+            ", ".join("%s" % wi for wi in self.word_indices),
+        )
+        for v in self:
+            s += ", %s" % v
+        return s + ")"
+class BoxerPred(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var = var
+        self.name = name
+        self.pos = pos
+        self.sense = sense
+    def _variables(self):
+        return ({self.var}, set(), set())
+    def change_var(self, var):
+        return BoxerPred(
+            self.discourse_id,
+            self.sent_index,
+            self.word_indices,
+            var,
+            self.name,
+            self.pos,
+            self.sense,
+        )
+    def clean(self):
+        return BoxerPred(
+            self.discourse_id,
+            self.sent_index,
+            self.word_indices,
+            self.var,
+            self._clean_name(self.name),
+            self.pos,
+            self.sense,
+        )
+    def renumber_sentences(self, f):
+        new_sent_index = f(self.sent_index)
+        return BoxerPred(
+            self.discourse_id,
+            new_sent_index,
+            self.word_indices,
+            self.var,
+            self.name,
+            self.pos,
+            self.sense,
+        )
+    def __iter__(self):
+        return iter((self.var, self.name, self.pos, self.sense))
+    def _pred(self):
+        return "pred"
+class BoxerNamed(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var = var
+        self.name = name
+        self.type = type
+        self.sense = sense
+    def _variables(self):
+        return ({self.var}, set(), set())
+    def change_var(self, var):
+        return BoxerNamed(
+            self.discourse_id,
+            self.sent_index,
+            self.word_indices,
+            var,
+            self.name,
+            self.type,
+            self.sense,
+        )
+    def clean(self):
+        return BoxerNamed(
+            self.discourse_id,
+            self.sent_index,
+            self.word_indices,
+            self.var,
+            self._clean_name(self.name),
+            self.type,
+            self.sense,
+        )
+    def renumber_sentences(self, f):
+        return BoxerNamed(
+            self.discourse_id,
+            f(self.sent_index),
+            self.word_indices,
+            self.var,
+            self.name,
+            self.type,
+            self.sense,
+        )
+    def __iter__(self):
+        return iter((self.var, self.name, self.type, self.sense))
+    def _pred(self):
+        return "named"
+class BoxerRel(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var1 = var1
+        self.var2 = var2
+        self.rel = rel
+        self.sense = sense
+    def _variables(self):
+        return ({self.var1, self.var2}, set(), set())
+    def clean(self):
+        return BoxerRel(
+            self.discourse_id,
+            self.sent_index,
+            self.word_indices,
+            self.var1,
+            self.var2,
+            self._clean_name(self.rel),
+            self.sense,
+        )
+    def renumber_sentences(self, f):
+        return BoxerRel(
+            self.discourse_id,
+            f(self.sent_index),
+            self.word_indices,
+            self.var1,
+            self.var2,
+            self.rel,
+            self.sense,
+        )
+    def __iter__(self):
+        return iter((self.var1, self.var2, self.rel, self.sense))
+    def _pred(self):
+        return "rel"
+class BoxerProp(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var, drs):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var = var
+        self.drs = drs
+    def _variables(self):
+        return tuple(
+            map(operator.or_, (set(), set(), {self.var}), self.drs._variables())
+        )
+    def referenced_labels(self):
+        return {self.drs}
+    def atoms(self):
+        return self.drs.atoms()
+    def clean(self):
+        return BoxerProp(
+            self.discourse_id,
+            self.sent_index,
+            self.word_indices,
+            self.var,
+            self.drs.clean(),
+        )
+    def renumber_sentences(self, f):
+        return BoxerProp(
+            self.discourse_id,
+            f(self.sent_index),
+            self.word_indices,
+            self.var,
+            self.drs.renumber_sentences(f),
+        )
+    def __iter__(self):
+        return iter((self.var, self.drs))
+    def _pred(self):
+        return "prop"
+class BoxerEq(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var1, var2):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var1 = var1
+        self.var2 = var2
+    def _variables(self):
+        return ({self.var1, self.var2}, set(), set())
+    def atoms(self):
+        return set()
+    def renumber_sentences(self, f):
+        return BoxerEq(
+            self.discourse_id,
+            f(self.sent_index),
+            self.word_indices,
+            self.var1,
+            self.var2,
+        )
+    def __iter__(self):
+        return iter((self.var1, self.var2))
+    def _pred(self):
+        return "eq"
+class BoxerCard(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var, value, type):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var = var
+        self.value = value
+        self.type = type
+    def _variables(self):
+        return ({self.var}, set(), set())
+    def renumber_sentences(self, f):
+        return BoxerCard(
+            self.discourse_id,
+            f(self.sent_index),
+            self.word_indices,
+            self.var,
+            self.value,
+            self.type,
+        )
+    def __iter__(self):
+        return iter((self.var, self.value, self.type))
+    def _pred(self):
+        return "card"
+class BoxerOr(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.drs1 = drs1
+        self.drs2 = drs2
+    def _variables(self):
+        return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables()))
+    def atoms(self):
+        return self.drs1.atoms() | self.drs2.atoms()
+    def clean(self):
+        return BoxerOr(
+            self.discourse_id,
+            self.sent_index,
+            self.word_indices,
+            self.drs1.clean(),
+            self.drs2.clean(),
+        )
+    def renumber_sentences(self, f):
+        return BoxerOr(
+            self.discourse_id,
+            f(self.sent_index),
+            self.word_indices,
+            self.drs1,
+            self.drs2,
+        )
+    def __iter__(self):
+        return iter((self.drs1, self.drs2))
+    def _pred(self):
+        return "or"
+class BoxerWhq(BoxerIndexed):
+    def __init__(
+        self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2
+    ):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.ans_types = ans_types
+        self.drs1 = drs1
+        self.variable = variable
+        self.drs2 = drs2
+    def _variables(self):
+        return tuple(
+            map(
+                operator.or_,
+                ({self.variable}, set(), set()),
+                self.drs1._variables(),
+                self.drs2._variables(),
+            )
+        )
+    def atoms(self):
+        return self.drs1.atoms() | self.drs2.atoms()
+    def clean(self):
+        return BoxerWhq(
+            self.discourse_id,
+            self.sent_index,
+            self.word_indices,
+            self.ans_types,
+            self.drs1.clean(),
+            self.variable,
+            self.drs2.clean(),
+        )
+    def renumber_sentences(self, f):
+        return BoxerWhq(
+            self.discourse_id,
+            f(self.sent_index),
+            self.word_indices,
+            self.ans_types,
+            self.drs1,
+            self.variable,
+            self.drs2,
+        )
+    def __iter__(self):
+        return iter(
+            ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2)
+        )
+    def _pred(self):
+        return "whq"
+class PassthroughBoxerDrsInterpreter:
+    def interpret(self, ex):
+        return ex
+class NltkDrtBoxerDrsInterpreter:
+    def __init__(self, occur_index=False):
+        self._occur_index = occur_index
+    def interpret(self, ex):
+        """
+        :param ex: ``AbstractBoxerDrs``
+        :return: ``DrtExpression``
+        """
+        if isinstance(ex, BoxerDrs):
+            drs = DRS(
+                [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds))
+            )
+            if ex.consequent is not None:
+                drs.consequent = self.interpret(ex.consequent)
+            return drs
+        elif isinstance(ex, BoxerNot):
+            return DrtNegatedExpression(self.interpret(ex.drs))
+        elif isinstance(ex, BoxerPred):
+            pred = self._add_occur_indexing(f"{ex.pos}_{ex.name}", ex)
+            return self._make_atom(pred, ex.var)
+        elif isinstance(ex, BoxerNamed):
+            pred = self._add_occur_indexing(f"ne_{ex.type}_{ex.name}", ex)
+            return self._make_atom(pred, ex.var)
+        elif isinstance(ex, BoxerRel):
+            pred = self._add_occur_indexing("%s" % (ex.rel), ex)
+            return self._make_atom(pred, ex.var1, ex.var2)
+        elif isinstance(ex, BoxerProp):
+            return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
+        elif isinstance(ex, BoxerEq):
+            return DrtEqualityExpression(
+                DrtVariableExpression(Variable(ex.var1)),
+                DrtVariableExpression(Variable(ex.var2)),
+            )
+        elif isinstance(ex, BoxerCard):
+            pred = self._add_occur_indexing(f"card_{ex.type}_{ex.value}", ex)
+            return self._make_atom(pred, ex.var)
+        elif isinstance(ex, BoxerOr):
+            return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
+        elif isinstance(ex, BoxerWhq):
+            drs1 = self.interpret(ex.drs1)
+            drs2 = self.interpret(ex.drs2)
+            return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
+        assert False, f"{ex.__class__.__name__}: {ex}"
+    def _make_atom(self, pred, *args):
+        accum = DrtVariableExpression(Variable(pred))
+        for arg in args:
+            accum = DrtApplicationExpression(
+                accum, DrtVariableExpression(Variable(arg))
+            )
+        return accum
+    def _add_occur_indexing(self, base, ex):
+        if self._occur_index and ex.sent_index is not None:
+            if ex.discourse_id:
+                base += "_%s" % ex.discourse_id
+            base += "_s%s" % ex.sent_index
+            base += "_w%s" % sorted(ex.word_indices)[0]
+        return base
+class UnparseableInputException(Exception):
+    pass
+if __name__ == "__main__":
+    opts = OptionParser("usage: %prog TEXT [options]")
+    opts.add_option(
+        "--verbose",
+        "-v",
+        help="display verbose logs",
+        action="store_true",
+        default=False,
+        dest="verbose",
+    )
+    opts.add_option(
+        "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol"
+    )
+    opts.add_option(
+        "--question",
+        "-q",
+        help="input is a question",
+        action="store_true",
+        default=False,
+        dest="question",
+    )
+    opts.add_option(
+        "--occur",
+        "-o",
+        help="occurrence index",
+        action="store_true",
+        default=False,
+        dest="occur_index",
+    )
+    (options, args) = opts.parse_args()
+    if len(args) != 1:
+        opts.error("incorrect number of arguments")
+    interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
+    drs = Boxer(interpreter).interpret_multi(
+        args[0].split(r"\n"), question=options.question, verbose=options.verbose
+    )
+    if drs is None:
+        print(None)
+    else:
+        drs = drs.simplify().eliminate_equality()
+        if options.fol:
+            print(drs.fol().normalize())
+        else:
+            drs.pretty_print()

.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py ADDED Viewed

	@@ -0,0 +1,553 @@

+# Natural Language Toolkit: GUI Demo for Glue Semantics with Discourse
+#                           Representation Theory (DRT) as meaning language
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2022 NLTK Project
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+try:
+    from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
+    from tkinter.font import Font
+    from nltk.draw.util import CanvasFrame, ShowText
+except ImportError:
+    """Ignore ImportError because tkinter might not be available."""
+from nltk.parse import MaltParser
+from nltk.sem.drt import DrsDrawer, DrtVariableExpression
+from nltk.sem.glue import DrtGlue
+from nltk.sem.logic import Variable
+from nltk.tag import RegexpTagger
+from nltk.util import in_idle
+class DrtGlueDemo:
+    def __init__(self, examples):
+        # Set up the main window.
+        self._top = Tk()
+        self._top.title("DRT Glue Demo")
+        # Set up key bindings.
+        self._init_bindings()
+        # Initialize the fonts.self._error = None
+        self._init_fonts(self._top)
+        self._examples = examples
+        self._readingCache = [None for example in examples]
+        # The user can hide the grammar.
+        self._show_grammar = IntVar(self._top)
+        self._show_grammar.set(1)
+        # Set the data to None
+        self._curExample = -1
+        self._readings = []
+        self._drs = None
+        self._drsWidget = None
+        self._error = None
+        self._init_glue()
+        # Create the basic frames.
+        self._init_menubar(self._top)
+        self._init_buttons(self._top)
+        self._init_exampleListbox(self._top)
+        self._init_readingListbox(self._top)
+        self._init_canvas(self._top)
+        # Resize callback
+        self._canvas.bind("<Configure>", self._configure)
+    #########################################
+    ##  Initialization Helpers
+    #########################################
+    def _init_glue(self):
+        tagger = RegexpTagger(
+            [
+                ("^(David|Mary|John)$", "NNP"),
+                (
+                    "^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
+                    "VB",
+                ),
+                ("^(go|order|vanish|find|approach)$", "VB"),
+                ("^(a)$", "ex_quant"),
+                ("^(every)$", "univ_quant"),
+                ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
+                ("^(big|gray|former)$", "JJ"),
+                ("^(him|himself)$", "PRP"),
+            ]
+        )
+        depparser = MaltParser(tagger=tagger)
+        self._glue = DrtGlue(depparser=depparser, remove_duplicates=False)
+    def _init_fonts(self, root):
+        # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
+        self._sysfont = Font(font=Button()["font"])
+        root.option_add("*Font", self._sysfont)
+        # TWhat's our font size (default=same as sysfont)
+        self._size = IntVar(root)
+        self._size.set(self._sysfont.cget("size"))
+        self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
+        self._font = Font(family="helvetica", size=self._size.get())
+        if self._size.get() < 0:
+            big = self._size.get() - 2
+        else:
+            big = self._size.get() + 2
+        self._bigfont = Font(family="helvetica", weight="bold", size=big)
+    def _init_exampleListbox(self, parent):
+        self._exampleFrame = listframe = Frame(parent)
+        self._exampleFrame.pack(fill="both", side="left", padx=2)
+        self._exampleList_label = Label(
+            self._exampleFrame, font=self._boldfont, text="Examples"
+        )
+        self._exampleList_label.pack()
+        self._exampleList = Listbox(
+            self._exampleFrame,
+            selectmode="single",
+            relief="groove",
+            background="white",
+            foreground="#909090",
+            font=self._font,
+            selectforeground="#004040",
+            selectbackground="#c0f0c0",
+        )
+        self._exampleList.pack(side="right", fill="both", expand=1)
+        for example in self._examples:
+            self._exampleList.insert("end", ("  %s" % example))
+        self._exampleList.config(height=min(len(self._examples), 25), width=40)
+        # Add a scrollbar if there are more than 25 examples.
+        if len(self._examples) > 25:
+            listscroll = Scrollbar(self._exampleFrame, orient="vertical")
+            self._exampleList.config(yscrollcommand=listscroll.set)
+            listscroll.config(command=self._exampleList.yview)
+            listscroll.pack(side="left", fill="y")
+        # If they select a example, apply it.
+        self._exampleList.bind("<<ListboxSelect>>", self._exampleList_select)
+    def _init_readingListbox(self, parent):
+        self._readingFrame = listframe = Frame(parent)
+        self._readingFrame.pack(fill="both", side="left", padx=2)
+        self._readingList_label = Label(
+            self._readingFrame, font=self._boldfont, text="Readings"
+        )
+        self._readingList_label.pack()
+        self._readingList = Listbox(
+            self._readingFrame,
+            selectmode="single",
+            relief="groove",
+            background="white",
+            foreground="#909090",
+            font=self._font,
+            selectforeground="#004040",
+            selectbackground="#c0f0c0",
+        )
+        self._readingList.pack(side="right", fill="both", expand=1)
+        # Add a scrollbar if there are more than 25 examples.
+        listscroll = Scrollbar(self._readingFrame, orient="vertical")
+        self._readingList.config(yscrollcommand=listscroll.set)
+        listscroll.config(command=self._readingList.yview)
+        listscroll.pack(side="right", fill="y")
+        self._populate_readingListbox()
+    def _populate_readingListbox(self):
+        # Populate the listbox with integers
+        self._readingList.delete(0, "end")
+        for i in range(len(self._readings)):
+            self._readingList.insert("end", ("  %s" % (i + 1)))
+        self._readingList.config(height=min(len(self._readings), 25), width=5)
+        # If they select a example, apply it.
+        self._readingList.bind("<<ListboxSelect>>", self._readingList_select)
+    def _init_bindings(self):
+        # Key bindings are a good thing.
+        self._top.bind("<Control-q>", self.destroy)
+        self._top.bind("<Control-x>", self.destroy)
+        self._top.bind("<Escape>", self.destroy)
+        self._top.bind("n", self.next)
+        self._top.bind("<space>", self.next)
+        self._top.bind("p", self.prev)
+        self._top.bind("<BackSpace>", self.prev)
+    def _init_buttons(self, parent):
+        # Set up the frames.
+        self._buttonframe = buttonframe = Frame(parent)
+        buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
+        Button(
+            buttonframe,
+            text="Prev",
+            background="#90c0d0",
+            foreground="black",
+            command=self.prev,
+        ).pack(side="left")
+        Button(
+            buttonframe,
+            text="Next",
+            background="#90c0d0",
+            foreground="black",
+            command=self.next,
+        ).pack(side="left")
+    def _configure(self, event):
+        self._autostep = 0
+        (x1, y1, x2, y2) = self._cframe.scrollregion()
+        y2 = event.height - 6
+        self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
+        self._redraw()
+    def _init_canvas(self, parent):
+        self._cframe = CanvasFrame(
+            parent,
+            background="white",
+            # width=525, height=250,
+            closeenough=10,
+            border=2,
+            relief="sunken",
+        )
+        self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+        canvas = self._canvas = self._cframe.canvas()
+        # Initially, there's no tree or text
+        self._tree = None
+        self._textwidgets = []
+        self._textline = None
+    def _init_menubar(self, parent):
+        menubar = Menu(parent)
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(
+            label="Exit", underline=1, command=self.destroy, accelerator="q"
+        )
+        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        actionmenu = Menu(menubar, tearoff=0)
+        actionmenu.add_command(
+            label="Next", underline=0, command=self.next, accelerator="n, Space"
+        )
+        actionmenu.add_command(
+            label="Previous", underline=0, command=self.prev, accelerator="p, Backspace"
+        )
+        menubar.add_cascade(label="Action", underline=0, menu=actionmenu)
+        optionmenu = Menu(menubar, tearoff=0)
+        optionmenu.add_checkbutton(
+            label="Remove Duplicates",
+            underline=0,
+            variable=self._glue.remove_duplicates,
+            command=self._toggle_remove_duplicates,
+            accelerator="r",
+        )
+        menubar.add_cascade(label="Options", underline=0, menu=optionmenu)
+        viewmenu = Menu(menubar, tearoff=0)
+        viewmenu.add_radiobutton(
+            label="Tiny",
+            variable=self._size,
+            underline=0,
+            value=10,
+            command=self.resize,
+        )
+        viewmenu.add_radiobutton(
+            label="Small",
+            variable=self._size,
+            underline=0,
+            value=12,
+            command=self.resize,
+        )
+        viewmenu.add_radiobutton(
+            label="Medium",
+            variable=self._size,
+            underline=0,
+            value=14,
+            command=self.resize,
+        )
+        viewmenu.add_radiobutton(
+            label="Large",
+            variable=self._size,
+            underline=0,
+            value=18,
+            command=self.resize,
+        )
+        viewmenu.add_radiobutton(
+            label="Huge",
+            variable=self._size,
+            underline=0,
+            value=24,
+            command=self.resize,
+        )
+        menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+        helpmenu = Menu(menubar, tearoff=0)
+        helpmenu.add_command(label="About", underline=0, command=self.about)
+        menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+        parent.config(menu=menubar)
+    #########################################
+    ##  Main draw procedure
+    #########################################
+    def _redraw(self):
+        canvas = self._canvas
+        # Delete the old DRS, widgets, etc.
+        if self._drsWidget is not None:
+            self._drsWidget.clear()
+        if self._drs:
+            self._drsWidget = DrsWidget(self._canvas, self._drs)
+            self._drsWidget.draw()
+        if self._error:
+            self._drsWidget = DrsWidget(self._canvas, self._error)
+            self._drsWidget.draw()
+    #########################################
+    ##  Button Callbacks
+    #########################################
+    def destroy(self, *e):
+        self._autostep = 0
+        if self._top is None:
+            return
+        self._top.destroy()
+        self._top = None
+    def prev(self, *e):
+        selection = self._readingList.curselection()
+        readingListSize = self._readingList.size()
+        # there are readings
+        if readingListSize > 0:
+            # if one reading is currently selected
+            if len(selection) == 1:
+                index = int(selection[0])
+                # if it's on (or before) the first item
+                if index <= 0:
+                    self._select_previous_example()
+                else:
+                    self._readingList_store_selection(index - 1)
+            else:
+                # select its first reading
+                self._readingList_store_selection(readingListSize - 1)
+        else:
+            self._select_previous_example()
+    def _select_previous_example(self):
+        # if the current example is not the first example
+        if self._curExample > 0:
+            self._exampleList_store_selection(self._curExample - 1)
+        else:
+            # go to the last example
+            self._exampleList_store_selection(len(self._examples) - 1)
+    def next(self, *e):
+        selection = self._readingList.curselection()
+        readingListSize = self._readingList.size()
+        # if there are readings
+        if readingListSize > 0:
+            # if one reading is currently selected
+            if len(selection) == 1:
+                index = int(selection[0])
+                # if it's on (or past) the last item
+                if index >= (readingListSize - 1):
+                    self._select_next_example()
+                else:
+                    self._readingList_store_selection(index + 1)
+            else:
+                # select its first reading
+                self._readingList_store_selection(0)
+        else:
+            self._select_next_example()
+    def _select_next_example(self):
+        # if the current example is not the last example
+        if self._curExample < len(self._examples) - 1:
+            self._exampleList_store_selection(self._curExample + 1)
+        else:
+            # go to the first example
+            self._exampleList_store_selection(0)
+    def about(self, *e):
+        ABOUT = (
+            "NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"
+            + "Written by Daniel H. Garrette"
+        )
+        TITLE = "About: NLTK DRT Glue Demo"
+        try:
+            from tkinter.messagebox import Message
+            Message(message=ABOUT, title=TITLE).show()
+        except:
+            ShowText(self._top, TITLE, ABOUT)
+    def postscript(self, *e):
+        self._autostep = 0
+        self._cframe.print_to_file()
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this demo is created from a non-interactive program (e.g.
+        from a secript); otherwise, the demo will close as soon as
+        the script completes.
+        """
+        if in_idle():
+            return
+        self._top.mainloop(*args, **kwargs)
+    def resize(self, size=None):
+        if size is not None:
+            self._size.set(size)
+        size = self._size.get()
+        self._font.configure(size=-(abs(size)))
+        self._boldfont.configure(size=-(abs(size)))
+        self._sysfont.configure(size=-(abs(size)))
+        self._bigfont.configure(size=-(abs(size + 2)))
+        self._redraw()
+    def _toggle_remove_duplicates(self):
+        self._glue.remove_duplicates = not self._glue.remove_duplicates
+        self._exampleList.selection_clear(0, "end")
+        self._readings = []
+        self._populate_readingListbox()
+        self._readingCache = [None for ex in self._examples]
+        self._curExample = -1
+        self._error = None
+        self._drs = None
+        self._redraw()
+    def _exampleList_select(self, event):
+        selection = self._exampleList.curselection()
+        if len(selection) != 1:
+            return
+        self._exampleList_store_selection(int(selection[0]))
+    def _exampleList_store_selection(self, index):
+        self._curExample = index
+        example = self._examples[index]
+        self._exampleList.selection_clear(0, "end")
+        if example:
+            cache = self._readingCache[index]
+            if cache:
+                if isinstance(cache, list):
+                    self._readings = cache
+                    self._error = None
+                else:
+                    self._readings = []
+                    self._error = cache
+            else:
+                try:
+                    self._readings = self._glue.parse_to_meaning(example)
+                    self._error = None
+                    self._readingCache[index] = self._readings
+                except Exception as e:
+                    self._readings = []
+                    self._error = DrtVariableExpression(Variable("Error: " + str(e)))
+                    self._readingCache[index] = self._error
+                    # add a star to the end of the example
+                    self._exampleList.delete(index)
+                    self._exampleList.insert(index, ("  %s *" % example))
+                    self._exampleList.config(
+                        height=min(len(self._examples), 25), width=40
+                    )
+            self._populate_readingListbox()
+            self._exampleList.selection_set(index)
+            self._drs = None
+            self._redraw()
+    def _readingList_select(self, event):
+        selection = self._readingList.curselection()
+        if len(selection) != 1:
+            return
+        self._readingList_store_selection(int(selection[0]))
+    def _readingList_store_selection(self, index):
+        reading = self._readings[index]
+        self._readingList.selection_clear(0, "end")
+        if reading:
+            self._readingList.selection_set(index)
+            self._drs = reading.simplify().normalize().resolve_anaphora()
+            self._redraw()
+class DrsWidget:
+    def __init__(self, canvas, drs, **attribs):
+        self._drs = drs
+        self._canvas = canvas
+        canvas.font = Font(
+            font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font")
+        )
+        canvas._BUFFER = 3
+        self.bbox = (0, 0, 0, 0)
+    def draw(self):
+        (right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw()
+        self.bbox = (0, 0, right + 1, bottom + 1)
+    def clear(self):
+        self._canvas.create_rectangle(self.bbox, fill="white", width="0")
+def demo():
+    examples = [
+        "John walks",
+        "David sees Mary",
+        "David eats a sandwich",
+        "every man chases a dog",
+        #                'every man believes a dog yawns',
+        #                'John gives David a sandwich',
+        "John chases himself",
+        #                'John persuades David to order a pizza',
+        #                'John tries to go',
+        #                'John tries to find a unicorn',
+        #                'John seems to vanish',
+        #                'a unicorn seems to approach',
+        #                'every big cat leaves',
+        #                'every gray cat leaves',
+        #                'every big gray cat leaves',
+        #                'a former senator leaves',
+        #                'John likes a cat',
+        #                'John likes every cat',
+        #                'he walks',
+        #                'John walks and he leaves'
+    ]
+    DrtGlueDemo(examples).mainloop()
+if __name__ == "__main__":
+    demo()

.eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py ADDED Viewed

	@@ -0,0 +1,835 @@

+# Natural Language Toolkit: Glue Semantics
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2022 NLTK Project
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import os
+from itertools import chain
+import nltk
+from nltk.internals import Counter
+from nltk.sem import drt, linearlogic
+from nltk.sem.logic import (
+    AbstractVariableExpression,
+    Expression,
+    LambdaExpression,
+    Variable,
+    VariableExpression,
+)
+from nltk.tag import BigramTagger, RegexpTagger, TrigramTagger, UnigramTagger
+SPEC_SEMTYPES = {
+    "a": "ex_quant",
+    "an": "ex_quant",
+    "every": "univ_quant",
+    "the": "def_art",
+    "no": "no_quant",
+    "default": "ex_quant",
+}
+OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]
+class GlueFormula:
+    def __init__(self, meaning, glue, indices=None):
+        if not indices:
+            indices = set()
+        if isinstance(meaning, str):
+            self.meaning = Expression.fromstring(meaning)
+        elif isinstance(meaning, Expression):
+            self.meaning = meaning
+        else:
+            raise RuntimeError(
+                "Meaning term neither string or expression: %s, %s"
+                % (meaning, meaning.__class__)
+            )
+        if isinstance(glue, str):
+            self.glue = linearlogic.LinearLogicParser().parse(glue)
+        elif isinstance(glue, linearlogic.Expression):
+            self.glue = glue
+        else:
+            raise RuntimeError(
+                "Glue term neither string or expression: %s, %s"
+                % (glue, glue.__class__)
+            )
+        self.indices = indices
+    def applyto(self, arg):
+        """self = (\\x.(walk x), (subj -o f))
+        arg  = (john        ,  subj)
+        returns ((walk john),          f)
+        """
+        if self.indices & arg.indices:  # if the sets are NOT disjoint
+            raise linearlogic.LinearLogicApplicationException(
+                f"'{self}' applied to '{arg}'.  Indices are not disjoint."
+            )
+        else:  # if the sets ARE disjoint
+            return_indices = self.indices | arg.indices
+        try:
+            return_glue = linearlogic.ApplicationExpression(
+                self.glue, arg.glue, arg.indices
+            )
+        except linearlogic.LinearLogicApplicationException as e:
+            raise linearlogic.LinearLogicApplicationException(
+                f"'{self.simplify()}' applied to '{arg.simplify()}'"
+            ) from e
+        arg_meaning_abstracted = arg.meaning
+        if return_indices:
+            for dep in self.glue.simplify().antecedent.dependencies[
+                ::-1
+            ]:  # if self.glue is (A -o B), dep is in A.dependencies
+                arg_meaning_abstracted = self.make_LambdaExpression(
+                    Variable("v%s" % dep), arg_meaning_abstracted
+                )
+        return_meaning = self.meaning.applyto(arg_meaning_abstracted)
+        return self.__class__(return_meaning, return_glue, return_indices)
+    def make_VariableExpression(self, name):
+        return VariableExpression(name)
+    def make_LambdaExpression(self, variable, term):
+        return LambdaExpression(variable, term)
+    def lambda_abstract(self, other):
+        assert isinstance(other, GlueFormula)
+        assert isinstance(other.meaning, AbstractVariableExpression)
+        return self.__class__(
+            self.make_LambdaExpression(other.meaning.variable, self.meaning),
+            linearlogic.ImpExpression(other.glue, self.glue),
+        )
+    def compile(self, counter=None):
+        """From Iddo Lev's PhD Dissertation p108-109"""
+        if not counter:
+            counter = Counter()
+        (compiled_glue, new_forms) = self.glue.simplify().compile_pos(
+            counter, self.__class__
+        )
+        return new_forms + [
+            self.__class__(self.meaning, compiled_glue, {counter.get()})
+        ]
+    def simplify(self):
+        return self.__class__(
+            self.meaning.simplify(), self.glue.simplify(), self.indices
+        )
+    def __eq__(self, other):
+        return (
+            self.__class__ == other.__class__
+            and self.meaning == other.meaning
+            and self.glue == other.glue
+        )
+    def __ne__(self, other):
+        return not self == other
+    # sorting for use in doctests which must be deterministic
+    def __lt__(self, other):
+        return str(self) < str(other)
+    def __str__(self):
+        assert isinstance(self.indices, set)
+        accum = f"{self.meaning} : {self.glue}"
+        if self.indices:
+            accum += (
+                " : {" + ", ".join(str(index) for index in sorted(self.indices)) + "}"
+            )
+        return accum
+    def __repr__(self):
+        return "%s" % self
+class GlueDict(dict):
+    def __init__(self, filename, encoding=None):
+        self.filename = filename
+        self.file_encoding = encoding
+        self.read_file()
+    def read_file(self, empty_first=True):
+        if empty_first:
+            self.clear()
+        try:
+            contents = nltk.data.load(
+                self.filename, format="text", encoding=self.file_encoding
+            )
+            # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
+        except LookupError as e:
+            try:
+                contents = nltk.data.load(
+                    "file:" + self.filename, format="text", encoding=self.file_encoding
+                )
+            except LookupError:
+                raise e
+        lines = contents.splitlines()
+        for line in lines:  # example: 'n : (\\x.(<word> x), (v-or))'
+            #     lambdacalc -^  linear logic -^
+            line = line.strip()  # remove trailing newline
+            if not len(line):
+                continue  # skip empty lines
+            if line[0] == "#":
+                continue  # skip commented out lines
+            parts = line.split(
+                " : ", 2
+            )  # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
+            glue_formulas = []
+            paren_count = 0
+            tuple_start = 0
+            tuple_comma = 0
+            relationships = None
+            if len(parts) > 1:
+                for (i, c) in enumerate(parts[1]):
+                    if c == "(":
+                        if paren_count == 0:  # if it's the first '(' of a tuple
+                            tuple_start = i + 1  # then save the index
+                        paren_count += 1
+                    elif c == ")":
+                        paren_count -= 1
+                        if paren_count == 0:  # if it's the last ')' of a tuple
+                            meaning_term = parts[1][
+                                tuple_start:tuple_comma
+                            ]  # '\\x.(<word> x)'
+                            glue_term = parts[1][tuple_comma + 1 : i]  # '(v-r)'
+                            glue_formulas.append(
+                                [meaning_term, glue_term]
+                            )  # add the GlueFormula to the list
+                    elif c == ",":
+                        if (
+                            paren_count == 1
+                        ):  # if it's a comma separating the parts of the tuple
+                            tuple_comma = i  # then save the index
+                    elif c == "#":  # skip comments at the ends of lines
+                        if (
+                            paren_count != 0
+                        ):  # if the line hasn't parsed correctly so far
+                            raise RuntimeError(
+                                "Formula syntax is incorrect for entry " + line
+                            )
+                        break  # break to the next line
+            if len(parts) > 2:  # if there is a relationship entry at the end
+                rel_start = parts[2].index("[") + 1
+                rel_end = parts[2].index("]")
+                if rel_start == rel_end:
+                    relationships = frozenset()
+                else:
+                    relationships = frozenset(
+                        r.strip() for r in parts[2][rel_start:rel_end].split(",")
+                    )
+            try:
+                start_inheritance = parts[0].index("(")
+                end_inheritance = parts[0].index(")")
+                sem = parts[0][:start_inheritance].strip()
+                supertype = parts[0][start_inheritance + 1 : end_inheritance]
+            except:
+                sem = parts[0].strip()
+                supertype = None
+            if sem not in self:
+                self[sem] = {}
+            if (
+                relationships is None
+            ):  # if not specified for a specific relationship set
+                # add all relationship entries for parents
+                if supertype:
+                    for rels in self[supertype]:
+                        if rels not in self[sem]:
+                            self[sem][rels] = []
+                        glue = self[supertype][rels]
+                        self[sem][rels].extend(glue)
+                        self[sem][rels].extend(
+                            glue_formulas
+                        )  # add the glue formulas to every rel entry
+                else:
+                    if None not in self[sem]:
+                        self[sem][None] = []
+                    self[sem][None].extend(
+                        glue_formulas
+                    )  # add the glue formulas to every rel entry
+            else:
+                if relationships not in self[sem]:
+                    self[sem][relationships] = []
+                if supertype:
+                    self[sem][relationships].extend(self[supertype][relationships])
+                self[sem][relationships].extend(
+                    glue_formulas
+                )  # add the glue entry to the dictionary
+    def __str__(self):
+        accum = ""
+        for pos in self:
+            str_pos = "%s" % pos
+            for relset in self[pos]:
+                i = 1
+                for gf in self[pos][relset]:
+                    if i == 1:
+                        accum += str_pos + ": "
+                    else:
+                        accum += " " * (len(str_pos) + 2)
+                    accum += "%s" % gf
+                    if relset and i == len(self[pos][relset]):
+                        accum += " : %s" % relset
+                    accum += "\n"
+                    i += 1
+        return accum
+    def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False):
+        if node is None:
+            # TODO: should it be depgraph.root? Is this code tested?
+            top = depgraph.nodes[0]
+            depList = list(chain.from_iterable(top["deps"].values()))
+            root = depgraph.nodes[depList[0]]
+            return self.to_glueformula_list(depgraph, root, Counter(), verbose)
+        glueformulas = self.lookup(node, depgraph, counter)
+        for dep_idx in chain.from_iterable(node["deps"].values()):
+            dep = depgraph.nodes[dep_idx]
+            glueformulas.extend(
+                self.to_glueformula_list(depgraph, dep, counter, verbose)
+            )
+        return glueformulas
+    def lookup(self, node, depgraph, counter):
+        semtype_names = self.get_semtypes(node)
+        semtype = None
+        for name in semtype_names:
+            if name in self:
+                semtype = self[name]
+                break
+        if semtype is None:
+            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
+            return []
+        self.add_missing_dependencies(node, depgraph)
+        lookup = self._lookup_semtype_option(semtype, node, depgraph)
+        if not len(lookup):
+            raise KeyError(
+                "There is no GlueDict entry for sem type of '%s' "
+                "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"])
+            )
+        return self.get_glueformulas_from_semtype_entry(
+            lookup, node["word"], node, depgraph, counter
+        )
+    def add_missing_dependencies(self, node, depgraph):
+        rel = node["rel"].lower()
+        if rel == "main":
+            headnode = depgraph.nodes[node["head"]]
+            subj = self.lookup_unique("subj", headnode, depgraph)
+            relation = subj["rel"]
+            node["deps"].setdefault(relation, [])
+            node["deps"][relation].append(subj["address"])
+            # node['deps'].append(subj['address'])
+    def _lookup_semtype_option(self, semtype, node, depgraph):
+        relationships = frozenset(
+            depgraph.nodes[dep]["rel"].lower()
+            for dep in chain.from_iterable(node["deps"].values())
+            if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS
+        )
+        try:
+            lookup = semtype[relationships]
+        except KeyError:
+            # An exact match is not found, so find the best match where
+            # 'best' is defined as the glue entry whose relationship set has the
+            # most relations of any possible relationship set that is a subset
+            # of the actual depgraph
+            best_match = frozenset()
+            for relset_option in set(semtype) - {None}:
+                if (
+                    len(relset_option) > len(best_match)
+                    and relset_option < relationships
+                ):
+                    best_match = relset_option
+            if not best_match:
+                if None in semtype:
+                    best_match = None
+                else:
+                    return None
+            lookup = semtype[best_match]
+        return lookup
+    def get_semtypes(self, node):
+        """
+        Based on the node, return a list of plausible semtypes in order of
+        plausibility.
+        """
+        rel = node["rel"].lower()
+        word = node["word"].lower()
+        if rel == "spec":
+            if word in SPEC_SEMTYPES:
+                return [SPEC_SEMTYPES[word]]
+            else:
+                return [SPEC_SEMTYPES["default"]]
+        elif rel in ["nmod", "vmod"]:
+            return [node["tag"], rel]
+        else:
+            return [node["tag"]]
+    def get_glueformulas_from_semtype_entry(
+        self, lookup, word, node, depgraph, counter
+    ):
+        glueformulas = []
+        glueFormulaFactory = self.get_GlueFormula_factory()
+        for meaning, glue in lookup:
+            gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue)
+            if not len(glueformulas):
+                gf.word = word
+            else:
+                gf.word = f"{word}{len(glueformulas) + 1}"
+            gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
+            glueformulas.append(gf)
+        return glueformulas
+    def get_meaning_formula(self, generic, word):
+        """
+        :param generic: A meaning formula string containing the
+            parameter "<word>"
+        :param word: The actual word to be replace "<word>"
+        """
+        word = word.replace(".", "")
+        return generic.replace("<word>", word)
+    def initialize_labels(self, expr, node, depgraph, unique_index):
+        if isinstance(expr, linearlogic.AtomicExpression):
+            name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index)
+            if name[0].isupper():
+                return linearlogic.VariableExpression(name)
+            else:
+                return linearlogic.ConstantExpression(name)
+        else:
+            return linearlogic.ImpExpression(
+                self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
+                self.initialize_labels(expr.consequent, node, depgraph, unique_index),
+            )
+    def find_label_name(self, name, node, depgraph, unique_index):
+        try:
+            dot = name.index(".")
+            before_dot = name[:dot]
+            after_dot = name[dot + 1 :]
+            if before_dot == "super":
+                return self.find_label_name(
+                    after_dot, depgraph.nodes[node["head"]], depgraph, unique_index
+                )
+            else:
+                return self.find_label_name(
+                    after_dot,
+                    self.lookup_unique(before_dot, node, depgraph),
+                    depgraph,
+                    unique_index,
+                )
+        except ValueError:
+            lbl = self.get_label(node)
+            if name == "f":
+                return lbl
+            elif name == "v":
+                return "%sv" % lbl
+            elif name == "r":
+                return "%sr" % lbl
+            elif name == "super":
+                return self.get_label(depgraph.nodes[node["head"]])
+            elif name == "var":
+                return f"{lbl.upper()}{unique_index}"
+            elif name == "a":
+                return self.get_label(self.lookup_unique("conja", node, depgraph))
+            elif name == "b":
+                return self.get_label(self.lookup_unique("conjb", node, depgraph))
+            else:
+                return self.get_label(self.lookup_unique(name, node, depgraph))
+    def get_label(self, node):
+        """
+        Pick an alphabetic character as identifier for an entity in the model.
+        :param value: where to index into the list of characters
+        :type value: int
+        """
+        value = node["address"]
+        letter = [
+            "f",
+            "g",
+            "h",
+            "i",
+            "j",
+            "k",
+            "l",
+            "m",
+            "n",
+            "o",
+            "p",
+            "q",
+            "r",
+            "s",
+            "t",
+            "u",
+            "v",
+            "w",
+            "x",
+            "y",
+            "z",
+            "a",
+            "b",
+            "c",
+            "d",
+            "e",
+        ][value - 1]
+        num = int(value) // 26
+        if num > 0:
+            return letter + str(num)
+        else:
+            return letter
+    def lookup_unique(self, rel, node, depgraph):
+        """
+        Lookup 'key'. There should be exactly one item in the associated relation.
+        """
+        deps = [
+            depgraph.nodes[dep]
+            for dep in chain.from_iterable(node["deps"].values())
+            if depgraph.nodes[dep]["rel"].lower() == rel.lower()
+        ]
+        if len(deps) == 0:
+            raise KeyError(
+                "'{}' doesn't contain a feature '{}'".format(node["word"], rel)
+            )
+        elif len(deps) > 1:
+            raise KeyError(
+                "'{}' should only have one feature '{}'".format(node["word"], rel)
+            )
+        else:
+            return deps[0]
+    def get_GlueFormula_factory(self):
+        return GlueFormula
+class Glue:
+    def __init__(
+        self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
+    ):
+        self.verbose = verbose
+        self.remove_duplicates = remove_duplicates
+        self.depparser = depparser
+        from nltk import Prover9
+        self.prover = Prover9()
+        if semtype_file:
+            self.semtype_file = semtype_file
+        else:
+            self.semtype_file = os.path.join(
+                "grammars", "sample_grammars", "glue.semtype"
+            )
+    def train_depparser(self, depgraphs=None):
+        if depgraphs:
+            self.depparser.train(depgraphs)
+        else:
+            self.depparser.train_from_file(
+                nltk.data.find(
+                    os.path.join("grammars", "sample_grammars", "glue_train.conll")
+                )
+            )
+    def parse_to_meaning(self, sentence):
+        readings = []
+        for agenda in self.parse_to_compiled(sentence):
+            readings.extend(self.get_readings(agenda))
+        return readings
+    def get_readings(self, agenda):
+        readings = []
+        agenda_length = len(agenda)
+        atomics = dict()
+        nonatomics = dict()
+        while agenda:  # is not empty
+            cur = agenda.pop()
+            glue_simp = cur.glue.simplify()
+            if isinstance(
+                glue_simp, linearlogic.ImpExpression
+            ):  # if cur.glue is non-atomic
+                for key in atomics:
+                    try:
+                        if isinstance(cur.glue, linearlogic.ApplicationExpression):
+                            bindings = cur.glue.bindings
+                        else:
+                            bindings = linearlogic.BindingDict()
+                        glue_simp.antecedent.unify(key, bindings)
+                        for atomic in atomics[key]:
+                            if not (
+                                cur.indices & atomic.indices
+                            ):  # if the sets of indices are disjoint
+                                try:
+                                    agenda.append(cur.applyto(atomic))
+                                except linearlogic.LinearLogicApplicationException:
+                                    pass
+                    except linearlogic.UnificationException:
+                        pass
+                try:
+                    nonatomics[glue_simp.antecedent].append(cur)
+                except KeyError:
+                    nonatomics[glue_simp.antecedent] = [cur]
+            else:  # else cur.glue is atomic
+                for key in nonatomics:
+                    for nonatomic in nonatomics[key]:
+                        try:
+                            if isinstance(
+                                nonatomic.glue, linearlogic.ApplicationExpression
+                            ):
+                                bindings = nonatomic.glue.bindings
+                            else:
+                                bindings = linearlogic.BindingDict()
+                            glue_simp.unify(key, bindings)
+                            if not (
+                                cur.indices & nonatomic.indices
+                            ):  # if the sets of indices are disjoint
+                                try:
+                                    agenda.append(nonatomic.applyto(cur))
+                                except linearlogic.LinearLogicApplicationException:
+                                    pass
+                        except linearlogic.UnificationException:
+                            pass
+                try:
+                    atomics[glue_simp].append(cur)
+                except KeyError:
+                    atomics[glue_simp] = [cur]
+        for entry in atomics:
+            for gf in atomics[entry]:
+                if len(gf.indices) == agenda_length:
+                    self._add_to_reading_list(gf, readings)
+        for entry in nonatomics:
+            for gf in nonatomics[entry]:
+                if len(gf.indices) == agenda_length:
+                    self._add_to_reading_list(gf, readings)
+        return readings
+    def _add_to_reading_list(self, glueformula, reading_list):
+        add_reading = True
+        if self.remove_duplicates:
+            for reading in reading_list:
+                try:
+                    if reading.equiv(glueformula.meaning, self.prover):
+                        add_reading = False
+                        break
+                except Exception as e:
+                    # if there is an exception, the syntax of the formula
+                    # may not be understandable by the prover, so don't
+                    # throw out the reading.
+                    print("Error when checking logical equality of statements", e)
+        if add_reading:
+            reading_list.append(glueformula.meaning)
+    def parse_to_compiled(self, sentence):
+        gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
+        return [self.gfl_to_compiled(gfl) for gfl in gfls]
+    def dep_parse(self, sentence):
+        """
+        Return a dependency graph for the sentence.
+        :param sentence: the sentence to be parsed
+        :type sentence: list(str)
+        :rtype: DependencyGraph
+        """
+        # Lazy-initialize the depparser
+        if self.depparser is None:
+            from nltk.parse import MaltParser
+            self.depparser = MaltParser(tagger=self.get_pos_tagger())
+        if not self.depparser._trained:
+            self.train_depparser()
+        return self.depparser.parse(sentence, verbose=self.verbose)
+    def depgraph_to_glue(self, depgraph):
+        return self.get_glue_dict().to_glueformula_list(depgraph)
+    def get_glue_dict(self):
+        return GlueDict(self.semtype_file)
+    def gfl_to_compiled(self, gfl):
+        index_counter = Counter()
+        return_list = []
+        for gf in gfl:
+            return_list.extend(gf.compile(index_counter))
+        if self.verbose:
+            print("Compiled Glue Premises:")
+            for cgf in return_list:
+                print(cgf)
+        return return_list
+    def get_pos_tagger(self):
+        from nltk.corpus import brown
+        regexp_tagger = RegexpTagger(
+            [
+                (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
+                (r"(The|the|A|a|An|an)$", "AT"),  # articles
+                (r".*able$", "JJ"),  # adjectives
+                (r".*ness$", "NN"),  # nouns formed from adjectives
+                (r".*ly$", "RB"),  # adverbs
+                (r".*s$", "NNS"),  # plural nouns
+                (r".*ing$", "VBG"),  # gerunds
+                (r".*ed$", "VBD"),  # past tense verbs
+                (r".*", "NN"),  # nouns (default)
+            ]
+        )
+        brown_train = brown.tagged_sents(categories="news")
+        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
+        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
+        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
+        # Override particular words
+        main_tagger = RegexpTagger(
+            [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
+            backoff=trigram_tagger,
+        )
+        return main_tagger
+class DrtGlueFormula(GlueFormula):
+    def __init__(self, meaning, glue, indices=None):
+        if not indices:
+            indices = set()
+        if isinstance(meaning, str):
+            self.meaning = drt.DrtExpression.fromstring(meaning)
+        elif isinstance(meaning, drt.DrtExpression):
+            self.meaning = meaning
+        else:
+            raise RuntimeError(
+                "Meaning term neither string or expression: %s, %s"
+                % (meaning, meaning.__class__)
+            )
+        if isinstance(glue, str):
+            self.glue = linearlogic.LinearLogicParser().parse(glue)
+        elif isinstance(glue, linearlogic.Expression):
+            self.glue = glue
+        else:
+            raise RuntimeError(
+                "Glue term neither string or expression: %s, %s"
+                % (glue, glue.__class__)
+            )
+        self.indices = indices
+    def make_VariableExpression(self, name):
+        return drt.DrtVariableExpression(name)
+    def make_LambdaExpression(self, variable, term):
+        return drt.DrtLambdaExpression(variable, term)
+class DrtGlueDict(GlueDict):
+    def get_GlueFormula_factory(self):
+        return DrtGlueFormula
+class DrtGlue(Glue):
+    def __init__(
+        self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
+    ):
+        if not semtype_file:
+            semtype_file = os.path.join(
+                "grammars", "sample_grammars", "drt_glue.semtype"
+            )
+        Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
+    def get_glue_dict(self):
+        return DrtGlueDict(self.semtype_file)
+def demo(show_example=-1):
+    from nltk.parse import MaltParser
+    examples = [
+        "David sees Mary",
+        "David eats a sandwich",
+        "every man chases a dog",
+        "every man believes a dog sleeps",
+        "John gives David a sandwich",
+        "John chases himself",
+    ]
+    #                'John persuades David to order a pizza',
+    #                'John tries to go',
+    #                'John tries to find a unicorn',
+    #                'John seems to vanish',
+    #                'a unicorn seems to approach',
+    #                'every big cat leaves',
+    #                'every gray cat leaves',
+    #                'every big gray cat leaves',
+    #                'a former senator leaves',
+    print("============== DEMO ==============")
+    tagger = RegexpTagger(
+        [
+            ("^(David|Mary|John)$", "NNP"),
+            (
+                "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
+                "VB",
+            ),
+            ("^(go|order|vanish|find|approach)$", "VB"),
+            ("^(a)$", "ex_quant"),
+            ("^(every)$", "univ_quant"),
+            ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
+            ("^(big|gray|former)$", "JJ"),
+            ("^(him|himself)$", "PRP"),
+        ]
+    )
+    depparser = MaltParser(tagger=tagger)
+    glue = Glue(depparser=depparser, verbose=False)
+    for (i, sentence) in enumerate(examples):
+        if i == show_example or show_example == -1:
+            print(f"[[[Example {i}]]]  {sentence}")
+            for reading in glue.parse_to_meaning(sentence.split()):
+                print(reading.simplify())
+            print("")
+if __name__ == "__main__":
+    demo()

.eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py ADDED Viewed

	@@ -0,0 +1,395 @@

+# Natural Language Toolkit: Logic
+#
+# Author:     Peter Wang
+# Updated by: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2022 NLTK Project
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+An implementation of the Hole Semantics model, following Blackburn and Bos,
+Representation and Inference for Natural Language (CSLI, 2005).
+The semantic representations are built by the grammar hole.fcfg.
+This module contains driver code to read in sentences and parse them
+according to a hole semantics grammar.
+After parsing, the semantic representation is in the form of an underspecified
+representation that is not easy to read.  We use a "plugging" algorithm to
+convert that representation into first-order logic formulas.
+"""
+from functools import reduce
+from nltk.parse import load_parser
+from nltk.sem.logic import (
+    AllExpression,
+    AndExpression,
+    ApplicationExpression,
+    ExistsExpression,
+    IffExpression,
+    ImpExpression,
+    LambdaExpression,
+    NegatedExpression,
+    OrExpression,
+)
+from nltk.sem.skolemize import skolemize
+# Note that in this code there may be multiple types of trees being referred to:
+#
+# 1. parse trees
+# 2. the underspecified representation
+# 3. first-order logic formula trees
+# 4. the search space when plugging (search tree)
+#
+class Constants:
+    ALL = "ALL"
+    EXISTS = "EXISTS"
+    NOT = "NOT"
+    AND = "AND"
+    OR = "OR"
+    IMP = "IMP"
+    IFF = "IFF"
+    PRED = "PRED"
+    LEQ = "LEQ"
+    HOLE = "HOLE"
+    LABEL = "LABEL"
+    MAP = {
+        ALL: lambda v, e: AllExpression(v.variable, e),
+        EXISTS: lambda v, e: ExistsExpression(v.variable, e),
+        NOT: NegatedExpression,
+        AND: AndExpression,
+        OR: OrExpression,
+        IMP: ImpExpression,
+        IFF: IffExpression,
+        PRED: ApplicationExpression,
+    }
+class HoleSemantics:
+    """
+    This class holds the broken-down components of a hole semantics, i.e. it
+    extracts the holes, labels, logic formula fragments and constraints out of
+    a big conjunction of such as produced by the hole semantics grammar.  It
+    then provides some operations on the semantics dealing with holes, labels
+    and finding legal ways to plug holes with labels.
+    """
+    def __init__(self, usr):
+        """
+        Constructor.  `usr' is a ``sem.Expression`` representing an
+        Underspecified Representation Structure (USR).  A USR has the following
+        special predicates:
+        ALL(l,v,n),
+        EXISTS(l,v,n),
+        AND(l,n,n),
+        OR(l,n,n),
+        IMP(l,n,n),
+        IFF(l,n,n),
+        PRED(l,v,n,v[,v]*) where the brackets and star indicate zero or more repetitions,
+        LEQ(n,n),
+        HOLE(n),
+        LABEL(n)
+        where l is the label of the node described by the predicate, n is either
+        a label or a hole, and v is a variable.
+        """
+        self.holes = set()
+        self.labels = set()
+        self.fragments = {}  # mapping of label -> formula fragment
+        self.constraints = set()  # set of Constraints
+        self._break_down(usr)
+        self.top_most_labels = self._find_top_most_labels()
+        self.top_hole = self._find_top_hole()
+    def is_node(self, x):
+        """
+        Return true if x is a node (label or hole) in this semantic
+        representation.
+        """
+        return x in (self.labels | self.holes)
+    def _break_down(self, usr):
+        """
+        Extract holes, labels, formula fragments and constraints from the hole
+        semantics underspecified representation (USR).
+        """
+        if isinstance(usr, AndExpression):
+            self._break_down(usr.first)
+            self._break_down(usr.second)
+        elif isinstance(usr, ApplicationExpression):
+            func, args = usr.uncurry()
+            if func.variable.name == Constants.LEQ:
+                self.constraints.add(Constraint(args[0], args[1]))
+            elif func.variable.name == Constants.HOLE:
+                self.holes.add(args[0])
+            elif func.variable.name == Constants.LABEL:
+                self.labels.add(args[0])
+            else:
+                label = args[0]
+                assert label not in self.fragments
+                self.fragments[label] = (func, args[1:])
+        else:
+            raise ValueError(usr.label())
+    def _find_top_nodes(self, node_list):
+        top_nodes = node_list.copy()
+        for f in self.fragments.values():
+            # the label is the first argument of the predicate
+            args = f[1]
+            for arg in args:
+                if arg in node_list:
+                    top_nodes.discard(arg)
+        return top_nodes
+    def _find_top_most_labels(self):
+        """
+        Return the set of labels which are not referenced directly as part of
+        another formula fragment.  These will be the top-most labels for the
+        subtree that they are part of.
+        """
+        return self._find_top_nodes(self.labels)
+    def _find_top_hole(self):
+        """
+        Return the hole that will be the top of the formula tree.
+        """
+        top_holes = self._find_top_nodes(self.holes)
+        assert len(top_holes) == 1  # it must be unique
+        return top_holes.pop()
+    def pluggings(self):
+        """
+        Calculate and return all the legal pluggings (mappings of labels to
+        holes) of this semantics given the constraints.
+        """
+        record = []
+        self._plug_nodes([(self.top_hole, [])], self.top_most_labels, {}, record)
+        return record
+    def _plug_nodes(self, queue, potential_labels, plug_acc, record):
+        """
+        Plug the nodes in `queue' with the labels in `potential_labels'.
+        Each element of `queue' is a tuple of the node to plug and the list of
+        ancestor holes from the root of the graph to that node.
+        `potential_labels' is a set of the labels which are still available for
+        plugging.
+        `plug_acc' is the incomplete mapping of holes to labels made on the
+        current branch of the search tree so far.
+        `record' is a list of all the complete pluggings that we have found in
+        total so far.  It is the only parameter that is destructively updated.
+        """
+        if queue != []:
+            (node, ancestors) = queue[0]
+            if node in self.holes:
+                # The node is a hole, try to plug it.
+                self._plug_hole(
+                    node, ancestors, queue[1:], potential_labels, plug_acc, record
+                )
+            else:
+                assert node in self.labels
+                # The node is a label.  Replace it in the queue by the holes and
+                # labels in the formula fragment named by that label.
+                args = self.fragments[node][1]
+                head = [(a, ancestors) for a in args if self.is_node(a)]
+                self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record)
+        else:
+            raise Exception("queue empty")
+    def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record):
+        """
+        Try all possible ways of plugging a single hole.
+        See _plug_nodes for the meanings of the parameters.
+        """
+        # Add the current hole we're trying to plug into the list of ancestors.
+        assert hole not in ancestors0
+        ancestors = [hole] + ancestors0
+        # Try each potential label in this hole in turn.
+        for l in potential_labels0:
+            # Is the label valid in this hole?
+            if self._violates_constraints(l, ancestors):
+                continue
+            plug_acc = plug_acc0.copy()
+            plug_acc[hole] = l
+            potential_labels = potential_labels0.copy()
+            potential_labels.remove(l)
+            if len(potential_labels) == 0:
+                # No more potential labels.  That must mean all the holes have
+                # been filled so we have found a legal plugging so remember it.
+                #
+                # Note that the queue might not be empty because there might
+                # be labels on there that point to formula fragments with
+                # no holes in them.  _sanity_check_plugging will make sure
+                # all holes are filled.
+                self._sanity_check_plugging(plug_acc, self.top_hole, [])
+                record.append(plug_acc)
+            else:
+                # Recursively try to fill in the rest of the holes in the
+                # queue.  The label we just plugged into the hole could have
+                # holes of its own so at the end of the queue.  Putting it on
+                # the end of the queue gives us a breadth-first search, so that
+                # all the holes at level i of the formula tree are filled
+                # before filling level i+1.
+                # A depth-first search would work as well since the trees must
+                # be finite but the bookkeeping would be harder.
+                self._plug_nodes(
+                    queue + [(l, ancestors)], potential_labels, plug_acc, record
+                )
+    def _violates_constraints(self, label, ancestors):
+        """
+        Return True if the `label' cannot be placed underneath the holes given
+        by the set `ancestors' because it would violate the constraints imposed
+        on it.
+        """
+        for c in self.constraints:
+            if c.lhs == label:
+                if c.rhs not in ancestors:
+                    return True
+        return False
+    def _sanity_check_plugging(self, plugging, node, ancestors):
+        """
+        Make sure that a given plugging is legal.  We recursively go through
+        each node and make sure that no constraints are violated.
+        We also check that all holes have been filled.
+        """
+        if node in self.holes:
+            ancestors = [node] + ancestors
+            label = plugging[node]
+        else:
+            label = node
+        assert label in self.labels
+        for c in self.constraints:
+            if c.lhs == label:
+                assert c.rhs in ancestors
+        args = self.fragments[label][1]
+        for arg in args:
+            if self.is_node(arg):
+                self._sanity_check_plugging(plugging, arg, [label] + ancestors)
+    def formula_tree(self, plugging):
+        """
+        Return the first-order logic formula tree for this underspecified
+        representation using the plugging given.
+        """
+        return self._formula_tree(plugging, self.top_hole)
+    def _formula_tree(self, plugging, node):
+        if node in plugging:
+            return self._formula_tree(plugging, plugging[node])
+        elif node in self.fragments:
+            pred, args = self.fragments[node]
+            children = [self._formula_tree(plugging, arg) for arg in args]
+            return reduce(Constants.MAP[pred.variable.name], children)
+        else:
+            return node
+class Constraint:
+    """
+    This class represents a constraint of the form (L =< N),
+    where L is a label and N is a node (a label or a hole).
+    """
+    def __init__(self, lhs, rhs):
+        self.lhs = lhs
+        self.rhs = rhs
+    def __eq__(self, other):
+        if self.__class__ == other.__class__:
+            return self.lhs == other.lhs and self.rhs == other.rhs
+        else:
+            return False
+    def __ne__(self, other):
+        return not (self == other)
+    def __hash__(self):
+        return hash(repr(self))
+    def __repr__(self):
+        return f"({self.lhs} < {self.rhs})"
+def hole_readings(sentence, grammar_filename=None, verbose=False):
+    if not grammar_filename:
+        grammar_filename = "grammars/sample_grammars/hole.fcfg"
+    if verbose:
+        print("Reading grammar file", grammar_filename)
+    parser = load_parser(grammar_filename)
+    # Parse the sentence.
+    tokens = sentence.split()
+    trees = list(parser.parse(tokens))
+    if verbose:
+        print("Got %d different parses" % len(trees))
+    all_readings = []
+    for tree in trees:
+        # Get the semantic feature from the top of the parse tree.
+        sem = tree.label()["SEM"].simplify()
+        # Print the raw semantic representation.
+        if verbose:
+            print("Raw:       ", sem)
+        # Skolemize away all quantifiers.  All variables become unique.
+        while isinstance(sem, LambdaExpression):
+            sem = sem.term
+        skolemized = skolemize(sem)
+        if verbose:
+            print("Skolemized:", skolemized)
+        # Break the hole semantics representation down into its components
+        # i.e. holes, labels, formula fragments and constraints.
+        hole_sem = HoleSemantics(skolemized)
+        # Maybe show the details of the semantic representation.
+        if verbose:
+            print("Holes:       ", hole_sem.holes)
+            print("Labels:      ", hole_sem.labels)
+            print("Constraints: ", hole_sem.constraints)
+            print("Top hole:    ", hole_sem.top_hole)
+            print("Top labels:  ", hole_sem.top_most_labels)
+            print("Fragments:")
+            for l, f in hole_sem.fragments.items():
+                print(f"\t{l}: {f}")
+        # Find all the possible ways to plug the formulas together.
+        pluggings = hole_sem.pluggings()
+        # Build FOL formula trees using the pluggings.
+        readings = list(map(hole_sem.formula_tree, pluggings))
+        # Print out the formulas in a textual format.
+        if verbose:
+            for i, r in enumerate(readings):
+                print()
+                print("%d. %s" % (i, r))
+            print()
+        all_readings.extend(readings)
+    return all_readings
+if __name__ == "__main__":
+    for r in hole_readings("a dog barks"):
+        print(r)
+    print()
+    for r in hole_readings("every girl chases a dog"):
+        print(r)

.eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Natural Language Toolkit: Stemmers
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+#         Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+NLTK Stemmers
+Interfaces used to remove morphological affixes from words, leaving
+only the word stem.  Stemming algorithms aim to remove those affixes
+required for eg. grammatical role, tense, derivational morphology
+leaving only the stem of the word.  This is a difficult problem due to
+irregular words (eg. common verbs in English), complicated
+morphological rules, and part-of-speech and sense ambiguities
+(eg. ``ceil-`` is not the stem of ``ceiling``).
+StemmerI defines a standard interface for stemmers.
+"""
+from nltk.stem.api import StemmerI
+from nltk.stem.arlstem import ARLSTem
+from nltk.stem.arlstem2 import ARLSTem2
+from nltk.stem.cistem import Cistem
+from nltk.stem.isri import ISRIStemmer
+from nltk.stem.lancaster import LancasterStemmer
+from nltk.stem.porter import PorterStemmer
+from nltk.stem.regexp import RegexpStemmer
+from nltk.stem.rslp import RSLPStemmer
+from nltk.stem.snowball import SnowballStemmer
+from nltk.stem.wordnet import WordNetLemmatizer

.eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Natural Language Toolkit: Stemmer Interface
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+#         Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+from abc import ABCMeta, abstractmethod
+class StemmerI(metaclass=ABCMeta):
+    """
+    A processing interface for removing morphological affixes from
+    words.  This process is known as stemming.
+    """
+    @abstractmethod
+    def stem(self, token):
+        """
+        Strip affixes from the token and return the stem.
+        :param token: The token that should be stemmed.
+        :type token: str
+        """

.eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py ADDED Viewed

	@@ -0,0 +1,343 @@

+# Natural Language Toolkit: Stemmers
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Steven Tomcavage <stomcava@law.upenn.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
+Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
+"""
+import re
+from nltk.stem.api import StemmerI
+class LancasterStemmer(StemmerI):
+    """
+    Lancaster Stemmer
+        >>> from nltk.stem.lancaster import LancasterStemmer
+        >>> st = LancasterStemmer()
+        >>> st.stem('maximum')     # Remove "-um" when word is intact
+        'maxim'
+        >>> st.stem('presumably')  # Don't remove "-um" when word is not intact
+        'presum'
+        >>> st.stem('multiply')    # No action taken if word ends with "-ply"
+        'multiply'
+        >>> st.stem('provision')   # Replace "-sion" with "-j" to trigger "j" set of rules
+        'provid'
+        >>> st.stem('owed')        # Word starting with vowel must contain at least 2 letters
+        'ow'
+        >>> st.stem('ear')         # ditto
+        'ear'
+        >>> st.stem('saying')      # Words starting with consonant must contain at least 3
+        'say'
+        >>> st.stem('crying')      #     letters and one of those letters must be a vowel
+        'cry'
+        >>> st.stem('string')      # ditto
+        'string'
+        >>> st.stem('meant')       # ditto
+        'meant'
+        >>> st.stem('cement')      # ditto
+        'cem'
+        >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
+        >>> st_pre.stem('kilometer') # Test Prefix
+        'met'
+        >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
+        >>> st_custom.stem("ness") # Change s to t
+        'nest'
+    """
+    # The rule list is static since it doesn't change between instances
+    default_rule_tuple = (
+        "ai*2.",  # -ia > -   if intact
+        "a*1.",  # -a > -    if intact
+        "bb1.",  # -bb > -b
+        "city3s.",  # -ytic > -ys
+        "ci2>",  # -ic > -
+        "cn1t>",  # -nc > -nt
+        "dd1.",  # -dd > -d
+        "dei3y>",  # -ied > -y
+        "deec2ss.",  # -ceed >", -cess
+        "dee1.",  # -eed > -ee
+        "de2>",  # -ed > -
+        "dooh4>",  # -hood > -
+        "e1>",  # -e > -
+        "feil1v.",  # -lief > -liev
+        "fi2>",  # -if > -
+        "gni3>",  # -ing > -
+        "gai3y.",  # -iag > -y
+        "ga2>",  # -ag > -
+        "gg1.",  # -gg > -g
+        "ht*2.",  # -th > -   if intact
+        "hsiug5ct.",  # -guish > -ct
+        "hsi3>",  # -ish > -
+        "i*1.",  # -i > -    if intact
+        "i1y>",  # -i > -y
+        "ji1d.",  # -ij > -id   --  see nois4j> & vis3j>
+        "juf1s.",  # -fuj > -fus
+        "ju1d.",  # -uj > -ud
+        "jo1d.",  # -oj > -od
+        "jeh1r.",  # -hej > -her
+        "jrev1t.",  # -verj > -vert
+        "jsim2t.",  # -misj > -mit
+        "jn1d.",  # -nj > -nd
+        "j1s.",  # -j > -s
+        "lbaifi6.",  # -ifiabl > -
+        "lbai4y.",  # -iabl > -y
+        "lba3>",  # -abl > -
+        "lbi3.",  # -ibl > -
+        "lib2l>",  # -bil > -bl
+        "lc1.",  # -cl > c
+        "lufi4y.",  # -iful > -y
+        "luf3>",  # -ful > -
+        "lu2.",  # -ul > -
+        "lai3>",  # -ial > -
+        "lau3>",  # -ual > -
+        "la2>",  # -al > -
+        "ll1.",  # -ll > -l
+        "mui3.",  # -ium > -
+        "mu*2.",  # -um > -   if intact
+        "msi3>",  # -ism > -
+        "mm1.",  # -mm > -m
+        "nois4j>",  # -sion > -j
+        "noix4ct.",  # -xion > -ct
+        "noi3>",  # -ion > -
+        "nai3>",  # -ian > -
+        "na2>",  # -an > -
+        "nee0.",  # protect  -een
+        "ne2>",  # -en > -
+        "nn1.",  # -nn > -n
+        "pihs4>",  # -ship > -
+        "pp1.",  # -pp > -p
+        "re2>",  # -er > -
+        "rae0.",  # protect  -ear
+        "ra2.",  # -ar > -
+        "ro2>",  # -or > -
+        "ru2>",  # -ur > -
+        "rr1.",  # -rr > -r
+        "rt1>",  # -tr > -t
+        "rei3y>",  # -ier > -y
+        "sei3y>",  # -ies > -y
+        "sis2.",  # -sis > -s
+        "si2>",  # -is > -
+        "ssen4>",  # -ness > -
+        "ss0.",  # protect  -ss
+        "suo3>",  # -ous > -
+        "su*2.",  # -us > -   if intact
+        "s*1>",  # -s > -    if intact
+        "s0.",  # -s > -s
+        "tacilp4y.",  # -plicat > -ply
+        "ta2>",  # -at > -
+        "tnem4>",  # -ment > -
+        "tne3>",  # -ent > -
+        "tna3>",  # -ant > -
+        "tpir2b.",  # -ript > -rib
+        "tpro2b.",  # -orpt > -orb
+        "tcud1.",  # -duct > -duc
+        "tpmus2.",  # -sumpt > -sum
+        "tpec2iv.",  # -cept > -ceiv
+        "tulo2v.",  # -olut > -olv
+        "tsis0.",  # protect  -sist
+        "tsi3>",  # -ist > -
+        "tt1.",  # -tt > -t
+        "uqi3.",  # -iqu > -
+        "ugo1.",  # -ogu > -og
+        "vis3j>",  # -siv > -j
+        "vie0.",  # protect  -eiv
+        "vi2>",  # -iv > -
+        "ylb1>",  # -bly > -bl
+        "yli3y>",  # -ily > -y
+        "ylp0.",  # protect  -ply
+        "yl2>",  # -ly > -
+        "ygo1.",  # -ogy > -og
+        "yhp1.",  # -phy > -ph
+        "ymo1.",  # -omy > -om
+        "ypo1.",  # -opy > -op
+        "yti3>",  # -ity > -
+        "yte3>",  # -ety > -
+        "ytl2.",  # -lty > -l
+        "yrtsi5.",  # -istry > -
+        "yra3>",  # -ary > -
+        "yro3>",  # -ory > -
+        "yfi3.",  # -ify > -
+        "ycn2t>",  # -ncy > -nt
+        "yca3>",  # -acy > -
+        "zi2>",  # -iz > -
+        "zy1s.",  # -yz > -ys
+    )
+    def __init__(self, rule_tuple=None, strip_prefix_flag=False):
+        """Create an instance of the Lancaster stemmer."""
+        # Setup an empty rule dictionary - this will be filled in later
+        self.rule_dictionary = {}
+        # Check if a user wants to strip prefix
+        self._strip_prefix = strip_prefix_flag
+        # Check if a user wants to use his/her own rule tuples.
+        self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
+    def parseRules(self, rule_tuple=None):
+        """Validate the set of rules used in this stemmer.
+        If this function is called as an individual method, without using stem
+        method, rule_tuple argument will be compiled into self.rule_dictionary.
+        If this function is called within stem, self._rule_tuple will be used.
+        """
+        # If there is no argument for the function, use class' own rule tuple.
+        rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
+        valid_rule = re.compile(r"^[a-z]+\*?\d[a-z]*[>\.]?$")
+        # Empty any old rules from the rule set before adding new ones
+        self.rule_dictionary = {}
+        for rule in rule_tuple:
+            if not valid_rule.match(rule):
+                raise ValueError(f"The rule {rule} is invalid")
+            first_letter = rule[0:1]
+            if first_letter in self.rule_dictionary:
+                self.rule_dictionary[first_letter].append(rule)
+            else:
+                self.rule_dictionary[first_letter] = [rule]
+    def stem(self, word):
+        """Stem a word using the Lancaster stemmer."""
+        # Lower-case the word, since all the rules are lower-cased
+        word = word.lower()
+        word = self.__stripPrefix(word) if self._strip_prefix else word
+        # Save a copy of the original word
+        intact_word = word
+        # If rule dictionary is empty, parse rule tuple.
+        if not self.rule_dictionary:
+            self.parseRules()
+        return self.__doStemming(word, intact_word)
+    def __doStemming(self, word, intact_word):
+        """Perform the actual word stemming"""
+        valid_rule = re.compile(r"^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
+        proceed = True
+        while proceed:
+            # Find the position of the last letter of the word to be stemmed
+            last_letter_position = self.__getLastLetter(word)
+            # Only stem the word if it has a last letter and a rule matching that last letter
+            if (
+                last_letter_position < 0
+                or word[last_letter_position] not in self.rule_dictionary
+            ):
+                proceed = False
+            else:
+                rule_was_applied = False
+                # Go through each rule that matches the word's final letter
+                for rule in self.rule_dictionary[word[last_letter_position]]:
+                    rule_match = valid_rule.match(rule)
+                    if rule_match:
+                        (
+                            ending_string,
+                            intact_flag,
+                            remove_total,
+                            append_string,
+                            cont_flag,
+                        ) = rule_match.groups()
+                        # Convert the number of chars to remove when stemming
+                        # from a string to an integer
+                        remove_total = int(remove_total)
+                        # Proceed if word's ending matches rule's word ending
+                        if word.endswith(ending_string[::-1]):
+                            if intact_flag:
+                                if word == intact_word and self.__isAcceptable(
+                                    word, remove_total
+                                ):
+                                    word = self.__applyRule(
+                                        word, remove_total, append_string
+                                    )
+                                    rule_was_applied = True
+                                    if cont_flag == ".":
+                                        proceed = False
+                                    break
+                            elif self.__isAcceptable(word, remove_total):
+                                word = self.__applyRule(
+                                    word, remove_total, append_string
+                                )
+                                rule_was_applied = True
+                                if cont_flag == ".":
+                                    proceed = False
+                                break
+                # If no rules apply, the word doesn't need any more stemming
+                if rule_was_applied == False:
+                    proceed = False
+        return word
+    def __getLastLetter(self, word):
+        """Get the zero-based index of the last alphabetic character in this string"""
+        last_letter = -1
+        for position in range(len(word)):
+            if word[position].isalpha():
+                last_letter = position
+            else:
+                break
+        return last_letter
+    def __isAcceptable(self, word, remove_total):
+        """Determine if the word is acceptable for stemming."""
+        word_is_acceptable = False
+        # If the word starts with a vowel, it must be at least 2
+        # characters long to be stemmed
+        if word[0] in "aeiouy":
+            if len(word) - remove_total >= 2:
+                word_is_acceptable = True
+        # If the word starts with a consonant, it must be at least 3
+        # characters long (including one vowel) to be stemmed
+        elif len(word) - remove_total >= 3:
+            if word[1] in "aeiouy":
+                word_is_acceptable = True
+            elif word[2] in "aeiouy":
+                word_is_acceptable = True
+        return word_is_acceptable
+    def __applyRule(self, word, remove_total, append_string):
+        """Apply the stemming rule to the word"""
+        # Remove letters from the end of the word
+        new_word_length = len(word) - remove_total
+        word = word[0:new_word_length]
+        # And add new letters to the end of the truncated word
+        if append_string:
+            word += append_string
+        return word
+    def __stripPrefix(self, word):
+        """Remove prefix from a word.
+        This function originally taken from Whoosh.
+        """
+        for prefix in (
+            "kilo",
+            "micro",
+            "milli",
+            "intra",
+            "ultra",
+            "mega",
+            "nano",
+            "pico",
+            "pseudo",
+        ):
+            if word.startswith(prefix):
+                return word[len(prefix) :]
+        return word
+    def __repr__(self):
+        return "<LancasterStemmer>"

.eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Natural Language Toolkit: RSLP Stemmer
+#
+# Copyright (C) 2001-2022 NLTK Project
+# Author: Tiago Tresoldi <tresoldi@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+# This code is based on the algorithm presented in the paper "A Stemming
+# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
+# Christian Huyck, which unfortunately I had no access to. The code is a
+# Python version, with some minor modifications of mine, to the description
+# presented at https://www.webcitation.org/5NnvdIzOb and to the C source code
+# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
+# Please note that this stemmer is intended for demonstration and educational
+# purposes only. Feel free to write me for any comments, including the
+# development of a different and/or better stemmer for Portuguese. I also
+# suggest using NLTK's mailing list for Portuguese for any discussion.
+# Este código é baseado no algoritmo apresentado no artigo "A Stemming
+# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
+# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
+# código é uma conversão para Python, com algumas pequenas modificações
+# minhas, daquele apresentado em https://www.webcitation.org/5NnvdIzOb e do
+# código para linguagem C disponível em
+# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
+# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
+# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
+# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
+# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
+# do NLTK para o português para qualquer debate.
+from nltk.data import load
+from nltk.stem.api import StemmerI
+class RSLPStemmer(StemmerI):
+    """
+    A stemmer for Portuguese.
+        >>> from nltk.stem import RSLPStemmer
+        >>> st = RSLPStemmer()
+        >>> # opening lines of Erico Verissimo's "Música ao Longe"
+        >>> text = '''
+        ... Clarissa risca com giz no quadro-negro a paisagem que os alunos
+        ... devem copiar . Uma casinha de porta e janela , em cima duma
+        ... coxilha .'''
+        >>> for token in text.split(): # doctest: +NORMALIZE_WHITESPACE
+        ...     print(st.stem(token))
+        clariss risc com giz no quadro-negr a pais que os alun dev copi .
+        uma cas de port e janel , em cim dum coxilh .
+    """
+    def __init__(self):
+        self._model = []
+        self._model.append(self.read_rule("step0.pt"))
+        self._model.append(self.read_rule("step1.pt"))
+        self._model.append(self.read_rule("step2.pt"))
+        self._model.append(self.read_rule("step3.pt"))
+        self._model.append(self.read_rule("step4.pt"))
+        self._model.append(self.read_rule("step5.pt"))
+        self._model.append(self.read_rule("step6.pt"))
+    def read_rule(self, filename):
+        rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
+        lines = rules.split("\n")
+        lines = [line for line in lines if line != ""]  # remove blank lines
+        lines = [line for line in lines if line[0] != "#"]  # remove comments
+        # NOTE: a simple but ugly hack to make this parser happy with double '\t's
+        lines = [line.replace("\t\t", "\t") for line in lines]
+        # parse rules
+        rules = []
+        for line in lines:
+            rule = []
+            tokens = line.split("\t")
+            # text to be searched for at the end of the string
+            rule.append(tokens[0][1:-1])  # remove quotes
+            # minimum stem size to perform the replacement
+            rule.append(int(tokens[1]))
+            # text to be replaced into
+            rule.append(tokens[2][1:-1])  # remove quotes
+            # exceptions to this rule
+            rule.append([token[1:-1] for token in tokens[3].split(",")])
+            # append to the results
+            rules.append(rule)
+        return rules
+    def stem(self, word):
+        word = word.lower()
+        # the word ends in 's'? apply rule for plural reduction
+        if word[-1] == "s":
+            word = self.apply_rule(word, 0)
+        # the word ends in 'a'? apply rule for feminine reduction
+        if word[-1] == "a":
+            word = self.apply_rule(word, 1)
+        # augmentative reduction
+        word = self.apply_rule(word, 3)
+        # adverb reduction
+        word = self.apply_rule(word, 2)
+        # noun reduction
+        prev_word = word
+        word = self.apply_rule(word, 4)
+        if word == prev_word:
+            # verb reduction
+            prev_word = word
+            word = self.apply_rule(word, 5)
+            if word == prev_word:
+                # vowel removal
+                word = self.apply_rule(word, 6)
+        return word
+    def apply_rule(self, word, rule_index):
+        rules = self._model[rule_index]
+        for rule in rules:
+            suffix_length = len(rule[0])
+            if word[-suffix_length:] == rule[0]:  # if suffix matches
+                if len(word) >= suffix_length + rule[1]:  # if we have minimum size
+                    if word not in rule[3]:  # if not an exception
+                        word = word[:-suffix_length] + rule[2]
+                        break
+        return word