Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py +186 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py +154 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py +516 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py +218 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py +265 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py +237 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py +168 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py +158 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py +174 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py +95 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py +375 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py +227 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py +95 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py +520 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py +133 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py +331 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py +146 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py +296 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py +196 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py +136 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py +75 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py +56 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py +125 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py +354 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py +510 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py +76 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py +136 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py +75 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py +867 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py +629 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py +166 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py +2489 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py +397 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py +256 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py +393 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py +772 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py +684 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py +479 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py +470 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py +794 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py +234 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py +453 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py +1605 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py +553 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py +835 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py +395 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py +34 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py +27 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py +343 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py +137 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Corpus Readers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
NLTK corpus readers. The modules in this package provide functions
|
| 11 |
+
that can be used to read corpus fileids in a variety of formats. These
|
| 12 |
+
functions can be used to read both the corpus fileids that are
|
| 13 |
+
distributed in the NLTK corpus package, and corpus fileids that are part
|
| 14 |
+
of external corpora.
|
| 15 |
+
|
| 16 |
+
Corpus Reader Functions
|
| 17 |
+
=======================
|
| 18 |
+
Each corpus module defines one or more "corpus reader functions",
|
| 19 |
+
which can be used to read documents from that corpus. These functions
|
| 20 |
+
take an argument, ``item``, which is used to indicate which document
|
| 21 |
+
should be read from the corpus:
|
| 22 |
+
|
| 23 |
+
- If ``item`` is one of the unique identifiers listed in the corpus
|
| 24 |
+
module's ``items`` variable, then the corresponding document will
|
| 25 |
+
be loaded from the NLTK corpus package.
|
| 26 |
+
- If ``item`` is a fileid, then that file will be read.
|
| 27 |
+
|
| 28 |
+
Additionally, corpus reader functions can be given lists of item
|
| 29 |
+
names; in which case, they will return a concatenation of the
|
| 30 |
+
corresponding documents.
|
| 31 |
+
|
| 32 |
+
Corpus reader functions are named based on the type of information
|
| 33 |
+
they return. Some common examples, and their return types, are:
|
| 34 |
+
|
| 35 |
+
- words(): list of str
|
| 36 |
+
- sents(): list of (list of str)
|
| 37 |
+
- paras(): list of (list of (list of str))
|
| 38 |
+
- tagged_words(): list of (str,str) tuple
|
| 39 |
+
- tagged_sents(): list of (list of (str,str))
|
| 40 |
+
- tagged_paras(): list of (list of (list of (str,str)))
|
| 41 |
+
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
| 42 |
+
- parsed_sents(): list of (Tree with str leaves)
|
| 43 |
+
- parsed_paras(): list of (list of (Tree with str leaves))
|
| 44 |
+
- xml(): A single xml ElementTree
|
| 45 |
+
- raw(): unprocessed corpus contents
|
| 46 |
+
|
| 47 |
+
For example, to read a list of the words in the Brown Corpus, use
|
| 48 |
+
``nltk.corpus.brown.words()``:
|
| 49 |
+
|
| 50 |
+
>>> from nltk.corpus import brown
|
| 51 |
+
>>> print(", ".join(brown.words()[:6])) # only first 6 words
|
| 52 |
+
The, Fulton, County, Grand, Jury, said
|
| 53 |
+
|
| 54 |
+
isort:skip_file
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
from nltk.corpus.reader.plaintext import *
|
| 58 |
+
from nltk.corpus.reader.util import *
|
| 59 |
+
from nltk.corpus.reader.api import *
|
| 60 |
+
from nltk.corpus.reader.tagged import *
|
| 61 |
+
from nltk.corpus.reader.cmudict import *
|
| 62 |
+
from nltk.corpus.reader.conll import *
|
| 63 |
+
from nltk.corpus.reader.chunked import *
|
| 64 |
+
from nltk.corpus.reader.wordlist import *
|
| 65 |
+
from nltk.corpus.reader.xmldocs import *
|
| 66 |
+
from nltk.corpus.reader.ppattach import *
|
| 67 |
+
from nltk.corpus.reader.senseval import *
|
| 68 |
+
from nltk.corpus.reader.ieer import *
|
| 69 |
+
from nltk.corpus.reader.sinica_treebank import *
|
| 70 |
+
from nltk.corpus.reader.bracket_parse import *
|
| 71 |
+
from nltk.corpus.reader.indian import *
|
| 72 |
+
from nltk.corpus.reader.toolbox import *
|
| 73 |
+
from nltk.corpus.reader.timit import *
|
| 74 |
+
from nltk.corpus.reader.ycoe import *
|
| 75 |
+
from nltk.corpus.reader.rte import *
|
| 76 |
+
from nltk.corpus.reader.string_category import *
|
| 77 |
+
from nltk.corpus.reader.propbank import *
|
| 78 |
+
from nltk.corpus.reader.verbnet import *
|
| 79 |
+
from nltk.corpus.reader.bnc import *
|
| 80 |
+
from nltk.corpus.reader.nps_chat import *
|
| 81 |
+
from nltk.corpus.reader.wordnet import *
|
| 82 |
+
from nltk.corpus.reader.switchboard import *
|
| 83 |
+
from nltk.corpus.reader.dependency import *
|
| 84 |
+
from nltk.corpus.reader.nombank import *
|
| 85 |
+
from nltk.corpus.reader.ipipan import *
|
| 86 |
+
from nltk.corpus.reader.pl196x import *
|
| 87 |
+
from nltk.corpus.reader.knbc import *
|
| 88 |
+
from nltk.corpus.reader.chasen import *
|
| 89 |
+
from nltk.corpus.reader.childes import *
|
| 90 |
+
from nltk.corpus.reader.aligned import *
|
| 91 |
+
from nltk.corpus.reader.lin import *
|
| 92 |
+
from nltk.corpus.reader.semcor import *
|
| 93 |
+
from nltk.corpus.reader.framenet import *
|
| 94 |
+
from nltk.corpus.reader.udhr import *
|
| 95 |
+
from nltk.corpus.reader.bnc import *
|
| 96 |
+
from nltk.corpus.reader.sentiwordnet import *
|
| 97 |
+
from nltk.corpus.reader.twitter import *
|
| 98 |
+
from nltk.corpus.reader.nkjp import *
|
| 99 |
+
from nltk.corpus.reader.crubadan import *
|
| 100 |
+
from nltk.corpus.reader.mte import *
|
| 101 |
+
from nltk.corpus.reader.reviews import *
|
| 102 |
+
from nltk.corpus.reader.opinion_lexicon import *
|
| 103 |
+
from nltk.corpus.reader.pros_cons import *
|
| 104 |
+
from nltk.corpus.reader.categorized_sents import *
|
| 105 |
+
from nltk.corpus.reader.comparative_sents import *
|
| 106 |
+
from nltk.corpus.reader.panlex_lite import *
|
| 107 |
+
from nltk.corpus.reader.panlex_swadesh import *
|
| 108 |
+
from nltk.corpus.reader.bcp47 import *
|
| 109 |
+
|
| 110 |
+
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
|
| 111 |
+
# the function bracket_parse() defined in nltk.tree:
|
| 112 |
+
from nltk.corpus.reader import bracket_parse
|
| 113 |
+
|
| 114 |
+
__all__ = [
|
| 115 |
+
"CorpusReader",
|
| 116 |
+
"CategorizedCorpusReader",
|
| 117 |
+
"PlaintextCorpusReader",
|
| 118 |
+
"find_corpus_fileids",
|
| 119 |
+
"TaggedCorpusReader",
|
| 120 |
+
"CMUDictCorpusReader",
|
| 121 |
+
"ConllChunkCorpusReader",
|
| 122 |
+
"WordListCorpusReader",
|
| 123 |
+
"PPAttachmentCorpusReader",
|
| 124 |
+
"SensevalCorpusReader",
|
| 125 |
+
"IEERCorpusReader",
|
| 126 |
+
"ChunkedCorpusReader",
|
| 127 |
+
"SinicaTreebankCorpusReader",
|
| 128 |
+
"BracketParseCorpusReader",
|
| 129 |
+
"IndianCorpusReader",
|
| 130 |
+
"ToolboxCorpusReader",
|
| 131 |
+
"TimitCorpusReader",
|
| 132 |
+
"YCOECorpusReader",
|
| 133 |
+
"MacMorphoCorpusReader",
|
| 134 |
+
"SyntaxCorpusReader",
|
| 135 |
+
"AlpinoCorpusReader",
|
| 136 |
+
"RTECorpusReader",
|
| 137 |
+
"StringCategoryCorpusReader",
|
| 138 |
+
"EuroparlCorpusReader",
|
| 139 |
+
"CategorizedBracketParseCorpusReader",
|
| 140 |
+
"CategorizedTaggedCorpusReader",
|
| 141 |
+
"CategorizedPlaintextCorpusReader",
|
| 142 |
+
"PortugueseCategorizedPlaintextCorpusReader",
|
| 143 |
+
"tagged_treebank_para_block_reader",
|
| 144 |
+
"PropbankCorpusReader",
|
| 145 |
+
"VerbnetCorpusReader",
|
| 146 |
+
"BNCCorpusReader",
|
| 147 |
+
"ConllCorpusReader",
|
| 148 |
+
"XMLCorpusReader",
|
| 149 |
+
"NPSChatCorpusReader",
|
| 150 |
+
"SwadeshCorpusReader",
|
| 151 |
+
"WordNetCorpusReader",
|
| 152 |
+
"WordNetICCorpusReader",
|
| 153 |
+
"SwitchboardCorpusReader",
|
| 154 |
+
"DependencyCorpusReader",
|
| 155 |
+
"NombankCorpusReader",
|
| 156 |
+
"IPIPANCorpusReader",
|
| 157 |
+
"Pl196xCorpusReader",
|
| 158 |
+
"TEICorpusView",
|
| 159 |
+
"KNBCorpusReader",
|
| 160 |
+
"ChasenCorpusReader",
|
| 161 |
+
"CHILDESCorpusReader",
|
| 162 |
+
"AlignedCorpusReader",
|
| 163 |
+
"TimitTaggedCorpusReader",
|
| 164 |
+
"LinThesaurusCorpusReader",
|
| 165 |
+
"SemcorCorpusReader",
|
| 166 |
+
"FramenetCorpusReader",
|
| 167 |
+
"UdhrCorpusReader",
|
| 168 |
+
"BNCCorpusReader",
|
| 169 |
+
"SentiWordNetCorpusReader",
|
| 170 |
+
"SentiSynset",
|
| 171 |
+
"TwitterCorpusReader",
|
| 172 |
+
"NKJPCorpusReader",
|
| 173 |
+
"CrubadanCorpusReader",
|
| 174 |
+
"MTECorpusReader",
|
| 175 |
+
"ReviewsCorpusReader",
|
| 176 |
+
"OpinionLexiconCorpusReader",
|
| 177 |
+
"ProsConsCorpusReader",
|
| 178 |
+
"CategorizedSentencesCorpusReader",
|
| 179 |
+
"ComparativeSentencesCorpusReader",
|
| 180 |
+
"PanLexLiteCorpusReader",
|
| 181 |
+
"NonbreakingPrefixesCorpusReader",
|
| 182 |
+
"UnicharsCorpusReader",
|
| 183 |
+
"MWAPPDBCorpusReader",
|
| 184 |
+
"PanlexSwadeshCorpusReader",
|
| 185 |
+
"BCP47CorpusReader",
|
| 186 |
+
]
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Aligned Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# URL: <https://www.nltk.org/>
|
| 5 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
from nltk.corpus.reader.api import CorpusReader
|
| 9 |
+
from nltk.corpus.reader.util import (
|
| 10 |
+
StreamBackedCorpusView,
|
| 11 |
+
concat,
|
| 12 |
+
read_alignedsent_block,
|
| 13 |
+
)
|
| 14 |
+
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
|
| 15 |
+
from nltk.translate import AlignedSent, Alignment
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class AlignedCorpusReader(CorpusReader):
|
| 19 |
+
"""
|
| 20 |
+
Reader for corpora of word-aligned sentences. Tokens are assumed
|
| 21 |
+
to be separated by whitespace. Sentences begin on separate lines.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
root,
|
| 27 |
+
fileids,
|
| 28 |
+
sep="/",
|
| 29 |
+
word_tokenizer=WhitespaceTokenizer(),
|
| 30 |
+
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
| 31 |
+
alignedsent_block_reader=read_alignedsent_block,
|
| 32 |
+
encoding="latin1",
|
| 33 |
+
):
|
| 34 |
+
"""
|
| 35 |
+
Construct a new Aligned Corpus reader for a set of documents
|
| 36 |
+
located at the given root directory. Example usage:
|
| 37 |
+
|
| 38 |
+
>>> root = '/...path to corpus.../'
|
| 39 |
+
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
|
| 40 |
+
|
| 41 |
+
:param root: The root directory for this corpus.
|
| 42 |
+
:param fileids: A list or regexp specifying the fileids in this corpus.
|
| 43 |
+
"""
|
| 44 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 45 |
+
self._sep = sep
|
| 46 |
+
self._word_tokenizer = word_tokenizer
|
| 47 |
+
self._sent_tokenizer = sent_tokenizer
|
| 48 |
+
self._alignedsent_block_reader = alignedsent_block_reader
|
| 49 |
+
|
| 50 |
+
def words(self, fileids=None):
|
| 51 |
+
"""
|
| 52 |
+
:return: the given file(s) as a list of words
|
| 53 |
+
and punctuation symbols.
|
| 54 |
+
:rtype: list(str)
|
| 55 |
+
"""
|
| 56 |
+
return concat(
|
| 57 |
+
[
|
| 58 |
+
AlignedSentCorpusView(
|
| 59 |
+
fileid,
|
| 60 |
+
enc,
|
| 61 |
+
False,
|
| 62 |
+
False,
|
| 63 |
+
self._word_tokenizer,
|
| 64 |
+
self._sent_tokenizer,
|
| 65 |
+
self._alignedsent_block_reader,
|
| 66 |
+
)
|
| 67 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 68 |
+
]
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
def sents(self, fileids=None):
|
| 72 |
+
"""
|
| 73 |
+
:return: the given file(s) as a list of
|
| 74 |
+
sentences or utterances, each encoded as a list of word
|
| 75 |
+
strings.
|
| 76 |
+
:rtype: list(list(str))
|
| 77 |
+
"""
|
| 78 |
+
return concat(
|
| 79 |
+
[
|
| 80 |
+
AlignedSentCorpusView(
|
| 81 |
+
fileid,
|
| 82 |
+
enc,
|
| 83 |
+
False,
|
| 84 |
+
True,
|
| 85 |
+
self._word_tokenizer,
|
| 86 |
+
self._sent_tokenizer,
|
| 87 |
+
self._alignedsent_block_reader,
|
| 88 |
+
)
|
| 89 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 90 |
+
]
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def aligned_sents(self, fileids=None):
|
| 94 |
+
"""
|
| 95 |
+
:return: the given file(s) as a list of AlignedSent objects.
|
| 96 |
+
:rtype: list(AlignedSent)
|
| 97 |
+
"""
|
| 98 |
+
return concat(
|
| 99 |
+
[
|
| 100 |
+
AlignedSentCorpusView(
|
| 101 |
+
fileid,
|
| 102 |
+
enc,
|
| 103 |
+
True,
|
| 104 |
+
True,
|
| 105 |
+
self._word_tokenizer,
|
| 106 |
+
self._sent_tokenizer,
|
| 107 |
+
self._alignedsent_block_reader,
|
| 108 |
+
)
|
| 109 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 110 |
+
]
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class AlignedSentCorpusView(StreamBackedCorpusView):
|
| 115 |
+
"""
|
| 116 |
+
A specialized corpus view for aligned sentences.
|
| 117 |
+
``AlignedSentCorpusView`` objects are typically created by
|
| 118 |
+
``AlignedCorpusReader`` (not directly by nltk users).
|
| 119 |
+
"""
|
| 120 |
+
|
| 121 |
+
def __init__(
|
| 122 |
+
self,
|
| 123 |
+
corpus_file,
|
| 124 |
+
encoding,
|
| 125 |
+
aligned,
|
| 126 |
+
group_by_sent,
|
| 127 |
+
word_tokenizer,
|
| 128 |
+
sent_tokenizer,
|
| 129 |
+
alignedsent_block_reader,
|
| 130 |
+
):
|
| 131 |
+
self._aligned = aligned
|
| 132 |
+
self._group_by_sent = group_by_sent
|
| 133 |
+
self._word_tokenizer = word_tokenizer
|
| 134 |
+
self._sent_tokenizer = sent_tokenizer
|
| 135 |
+
self._alignedsent_block_reader = alignedsent_block_reader
|
| 136 |
+
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
| 137 |
+
|
| 138 |
+
def read_block(self, stream):
|
| 139 |
+
block = [
|
| 140 |
+
self._word_tokenizer.tokenize(sent_str)
|
| 141 |
+
for alignedsent_str in self._alignedsent_block_reader(stream)
|
| 142 |
+
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
|
| 143 |
+
]
|
| 144 |
+
if self._aligned:
|
| 145 |
+
block[2] = Alignment.fromstring(
|
| 146 |
+
" ".join(block[2])
|
| 147 |
+
) # kludge; we shouldn't have tokenized the alignment string
|
| 148 |
+
block = [AlignedSent(*block)]
|
| 149 |
+
elif self._group_by_sent:
|
| 150 |
+
block = [block[0]]
|
| 151 |
+
else:
|
| 152 |
+
block = block[0]
|
| 153 |
+
|
| 154 |
+
return block
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py
ADDED
|
@@ -0,0 +1,516 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: API for Corpus Readers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
API for corpus readers.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import re
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
from itertools import chain
|
| 17 |
+
|
| 18 |
+
from nltk.corpus.reader.util import *
|
| 19 |
+
from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class CorpusReader:
|
| 23 |
+
"""
|
| 24 |
+
A base class for "corpus reader" classes, each of which can be
|
| 25 |
+
used to read a specific corpus format. Each individual corpus
|
| 26 |
+
reader instance is used to read a specific corpus, consisting of
|
| 27 |
+
one or more files under a common root directory. Each file is
|
| 28 |
+
identified by its ``file identifier``, which is the relative path
|
| 29 |
+
to the file from the root directory.
|
| 30 |
+
|
| 31 |
+
A separate subclass is defined for each corpus format. These
|
| 32 |
+
subclasses define one or more methods that provide 'views' on the
|
| 33 |
+
corpus contents, such as ``words()`` (for a list of words) and
|
| 34 |
+
``parsed_sents()`` (for a list of parsed sentences). Called with
|
| 35 |
+
no arguments, these methods will return the contents of the entire
|
| 36 |
+
corpus. For most corpora, these methods define one or more
|
| 37 |
+
selection arguments, such as ``fileids`` or ``categories``, which can
|
| 38 |
+
be used to select which portion of the corpus should be returned.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(self, root, fileids, encoding="utf8", tagset=None):
|
| 42 |
+
"""
|
| 43 |
+
:type root: PathPointer or str
|
| 44 |
+
:param root: A path pointer identifying the root directory for
|
| 45 |
+
this corpus. If a string is specified, then it will be
|
| 46 |
+
converted to a ``PathPointer`` automatically.
|
| 47 |
+
:param fileids: A list of the files that make up this corpus.
|
| 48 |
+
This list can either be specified explicitly, as a list of
|
| 49 |
+
strings; or implicitly, as a regular expression over file
|
| 50 |
+
paths. The absolute path for each file will be constructed
|
| 51 |
+
by joining the reader's root to each file name.
|
| 52 |
+
:param encoding: The default unicode encoding for the files
|
| 53 |
+
that make up the corpus. The value of ``encoding`` can be any
|
| 54 |
+
of the following:
|
| 55 |
+
|
| 56 |
+
- A string: ``encoding`` is the encoding name for all files.
|
| 57 |
+
- A dictionary: ``encoding[file_id]`` is the encoding
|
| 58 |
+
name for the file whose identifier is ``file_id``. If
|
| 59 |
+
``file_id`` is not in ``encoding``, then the file
|
| 60 |
+
contents will be processed using non-unicode byte strings.
|
| 61 |
+
- A list: ``encoding`` should be a list of ``(regexp, encoding)``
|
| 62 |
+
tuples. The encoding for a file whose identifier is ``file_id``
|
| 63 |
+
will be the ``encoding`` value for the first tuple whose
|
| 64 |
+
``regexp`` matches the ``file_id``. If no tuple's ``regexp``
|
| 65 |
+
matches the ``file_id``, the file contents will be processed
|
| 66 |
+
using non-unicode byte strings.
|
| 67 |
+
- None: the file contents of all files will be
|
| 68 |
+
processed using non-unicode byte strings.
|
| 69 |
+
:param tagset: The name of the tagset used by this corpus, to be used
|
| 70 |
+
for normalizing or converting the POS tags returned by the
|
| 71 |
+
``tagged_...()`` methods.
|
| 72 |
+
"""
|
| 73 |
+
# Convert the root to a path pointer, if necessary.
|
| 74 |
+
if isinstance(root, str) and not isinstance(root, PathPointer):
|
| 75 |
+
m = re.match(r"(.*\.zip)/?(.*)$|", root)
|
| 76 |
+
zipfile, zipentry = m.groups()
|
| 77 |
+
if zipfile:
|
| 78 |
+
root = ZipFilePathPointer(zipfile, zipentry)
|
| 79 |
+
else:
|
| 80 |
+
root = FileSystemPathPointer(root)
|
| 81 |
+
elif not isinstance(root, PathPointer):
|
| 82 |
+
raise TypeError("CorpusReader: expected a string or a PathPointer")
|
| 83 |
+
|
| 84 |
+
# If `fileids` is a regexp, then expand it.
|
| 85 |
+
if isinstance(fileids, str):
|
| 86 |
+
fileids = find_corpus_fileids(root, fileids)
|
| 87 |
+
|
| 88 |
+
self._fileids = fileids
|
| 89 |
+
"""A list of the relative paths for the fileids that make up
|
| 90 |
+
this corpus."""
|
| 91 |
+
|
| 92 |
+
self._root = root
|
| 93 |
+
"""The root directory for this corpus."""
|
| 94 |
+
|
| 95 |
+
self._readme = "README"
|
| 96 |
+
self._license = "LICENSE"
|
| 97 |
+
self._citation = "citation.bib"
|
| 98 |
+
|
| 99 |
+
# If encoding was specified as a list of regexps, then convert
|
| 100 |
+
# it to a dictionary.
|
| 101 |
+
if isinstance(encoding, list):
|
| 102 |
+
encoding_dict = {}
|
| 103 |
+
for fileid in self._fileids:
|
| 104 |
+
for x in encoding:
|
| 105 |
+
(regexp, enc) = x
|
| 106 |
+
if re.match(regexp, fileid):
|
| 107 |
+
encoding_dict[fileid] = enc
|
| 108 |
+
break
|
| 109 |
+
encoding = encoding_dict
|
| 110 |
+
|
| 111 |
+
self._encoding = encoding
|
| 112 |
+
"""The default unicode encoding for the fileids that make up
|
| 113 |
+
this corpus. If ``encoding`` is None, then the file
|
| 114 |
+
contents are processed using byte strings."""
|
| 115 |
+
self._tagset = tagset
|
| 116 |
+
|
| 117 |
+
def __repr__(self):
|
| 118 |
+
if isinstance(self._root, ZipFilePathPointer):
|
| 119 |
+
path = f"{self._root.zipfile.filename}/{self._root.entry}"
|
| 120 |
+
else:
|
| 121 |
+
path = "%s" % self._root.path
|
| 122 |
+
return f"<{self.__class__.__name__} in {path!r}>"
|
| 123 |
+
|
| 124 |
+
def ensure_loaded(self):
|
| 125 |
+
"""
|
| 126 |
+
Load this corpus (if it has not already been loaded). This is
|
| 127 |
+
used by LazyCorpusLoader as a simple method that can be used to
|
| 128 |
+
make sure a corpus is loaded -- e.g., in case a user wants to
|
| 129 |
+
do help(some_corpus).
|
| 130 |
+
"""
|
| 131 |
+
pass # no need to actually do anything.
|
| 132 |
+
|
| 133 |
+
def readme(self):
|
| 134 |
+
"""
|
| 135 |
+
Return the contents of the corpus README file, if it exists.
|
| 136 |
+
"""
|
| 137 |
+
with self.open(self._readme) as f:
|
| 138 |
+
return f.read()
|
| 139 |
+
|
| 140 |
+
def license(self):
|
| 141 |
+
"""
|
| 142 |
+
Return the contents of the corpus LICENSE file, if it exists.
|
| 143 |
+
"""
|
| 144 |
+
with self.open(self._license) as f:
|
| 145 |
+
return f.read()
|
| 146 |
+
|
| 147 |
+
def citation(self):
|
| 148 |
+
"""
|
| 149 |
+
Return the contents of the corpus citation.bib file, if it exists.
|
| 150 |
+
"""
|
| 151 |
+
with self.open(self._citation) as f:
|
| 152 |
+
return f.read()
|
| 153 |
+
|
| 154 |
+
def fileids(self):
|
| 155 |
+
"""
|
| 156 |
+
Return a list of file identifiers for the fileids that make up
|
| 157 |
+
this corpus.
|
| 158 |
+
"""
|
| 159 |
+
return self._fileids
|
| 160 |
+
|
| 161 |
+
def abspath(self, fileid):
|
| 162 |
+
"""
|
| 163 |
+
Return the absolute path for the given file.
|
| 164 |
+
|
| 165 |
+
:type fileid: str
|
| 166 |
+
:param fileid: The file identifier for the file whose path
|
| 167 |
+
should be returned.
|
| 168 |
+
:rtype: PathPointer
|
| 169 |
+
"""
|
| 170 |
+
return self._root.join(fileid)
|
| 171 |
+
|
| 172 |
+
def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
|
| 173 |
+
"""
|
| 174 |
+
Return a list of the absolute paths for all fileids in this corpus;
|
| 175 |
+
or for the given list of fileids, if specified.
|
| 176 |
+
|
| 177 |
+
:type fileids: None or str or list
|
| 178 |
+
:param fileids: Specifies the set of fileids for which paths should
|
| 179 |
+
be returned. Can be None, for all fileids; a list of
|
| 180 |
+
file identifiers, for a specified set of fileids; or a single
|
| 181 |
+
file identifier, for a single file. Note that the return
|
| 182 |
+
value is always a list of paths, even if ``fileids`` is a
|
| 183 |
+
single file identifier.
|
| 184 |
+
|
| 185 |
+
:param include_encoding: If true, then return a list of
|
| 186 |
+
``(path_pointer, encoding)`` tuples.
|
| 187 |
+
|
| 188 |
+
:rtype: list(PathPointer)
|
| 189 |
+
"""
|
| 190 |
+
if fileids is None:
|
| 191 |
+
fileids = self._fileids
|
| 192 |
+
elif isinstance(fileids, str):
|
| 193 |
+
fileids = [fileids]
|
| 194 |
+
|
| 195 |
+
paths = [self._root.join(f) for f in fileids]
|
| 196 |
+
|
| 197 |
+
if include_encoding and include_fileid:
|
| 198 |
+
return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
|
| 199 |
+
elif include_fileid:
|
| 200 |
+
return list(zip(paths, fileids))
|
| 201 |
+
elif include_encoding:
|
| 202 |
+
return list(zip(paths, [self.encoding(f) for f in fileids]))
|
| 203 |
+
else:
|
| 204 |
+
return paths
|
| 205 |
+
|
| 206 |
+
def raw(self, fileids=None):
|
| 207 |
+
"""
|
| 208 |
+
:param fileids: A list specifying the fileids that should be used.
|
| 209 |
+
:return: the given file(s) as a single string.
|
| 210 |
+
:rtype: str
|
| 211 |
+
"""
|
| 212 |
+
if fileids is None:
|
| 213 |
+
fileids = self._fileids
|
| 214 |
+
elif isinstance(fileids, str):
|
| 215 |
+
fileids = [fileids]
|
| 216 |
+
contents = []
|
| 217 |
+
for f in fileids:
|
| 218 |
+
with self.open(f) as fp:
|
| 219 |
+
contents.append(fp.read())
|
| 220 |
+
return concat(contents)
|
| 221 |
+
|
| 222 |
+
def open(self, file):
|
| 223 |
+
"""
|
| 224 |
+
Return an open stream that can be used to read the given file.
|
| 225 |
+
If the file's encoding is not None, then the stream will
|
| 226 |
+
automatically decode the file's contents into unicode.
|
| 227 |
+
|
| 228 |
+
:param file: The file identifier of the file to read.
|
| 229 |
+
"""
|
| 230 |
+
encoding = self.encoding(file)
|
| 231 |
+
stream = self._root.join(file).open(encoding)
|
| 232 |
+
return stream
|
| 233 |
+
|
| 234 |
+
def encoding(self, file):
|
| 235 |
+
"""
|
| 236 |
+
Return the unicode encoding for the given corpus file, if known.
|
| 237 |
+
If the encoding is unknown, or if the given file should be
|
| 238 |
+
processed using byte strings (str), then return None.
|
| 239 |
+
"""
|
| 240 |
+
if isinstance(self._encoding, dict):
|
| 241 |
+
return self._encoding.get(file)
|
| 242 |
+
else:
|
| 243 |
+
return self._encoding
|
| 244 |
+
|
| 245 |
+
def _get_root(self):
|
| 246 |
+
return self._root
|
| 247 |
+
|
| 248 |
+
root = property(
|
| 249 |
+
_get_root,
|
| 250 |
+
doc="""
|
| 251 |
+
The directory where this corpus is stored.
|
| 252 |
+
|
| 253 |
+
:type: PathPointer""",
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
######################################################################
|
| 258 |
+
# { Corpora containing categorized items
|
| 259 |
+
######################################################################
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
class CategorizedCorpusReader:
|
| 263 |
+
"""
|
| 264 |
+
A mixin class used to aid in the implementation of corpus readers
|
| 265 |
+
for categorized corpora. This class defines the method
|
| 266 |
+
``categories()``, which returns a list of the categories for the
|
| 267 |
+
corpus or for a specified set of fileids; and overrides ``fileids()``
|
| 268 |
+
to take a ``categories`` argument, restricting the set of fileids to
|
| 269 |
+
be returned.
|
| 270 |
+
|
| 271 |
+
Subclasses are expected to:
|
| 272 |
+
|
| 273 |
+
- Call ``__init__()`` to set up the mapping.
|
| 274 |
+
|
| 275 |
+
- Override all view methods to accept a ``categories`` parameter,
|
| 276 |
+
which can be used *instead* of the ``fileids`` parameter, to
|
| 277 |
+
select which fileids should be included in the returned view.
|
| 278 |
+
"""
|
| 279 |
+
|
| 280 |
+
def __init__(self, kwargs):
|
| 281 |
+
"""
|
| 282 |
+
Initialize this mapping based on keyword arguments, as
|
| 283 |
+
follows:
|
| 284 |
+
|
| 285 |
+
- cat_pattern: A regular expression pattern used to find the
|
| 286 |
+
category for each file identifier. The pattern will be
|
| 287 |
+
applied to each file identifier, and the first matching
|
| 288 |
+
group will be used as the category label for that file.
|
| 289 |
+
|
| 290 |
+
- cat_map: A dictionary, mapping from file identifiers to
|
| 291 |
+
category labels.
|
| 292 |
+
|
| 293 |
+
- cat_file: The name of a file that contains the mapping
|
| 294 |
+
from file identifiers to categories. The argument
|
| 295 |
+
``cat_delimiter`` can be used to specify a delimiter.
|
| 296 |
+
|
| 297 |
+
The corresponding argument will be deleted from ``kwargs``. If
|
| 298 |
+
more than one argument is specified, an exception will be
|
| 299 |
+
raised.
|
| 300 |
+
"""
|
| 301 |
+
self._f2c = None #: file-to-category mapping
|
| 302 |
+
self._c2f = None #: category-to-file mapping
|
| 303 |
+
|
| 304 |
+
self._pattern = None #: regexp specifying the mapping
|
| 305 |
+
self._map = None #: dict specifying the mapping
|
| 306 |
+
self._file = None #: fileid of file containing the mapping
|
| 307 |
+
self._delimiter = None #: delimiter for ``self._file``
|
| 308 |
+
|
| 309 |
+
if "cat_pattern" in kwargs:
|
| 310 |
+
self._pattern = kwargs["cat_pattern"]
|
| 311 |
+
del kwargs["cat_pattern"]
|
| 312 |
+
elif "cat_map" in kwargs:
|
| 313 |
+
self._map = kwargs["cat_map"]
|
| 314 |
+
del kwargs["cat_map"]
|
| 315 |
+
elif "cat_file" in kwargs:
|
| 316 |
+
self._file = kwargs["cat_file"]
|
| 317 |
+
del kwargs["cat_file"]
|
| 318 |
+
if "cat_delimiter" in kwargs:
|
| 319 |
+
self._delimiter = kwargs["cat_delimiter"]
|
| 320 |
+
del kwargs["cat_delimiter"]
|
| 321 |
+
else:
|
| 322 |
+
raise ValueError(
|
| 323 |
+
"Expected keyword argument cat_pattern or " "cat_map or cat_file."
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
|
| 327 |
+
raise ValueError(
|
| 328 |
+
"Specify exactly one of: cat_pattern, " "cat_map, cat_file."
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
def _init(self):
|
| 332 |
+
self._f2c = defaultdict(set)
|
| 333 |
+
self._c2f = defaultdict(set)
|
| 334 |
+
|
| 335 |
+
if self._pattern is not None:
|
| 336 |
+
for file_id in self._fileids:
|
| 337 |
+
category = re.match(self._pattern, file_id).group(1)
|
| 338 |
+
self._add(file_id, category)
|
| 339 |
+
|
| 340 |
+
elif self._map is not None:
|
| 341 |
+
for (file_id, categories) in self._map.items():
|
| 342 |
+
for category in categories:
|
| 343 |
+
self._add(file_id, category)
|
| 344 |
+
|
| 345 |
+
elif self._file is not None:
|
| 346 |
+
with self.open(self._file) as f:
|
| 347 |
+
for line in f.readlines():
|
| 348 |
+
line = line.strip()
|
| 349 |
+
file_id, categories = line.split(self._delimiter, 1)
|
| 350 |
+
if file_id not in self.fileids():
|
| 351 |
+
raise ValueError(
|
| 352 |
+
"In category mapping file %s: %s "
|
| 353 |
+
"not found" % (self._file, file_id)
|
| 354 |
+
)
|
| 355 |
+
for category in categories.split(self._delimiter):
|
| 356 |
+
self._add(file_id, category)
|
| 357 |
+
|
| 358 |
+
def _add(self, file_id, category):
|
| 359 |
+
self._f2c[file_id].add(category)
|
| 360 |
+
self._c2f[category].add(file_id)
|
| 361 |
+
|
| 362 |
+
def categories(self, fileids=None):
|
| 363 |
+
"""
|
| 364 |
+
Return a list of the categories that are defined for this corpus,
|
| 365 |
+
or for the file(s) if it is given.
|
| 366 |
+
"""
|
| 367 |
+
if self._f2c is None:
|
| 368 |
+
self._init()
|
| 369 |
+
if fileids is None:
|
| 370 |
+
return sorted(self._c2f)
|
| 371 |
+
if isinstance(fileids, str):
|
| 372 |
+
fileids = [fileids]
|
| 373 |
+
return sorted(set.union(*(self._f2c[d] for d in fileids)))
|
| 374 |
+
|
| 375 |
+
def fileids(self, categories=None):
|
| 376 |
+
"""
|
| 377 |
+
Return a list of file identifiers for the files that make up
|
| 378 |
+
this corpus, or that make up the given category(s) if specified.
|
| 379 |
+
"""
|
| 380 |
+
if categories is None:
|
| 381 |
+
return super().fileids()
|
| 382 |
+
elif isinstance(categories, str):
|
| 383 |
+
if self._f2c is None:
|
| 384 |
+
self._init()
|
| 385 |
+
if categories in self._c2f:
|
| 386 |
+
return sorted(self._c2f[categories])
|
| 387 |
+
else:
|
| 388 |
+
raise ValueError("Category %s not found" % categories)
|
| 389 |
+
else:
|
| 390 |
+
if self._f2c is None:
|
| 391 |
+
self._init()
|
| 392 |
+
return sorted(set.union(*(self._c2f[c] for c in categories)))
|
| 393 |
+
|
| 394 |
+
def _resolve(self, fileids, categories):
|
| 395 |
+
if fileids is not None and categories is not None:
|
| 396 |
+
raise ValueError("Specify fileids or categories, not both")
|
| 397 |
+
if categories is not None:
|
| 398 |
+
return self.fileids(categories)
|
| 399 |
+
else:
|
| 400 |
+
return fileids
|
| 401 |
+
|
| 402 |
+
def raw(self, fileids=None, categories=None):
|
| 403 |
+
return super().raw(self._resolve(fileids, categories))
|
| 404 |
+
|
| 405 |
+
def words(self, fileids=None, categories=None):
|
| 406 |
+
return super().words(self._resolve(fileids, categories))
|
| 407 |
+
|
| 408 |
+
def sents(self, fileids=None, categories=None):
|
| 409 |
+
return super().sents(self._resolve(fileids, categories))
|
| 410 |
+
|
| 411 |
+
def paras(self, fileids=None, categories=None):
|
| 412 |
+
return super().paras(self._resolve(fileids, categories))
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
######################################################################
|
| 416 |
+
# { Treebank readers
|
| 417 |
+
######################################################################
|
| 418 |
+
|
| 419 |
+
# [xx] is it worth it to factor this out?
|
| 420 |
+
class SyntaxCorpusReader(CorpusReader):
|
| 421 |
+
"""
|
| 422 |
+
An abstract base class for reading corpora consisting of
|
| 423 |
+
syntactically parsed text. Subclasses should define:
|
| 424 |
+
|
| 425 |
+
- ``__init__``, which specifies the location of the corpus
|
| 426 |
+
and a method for detecting the sentence blocks in corpus files.
|
| 427 |
+
- ``_read_block``, which reads a block from the input stream.
|
| 428 |
+
- ``_word``, which takes a block and returns a list of list of words.
|
| 429 |
+
- ``_tag``, which takes a block and returns a list of list of tagged
|
| 430 |
+
words.
|
| 431 |
+
- ``_parse``, which takes a block and returns a list of parsed
|
| 432 |
+
sentences.
|
| 433 |
+
"""
|
| 434 |
+
|
| 435 |
+
def _parse(self, s):
|
| 436 |
+
raise NotImplementedError()
|
| 437 |
+
|
| 438 |
+
def _word(self, s):
|
| 439 |
+
raise NotImplementedError()
|
| 440 |
+
|
| 441 |
+
def _tag(self, s):
|
| 442 |
+
raise NotImplementedError()
|
| 443 |
+
|
| 444 |
+
def _read_block(self, stream):
|
| 445 |
+
raise NotImplementedError()
|
| 446 |
+
|
| 447 |
+
def parsed_sents(self, fileids=None):
|
| 448 |
+
reader = self._read_parsed_sent_block
|
| 449 |
+
return concat(
|
| 450 |
+
[
|
| 451 |
+
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
| 452 |
+
for fileid, enc in self.abspaths(fileids, True)
|
| 453 |
+
]
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
def tagged_sents(self, fileids=None, tagset=None):
|
| 457 |
+
def reader(stream):
|
| 458 |
+
return self._read_tagged_sent_block(stream, tagset)
|
| 459 |
+
|
| 460 |
+
return concat(
|
| 461 |
+
[
|
| 462 |
+
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
| 463 |
+
for fileid, enc in self.abspaths(fileids, True)
|
| 464 |
+
]
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
def sents(self, fileids=None):
|
| 468 |
+
reader = self._read_sent_block
|
| 469 |
+
return concat(
|
| 470 |
+
[
|
| 471 |
+
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
| 472 |
+
for fileid, enc in self.abspaths(fileids, True)
|
| 473 |
+
]
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
def tagged_words(self, fileids=None, tagset=None):
|
| 477 |
+
def reader(stream):
|
| 478 |
+
return self._read_tagged_word_block(stream, tagset)
|
| 479 |
+
|
| 480 |
+
return concat(
|
| 481 |
+
[
|
| 482 |
+
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
| 483 |
+
for fileid, enc in self.abspaths(fileids, True)
|
| 484 |
+
]
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
def words(self, fileids=None):
|
| 488 |
+
return concat(
|
| 489 |
+
[
|
| 490 |
+
StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
|
| 491 |
+
for fileid, enc in self.abspaths(fileids, True)
|
| 492 |
+
]
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
# ------------------------------------------------------------
|
| 496 |
+
# { Block Readers
|
| 497 |
+
|
| 498 |
+
def _read_word_block(self, stream):
|
| 499 |
+
return list(chain.from_iterable(self._read_sent_block(stream)))
|
| 500 |
+
|
| 501 |
+
def _read_tagged_word_block(self, stream, tagset=None):
|
| 502 |
+
return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset)))
|
| 503 |
+
|
| 504 |
+
def _read_sent_block(self, stream):
|
| 505 |
+
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
|
| 506 |
+
|
| 507 |
+
def _read_tagged_sent_block(self, stream, tagset=None):
|
| 508 |
+
return list(
|
| 509 |
+
filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
def _read_parsed_sent_block(self, stream):
|
| 513 |
+
return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
|
| 514 |
+
|
| 515 |
+
# } End of Block Readers
|
| 516 |
+
# ------------------------------------------------------------
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: BCP-47 language tags
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2022 NLTK Project
|
| 4 |
+
# Author: Eric Kafe <kafe.eric@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from warnings import warn
|
| 10 |
+
from xml.etree import ElementTree as et
|
| 11 |
+
|
| 12 |
+
from nltk.corpus.reader import CorpusReader
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class BCP47CorpusReader(CorpusReader):
|
| 16 |
+
"""
|
| 17 |
+
Parse BCP-47 composite language tags
|
| 18 |
+
|
| 19 |
+
Supports all the main subtags, and the 'u-sd' extension:
|
| 20 |
+
|
| 21 |
+
>>> from nltk.corpus import bcp47
|
| 22 |
+
>>> bcp47.name('oc-gascon-u-sd-fr64')
|
| 23 |
+
'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
|
| 24 |
+
|
| 25 |
+
Can load a conversion table to Wikidata Q-codes:
|
| 26 |
+
>>> bcp47.load_wiki_q()
|
| 27 |
+
>>> bcp47.wiki_q['en-GI-spanglis']
|
| 28 |
+
'Q79388'
|
| 29 |
+
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, root, fileids):
|
| 33 |
+
"""Read the BCP-47 database"""
|
| 34 |
+
super().__init__(root, fileids)
|
| 35 |
+
self.langcode = {}
|
| 36 |
+
with self.open("iana/language-subtag-registry.txt") as fp:
|
| 37 |
+
self.db = self.data_dict(fp.read().split("%%\n"))
|
| 38 |
+
with self.open("cldr/common-subdivisions-en.xml") as fp:
|
| 39 |
+
self.subdiv = self.subdiv_dict(
|
| 40 |
+
et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
|
| 41 |
+
)
|
| 42 |
+
self.morphology()
|
| 43 |
+
|
| 44 |
+
def load_wiki_q(self):
|
| 45 |
+
"""Load conversion table to Wikidata Q-codes (only if needed)"""
|
| 46 |
+
with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
|
| 47 |
+
self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
|
| 48 |
+
|
| 49 |
+
def wiki_dict(self, lines):
|
| 50 |
+
"""Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
|
| 51 |
+
return {
|
| 52 |
+
pair[1]: pair[0].split("/")[-1]
|
| 53 |
+
for pair in [line.strip().split("\t") for line in lines]
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
def subdiv_dict(self, subdivs):
|
| 57 |
+
"""Convert the CLDR subdivisions list to a dictionary"""
|
| 58 |
+
return {sub.attrib["type"]: sub.text for sub in subdivs}
|
| 59 |
+
|
| 60 |
+
def morphology(self):
|
| 61 |
+
self.casing = {
|
| 62 |
+
"language": str.lower,
|
| 63 |
+
"extlang": str.lower,
|
| 64 |
+
"script": str.title,
|
| 65 |
+
"region": str.upper,
|
| 66 |
+
"variant": str.lower,
|
| 67 |
+
}
|
| 68 |
+
dig = "[0-9]"
|
| 69 |
+
low = "[a-z]"
|
| 70 |
+
up = "[A-Z]"
|
| 71 |
+
alnum = "[a-zA-Z0-9]"
|
| 72 |
+
self.format = {
|
| 73 |
+
"language": re.compile(f"{low*3}?"),
|
| 74 |
+
"extlang": re.compile(f"{low*3}"),
|
| 75 |
+
"script": re.compile(f"{up}{low*3}"),
|
| 76 |
+
"region": re.compile(f"({up*2})|({dig*3})"),
|
| 77 |
+
"variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
|
| 78 |
+
"singleton": re.compile(f"{low}"),
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def data_dict(self, records):
|
| 82 |
+
"""Convert the BCP-47 language subtag registry to a dictionary"""
|
| 83 |
+
self.version = records[0].replace("File-Date:", "").strip()
|
| 84 |
+
dic = {}
|
| 85 |
+
dic["deprecated"] = {}
|
| 86 |
+
for label in [
|
| 87 |
+
"language",
|
| 88 |
+
"extlang",
|
| 89 |
+
"script",
|
| 90 |
+
"region",
|
| 91 |
+
"variant",
|
| 92 |
+
"redundant",
|
| 93 |
+
"grandfathered",
|
| 94 |
+
]:
|
| 95 |
+
dic["deprecated"][label] = {}
|
| 96 |
+
for record in records[1:]:
|
| 97 |
+
fields = [field.split(": ") for field in record.strip().split("\n")]
|
| 98 |
+
typ = fields[0][1]
|
| 99 |
+
tag = fields[1][1]
|
| 100 |
+
if typ not in dic:
|
| 101 |
+
dic[typ] = {}
|
| 102 |
+
subfields = {}
|
| 103 |
+
for field in fields[2:]:
|
| 104 |
+
if len(field) == 2:
|
| 105 |
+
[key, val] = field
|
| 106 |
+
if key not in subfields:
|
| 107 |
+
subfields[key] = [val]
|
| 108 |
+
else: # multiple value
|
| 109 |
+
subfields[key].append(val)
|
| 110 |
+
else: # multiline field
|
| 111 |
+
subfields[key][-1] += " " + field[0].strip()
|
| 112 |
+
if (
|
| 113 |
+
"Deprecated" not in record
|
| 114 |
+
and typ == "language"
|
| 115 |
+
and key == "Description"
|
| 116 |
+
):
|
| 117 |
+
self.langcode[subfields[key][-1]] = tag
|
| 118 |
+
for key in subfields:
|
| 119 |
+
if len(subfields[key]) == 1: # single value
|
| 120 |
+
subfields[key] = subfields[key][0]
|
| 121 |
+
if "Deprecated" in record:
|
| 122 |
+
dic["deprecated"][typ][tag] = subfields
|
| 123 |
+
else:
|
| 124 |
+
dic[typ][tag] = subfields
|
| 125 |
+
return dic
|
| 126 |
+
|
| 127 |
+
def val2str(self, val):
|
| 128 |
+
"""Return only first value"""
|
| 129 |
+
if type(val) == list:
|
| 130 |
+
# val = "/".join(val) # Concatenate all values
|
| 131 |
+
val = val[0]
|
| 132 |
+
return val
|
| 133 |
+
|
| 134 |
+
def lang2str(self, lg_record):
|
| 135 |
+
"""Concatenate subtag values"""
|
| 136 |
+
name = f"{lg_record['language']}"
|
| 137 |
+
for label in ["extlang", "script", "region", "variant", "extension"]:
|
| 138 |
+
if label in lg_record:
|
| 139 |
+
name += f": {lg_record[label]}"
|
| 140 |
+
return name
|
| 141 |
+
|
| 142 |
+
def parse_tag(self, tag):
|
| 143 |
+
"""Convert a BCP-47 tag to a dictionary of labelled subtags"""
|
| 144 |
+
subtags = tag.split("-")
|
| 145 |
+
lang = {}
|
| 146 |
+
labels = ["language", "extlang", "script", "region", "variant", "variant"]
|
| 147 |
+
while subtags and labels:
|
| 148 |
+
subtag = subtags.pop(0)
|
| 149 |
+
found = False
|
| 150 |
+
while labels:
|
| 151 |
+
label = labels.pop(0)
|
| 152 |
+
subtag = self.casing[label](subtag)
|
| 153 |
+
if self.format[label].fullmatch(subtag):
|
| 154 |
+
if subtag in self.db[label]:
|
| 155 |
+
found = True
|
| 156 |
+
valstr = self.val2str(self.db[label][subtag]["Description"])
|
| 157 |
+
if label == "variant" and label in lang:
|
| 158 |
+
lang[label] += ": " + valstr
|
| 159 |
+
else:
|
| 160 |
+
lang[label] = valstr
|
| 161 |
+
break
|
| 162 |
+
elif subtag in self.db["deprecated"][label]:
|
| 163 |
+
found = True
|
| 164 |
+
note = f"The {subtag!r} {label} code is deprecated"
|
| 165 |
+
if "Preferred-Value" in self.db["deprecated"][label][subtag]:
|
| 166 |
+
prefer = self.db["deprecated"][label][subtag][
|
| 167 |
+
"Preferred-Value"
|
| 168 |
+
]
|
| 169 |
+
note += f"', prefer '{self.val2str(prefer)}'"
|
| 170 |
+
lang[label] = self.val2str(
|
| 171 |
+
self.db["deprecated"][label][subtag]["Description"]
|
| 172 |
+
)
|
| 173 |
+
warn(note)
|
| 174 |
+
break
|
| 175 |
+
if not found:
|
| 176 |
+
if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions
|
| 177 |
+
sd = subtags[1]
|
| 178 |
+
if sd in self.subdiv:
|
| 179 |
+
ext = self.subdiv[sd]
|
| 180 |
+
else:
|
| 181 |
+
ext = f"<Unknown subdivision: {ext}>"
|
| 182 |
+
else: # other extension subtags are not supported yet
|
| 183 |
+
ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
|
| 184 |
+
if not self.format["singleton"].fullmatch(subtag):
|
| 185 |
+
ext = f"<Invalid extension: {ext}>"
|
| 186 |
+
warn(ext)
|
| 187 |
+
lang["extension"] = ext
|
| 188 |
+
subtags = []
|
| 189 |
+
return lang
|
| 190 |
+
|
| 191 |
+
def name(self, tag):
|
| 192 |
+
"""
|
| 193 |
+
Convert a BCP-47 tag to a colon-separated string of subtag names
|
| 194 |
+
|
| 195 |
+
>>> from nltk.corpus import bcp47
|
| 196 |
+
>>> bcp47.name('ca-Latn-ES-valencia')
|
| 197 |
+
'Catalan: Latin: Spain: Valencian'
|
| 198 |
+
|
| 199 |
+
"""
|
| 200 |
+
for label in ["redundant", "grandfathered"]:
|
| 201 |
+
val = None
|
| 202 |
+
if tag in self.db[label]:
|
| 203 |
+
val = f"{self.db[label][tag]['Description']}"
|
| 204 |
+
note = f"The {tag!r} code is {label}"
|
| 205 |
+
elif tag in self.db["deprecated"][label]:
|
| 206 |
+
val = f"{self.db['deprecated'][label][tag]['Description']}"
|
| 207 |
+
note = f"The {tag!r} code is {label} and deprecated"
|
| 208 |
+
if "Preferred-Value" in self.db["deprecated"][label][tag]:
|
| 209 |
+
prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
|
| 210 |
+
note += f", prefer {self.val2str(prefer)!r}"
|
| 211 |
+
if val:
|
| 212 |
+
warn(note)
|
| 213 |
+
return val
|
| 214 |
+
try:
|
| 215 |
+
return self.lang2str(self.parse_tag(tag))
|
| 216 |
+
except:
|
| 217 |
+
warn(f"Tag {tag!r} was not recognized")
|
| 218 |
+
return None
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Plaintext Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""Corpus reader for the XML version of the British National Corpus."""
|
| 9 |
+
|
| 10 |
+
from nltk.corpus.reader.util import concat
|
| 11 |
+
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BNCCorpusReader(XMLCorpusReader):
|
| 15 |
+
r"""Corpus reader for the XML version of the British National Corpus.
|
| 16 |
+
|
| 17 |
+
For access to the complete XML data structure, use the ``xml()``
|
| 18 |
+
method. For access to simple word lists and tagged word lists, use
|
| 19 |
+
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
|
| 20 |
+
|
| 21 |
+
You can obtain the full version of the BNC corpus at
|
| 22 |
+
https://www.ota.ox.ac.uk/desc/2554
|
| 23 |
+
|
| 24 |
+
If you extracted the archive to a directory called `BNC`, then you can
|
| 25 |
+
instantiate the reader as::
|
| 26 |
+
|
| 27 |
+
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
|
| 28 |
+
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, root, fileids, lazy=True):
|
| 32 |
+
XMLCorpusReader.__init__(self, root, fileids)
|
| 33 |
+
self._lazy = lazy
|
| 34 |
+
|
| 35 |
+
def words(self, fileids=None, strip_space=True, stem=False):
|
| 36 |
+
"""
|
| 37 |
+
:return: the given file(s) as a list of words
|
| 38 |
+
and punctuation symbols.
|
| 39 |
+
:rtype: list(str)
|
| 40 |
+
|
| 41 |
+
:param strip_space: If true, then strip trailing spaces from
|
| 42 |
+
word tokens. Otherwise, leave the spaces on the tokens.
|
| 43 |
+
:param stem: If true, then use word stems instead of word strings.
|
| 44 |
+
"""
|
| 45 |
+
return self._views(fileids, False, None, strip_space, stem)
|
| 46 |
+
|
| 47 |
+
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
|
| 48 |
+
"""
|
| 49 |
+
:return: the given file(s) as a list of tagged
|
| 50 |
+
words and punctuation symbols, encoded as tuples
|
| 51 |
+
``(word,tag)``.
|
| 52 |
+
:rtype: list(tuple(str,str))
|
| 53 |
+
|
| 54 |
+
:param c5: If true, then the tags used will be the more detailed
|
| 55 |
+
c5 tags. Otherwise, the simplified tags will be used.
|
| 56 |
+
:param strip_space: If true, then strip trailing spaces from
|
| 57 |
+
word tokens. Otherwise, leave the spaces on the tokens.
|
| 58 |
+
:param stem: If true, then use word stems instead of word strings.
|
| 59 |
+
"""
|
| 60 |
+
tag = "c5" if c5 else "pos"
|
| 61 |
+
return self._views(fileids, False, tag, strip_space, stem)
|
| 62 |
+
|
| 63 |
+
def sents(self, fileids=None, strip_space=True, stem=False):
|
| 64 |
+
"""
|
| 65 |
+
:return: the given file(s) as a list of
|
| 66 |
+
sentences or utterances, each encoded as a list of word
|
| 67 |
+
strings.
|
| 68 |
+
:rtype: list(list(str))
|
| 69 |
+
|
| 70 |
+
:param strip_space: If true, then strip trailing spaces from
|
| 71 |
+
word tokens. Otherwise, leave the spaces on the tokens.
|
| 72 |
+
:param stem: If true, then use word stems instead of word strings.
|
| 73 |
+
"""
|
| 74 |
+
return self._views(fileids, True, None, strip_space, stem)
|
| 75 |
+
|
| 76 |
+
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
|
| 77 |
+
"""
|
| 78 |
+
:return: the given file(s) as a list of
|
| 79 |
+
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
| 80 |
+
:rtype: list(list(tuple(str,str)))
|
| 81 |
+
|
| 82 |
+
:param c5: If true, then the tags used will be the more detailed
|
| 83 |
+
c5 tags. Otherwise, the simplified tags will be used.
|
| 84 |
+
:param strip_space: If true, then strip trailing spaces from
|
| 85 |
+
word tokens. Otherwise, leave the spaces on the tokens.
|
| 86 |
+
:param stem: If true, then use word stems instead of word strings.
|
| 87 |
+
"""
|
| 88 |
+
tag = "c5" if c5 else "pos"
|
| 89 |
+
return self._views(
|
| 90 |
+
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
|
| 94 |
+
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
|
| 95 |
+
f = BNCWordView if self._lazy else self._words
|
| 96 |
+
return concat(
|
| 97 |
+
[
|
| 98 |
+
f(fileid, sent, tag, strip_space, stem)
|
| 99 |
+
for fileid in self.abspaths(fileids)
|
| 100 |
+
]
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
|
| 104 |
+
"""
|
| 105 |
+
Helper used to implement the view methods -- returns a list of
|
| 106 |
+
words or a list of sentences, optionally tagged.
|
| 107 |
+
|
| 108 |
+
:param fileid: The name of the underlying file.
|
| 109 |
+
:param bracket_sent: If true, include sentence bracketing.
|
| 110 |
+
:param tag: The name of the tagset to use, or None for no tags.
|
| 111 |
+
:param strip_space: If true, strip spaces from word tokens.
|
| 112 |
+
:param stem: If true, then substitute stems for words.
|
| 113 |
+
"""
|
| 114 |
+
result = []
|
| 115 |
+
|
| 116 |
+
xmldoc = ElementTree.parse(fileid).getroot()
|
| 117 |
+
for xmlsent in xmldoc.findall(".//s"):
|
| 118 |
+
sent = []
|
| 119 |
+
for xmlword in _all_xmlwords_in(xmlsent):
|
| 120 |
+
word = xmlword.text
|
| 121 |
+
if not word:
|
| 122 |
+
word = "" # fixes issue 337?
|
| 123 |
+
if strip_space or stem:
|
| 124 |
+
word = word.strip()
|
| 125 |
+
if stem:
|
| 126 |
+
word = xmlword.get("hw", word)
|
| 127 |
+
if tag == "c5":
|
| 128 |
+
word = (word, xmlword.get("c5"))
|
| 129 |
+
elif tag == "pos":
|
| 130 |
+
word = (word, xmlword.get("pos", xmlword.get("c5")))
|
| 131 |
+
sent.append(word)
|
| 132 |
+
if bracket_sent:
|
| 133 |
+
result.append(BNCSentence(xmlsent.attrib["n"], sent))
|
| 134 |
+
else:
|
| 135 |
+
result.extend(sent)
|
| 136 |
+
|
| 137 |
+
assert None not in result
|
| 138 |
+
return result
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def _all_xmlwords_in(elt, result=None):
|
| 142 |
+
if result is None:
|
| 143 |
+
result = []
|
| 144 |
+
for child in elt:
|
| 145 |
+
if child.tag in ("c", "w"):
|
| 146 |
+
result.append(child)
|
| 147 |
+
else:
|
| 148 |
+
_all_xmlwords_in(child, result)
|
| 149 |
+
return result
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
class BNCSentence(list):
|
| 153 |
+
"""
|
| 154 |
+
A list of words, augmented by an attribute ``num`` used to record
|
| 155 |
+
the sentence identifier (the ``n`` attribute from the XML).
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
+
def __init__(self, num, items):
|
| 159 |
+
self.num = num
|
| 160 |
+
list.__init__(self, items)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
class BNCWordView(XMLCorpusView):
|
| 164 |
+
"""
|
| 165 |
+
A stream backed corpus view specialized for use with the BNC corpus.
|
| 166 |
+
"""
|
| 167 |
+
|
| 168 |
+
tags_to_ignore = {
|
| 169 |
+
"pb",
|
| 170 |
+
"gap",
|
| 171 |
+
"vocal",
|
| 172 |
+
"event",
|
| 173 |
+
"unclear",
|
| 174 |
+
"shift",
|
| 175 |
+
"pause",
|
| 176 |
+
"align",
|
| 177 |
+
}
|
| 178 |
+
"""These tags are ignored. For their description refer to the
|
| 179 |
+
technical documentation, for example,
|
| 180 |
+
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
|
| 181 |
+
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
def __init__(self, fileid, sent, tag, strip_space, stem):
|
| 185 |
+
"""
|
| 186 |
+
:param fileid: The name of the underlying file.
|
| 187 |
+
:param sent: If true, include sentence bracketing.
|
| 188 |
+
:param tag: The name of the tagset to use, or None for no tags.
|
| 189 |
+
:param strip_space: If true, strip spaces from word tokens.
|
| 190 |
+
:param stem: If true, then substitute stems for words.
|
| 191 |
+
"""
|
| 192 |
+
if sent:
|
| 193 |
+
tagspec = ".*/s"
|
| 194 |
+
else:
|
| 195 |
+
tagspec = ".*/s/(.*/)?(c|w)"
|
| 196 |
+
self._sent = sent
|
| 197 |
+
self._tag = tag
|
| 198 |
+
self._strip_space = strip_space
|
| 199 |
+
self._stem = stem
|
| 200 |
+
|
| 201 |
+
self.title = None #: Title of the document.
|
| 202 |
+
self.author = None #: Author of the document.
|
| 203 |
+
self.editor = None #: Editor
|
| 204 |
+
self.resps = None #: Statement of responsibility
|
| 205 |
+
|
| 206 |
+
XMLCorpusView.__init__(self, fileid, tagspec)
|
| 207 |
+
|
| 208 |
+
# Read in a tasty header.
|
| 209 |
+
self._open()
|
| 210 |
+
self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
|
| 211 |
+
self.close()
|
| 212 |
+
|
| 213 |
+
# Reset tag context.
|
| 214 |
+
self._tag_context = {0: ()}
|
| 215 |
+
|
| 216 |
+
def handle_header(self, elt, context):
|
| 217 |
+
# Set up some metadata!
|
| 218 |
+
titles = elt.findall("titleStmt/title")
|
| 219 |
+
if titles:
|
| 220 |
+
self.title = "\n".join(title.text.strip() for title in titles)
|
| 221 |
+
|
| 222 |
+
authors = elt.findall("titleStmt/author")
|
| 223 |
+
if authors:
|
| 224 |
+
self.author = "\n".join(author.text.strip() for author in authors)
|
| 225 |
+
|
| 226 |
+
editors = elt.findall("titleStmt/editor")
|
| 227 |
+
if editors:
|
| 228 |
+
self.editor = "\n".join(editor.text.strip() for editor in editors)
|
| 229 |
+
|
| 230 |
+
resps = elt.findall("titleStmt/respStmt")
|
| 231 |
+
if resps:
|
| 232 |
+
self.resps = "\n\n".join(
|
| 233 |
+
"\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
def handle_elt(self, elt, context):
|
| 237 |
+
if self._sent:
|
| 238 |
+
return self.handle_sent(elt)
|
| 239 |
+
else:
|
| 240 |
+
return self.handle_word(elt)
|
| 241 |
+
|
| 242 |
+
def handle_word(self, elt):
|
| 243 |
+
word = elt.text
|
| 244 |
+
if not word:
|
| 245 |
+
word = "" # fixes issue 337?
|
| 246 |
+
if self._strip_space or self._stem:
|
| 247 |
+
word = word.strip()
|
| 248 |
+
if self._stem:
|
| 249 |
+
word = elt.get("hw", word)
|
| 250 |
+
if self._tag == "c5":
|
| 251 |
+
word = (word, elt.get("c5"))
|
| 252 |
+
elif self._tag == "pos":
|
| 253 |
+
word = (word, elt.get("pos", elt.get("c5")))
|
| 254 |
+
return word
|
| 255 |
+
|
| 256 |
+
def handle_sent(self, elt):
|
| 257 |
+
sent = []
|
| 258 |
+
for child in elt:
|
| 259 |
+
if child.tag in ("mw", "hi", "corr", "trunc"):
|
| 260 |
+
sent += [self.handle_word(w) for w in child]
|
| 261 |
+
elif child.tag in ("w", "c"):
|
| 262 |
+
sent.append(self.handle_word(child))
|
| 263 |
+
elif child.tag not in self.tags_to_ignore:
|
| 264 |
+
raise ValueError("Unexpected element %s" % child.tag)
|
| 265 |
+
return BNCSentence(elt.attrib["n"], sent)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Penn Treebank Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
"""
|
| 9 |
+
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
from nltk.corpus.reader.api import *
|
| 15 |
+
from nltk.corpus.reader.util import *
|
| 16 |
+
from nltk.tag import map_tag
|
| 17 |
+
from nltk.tree import Tree
|
| 18 |
+
|
| 19 |
+
# we use [^\s()]+ instead of \S+? to avoid matching ()
|
| 20 |
+
SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
|
| 21 |
+
TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
|
| 22 |
+
WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
|
| 23 |
+
EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class BracketParseCorpusReader(SyntaxCorpusReader):
|
| 27 |
+
"""
|
| 28 |
+
Reader for corpora that consist of parenthesis-delineated parse trees,
|
| 29 |
+
like those found in the "combined" section of the Penn Treebank,
|
| 30 |
+
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
|
| 31 |
+
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(
|
| 35 |
+
self,
|
| 36 |
+
root,
|
| 37 |
+
fileids,
|
| 38 |
+
comment_char=None,
|
| 39 |
+
detect_blocks="unindented_paren",
|
| 40 |
+
encoding="utf8",
|
| 41 |
+
tagset=None,
|
| 42 |
+
):
|
| 43 |
+
"""
|
| 44 |
+
:param root: The root directory for this corpus.
|
| 45 |
+
:param fileids: A list or regexp specifying the fileids in this corpus.
|
| 46 |
+
:param comment_char: The character which can appear at the start of
|
| 47 |
+
a line to indicate that the rest of the line is a comment.
|
| 48 |
+
:param detect_blocks: The method that is used to find blocks
|
| 49 |
+
in the corpus; can be 'unindented_paren' (every unindented
|
| 50 |
+
parenthesis starts a new parse) or 'sexpr' (brackets are
|
| 51 |
+
matched).
|
| 52 |
+
:param tagset: The name of the tagset used by this corpus, to be used
|
| 53 |
+
for normalizing or converting the POS tags returned by the
|
| 54 |
+
``tagged_...()`` methods.
|
| 55 |
+
"""
|
| 56 |
+
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
|
| 57 |
+
self._comment_char = comment_char
|
| 58 |
+
self._detect_blocks = detect_blocks
|
| 59 |
+
self._tagset = tagset
|
| 60 |
+
|
| 61 |
+
def _read_block(self, stream):
|
| 62 |
+
if self._detect_blocks == "sexpr":
|
| 63 |
+
return read_sexpr_block(stream, comment_char=self._comment_char)
|
| 64 |
+
elif self._detect_blocks == "blankline":
|
| 65 |
+
return read_blankline_block(stream)
|
| 66 |
+
elif self._detect_blocks == "unindented_paren":
|
| 67 |
+
# Tokens start with unindented left parens.
|
| 68 |
+
toks = read_regexp_block(stream, start_re=r"^\(")
|
| 69 |
+
# Strip any comments out of the tokens.
|
| 70 |
+
if self._comment_char:
|
| 71 |
+
toks = [
|
| 72 |
+
re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
|
| 73 |
+
for tok in toks
|
| 74 |
+
]
|
| 75 |
+
return toks
|
| 76 |
+
else:
|
| 77 |
+
assert 0, "bad block type"
|
| 78 |
+
|
| 79 |
+
def _normalize(self, t):
|
| 80 |
+
# Replace leaves of the form (!), (,), with (! !), (, ,)
|
| 81 |
+
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
|
| 82 |
+
# Replace leaves of the form (tag word root) with (tag word)
|
| 83 |
+
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
|
| 84 |
+
return t
|
| 85 |
+
|
| 86 |
+
def _parse(self, t):
|
| 87 |
+
try:
|
| 88 |
+
tree = Tree.fromstring(self._normalize(t))
|
| 89 |
+
# If there's an empty node at the top, strip it off
|
| 90 |
+
if tree.label() == "" and len(tree) == 1:
|
| 91 |
+
return tree[0]
|
| 92 |
+
else:
|
| 93 |
+
return tree
|
| 94 |
+
|
| 95 |
+
except ValueError as e:
|
| 96 |
+
sys.stderr.write("Bad tree detected; trying to recover...\n")
|
| 97 |
+
# Try to recover, if we can:
|
| 98 |
+
if e.args == ("mismatched parens",):
|
| 99 |
+
for n in range(1, 5):
|
| 100 |
+
try:
|
| 101 |
+
v = Tree(self._normalize(t + ")" * n))
|
| 102 |
+
sys.stderr.write(
|
| 103 |
+
" Recovered by adding %d close " "paren(s)\n" % n
|
| 104 |
+
)
|
| 105 |
+
return v
|
| 106 |
+
except ValueError:
|
| 107 |
+
pass
|
| 108 |
+
# Try something else:
|
| 109 |
+
sys.stderr.write(" Recovered by returning a flat parse.\n")
|
| 110 |
+
# sys.stderr.write(' '.join(t.split())+'\n')
|
| 111 |
+
return Tree("S", self._tag(t))
|
| 112 |
+
|
| 113 |
+
def _tag(self, t, tagset=None):
|
| 114 |
+
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
|
| 115 |
+
if tagset and tagset != self._tagset:
|
| 116 |
+
tagged_sent = [
|
| 117 |
+
(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
|
| 118 |
+
]
|
| 119 |
+
return tagged_sent
|
| 120 |
+
|
| 121 |
+
def _word(self, t):
|
| 122 |
+
return WORD.findall(self._normalize(t))
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
class CategorizedBracketParseCorpusReader(
|
| 126 |
+
CategorizedCorpusReader, BracketParseCorpusReader
|
| 127 |
+
):
|
| 128 |
+
"""
|
| 129 |
+
A reader for parsed corpora whose documents are
|
| 130 |
+
divided into categories based on their file identifiers.
|
| 131 |
+
@author: Nathan Schneider <nschneid@cs.cmu.edu>
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
def __init__(self, *args, **kwargs):
|
| 135 |
+
"""
|
| 136 |
+
Initialize the corpus reader. Categorization arguments
|
| 137 |
+
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
|
| 138 |
+
the L{CategorizedCorpusReader constructor
|
| 139 |
+
<CategorizedCorpusReader.__init__>}. The remaining arguments
|
| 140 |
+
are passed to the L{BracketParseCorpusReader constructor
|
| 141 |
+
<BracketParseCorpusReader.__init__>}.
|
| 142 |
+
"""
|
| 143 |
+
CategorizedCorpusReader.__init__(self, kwargs)
|
| 144 |
+
BracketParseCorpusReader.__init__(self, *args, **kwargs)
|
| 145 |
+
|
| 146 |
+
def tagged_words(self, fileids=None, categories=None, tagset=None):
|
| 147 |
+
return super().tagged_words(self._resolve(fileids, categories), tagset)
|
| 148 |
+
|
| 149 |
+
def tagged_sents(self, fileids=None, categories=None, tagset=None):
|
| 150 |
+
return super().tagged_sents(self._resolve(fileids, categories), tagset)
|
| 151 |
+
|
| 152 |
+
def tagged_paras(self, fileids=None, categories=None, tagset=None):
|
| 153 |
+
return super().tagged_paras(self._resolve(fileids, categories), tagset)
|
| 154 |
+
|
| 155 |
+
def parsed_words(self, fileids=None, categories=None):
|
| 156 |
+
return super().parsed_words(self._resolve(fileids, categories))
|
| 157 |
+
|
| 158 |
+
def parsed_sents(self, fileids=None, categories=None):
|
| 159 |
+
return super().parsed_sents(self._resolve(fileids, categories))
|
| 160 |
+
|
| 161 |
+
def parsed_paras(self, fileids=None, categories=None):
|
| 162 |
+
return super().parsed_paras(self._resolve(fileids, categories))
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class AlpinoCorpusReader(BracketParseCorpusReader):
|
| 166 |
+
"""
|
| 167 |
+
Reader for the Alpino Dutch Treebank.
|
| 168 |
+
This corpus has a lexical breakdown structure embedded, as read by `_parse`
|
| 169 |
+
Unfortunately this puts punctuation and some other words out of the sentence
|
| 170 |
+
order in the xml element tree. This is no good for `tag_` and `word_`
|
| 171 |
+
`_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
|
| 172 |
+
to the overridden _normalize function. The _parse function can then remain
|
| 173 |
+
untouched.
|
| 174 |
+
"""
|
| 175 |
+
|
| 176 |
+
def __init__(self, root, encoding="ISO-8859-1", tagset=None):
|
| 177 |
+
BracketParseCorpusReader.__init__(
|
| 178 |
+
self,
|
| 179 |
+
root,
|
| 180 |
+
r"alpino\.xml",
|
| 181 |
+
detect_blocks="blankline",
|
| 182 |
+
encoding=encoding,
|
| 183 |
+
tagset=tagset,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
def _normalize(self, t, ordered=False):
|
| 187 |
+
"""Normalize the xml sentence element in t.
|
| 188 |
+
The sentence elements <alpino_ds>, although embedded in a few overall
|
| 189 |
+
xml elements, are separated by blank lines. That's how the reader can
|
| 190 |
+
deliver them one at a time.
|
| 191 |
+
Each sentence has a few category subnodes that are of no use to us.
|
| 192 |
+
The remaining word nodes may or may not appear in the proper order.
|
| 193 |
+
Each word node has attributes, among which:
|
| 194 |
+
- begin : the position of the word in the sentence
|
| 195 |
+
- pos : Part of Speech: the Tag
|
| 196 |
+
- word : the actual word
|
| 197 |
+
The return value is a string with all xml elementes replaced by
|
| 198 |
+
clauses: either a cat clause with nested clauses, or a word clause.
|
| 199 |
+
The order of the bracket clauses closely follows the xml.
|
| 200 |
+
If ordered == True, the word clauses include an order sequence number.
|
| 201 |
+
If ordered == False, the word clauses only have pos and word parts.
|
| 202 |
+
"""
|
| 203 |
+
if t[:10] != "<alpino_ds":
|
| 204 |
+
return ""
|
| 205 |
+
# convert XML to sexpr notation
|
| 206 |
+
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
|
| 207 |
+
if ordered:
|
| 208 |
+
t = re.sub(
|
| 209 |
+
r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
|
| 210 |
+
r"(\1 \2 \3)",
|
| 211 |
+
t,
|
| 212 |
+
)
|
| 213 |
+
else:
|
| 214 |
+
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
|
| 215 |
+
t = re.sub(r" </node>", r")", t)
|
| 216 |
+
t = re.sub(r"<sentence>.*</sentence>", r"", t)
|
| 217 |
+
t = re.sub(r"</?alpino_ds.*>", r"", t)
|
| 218 |
+
return t
|
| 219 |
+
|
| 220 |
+
def _tag(self, t, tagset=None):
|
| 221 |
+
tagged_sent = [
|
| 222 |
+
(int(o), w, p)
|
| 223 |
+
for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
|
| 224 |
+
]
|
| 225 |
+
tagged_sent.sort()
|
| 226 |
+
if tagset and tagset != self._tagset:
|
| 227 |
+
tagged_sent = [
|
| 228 |
+
(w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
|
| 229 |
+
]
|
| 230 |
+
else:
|
| 231 |
+
tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
|
| 232 |
+
return tagged_sent
|
| 233 |
+
|
| 234 |
+
def _word(self, t):
|
| 235 |
+
"""Return a correctly ordered list if words"""
|
| 236 |
+
tagged_sent = self._tag(t)
|
| 237 |
+
return [w for (w, p) in tagged_sent]
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Categorized Sentences Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
CorpusReader structured for corpora that contain one instance on each row.
|
| 10 |
+
This CorpusReader is specifically used for the Subjectivity Dataset and the
|
| 11 |
+
Sentence Polarity Dataset.
|
| 12 |
+
|
| 13 |
+
- Subjectivity Dataset information -
|
| 14 |
+
|
| 15 |
+
Authors: Bo Pang and Lillian Lee.
|
| 16 |
+
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
|
| 17 |
+
|
| 18 |
+
Distributed with permission.
|
| 19 |
+
|
| 20 |
+
Related papers:
|
| 21 |
+
|
| 22 |
+
- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
|
| 23 |
+
Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
|
| 24 |
+
2004.
|
| 25 |
+
|
| 26 |
+
- Sentence Polarity Dataset information -
|
| 27 |
+
|
| 28 |
+
Authors: Bo Pang and Lillian Lee.
|
| 29 |
+
Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
|
| 30 |
+
|
| 31 |
+
Related papers:
|
| 32 |
+
|
| 33 |
+
- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
|
| 34 |
+
sentiment categorization with respect to rating scales". Proceedings of the
|
| 35 |
+
ACL, 2005.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
from nltk.corpus.reader.api import *
|
| 39 |
+
from nltk.tokenize import *
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
|
| 43 |
+
"""
|
| 44 |
+
A reader for corpora in which each row represents a single instance, mainly
|
| 45 |
+
a sentence. Istances are divided into categories based on their file identifiers
|
| 46 |
+
(see CategorizedCorpusReader).
|
| 47 |
+
Since many corpora allow rows that contain more than one sentence, it is
|
| 48 |
+
possible to specify a sentence tokenizer to retrieve all sentences instead
|
| 49 |
+
than all rows.
|
| 50 |
+
|
| 51 |
+
Examples using the Subjectivity Dataset:
|
| 52 |
+
|
| 53 |
+
>>> from nltk.corpus import subjectivity
|
| 54 |
+
>>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE
|
| 55 |
+
['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
|
| 56 |
+
'happened', 'off', 'screen', '.']
|
| 57 |
+
>>> subjectivity.categories()
|
| 58 |
+
['obj', 'subj']
|
| 59 |
+
>>> subjectivity.words(categories='subj')
|
| 60 |
+
['smart', 'and', 'alert', ',', 'thirteen', ...]
|
| 61 |
+
|
| 62 |
+
Examples using the Sentence Polarity Dataset:
|
| 63 |
+
|
| 64 |
+
>>> from nltk.corpus import sentence_polarity
|
| 65 |
+
>>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE
|
| 66 |
+
[['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
|
| 67 |
+
'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
|
| 68 |
+
'it', 'funny', '.'], ...]
|
| 69 |
+
>>> sentence_polarity.categories()
|
| 70 |
+
['neg', 'pos']
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
CorpusView = StreamBackedCorpusView
|
| 74 |
+
|
| 75 |
+
def __init__(
|
| 76 |
+
self,
|
| 77 |
+
root,
|
| 78 |
+
fileids,
|
| 79 |
+
word_tokenizer=WhitespaceTokenizer(),
|
| 80 |
+
sent_tokenizer=None,
|
| 81 |
+
encoding="utf8",
|
| 82 |
+
**kwargs
|
| 83 |
+
):
|
| 84 |
+
"""
|
| 85 |
+
:param root: The root directory for the corpus.
|
| 86 |
+
:param fileids: a list or regexp specifying the fileids in the corpus.
|
| 87 |
+
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
| 88 |
+
into words. Default: `WhitespaceTokenizer`
|
| 89 |
+
:param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
|
| 90 |
+
:param encoding: the encoding that should be used to read the corpus.
|
| 91 |
+
:param kwargs: additional parameters passed to CategorizedCorpusReader.
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 95 |
+
CategorizedCorpusReader.__init__(self, kwargs)
|
| 96 |
+
self._word_tokenizer = word_tokenizer
|
| 97 |
+
self._sent_tokenizer = sent_tokenizer
|
| 98 |
+
|
| 99 |
+
def sents(self, fileids=None, categories=None):
|
| 100 |
+
"""
|
| 101 |
+
Return all sentences in the corpus or in the specified file(s).
|
| 102 |
+
|
| 103 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 104 |
+
sentences have to be returned.
|
| 105 |
+
:param categories: a list specifying the categories whose sentences have
|
| 106 |
+
to be returned.
|
| 107 |
+
:return: the given file(s) as a list of sentences.
|
| 108 |
+
Each sentence is tokenized using the specified word_tokenizer.
|
| 109 |
+
:rtype: list(list(str))
|
| 110 |
+
"""
|
| 111 |
+
fileids = self._resolve(fileids, categories)
|
| 112 |
+
if fileids is None:
|
| 113 |
+
fileids = self._fileids
|
| 114 |
+
elif isinstance(fileids, str):
|
| 115 |
+
fileids = [fileids]
|
| 116 |
+
return concat(
|
| 117 |
+
[
|
| 118 |
+
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
| 119 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 120 |
+
]
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
def words(self, fileids=None, categories=None):
|
| 124 |
+
"""
|
| 125 |
+
Return all words and punctuation symbols in the corpus or in the specified
|
| 126 |
+
file(s).
|
| 127 |
+
|
| 128 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 129 |
+
words have to be returned.
|
| 130 |
+
:param categories: a list specifying the categories whose words have to
|
| 131 |
+
be returned.
|
| 132 |
+
:return: the given file(s) as a list of words and punctuation symbols.
|
| 133 |
+
:rtype: list(str)
|
| 134 |
+
"""
|
| 135 |
+
fileids = self._resolve(fileids, categories)
|
| 136 |
+
if fileids is None:
|
| 137 |
+
fileids = self._fileids
|
| 138 |
+
elif isinstance(fileids, str):
|
| 139 |
+
fileids = [fileids]
|
| 140 |
+
return concat(
|
| 141 |
+
[
|
| 142 |
+
self.CorpusView(path, self._read_word_block, encoding=enc)
|
| 143 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 144 |
+
]
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
def _read_sent_block(self, stream):
|
| 148 |
+
sents = []
|
| 149 |
+
for i in range(20): # Read 20 lines at a time.
|
| 150 |
+
line = stream.readline()
|
| 151 |
+
if not line:
|
| 152 |
+
continue
|
| 153 |
+
if self._sent_tokenizer:
|
| 154 |
+
sents.extend(
|
| 155 |
+
[
|
| 156 |
+
self._word_tokenizer.tokenize(sent)
|
| 157 |
+
for sent in self._sent_tokenizer.tokenize(line)
|
| 158 |
+
]
|
| 159 |
+
)
|
| 160 |
+
else:
|
| 161 |
+
sents.append(self._word_tokenizer.tokenize(line))
|
| 162 |
+
return sents
|
| 163 |
+
|
| 164 |
+
def _read_word_block(self, stream):
|
| 165 |
+
words = []
|
| 166 |
+
for sent in self._read_sent_block(stream):
|
| 167 |
+
words.extend(sent)
|
| 168 |
+
return words
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 3 |
+
# Author: Masato Hagiwara <hagisan@gmail.com>
|
| 4 |
+
# URL: <https://www.nltk.org/>
|
| 5 |
+
# For license information, see LICENSE.TXT
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
from nltk.corpus.reader import util
|
| 10 |
+
from nltk.corpus.reader.api import *
|
| 11 |
+
from nltk.corpus.reader.util import *
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ChasenCorpusReader(CorpusReader):
|
| 15 |
+
def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
|
| 16 |
+
self._sent_splitter = sent_splitter
|
| 17 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 18 |
+
|
| 19 |
+
def words(self, fileids=None):
|
| 20 |
+
return concat(
|
| 21 |
+
[
|
| 22 |
+
ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
|
| 23 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 24 |
+
]
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
def tagged_words(self, fileids=None):
|
| 28 |
+
return concat(
|
| 29 |
+
[
|
| 30 |
+
ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
|
| 31 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 32 |
+
]
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
def sents(self, fileids=None):
|
| 36 |
+
return concat(
|
| 37 |
+
[
|
| 38 |
+
ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
|
| 39 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def tagged_sents(self, fileids=None):
|
| 44 |
+
return concat(
|
| 45 |
+
[
|
| 46 |
+
ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
|
| 47 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 48 |
+
]
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
def paras(self, fileids=None):
|
| 52 |
+
return concat(
|
| 53 |
+
[
|
| 54 |
+
ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
|
| 55 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 56 |
+
]
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def tagged_paras(self, fileids=None):
|
| 60 |
+
return concat(
|
| 61 |
+
[
|
| 62 |
+
ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
|
| 63 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 64 |
+
]
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class ChasenCorpusView(StreamBackedCorpusView):
|
| 69 |
+
"""
|
| 70 |
+
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
|
| 71 |
+
but this'll use fixed sets of word and sentence tokenizer.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
def __init__(
|
| 75 |
+
self,
|
| 76 |
+
corpus_file,
|
| 77 |
+
encoding,
|
| 78 |
+
tagged,
|
| 79 |
+
group_by_sent,
|
| 80 |
+
group_by_para,
|
| 81 |
+
sent_splitter=None,
|
| 82 |
+
):
|
| 83 |
+
self._tagged = tagged
|
| 84 |
+
self._group_by_sent = group_by_sent
|
| 85 |
+
self._group_by_para = group_by_para
|
| 86 |
+
self._sent_splitter = sent_splitter
|
| 87 |
+
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
| 88 |
+
|
| 89 |
+
def read_block(self, stream):
|
| 90 |
+
"""Reads one paragraph at a time."""
|
| 91 |
+
block = []
|
| 92 |
+
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
|
| 93 |
+
|
| 94 |
+
para = []
|
| 95 |
+
|
| 96 |
+
sent = []
|
| 97 |
+
for line in para_str.splitlines():
|
| 98 |
+
|
| 99 |
+
_eos = line.strip() == "EOS"
|
| 100 |
+
_cells = line.split("\t")
|
| 101 |
+
w = (_cells[0], "\t".join(_cells[1:]))
|
| 102 |
+
if not _eos:
|
| 103 |
+
sent.append(w)
|
| 104 |
+
|
| 105 |
+
if _eos or (self._sent_splitter and self._sent_splitter(w)):
|
| 106 |
+
if not self._tagged:
|
| 107 |
+
sent = [w for (w, t) in sent]
|
| 108 |
+
if self._group_by_sent:
|
| 109 |
+
para.append(sent)
|
| 110 |
+
else:
|
| 111 |
+
para.extend(sent)
|
| 112 |
+
sent = []
|
| 113 |
+
|
| 114 |
+
if len(sent) > 0:
|
| 115 |
+
if not self._tagged:
|
| 116 |
+
sent = [w for (w, t) in sent]
|
| 117 |
+
|
| 118 |
+
if self._group_by_sent:
|
| 119 |
+
para.append(sent)
|
| 120 |
+
else:
|
| 121 |
+
para.extend(sent)
|
| 122 |
+
|
| 123 |
+
if self._group_by_para:
|
| 124 |
+
block.append(para)
|
| 125 |
+
else:
|
| 126 |
+
block.extend(para)
|
| 127 |
+
|
| 128 |
+
return block
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def demo():
|
| 132 |
+
|
| 133 |
+
import nltk
|
| 134 |
+
from nltk.corpus.util import LazyCorpusLoader
|
| 135 |
+
|
| 136 |
+
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
|
| 137 |
+
print("/".join(jeita.words()[22100:22140]))
|
| 138 |
+
|
| 139 |
+
print(
|
| 140 |
+
"\nEOS\n".join(
|
| 141 |
+
"\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
|
| 142 |
+
for sent in jeita.tagged_sents()[2170:2173]
|
| 143 |
+
)
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def test():
|
| 148 |
+
|
| 149 |
+
from nltk.corpus.util import LazyCorpusLoader
|
| 150 |
+
|
| 151 |
+
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
|
| 152 |
+
|
| 153 |
+
assert isinstance(jeita.tagged_words()[0][1], str)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
if __name__ == "__main__":
|
| 157 |
+
demo()
|
| 158 |
+
test()
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: PanLex Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: David Kamholz <kamholz@panlex.org>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
|
| 10 |
+
as an SQLite database. See the README.txt in the panlex_lite corpus directory
|
| 11 |
+
for more information on PanLex Lite.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import sqlite3
|
| 16 |
+
|
| 17 |
+
from nltk.corpus.reader.api import CorpusReader
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class PanLexLiteCorpusReader(CorpusReader):
|
| 21 |
+
MEANING_Q = """
|
| 22 |
+
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
|
| 23 |
+
FROM dnx
|
| 24 |
+
JOIN ex ON (ex.ex = dnx.ex)
|
| 25 |
+
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
|
| 26 |
+
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
|
| 27 |
+
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
|
| 28 |
+
ORDER BY dnx2.uq DESC
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
TRANSLATION_Q = """
|
| 32 |
+
SELECT s.tt, sum(s.uq) AS trq FROM (
|
| 33 |
+
SELECT ex2.tt, max(dnx.uq) AS uq
|
| 34 |
+
FROM dnx
|
| 35 |
+
JOIN ex ON (ex.ex = dnx.ex)
|
| 36 |
+
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
|
| 37 |
+
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
|
| 38 |
+
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
|
| 39 |
+
GROUP BY ex2.tt, dnx.ui
|
| 40 |
+
) s
|
| 41 |
+
GROUP BY s.tt
|
| 42 |
+
ORDER BY trq DESC, s.tt
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
def __init__(self, root):
|
| 46 |
+
self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
|
| 47 |
+
|
| 48 |
+
self._uid_lv = {}
|
| 49 |
+
self._lv_uid = {}
|
| 50 |
+
|
| 51 |
+
for row in self._c.execute("SELECT uid, lv FROM lv"):
|
| 52 |
+
self._uid_lv[row[0]] = row[1]
|
| 53 |
+
self._lv_uid[row[1]] = row[0]
|
| 54 |
+
|
| 55 |
+
def language_varieties(self, lc=None):
|
| 56 |
+
"""
|
| 57 |
+
Return a list of PanLex language varieties.
|
| 58 |
+
|
| 59 |
+
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
|
| 60 |
+
by this code. If unspecified, all varieties are returned.
|
| 61 |
+
:return: the specified language varieties as a list of tuples. The first
|
| 62 |
+
element is the language variety's seven-character uniform identifier,
|
| 63 |
+
and the second element is its default name.
|
| 64 |
+
:rtype: list(tuple)
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
if lc is None:
|
| 68 |
+
return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
|
| 69 |
+
else:
|
| 70 |
+
return self._c.execute(
|
| 71 |
+
"SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
|
| 72 |
+
).fetchall()
|
| 73 |
+
|
| 74 |
+
def meanings(self, expr_uid, expr_tt):
|
| 75 |
+
"""
|
| 76 |
+
Return a list of meanings for an expression.
|
| 77 |
+
|
| 78 |
+
:param expr_uid: the expression's language variety, as a seven-character
|
| 79 |
+
uniform identifier.
|
| 80 |
+
:param expr_tt: the expression's text.
|
| 81 |
+
:return: a list of Meaning objects.
|
| 82 |
+
:rtype: list(Meaning)
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
expr_lv = self._uid_lv[expr_uid]
|
| 86 |
+
|
| 87 |
+
mn_info = {}
|
| 88 |
+
|
| 89 |
+
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
|
| 90 |
+
mn = i[0]
|
| 91 |
+
uid = self._lv_uid[i[5]]
|
| 92 |
+
|
| 93 |
+
if not mn in mn_info:
|
| 94 |
+
mn_info[mn] = {
|
| 95 |
+
"uq": i[1],
|
| 96 |
+
"ap": i[2],
|
| 97 |
+
"ui": i[3],
|
| 98 |
+
"ex": {expr_uid: [expr_tt]},
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
if not uid in mn_info[mn]["ex"]:
|
| 102 |
+
mn_info[mn]["ex"][uid] = []
|
| 103 |
+
|
| 104 |
+
mn_info[mn]["ex"][uid].append(i[4])
|
| 105 |
+
|
| 106 |
+
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
|
| 107 |
+
|
| 108 |
+
def translations(self, from_uid, from_tt, to_uid):
|
| 109 |
+
"""
|
| 110 |
+
Return a list of translations for an expression into a single language
|
| 111 |
+
variety.
|
| 112 |
+
|
| 113 |
+
:param from_uid: the source expression's language variety, as a
|
| 114 |
+
seven-character uniform identifier.
|
| 115 |
+
:param from_tt: the source expression's text.
|
| 116 |
+
:param to_uid: the target language variety, as a seven-character
|
| 117 |
+
uniform identifier.
|
| 118 |
+
:return: a list of translation tuples. The first element is the expression
|
| 119 |
+
text and the second element is the translation quality.
|
| 120 |
+
:rtype: list(tuple)
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
from_lv = self._uid_lv[from_uid]
|
| 124 |
+
to_lv = self._uid_lv[to_uid]
|
| 125 |
+
|
| 126 |
+
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class Meaning(dict):
|
| 130 |
+
"""
|
| 131 |
+
Represents a single PanLex meaning. A meaning is a translation set derived
|
| 132 |
+
from a single source.
|
| 133 |
+
"""
|
| 134 |
+
|
| 135 |
+
def __init__(self, mn, attr):
|
| 136 |
+
super().__init__(**attr)
|
| 137 |
+
self["mn"] = mn
|
| 138 |
+
|
| 139 |
+
def id(self):
|
| 140 |
+
"""
|
| 141 |
+
:return: the meaning's id.
|
| 142 |
+
:rtype: int
|
| 143 |
+
"""
|
| 144 |
+
return self["mn"]
|
| 145 |
+
|
| 146 |
+
def quality(self):
|
| 147 |
+
"""
|
| 148 |
+
:return: the meaning's source's quality (0=worst, 9=best).
|
| 149 |
+
:rtype: int
|
| 150 |
+
"""
|
| 151 |
+
return self["uq"]
|
| 152 |
+
|
| 153 |
+
def source(self):
|
| 154 |
+
"""
|
| 155 |
+
:return: the meaning's source id.
|
| 156 |
+
:rtype: int
|
| 157 |
+
"""
|
| 158 |
+
return self["ap"]
|
| 159 |
+
|
| 160 |
+
def source_group(self):
|
| 161 |
+
"""
|
| 162 |
+
:return: the meaning's source group id.
|
| 163 |
+
:rtype: int
|
| 164 |
+
"""
|
| 165 |
+
return self["ui"]
|
| 166 |
+
|
| 167 |
+
def expressions(self):
|
| 168 |
+
"""
|
| 169 |
+
:return: the meaning's expressions as a dictionary whose keys are language
|
| 170 |
+
variety uniform identifiers and whose values are lists of expression
|
| 171 |
+
texts.
|
| 172 |
+
:rtype: dict
|
| 173 |
+
"""
|
| 174 |
+
return self["ex"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Word List Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
from collections import defaultdict, namedtuple
|
| 12 |
+
|
| 13 |
+
from nltk.corpus.reader.api import *
|
| 14 |
+
from nltk.corpus.reader.util import *
|
| 15 |
+
from nltk.corpus.reader.wordlist import WordListCorpusReader
|
| 16 |
+
from nltk.tokenize import line_tokenize
|
| 17 |
+
|
| 18 |
+
PanlexLanguage = namedtuple(
|
| 19 |
+
"PanlexLanguage",
|
| 20 |
+
[
|
| 21 |
+
"panlex_uid", # (1) PanLex UID
|
| 22 |
+
"iso639", # (2) ISO 639 language code
|
| 23 |
+
"iso639_type", # (3) ISO 639 language type, see README
|
| 24 |
+
"script", # (4) normal scripts of expressions
|
| 25 |
+
"name", # (5) PanLex default name
|
| 26 |
+
"langvar_uid", # (6) UID of the language variety in which the default name is an expression
|
| 27 |
+
],
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class PanlexSwadeshCorpusReader(WordListCorpusReader):
|
| 32 |
+
"""
|
| 33 |
+
This is a class to read the PanLex Swadesh list from
|
| 34 |
+
|
| 35 |
+
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
|
| 36 |
+
PanLex: Building a Resource for Panlingual Lexical Translation.
|
| 37 |
+
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
|
| 38 |
+
|
| 39 |
+
License: CC0 1.0 Universal
|
| 40 |
+
https://creativecommons.org/publicdomain/zero/1.0/legalcode
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self, *args, **kwargs):
|
| 44 |
+
super().__init__(*args, **kwargs)
|
| 45 |
+
# Find the swadesh size using the fileids' path.
|
| 46 |
+
self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
|
| 47 |
+
self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
|
| 48 |
+
self._macro_langauges = self.get_macrolanguages()
|
| 49 |
+
|
| 50 |
+
def license(self):
|
| 51 |
+
return "CC0 1.0 Universal"
|
| 52 |
+
|
| 53 |
+
def language_codes(self):
|
| 54 |
+
return self._languages.keys()
|
| 55 |
+
|
| 56 |
+
def get_languages(self):
|
| 57 |
+
for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
|
| 58 |
+
if not line.strip(): # Skip empty lines.
|
| 59 |
+
continue
|
| 60 |
+
yield PanlexLanguage(*line.strip().split("\t"))
|
| 61 |
+
|
| 62 |
+
def get_macrolanguages(self):
|
| 63 |
+
macro_langauges = defaultdict(list)
|
| 64 |
+
for lang in self._languages.values():
|
| 65 |
+
macro_langauges[lang.iso639].append(lang.panlex_uid)
|
| 66 |
+
return macro_langauges
|
| 67 |
+
|
| 68 |
+
def words_by_lang(self, lang_code):
|
| 69 |
+
"""
|
| 70 |
+
:return: a list of list(str)
|
| 71 |
+
"""
|
| 72 |
+
fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
|
| 73 |
+
return [concept.split("\t") for concept in self.words(fileid)]
|
| 74 |
+
|
| 75 |
+
def words_by_iso639(self, iso63_code):
|
| 76 |
+
"""
|
| 77 |
+
:return: a list of list(str)
|
| 78 |
+
"""
|
| 79 |
+
fileids = [
|
| 80 |
+
f"swadesh{self.swadesh_size}/{lang_code}.txt"
|
| 81 |
+
for lang_code in self._macro_langauges[iso63_code]
|
| 82 |
+
]
|
| 83 |
+
return [
|
| 84 |
+
concept.split("\t") for fileid in fileids for concept in self.words(fileid)
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
def entries(self, fileids=None):
|
| 88 |
+
"""
|
| 89 |
+
:return: a tuple of words for the specified fileids.
|
| 90 |
+
"""
|
| 91 |
+
if not fileids:
|
| 92 |
+
fileids = self.fileids()
|
| 93 |
+
|
| 94 |
+
wordlists = [self.words(f) for f in fileids]
|
| 95 |
+
return list(zip(*wordlists))
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit:
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
from nltk.corpus.reader.api import *
|
| 9 |
+
from nltk.corpus.reader.xmldocs import XMLCorpusReader
|
| 10 |
+
|
| 11 |
+
PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
|
| 12 |
+
SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
|
| 13 |
+
|
| 14 |
+
TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
|
| 15 |
+
WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
|
| 16 |
+
|
| 17 |
+
TYPE = re.compile(r'type="(.*?)"')
|
| 18 |
+
ANA = re.compile(r'ana="(.*?)"')
|
| 19 |
+
|
| 20 |
+
TEXTID = re.compile(r'text id="(.*?)"')
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class TEICorpusView(StreamBackedCorpusView):
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
corpus_file,
|
| 27 |
+
tagged,
|
| 28 |
+
group_by_sent,
|
| 29 |
+
group_by_para,
|
| 30 |
+
tagset=None,
|
| 31 |
+
head_len=0,
|
| 32 |
+
textids=None,
|
| 33 |
+
):
|
| 34 |
+
|
| 35 |
+
self._tagged = tagged
|
| 36 |
+
self._textids = textids
|
| 37 |
+
|
| 38 |
+
self._group_by_sent = group_by_sent
|
| 39 |
+
self._group_by_para = group_by_para
|
| 40 |
+
# WARNING -- skip header
|
| 41 |
+
StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
|
| 42 |
+
|
| 43 |
+
_pagesize = 4096
|
| 44 |
+
|
| 45 |
+
def read_block(self, stream):
|
| 46 |
+
block = stream.readlines(self._pagesize)
|
| 47 |
+
block = concat(block)
|
| 48 |
+
while (block.count("<text id") > block.count("</text>")) or block.count(
|
| 49 |
+
"<text id"
|
| 50 |
+
) == 0:
|
| 51 |
+
tmp = stream.readline()
|
| 52 |
+
if len(tmp) <= 0:
|
| 53 |
+
break
|
| 54 |
+
block += tmp
|
| 55 |
+
|
| 56 |
+
block = block.replace("\n", "")
|
| 57 |
+
|
| 58 |
+
textids = TEXTID.findall(block)
|
| 59 |
+
if self._textids:
|
| 60 |
+
for tid in textids:
|
| 61 |
+
if tid not in self._textids:
|
| 62 |
+
beg = block.find(tid) - 1
|
| 63 |
+
end = block[beg:].find("</text>") + len("</text>")
|
| 64 |
+
block = block[:beg] + block[beg + end :]
|
| 65 |
+
|
| 66 |
+
output = []
|
| 67 |
+
for para_str in PARA.findall(block):
|
| 68 |
+
para = []
|
| 69 |
+
for sent_str in SENT.findall(para_str):
|
| 70 |
+
if not self._tagged:
|
| 71 |
+
sent = WORD.findall(sent_str)
|
| 72 |
+
else:
|
| 73 |
+
sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
|
| 74 |
+
if self._group_by_sent:
|
| 75 |
+
para.append(sent)
|
| 76 |
+
else:
|
| 77 |
+
para.extend(sent)
|
| 78 |
+
if self._group_by_para:
|
| 79 |
+
output.append(para)
|
| 80 |
+
else:
|
| 81 |
+
output.extend(para)
|
| 82 |
+
return output
|
| 83 |
+
|
| 84 |
+
def _parse_tag(self, tag_word_tuple):
|
| 85 |
+
(tag, word) = tag_word_tuple
|
| 86 |
+
if tag.startswith("w"):
|
| 87 |
+
tag = ANA.search(tag).group(1)
|
| 88 |
+
else: # tag.startswith('c')
|
| 89 |
+
tag = TYPE.search(tag).group(1)
|
| 90 |
+
return word, tag
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
|
| 94 |
+
head_len = 2770
|
| 95 |
+
|
| 96 |
+
def __init__(self, *args, **kwargs):
|
| 97 |
+
if "textid_file" in kwargs:
|
| 98 |
+
self._textids = kwargs["textid_file"]
|
| 99 |
+
else:
|
| 100 |
+
self._textids = None
|
| 101 |
+
|
| 102 |
+
XMLCorpusReader.__init__(self, *args)
|
| 103 |
+
CategorizedCorpusReader.__init__(self, kwargs)
|
| 104 |
+
|
| 105 |
+
self._init_textids()
|
| 106 |
+
|
| 107 |
+
def _init_textids(self):
|
| 108 |
+
self._f2t = defaultdict(list)
|
| 109 |
+
self._t2f = defaultdict(list)
|
| 110 |
+
if self._textids is not None:
|
| 111 |
+
with open(self._textids) as fp:
|
| 112 |
+
for line in fp:
|
| 113 |
+
line = line.strip()
|
| 114 |
+
file_id, text_ids = line.split(" ", 1)
|
| 115 |
+
if file_id not in self.fileids():
|
| 116 |
+
raise ValueError(
|
| 117 |
+
"In text_id mapping file %s: %s not found"
|
| 118 |
+
% (self._textids, file_id)
|
| 119 |
+
)
|
| 120 |
+
for text_id in text_ids.split(self._delimiter):
|
| 121 |
+
self._add_textids(file_id, text_id)
|
| 122 |
+
|
| 123 |
+
def _add_textids(self, file_id, text_id):
|
| 124 |
+
self._f2t[file_id].append(text_id)
|
| 125 |
+
self._t2f[text_id].append(file_id)
|
| 126 |
+
|
| 127 |
+
def _resolve(self, fileids, categories, textids=None):
|
| 128 |
+
tmp = None
|
| 129 |
+
if (
|
| 130 |
+
len(
|
| 131 |
+
list(
|
| 132 |
+
filter(
|
| 133 |
+
lambda accessor: accessor is None,
|
| 134 |
+
(fileids, categories, textids),
|
| 135 |
+
)
|
| 136 |
+
)
|
| 137 |
+
)
|
| 138 |
+
!= 1
|
| 139 |
+
):
|
| 140 |
+
|
| 141 |
+
raise ValueError(
|
| 142 |
+
"Specify exactly one of: fileids, " "categories or textids"
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
if fileids is not None:
|
| 146 |
+
return fileids, None
|
| 147 |
+
|
| 148 |
+
if categories is not None:
|
| 149 |
+
return self.fileids(categories), None
|
| 150 |
+
|
| 151 |
+
if textids is not None:
|
| 152 |
+
if isinstance(textids, str):
|
| 153 |
+
textids = [textids]
|
| 154 |
+
files = sum((self._t2f[t] for t in textids), [])
|
| 155 |
+
tdict = dict()
|
| 156 |
+
for f in files:
|
| 157 |
+
tdict[f] = set(self._f2t[f]) & set(textids)
|
| 158 |
+
return files, tdict
|
| 159 |
+
|
| 160 |
+
def decode_tag(self, tag):
|
| 161 |
+
# to be implemented
|
| 162 |
+
return tag
|
| 163 |
+
|
| 164 |
+
def textids(self, fileids=None, categories=None):
|
| 165 |
+
"""
|
| 166 |
+
In the pl196x corpus each category is stored in single
|
| 167 |
+
file and thus both methods provide identical functionality. In order
|
| 168 |
+
to accommodate finer granularity, a non-standard textids() method was
|
| 169 |
+
implemented. All the main functions can be supplied with a list
|
| 170 |
+
of required chunks---giving much more control to the user.
|
| 171 |
+
"""
|
| 172 |
+
fileids, _ = self._resolve(fileids, categories)
|
| 173 |
+
if fileids is None:
|
| 174 |
+
return sorted(self._t2f)
|
| 175 |
+
|
| 176 |
+
if isinstance(fileids, str):
|
| 177 |
+
fileids = [fileids]
|
| 178 |
+
return sorted(sum((self._f2t[d] for d in fileids), []))
|
| 179 |
+
|
| 180 |
+
def words(self, fileids=None, categories=None, textids=None):
|
| 181 |
+
fileids, textids = self._resolve(fileids, categories, textids)
|
| 182 |
+
if fileids is None:
|
| 183 |
+
fileids = self._fileids
|
| 184 |
+
elif isinstance(fileids, str):
|
| 185 |
+
fileids = [fileids]
|
| 186 |
+
|
| 187 |
+
if textids:
|
| 188 |
+
return concat(
|
| 189 |
+
[
|
| 190 |
+
TEICorpusView(
|
| 191 |
+
self.abspath(fileid),
|
| 192 |
+
False,
|
| 193 |
+
False,
|
| 194 |
+
False,
|
| 195 |
+
head_len=self.head_len,
|
| 196 |
+
textids=textids[fileid],
|
| 197 |
+
)
|
| 198 |
+
for fileid in fileids
|
| 199 |
+
]
|
| 200 |
+
)
|
| 201 |
+
else:
|
| 202 |
+
return concat(
|
| 203 |
+
[
|
| 204 |
+
TEICorpusView(
|
| 205 |
+
self.abspath(fileid),
|
| 206 |
+
False,
|
| 207 |
+
False,
|
| 208 |
+
False,
|
| 209 |
+
head_len=self.head_len,
|
| 210 |
+
)
|
| 211 |
+
for fileid in fileids
|
| 212 |
+
]
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
def sents(self, fileids=None, categories=None, textids=None):
|
| 216 |
+
fileids, textids = self._resolve(fileids, categories, textids)
|
| 217 |
+
if fileids is None:
|
| 218 |
+
fileids = self._fileids
|
| 219 |
+
elif isinstance(fileids, str):
|
| 220 |
+
fileids = [fileids]
|
| 221 |
+
|
| 222 |
+
if textids:
|
| 223 |
+
return concat(
|
| 224 |
+
[
|
| 225 |
+
TEICorpusView(
|
| 226 |
+
self.abspath(fileid),
|
| 227 |
+
False,
|
| 228 |
+
True,
|
| 229 |
+
False,
|
| 230 |
+
head_len=self.head_len,
|
| 231 |
+
textids=textids[fileid],
|
| 232 |
+
)
|
| 233 |
+
for fileid in fileids
|
| 234 |
+
]
|
| 235 |
+
)
|
| 236 |
+
else:
|
| 237 |
+
return concat(
|
| 238 |
+
[
|
| 239 |
+
TEICorpusView(
|
| 240 |
+
self.abspath(fileid), False, True, False, head_len=self.head_len
|
| 241 |
+
)
|
| 242 |
+
for fileid in fileids
|
| 243 |
+
]
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
def paras(self, fileids=None, categories=None, textids=None):
|
| 247 |
+
fileids, textids = self._resolve(fileids, categories, textids)
|
| 248 |
+
if fileids is None:
|
| 249 |
+
fileids = self._fileids
|
| 250 |
+
elif isinstance(fileids, str):
|
| 251 |
+
fileids = [fileids]
|
| 252 |
+
|
| 253 |
+
if textids:
|
| 254 |
+
return concat(
|
| 255 |
+
[
|
| 256 |
+
TEICorpusView(
|
| 257 |
+
self.abspath(fileid),
|
| 258 |
+
False,
|
| 259 |
+
True,
|
| 260 |
+
True,
|
| 261 |
+
head_len=self.head_len,
|
| 262 |
+
textids=textids[fileid],
|
| 263 |
+
)
|
| 264 |
+
for fileid in fileids
|
| 265 |
+
]
|
| 266 |
+
)
|
| 267 |
+
else:
|
| 268 |
+
return concat(
|
| 269 |
+
[
|
| 270 |
+
TEICorpusView(
|
| 271 |
+
self.abspath(fileid), False, True, True, head_len=self.head_len
|
| 272 |
+
)
|
| 273 |
+
for fileid in fileids
|
| 274 |
+
]
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
def tagged_words(self, fileids=None, categories=None, textids=None):
|
| 278 |
+
fileids, textids = self._resolve(fileids, categories, textids)
|
| 279 |
+
if fileids is None:
|
| 280 |
+
fileids = self._fileids
|
| 281 |
+
elif isinstance(fileids, str):
|
| 282 |
+
fileids = [fileids]
|
| 283 |
+
|
| 284 |
+
if textids:
|
| 285 |
+
return concat(
|
| 286 |
+
[
|
| 287 |
+
TEICorpusView(
|
| 288 |
+
self.abspath(fileid),
|
| 289 |
+
True,
|
| 290 |
+
False,
|
| 291 |
+
False,
|
| 292 |
+
head_len=self.head_len,
|
| 293 |
+
textids=textids[fileid],
|
| 294 |
+
)
|
| 295 |
+
for fileid in fileids
|
| 296 |
+
]
|
| 297 |
+
)
|
| 298 |
+
else:
|
| 299 |
+
return concat(
|
| 300 |
+
[
|
| 301 |
+
TEICorpusView(
|
| 302 |
+
self.abspath(fileid), True, False, False, head_len=self.head_len
|
| 303 |
+
)
|
| 304 |
+
for fileid in fileids
|
| 305 |
+
]
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
def tagged_sents(self, fileids=None, categories=None, textids=None):
|
| 309 |
+
fileids, textids = self._resolve(fileids, categories, textids)
|
| 310 |
+
if fileids is None:
|
| 311 |
+
fileids = self._fileids
|
| 312 |
+
elif isinstance(fileids, str):
|
| 313 |
+
fileids = [fileids]
|
| 314 |
+
|
| 315 |
+
if textids:
|
| 316 |
+
return concat(
|
| 317 |
+
[
|
| 318 |
+
TEICorpusView(
|
| 319 |
+
self.abspath(fileid),
|
| 320 |
+
True,
|
| 321 |
+
True,
|
| 322 |
+
False,
|
| 323 |
+
head_len=self.head_len,
|
| 324 |
+
textids=textids[fileid],
|
| 325 |
+
)
|
| 326 |
+
for fileid in fileids
|
| 327 |
+
]
|
| 328 |
+
)
|
| 329 |
+
else:
|
| 330 |
+
return concat(
|
| 331 |
+
[
|
| 332 |
+
TEICorpusView(
|
| 333 |
+
self.abspath(fileid), True, True, False, head_len=self.head_len
|
| 334 |
+
)
|
| 335 |
+
for fileid in fileids
|
| 336 |
+
]
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
def tagged_paras(self, fileids=None, categories=None, textids=None):
|
| 340 |
+
fileids, textids = self._resolve(fileids, categories, textids)
|
| 341 |
+
if fileids is None:
|
| 342 |
+
fileids = self._fileids
|
| 343 |
+
elif isinstance(fileids, str):
|
| 344 |
+
fileids = [fileids]
|
| 345 |
+
|
| 346 |
+
if textids:
|
| 347 |
+
return concat(
|
| 348 |
+
[
|
| 349 |
+
TEICorpusView(
|
| 350 |
+
self.abspath(fileid),
|
| 351 |
+
True,
|
| 352 |
+
True,
|
| 353 |
+
True,
|
| 354 |
+
head_len=self.head_len,
|
| 355 |
+
textids=textids[fileid],
|
| 356 |
+
)
|
| 357 |
+
for fileid in fileids
|
| 358 |
+
]
|
| 359 |
+
)
|
| 360 |
+
else:
|
| 361 |
+
return concat(
|
| 362 |
+
[
|
| 363 |
+
TEICorpusView(
|
| 364 |
+
self.abspath(fileid), True, True, True, head_len=self.head_len
|
| 365 |
+
)
|
| 366 |
+
for fileid in fileids
|
| 367 |
+
]
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
def xml(self, fileids=None, categories=None):
|
| 371 |
+
fileids, _ = self._resolve(fileids, categories)
|
| 372 |
+
if len(fileids) == 1:
|
| 373 |
+
return XMLCorpusReader.xml(self, fileids[0])
|
| 374 |
+
else:
|
| 375 |
+
raise TypeError("Expected a single file")
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Plaintext Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# Nitin Madnani <nmadnani@umiacs.umd.edu>
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
A reader for corpora that consist of plaintext documents.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import nltk.data
|
| 15 |
+
from nltk.corpus.reader.api import *
|
| 16 |
+
from nltk.corpus.reader.util import *
|
| 17 |
+
from nltk.tokenize import *
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class PlaintextCorpusReader(CorpusReader):
|
| 21 |
+
"""
|
| 22 |
+
Reader for corpora that consist of plaintext documents. Paragraphs
|
| 23 |
+
are assumed to be split using blank lines. Sentences and words can
|
| 24 |
+
be tokenized using the default tokenizers, or by custom tokenizers
|
| 25 |
+
specified as parameters to the constructor.
|
| 26 |
+
|
| 27 |
+
This corpus reader can be customized (e.g., to skip preface
|
| 28 |
+
sections of specific document formats) by creating a subclass and
|
| 29 |
+
overriding the ``CorpusView`` class variable.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
CorpusView = StreamBackedCorpusView
|
| 33 |
+
"""The corpus view class used by this reader. Subclasses of
|
| 34 |
+
``PlaintextCorpusReader`` may specify alternative corpus view
|
| 35 |
+
classes (e.g., to skip the preface sections of documents.)"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
root,
|
| 40 |
+
fileids,
|
| 41 |
+
word_tokenizer=WordPunctTokenizer(),
|
| 42 |
+
sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),
|
| 43 |
+
para_block_reader=read_blankline_block,
|
| 44 |
+
encoding="utf8",
|
| 45 |
+
):
|
| 46 |
+
r"""
|
| 47 |
+
Construct a new plaintext corpus reader for a set of documents
|
| 48 |
+
located at the given root directory. Example usage:
|
| 49 |
+
|
| 50 |
+
>>> root = '/usr/local/share/nltk_data/corpora/webtext/'
|
| 51 |
+
>>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
|
| 52 |
+
|
| 53 |
+
:param root: The root directory for this corpus.
|
| 54 |
+
:param fileids: A list or regexp specifying the fileids in this corpus.
|
| 55 |
+
:param word_tokenizer: Tokenizer for breaking sentences or
|
| 56 |
+
paragraphs into words.
|
| 57 |
+
:param sent_tokenizer: Tokenizer for breaking paragraphs
|
| 58 |
+
into words.
|
| 59 |
+
:param para_block_reader: The block reader used to divide the
|
| 60 |
+
corpus into paragraph blocks.
|
| 61 |
+
"""
|
| 62 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 63 |
+
self._word_tokenizer = word_tokenizer
|
| 64 |
+
self._sent_tokenizer = sent_tokenizer
|
| 65 |
+
self._para_block_reader = para_block_reader
|
| 66 |
+
|
| 67 |
+
def words(self, fileids=None):
|
| 68 |
+
"""
|
| 69 |
+
:return: the given file(s) as a list of words
|
| 70 |
+
and punctuation symbols.
|
| 71 |
+
:rtype: list(str)
|
| 72 |
+
"""
|
| 73 |
+
return concat(
|
| 74 |
+
[
|
| 75 |
+
self.CorpusView(path, self._read_word_block, encoding=enc)
|
| 76 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 77 |
+
]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
def sents(self, fileids=None):
|
| 81 |
+
"""
|
| 82 |
+
:return: the given file(s) as a list of
|
| 83 |
+
sentences or utterances, each encoded as a list of word
|
| 84 |
+
strings.
|
| 85 |
+
:rtype: list(list(str))
|
| 86 |
+
"""
|
| 87 |
+
if self._sent_tokenizer is None:
|
| 88 |
+
raise ValueError("No sentence tokenizer for this corpus")
|
| 89 |
+
|
| 90 |
+
return concat(
|
| 91 |
+
[
|
| 92 |
+
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
| 93 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 94 |
+
]
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
def paras(self, fileids=None):
|
| 98 |
+
"""
|
| 99 |
+
:return: the given file(s) as a list of
|
| 100 |
+
paragraphs, each encoded as a list of sentences, which are
|
| 101 |
+
in turn encoded as lists of word strings.
|
| 102 |
+
:rtype: list(list(list(str)))
|
| 103 |
+
"""
|
| 104 |
+
if self._sent_tokenizer is None:
|
| 105 |
+
raise ValueError("No sentence tokenizer for this corpus")
|
| 106 |
+
|
| 107 |
+
return concat(
|
| 108 |
+
[
|
| 109 |
+
self.CorpusView(path, self._read_para_block, encoding=enc)
|
| 110 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 111 |
+
]
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
def _read_word_block(self, stream):
|
| 115 |
+
words = []
|
| 116 |
+
for i in range(20): # Read 20 lines at a time.
|
| 117 |
+
words.extend(self._word_tokenizer.tokenize(stream.readline()))
|
| 118 |
+
return words
|
| 119 |
+
|
| 120 |
+
def _read_sent_block(self, stream):
|
| 121 |
+
sents = []
|
| 122 |
+
for para in self._para_block_reader(stream):
|
| 123 |
+
sents.extend(
|
| 124 |
+
[
|
| 125 |
+
self._word_tokenizer.tokenize(sent)
|
| 126 |
+
for sent in self._sent_tokenizer.tokenize(para)
|
| 127 |
+
]
|
| 128 |
+
)
|
| 129 |
+
return sents
|
| 130 |
+
|
| 131 |
+
def _read_para_block(self, stream):
|
| 132 |
+
paras = []
|
| 133 |
+
for para in self._para_block_reader(stream):
|
| 134 |
+
paras.append(
|
| 135 |
+
[
|
| 136 |
+
self._word_tokenizer.tokenize(sent)
|
| 137 |
+
for sent in self._sent_tokenizer.tokenize(para)
|
| 138 |
+
]
|
| 139 |
+
)
|
| 140 |
+
return paras
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
|
| 144 |
+
"""
|
| 145 |
+
A reader for plaintext corpora whose documents are divided into
|
| 146 |
+
categories based on their file identifiers.
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
def __init__(self, *args, **kwargs):
|
| 150 |
+
"""
|
| 151 |
+
Initialize the corpus reader. Categorization arguments
|
| 152 |
+
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
| 153 |
+
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
| 154 |
+
are passed to the ``PlaintextCorpusReader`` constructor.
|
| 155 |
+
"""
|
| 156 |
+
CategorizedCorpusReader.__init__(self, kwargs)
|
| 157 |
+
PlaintextCorpusReader.__init__(self, *args, **kwargs)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# FIXME: Is there a better way? How to not hardcode this?
|
| 161 |
+
# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
|
| 162 |
+
# override the `sent_tokenizer`.
|
| 163 |
+
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
|
| 164 |
+
def __init__(self, *args, **kwargs):
|
| 165 |
+
CategorizedCorpusReader.__init__(self, kwargs)
|
| 166 |
+
kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
|
| 167 |
+
"tokenizers/punkt/portuguese.pickle"
|
| 168 |
+
)
|
| 169 |
+
PlaintextCorpusReader.__init__(self, *args, **kwargs)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
class EuroparlCorpusReader(PlaintextCorpusReader):
|
| 173 |
+
|
| 174 |
+
"""
|
| 175 |
+
Reader for Europarl corpora that consist of plaintext documents.
|
| 176 |
+
Documents are divided into chapters instead of paragraphs as
|
| 177 |
+
for regular plaintext documents. Chapters are separated using blank
|
| 178 |
+
lines. Everything is inherited from ``PlaintextCorpusReader`` except
|
| 179 |
+
that:
|
| 180 |
+
|
| 181 |
+
- Since the corpus is pre-processed and pre-tokenized, the
|
| 182 |
+
word tokenizer should just split the line at whitespaces.
|
| 183 |
+
- For the same reason, the sentence tokenizer should just
|
| 184 |
+
split the paragraph at line breaks.
|
| 185 |
+
- There is a new 'chapters()' method that returns chapters instead
|
| 186 |
+
instead of paragraphs.
|
| 187 |
+
- The 'paras()' method inherited from PlaintextCorpusReader is
|
| 188 |
+
made non-functional to remove any confusion between chapters
|
| 189 |
+
and paragraphs for Europarl.
|
| 190 |
+
"""
|
| 191 |
+
|
| 192 |
+
def _read_word_block(self, stream):
|
| 193 |
+
words = []
|
| 194 |
+
for i in range(20): # Read 20 lines at a time.
|
| 195 |
+
words.extend(stream.readline().split())
|
| 196 |
+
return words
|
| 197 |
+
|
| 198 |
+
def _read_sent_block(self, stream):
|
| 199 |
+
sents = []
|
| 200 |
+
for para in self._para_block_reader(stream):
|
| 201 |
+
sents.extend([sent.split() for sent in para.splitlines()])
|
| 202 |
+
return sents
|
| 203 |
+
|
| 204 |
+
def _read_para_block(self, stream):
|
| 205 |
+
paras = []
|
| 206 |
+
for para in self._para_block_reader(stream):
|
| 207 |
+
paras.append([sent.split() for sent in para.splitlines()])
|
| 208 |
+
return paras
|
| 209 |
+
|
| 210 |
+
def chapters(self, fileids=None):
|
| 211 |
+
"""
|
| 212 |
+
:return: the given file(s) as a list of
|
| 213 |
+
chapters, each encoded as a list of sentences, which are
|
| 214 |
+
in turn encoded as lists of word strings.
|
| 215 |
+
:rtype: list(list(list(str)))
|
| 216 |
+
"""
|
| 217 |
+
return concat(
|
| 218 |
+
[
|
| 219 |
+
self.CorpusView(fileid, self._read_para_block, encoding=enc)
|
| 220 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 221 |
+
]
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
def paras(self, fileids=None):
|
| 225 |
+
raise NotImplementedError(
|
| 226 |
+
"The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
|
| 227 |
+
)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: PP Attachment Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
Read lines from the Prepositional Phrase Attachment Corpus.
|
| 11 |
+
|
| 12 |
+
The PP Attachment Corpus contains several files having the format:
|
| 13 |
+
|
| 14 |
+
sentence_id verb noun1 preposition noun2 attachment
|
| 15 |
+
|
| 16 |
+
For example:
|
| 17 |
+
|
| 18 |
+
42960 gives authority to administration V
|
| 19 |
+
46742 gives inventors of microchip N
|
| 20 |
+
|
| 21 |
+
The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
|
| 22 |
+
|
| 23 |
+
(VP gives (NP authority) (PP to administration))
|
| 24 |
+
(VP gives (NP inventors (PP of microchip)))
|
| 25 |
+
|
| 26 |
+
The corpus contains the following files:
|
| 27 |
+
|
| 28 |
+
training: training set
|
| 29 |
+
devset: development test set, used for algorithm development.
|
| 30 |
+
test: test set, used to report results
|
| 31 |
+
bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
|
| 32 |
+
|
| 33 |
+
Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
|
| 34 |
+
Phrase Attachment. Proceedings of the ARPA Human Language Technology
|
| 35 |
+
Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
|
| 36 |
+
|
| 37 |
+
The PP Attachment Corpus is distributed with NLTK with the permission
|
| 38 |
+
of the author.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
from nltk.corpus.reader.api import *
|
| 42 |
+
from nltk.corpus.reader.util import *
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class PPAttachment:
|
| 46 |
+
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
|
| 47 |
+
self.sent = sent
|
| 48 |
+
self.verb = verb
|
| 49 |
+
self.noun1 = noun1
|
| 50 |
+
self.prep = prep
|
| 51 |
+
self.noun2 = noun2
|
| 52 |
+
self.attachment = attachment
|
| 53 |
+
|
| 54 |
+
def __repr__(self):
|
| 55 |
+
return (
|
| 56 |
+
"PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
|
| 57 |
+
"noun2=%r, attachment=%r)"
|
| 58 |
+
% (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class PPAttachmentCorpusReader(CorpusReader):
|
| 63 |
+
"""
|
| 64 |
+
sentence_id verb noun1 preposition noun2 attachment
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
def attachments(self, fileids):
|
| 68 |
+
return concat(
|
| 69 |
+
[
|
| 70 |
+
StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
|
| 71 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 72 |
+
]
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
def tuples(self, fileids):
|
| 76 |
+
return concat(
|
| 77 |
+
[
|
| 78 |
+
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
|
| 79 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 80 |
+
]
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
def _read_tuple_block(self, stream):
|
| 84 |
+
line = stream.readline()
|
| 85 |
+
if line:
|
| 86 |
+
return [tuple(line.split())]
|
| 87 |
+
else:
|
| 88 |
+
return []
|
| 89 |
+
|
| 90 |
+
def _read_obj_block(self, stream):
|
| 91 |
+
line = stream.readline()
|
| 92 |
+
if line:
|
| 93 |
+
return [PPAttachment(*line.split())]
|
| 94 |
+
else:
|
| 95 |
+
return []
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py
ADDED
|
@@ -0,0 +1,520 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: PropBank Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from functools import total_ordering
|
| 10 |
+
from xml.etree import ElementTree
|
| 11 |
+
|
| 12 |
+
from nltk.corpus.reader.api import *
|
| 13 |
+
from nltk.corpus.reader.util import *
|
| 14 |
+
from nltk.internals import raise_unorderable_types
|
| 15 |
+
from nltk.tree import Tree
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PropbankCorpusReader(CorpusReader):
|
| 19 |
+
"""
|
| 20 |
+
Corpus reader for the propbank corpus, which augments the Penn
|
| 21 |
+
Treebank with information about the predicate argument structure
|
| 22 |
+
of every verb instance. The corpus consists of two parts: the
|
| 23 |
+
predicate-argument annotations themselves, and a set of "frameset
|
| 24 |
+
files" which define the argument labels used by the annotations,
|
| 25 |
+
on a per-verb basis. Each "frameset file" contains one or more
|
| 26 |
+
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
|
| 27 |
+
divided into coarse-grained word senses called "rolesets". For
|
| 28 |
+
each "roleset", the frameset file provides descriptions of the
|
| 29 |
+
argument roles, along with examples.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
root,
|
| 35 |
+
propfile,
|
| 36 |
+
framefiles="",
|
| 37 |
+
verbsfile=None,
|
| 38 |
+
parse_fileid_xform=None,
|
| 39 |
+
parse_corpus=None,
|
| 40 |
+
encoding="utf8",
|
| 41 |
+
):
|
| 42 |
+
"""
|
| 43 |
+
:param root: The root directory for this corpus.
|
| 44 |
+
:param propfile: The name of the file containing the predicate-
|
| 45 |
+
argument annotations (relative to ``root``).
|
| 46 |
+
:param framefiles: A list or regexp specifying the frameset
|
| 47 |
+
fileids for this corpus.
|
| 48 |
+
:param parse_fileid_xform: A transform that should be applied
|
| 49 |
+
to the fileids in this corpus. This should be a function
|
| 50 |
+
of one argument (a fileid) that returns a string (the new
|
| 51 |
+
fileid).
|
| 52 |
+
:param parse_corpus: The corpus containing the parse trees
|
| 53 |
+
corresponding to this corpus. These parse trees are
|
| 54 |
+
necessary to resolve the tree pointers used by propbank.
|
| 55 |
+
"""
|
| 56 |
+
# If framefiles is specified as a regexp, expand it.
|
| 57 |
+
if isinstance(framefiles, str):
|
| 58 |
+
framefiles = find_corpus_fileids(root, framefiles)
|
| 59 |
+
framefiles = list(framefiles)
|
| 60 |
+
# Initialize the corpus reader.
|
| 61 |
+
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
|
| 62 |
+
|
| 63 |
+
# Record our frame fileids & prop file.
|
| 64 |
+
self._propfile = propfile
|
| 65 |
+
self._framefiles = framefiles
|
| 66 |
+
self._verbsfile = verbsfile
|
| 67 |
+
self._parse_fileid_xform = parse_fileid_xform
|
| 68 |
+
self._parse_corpus = parse_corpus
|
| 69 |
+
|
| 70 |
+
def instances(self, baseform=None):
|
| 71 |
+
"""
|
| 72 |
+
:return: a corpus view that acts as a list of
|
| 73 |
+
``PropBankInstance`` objects, one for each noun in the corpus.
|
| 74 |
+
"""
|
| 75 |
+
kwargs = {}
|
| 76 |
+
if baseform is not None:
|
| 77 |
+
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
|
| 78 |
+
return StreamBackedCorpusView(
|
| 79 |
+
self.abspath(self._propfile),
|
| 80 |
+
lambda stream: self._read_instance_block(stream, **kwargs),
|
| 81 |
+
encoding=self.encoding(self._propfile),
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
def lines(self):
|
| 85 |
+
"""
|
| 86 |
+
:return: a corpus view that acts as a list of strings, one for
|
| 87 |
+
each line in the predicate-argument annotation file.
|
| 88 |
+
"""
|
| 89 |
+
return StreamBackedCorpusView(
|
| 90 |
+
self.abspath(self._propfile),
|
| 91 |
+
read_line_block,
|
| 92 |
+
encoding=self.encoding(self._propfile),
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
def roleset(self, roleset_id):
|
| 96 |
+
"""
|
| 97 |
+
:return: the xml description for the given roleset.
|
| 98 |
+
"""
|
| 99 |
+
baseform = roleset_id.split(".")[0]
|
| 100 |
+
framefile = "frames/%s.xml" % baseform
|
| 101 |
+
if framefile not in self._framefiles:
|
| 102 |
+
raise ValueError("Frameset file for %s not found" % roleset_id)
|
| 103 |
+
|
| 104 |
+
# n.b.: The encoding for XML fileids is specified by the file
|
| 105 |
+
# itself; so we ignore self._encoding here.
|
| 106 |
+
with self.abspath(framefile).open() as fp:
|
| 107 |
+
etree = ElementTree.parse(fp).getroot()
|
| 108 |
+
for roleset in etree.findall("predicate/roleset"):
|
| 109 |
+
if roleset.attrib["id"] == roleset_id:
|
| 110 |
+
return roleset
|
| 111 |
+
raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
|
| 112 |
+
|
| 113 |
+
def rolesets(self, baseform=None):
|
| 114 |
+
"""
|
| 115 |
+
:return: list of xml descriptions for rolesets.
|
| 116 |
+
"""
|
| 117 |
+
if baseform is not None:
|
| 118 |
+
framefile = "frames/%s.xml" % baseform
|
| 119 |
+
if framefile not in self._framefiles:
|
| 120 |
+
raise ValueError("Frameset file for %s not found" % baseform)
|
| 121 |
+
framefiles = [framefile]
|
| 122 |
+
else:
|
| 123 |
+
framefiles = self._framefiles
|
| 124 |
+
|
| 125 |
+
rsets = []
|
| 126 |
+
for framefile in framefiles:
|
| 127 |
+
# n.b.: The encoding for XML fileids is specified by the file
|
| 128 |
+
# itself; so we ignore self._encoding here.
|
| 129 |
+
with self.abspath(framefile).open() as fp:
|
| 130 |
+
etree = ElementTree.parse(fp).getroot()
|
| 131 |
+
rsets.append(etree.findall("predicate/roleset"))
|
| 132 |
+
return LazyConcatenation(rsets)
|
| 133 |
+
|
| 134 |
+
def verbs(self):
|
| 135 |
+
"""
|
| 136 |
+
:return: a corpus view that acts as a list of all verb lemmas
|
| 137 |
+
in this corpus (from the verbs.txt file).
|
| 138 |
+
"""
|
| 139 |
+
return StreamBackedCorpusView(
|
| 140 |
+
self.abspath(self._verbsfile),
|
| 141 |
+
read_line_block,
|
| 142 |
+
encoding=self.encoding(self._verbsfile),
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
|
| 146 |
+
block = []
|
| 147 |
+
|
| 148 |
+
# Read 100 at a time.
|
| 149 |
+
for i in range(100):
|
| 150 |
+
line = stream.readline().strip()
|
| 151 |
+
if line:
|
| 152 |
+
inst = PropbankInstance.parse(
|
| 153 |
+
line, self._parse_fileid_xform, self._parse_corpus
|
| 154 |
+
)
|
| 155 |
+
if instance_filter(inst):
|
| 156 |
+
block.append(inst)
|
| 157 |
+
|
| 158 |
+
return block
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
######################################################################
|
| 162 |
+
# { Propbank Instance & related datatypes
|
| 163 |
+
######################################################################
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class PropbankInstance:
|
| 167 |
+
def __init__(
|
| 168 |
+
self,
|
| 169 |
+
fileid,
|
| 170 |
+
sentnum,
|
| 171 |
+
wordnum,
|
| 172 |
+
tagger,
|
| 173 |
+
roleset,
|
| 174 |
+
inflection,
|
| 175 |
+
predicate,
|
| 176 |
+
arguments,
|
| 177 |
+
parse_corpus=None,
|
| 178 |
+
):
|
| 179 |
+
|
| 180 |
+
self.fileid = fileid
|
| 181 |
+
"""The name of the file containing the parse tree for this
|
| 182 |
+
instance's sentence."""
|
| 183 |
+
|
| 184 |
+
self.sentnum = sentnum
|
| 185 |
+
"""The sentence number of this sentence within ``fileid``.
|
| 186 |
+
Indexing starts from zero."""
|
| 187 |
+
|
| 188 |
+
self.wordnum = wordnum
|
| 189 |
+
"""The word number of this instance's predicate within its
|
| 190 |
+
containing sentence. Word numbers are indexed starting from
|
| 191 |
+
zero, and include traces and other empty parse elements."""
|
| 192 |
+
|
| 193 |
+
self.tagger = tagger
|
| 194 |
+
"""An identifier for the tagger who tagged this instance; or
|
| 195 |
+
``'gold'`` if this is an adjuticated instance."""
|
| 196 |
+
|
| 197 |
+
self.roleset = roleset
|
| 198 |
+
"""The name of the roleset used by this instance's predicate.
|
| 199 |
+
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
|
| 200 |
+
look up information about the roleset."""
|
| 201 |
+
|
| 202 |
+
self.inflection = inflection
|
| 203 |
+
"""A ``PropbankInflection`` object describing the inflection of
|
| 204 |
+
this instance's predicate."""
|
| 205 |
+
|
| 206 |
+
self.predicate = predicate
|
| 207 |
+
"""A ``PropbankTreePointer`` indicating the position of this
|
| 208 |
+
instance's predicate within its containing sentence."""
|
| 209 |
+
|
| 210 |
+
self.arguments = tuple(arguments)
|
| 211 |
+
"""A list of tuples (argloc, argid), specifying the location
|
| 212 |
+
and identifier for each of the predicate's argument in the
|
| 213 |
+
containing sentence. Argument identifiers are strings such as
|
| 214 |
+
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
|
| 215 |
+
the predicate."""
|
| 216 |
+
|
| 217 |
+
self.parse_corpus = parse_corpus
|
| 218 |
+
"""A corpus reader for the parse trees corresponding to the
|
| 219 |
+
instances in this propbank corpus."""
|
| 220 |
+
|
| 221 |
+
@property
|
| 222 |
+
def baseform(self):
|
| 223 |
+
"""The baseform of the predicate."""
|
| 224 |
+
return self.roleset.split(".")[0]
|
| 225 |
+
|
| 226 |
+
@property
|
| 227 |
+
def sensenumber(self):
|
| 228 |
+
"""The sense number of the predicate."""
|
| 229 |
+
return self.roleset.split(".")[1]
|
| 230 |
+
|
| 231 |
+
@property
|
| 232 |
+
def predid(self):
|
| 233 |
+
"""Identifier of the predicate."""
|
| 234 |
+
return "rel"
|
| 235 |
+
|
| 236 |
+
def __repr__(self):
|
| 237 |
+
return "<PropbankInstance: {}, sent {}, word {}>".format(
|
| 238 |
+
self.fileid,
|
| 239 |
+
self.sentnum,
|
| 240 |
+
self.wordnum,
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
def __str__(self):
|
| 244 |
+
s = "{} {} {} {} {} {}".format(
|
| 245 |
+
self.fileid,
|
| 246 |
+
self.sentnum,
|
| 247 |
+
self.wordnum,
|
| 248 |
+
self.tagger,
|
| 249 |
+
self.roleset,
|
| 250 |
+
self.inflection,
|
| 251 |
+
)
|
| 252 |
+
items = self.arguments + ((self.predicate, "rel"),)
|
| 253 |
+
for (argloc, argid) in sorted(items):
|
| 254 |
+
s += f" {argloc}-{argid}"
|
| 255 |
+
return s
|
| 256 |
+
|
| 257 |
+
def _get_tree(self):
|
| 258 |
+
if self.parse_corpus is None:
|
| 259 |
+
return None
|
| 260 |
+
if self.fileid not in self.parse_corpus.fileids():
|
| 261 |
+
return None
|
| 262 |
+
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
|
| 263 |
+
|
| 264 |
+
tree = property(
|
| 265 |
+
_get_tree,
|
| 266 |
+
doc="""
|
| 267 |
+
The parse tree corresponding to this instance, or None if
|
| 268 |
+
the corresponding tree is not available.""",
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
@staticmethod
|
| 272 |
+
def parse(s, parse_fileid_xform=None, parse_corpus=None):
|
| 273 |
+
pieces = s.split()
|
| 274 |
+
if len(pieces) < 7:
|
| 275 |
+
raise ValueError("Badly formatted propbank line: %r" % s)
|
| 276 |
+
|
| 277 |
+
# Divide the line into its basic pieces.
|
| 278 |
+
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
|
| 279 |
+
rel = [p for p in pieces[6:] if p.endswith("-rel")]
|
| 280 |
+
args = [p for p in pieces[6:] if not p.endswith("-rel")]
|
| 281 |
+
if len(rel) != 1:
|
| 282 |
+
raise ValueError("Badly formatted propbank line: %r" % s)
|
| 283 |
+
|
| 284 |
+
# Apply the fileid selector, if any.
|
| 285 |
+
if parse_fileid_xform is not None:
|
| 286 |
+
fileid = parse_fileid_xform(fileid)
|
| 287 |
+
|
| 288 |
+
# Convert sentence & word numbers to ints.
|
| 289 |
+
sentnum = int(sentnum)
|
| 290 |
+
wordnum = int(wordnum)
|
| 291 |
+
|
| 292 |
+
# Parse the inflection
|
| 293 |
+
inflection = PropbankInflection.parse(inflection)
|
| 294 |
+
|
| 295 |
+
# Parse the predicate location.
|
| 296 |
+
predicate = PropbankTreePointer.parse(rel[0][:-4])
|
| 297 |
+
|
| 298 |
+
# Parse the arguments.
|
| 299 |
+
arguments = []
|
| 300 |
+
for arg in args:
|
| 301 |
+
argloc, argid = arg.split("-", 1)
|
| 302 |
+
arguments.append((PropbankTreePointer.parse(argloc), argid))
|
| 303 |
+
|
| 304 |
+
# Put it all together.
|
| 305 |
+
return PropbankInstance(
|
| 306 |
+
fileid,
|
| 307 |
+
sentnum,
|
| 308 |
+
wordnum,
|
| 309 |
+
tagger,
|
| 310 |
+
roleset,
|
| 311 |
+
inflection,
|
| 312 |
+
predicate,
|
| 313 |
+
arguments,
|
| 314 |
+
parse_corpus,
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
class PropbankPointer:
|
| 319 |
+
"""
|
| 320 |
+
A pointer used by propbank to identify one or more constituents in
|
| 321 |
+
a parse tree. ``PropbankPointer`` is an abstract base class with
|
| 322 |
+
three concrete subclasses:
|
| 323 |
+
|
| 324 |
+
- ``PropbankTreePointer`` is used to point to single constituents.
|
| 325 |
+
- ``PropbankSplitTreePointer`` is used to point to 'split'
|
| 326 |
+
constituents, which consist of a sequence of two or more
|
| 327 |
+
``PropbankTreePointer`` pointers.
|
| 328 |
+
- ``PropbankChainTreePointer`` is used to point to entire trace
|
| 329 |
+
chains in a tree. It consists of a sequence of pieces, which
|
| 330 |
+
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
|
| 331 |
+
"""
|
| 332 |
+
|
| 333 |
+
def __init__(self):
|
| 334 |
+
if self.__class__ == PropbankPointer:
|
| 335 |
+
raise NotImplementedError()
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
class PropbankChainTreePointer(PropbankPointer):
|
| 339 |
+
def __init__(self, pieces):
|
| 340 |
+
self.pieces = pieces
|
| 341 |
+
"""A list of the pieces that make up this chain. Elements may
|
| 342 |
+
be either ``PropbankSplitTreePointer`` or
|
| 343 |
+
``PropbankTreePointer`` pointers."""
|
| 344 |
+
|
| 345 |
+
def __str__(self):
|
| 346 |
+
return "*".join("%s" % p for p in self.pieces)
|
| 347 |
+
|
| 348 |
+
def __repr__(self):
|
| 349 |
+
return "<PropbankChainTreePointer: %s>" % self
|
| 350 |
+
|
| 351 |
+
def select(self, tree):
|
| 352 |
+
if tree is None:
|
| 353 |
+
raise ValueError("Parse tree not available")
|
| 354 |
+
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
class PropbankSplitTreePointer(PropbankPointer):
|
| 358 |
+
def __init__(self, pieces):
|
| 359 |
+
self.pieces = pieces
|
| 360 |
+
"""A list of the pieces that make up this chain. Elements are
|
| 361 |
+
all ``PropbankTreePointer`` pointers."""
|
| 362 |
+
|
| 363 |
+
def __str__(self):
|
| 364 |
+
return ",".join("%s" % p for p in self.pieces)
|
| 365 |
+
|
| 366 |
+
def __repr__(self):
|
| 367 |
+
return "<PropbankSplitTreePointer: %s>" % self
|
| 368 |
+
|
| 369 |
+
def select(self, tree):
|
| 370 |
+
if tree is None:
|
| 371 |
+
raise ValueError("Parse tree not available")
|
| 372 |
+
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
@total_ordering
|
| 376 |
+
class PropbankTreePointer(PropbankPointer):
|
| 377 |
+
"""
|
| 378 |
+
wordnum:height*wordnum:height*...
|
| 379 |
+
wordnum:height,
|
| 380 |
+
|
| 381 |
+
"""
|
| 382 |
+
|
| 383 |
+
def __init__(self, wordnum, height):
|
| 384 |
+
self.wordnum = wordnum
|
| 385 |
+
self.height = height
|
| 386 |
+
|
| 387 |
+
@staticmethod
|
| 388 |
+
def parse(s):
|
| 389 |
+
# Deal with chains (xx*yy*zz)
|
| 390 |
+
pieces = s.split("*")
|
| 391 |
+
if len(pieces) > 1:
|
| 392 |
+
return PropbankChainTreePointer(
|
| 393 |
+
[PropbankTreePointer.parse(elt) for elt in pieces]
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
# Deal with split args (xx,yy,zz)
|
| 397 |
+
pieces = s.split(",")
|
| 398 |
+
if len(pieces) > 1:
|
| 399 |
+
return PropbankSplitTreePointer(
|
| 400 |
+
[PropbankTreePointer.parse(elt) for elt in pieces]
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
# Deal with normal pointers.
|
| 404 |
+
pieces = s.split(":")
|
| 405 |
+
if len(pieces) != 2:
|
| 406 |
+
raise ValueError("bad propbank pointer %r" % s)
|
| 407 |
+
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
|
| 408 |
+
|
| 409 |
+
def __str__(self):
|
| 410 |
+
return f"{self.wordnum}:{self.height}"
|
| 411 |
+
|
| 412 |
+
def __repr__(self):
|
| 413 |
+
return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
|
| 414 |
+
|
| 415 |
+
def __eq__(self, other):
|
| 416 |
+
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
|
| 417 |
+
other = other.pieces[0]
|
| 418 |
+
|
| 419 |
+
if not isinstance(other, PropbankTreePointer):
|
| 420 |
+
return self is other
|
| 421 |
+
|
| 422 |
+
return self.wordnum == other.wordnum and self.height == other.height
|
| 423 |
+
|
| 424 |
+
def __ne__(self, other):
|
| 425 |
+
return not self == other
|
| 426 |
+
|
| 427 |
+
def __lt__(self, other):
|
| 428 |
+
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
|
| 429 |
+
other = other.pieces[0]
|
| 430 |
+
|
| 431 |
+
if not isinstance(other, PropbankTreePointer):
|
| 432 |
+
return id(self) < id(other)
|
| 433 |
+
|
| 434 |
+
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
|
| 435 |
+
|
| 436 |
+
def select(self, tree):
|
| 437 |
+
if tree is None:
|
| 438 |
+
raise ValueError("Parse tree not available")
|
| 439 |
+
return tree[self.treepos(tree)]
|
| 440 |
+
|
| 441 |
+
def treepos(self, tree):
|
| 442 |
+
"""
|
| 443 |
+
Convert this pointer to a standard 'tree position' pointer,
|
| 444 |
+
given that it points to the given tree.
|
| 445 |
+
"""
|
| 446 |
+
if tree is None:
|
| 447 |
+
raise ValueError("Parse tree not available")
|
| 448 |
+
stack = [tree]
|
| 449 |
+
treepos = []
|
| 450 |
+
|
| 451 |
+
wordnum = 0
|
| 452 |
+
while True:
|
| 453 |
+
# tree node:
|
| 454 |
+
if isinstance(stack[-1], Tree):
|
| 455 |
+
# Select the next child.
|
| 456 |
+
if len(treepos) < len(stack):
|
| 457 |
+
treepos.append(0)
|
| 458 |
+
else:
|
| 459 |
+
treepos[-1] += 1
|
| 460 |
+
# Update the stack.
|
| 461 |
+
if treepos[-1] < len(stack[-1]):
|
| 462 |
+
stack.append(stack[-1][treepos[-1]])
|
| 463 |
+
else:
|
| 464 |
+
# End of node's child list: pop up a level.
|
| 465 |
+
stack.pop()
|
| 466 |
+
treepos.pop()
|
| 467 |
+
# word node:
|
| 468 |
+
else:
|
| 469 |
+
if wordnum == self.wordnum:
|
| 470 |
+
return tuple(treepos[: len(treepos) - self.height - 1])
|
| 471 |
+
else:
|
| 472 |
+
wordnum += 1
|
| 473 |
+
stack.pop()
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
class PropbankInflection:
|
| 477 |
+
# { Inflection Form
|
| 478 |
+
INFINITIVE = "i"
|
| 479 |
+
GERUND = "g"
|
| 480 |
+
PARTICIPLE = "p"
|
| 481 |
+
FINITE = "v"
|
| 482 |
+
# { Inflection Tense
|
| 483 |
+
FUTURE = "f"
|
| 484 |
+
PAST = "p"
|
| 485 |
+
PRESENT = "n"
|
| 486 |
+
# { Inflection Aspect
|
| 487 |
+
PERFECT = "p"
|
| 488 |
+
PROGRESSIVE = "o"
|
| 489 |
+
PERFECT_AND_PROGRESSIVE = "b"
|
| 490 |
+
# { Inflection Person
|
| 491 |
+
THIRD_PERSON = "3"
|
| 492 |
+
# { Inflection Voice
|
| 493 |
+
ACTIVE = "a"
|
| 494 |
+
PASSIVE = "p"
|
| 495 |
+
# { Inflection
|
| 496 |
+
NONE = "-"
|
| 497 |
+
# }
|
| 498 |
+
|
| 499 |
+
def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
|
| 500 |
+
self.form = form
|
| 501 |
+
self.tense = tense
|
| 502 |
+
self.aspect = aspect
|
| 503 |
+
self.person = person
|
| 504 |
+
self.voice = voice
|
| 505 |
+
|
| 506 |
+
def __str__(self):
|
| 507 |
+
return self.form + self.tense + self.aspect + self.person + self.voice
|
| 508 |
+
|
| 509 |
+
def __repr__(self):
|
| 510 |
+
return "<PropbankInflection: %s>" % self
|
| 511 |
+
|
| 512 |
+
_VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
|
| 513 |
+
|
| 514 |
+
@staticmethod
|
| 515 |
+
def parse(s):
|
| 516 |
+
if not isinstance(s, str):
|
| 517 |
+
raise TypeError("expected a string")
|
| 518 |
+
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
|
| 519 |
+
raise ValueError("Bad propbank inflection string %r" % s)
|
| 520 |
+
return PropbankInflection(*s)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Pros and Cons Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
CorpusReader for the Pros and Cons dataset.
|
| 10 |
+
|
| 11 |
+
- Pros and Cons dataset information -
|
| 12 |
+
|
| 13 |
+
Contact: Bing Liu, liub@cs.uic.edu
|
| 14 |
+
https://www.cs.uic.edu/~liub
|
| 15 |
+
|
| 16 |
+
Distributed with permission.
|
| 17 |
+
|
| 18 |
+
Related papers:
|
| 19 |
+
|
| 20 |
+
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
| 21 |
+
Proceedings of the 22nd International Conference on Computational Linguistics
|
| 22 |
+
(Coling-2008), Manchester, 18-22 August, 2008.
|
| 23 |
+
|
| 24 |
+
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
|
| 25 |
+
Opinions on the Web". Proceedings of the 14th international World Wide Web
|
| 26 |
+
conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
|
| 27 |
+
"""
|
| 28 |
+
import re
|
| 29 |
+
|
| 30 |
+
from nltk.corpus.reader.api import *
|
| 31 |
+
from nltk.tokenize import *
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
|
| 35 |
+
"""
|
| 36 |
+
Reader for the Pros and Cons sentence dataset.
|
| 37 |
+
|
| 38 |
+
>>> from nltk.corpus import pros_cons
|
| 39 |
+
>>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
|
| 40 |
+
[['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
|
| 41 |
+
'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
|
| 42 |
+
...]
|
| 43 |
+
>>> pros_cons.words('IntegratedPros.txt')
|
| 44 |
+
['Easy', 'to', 'use', ',', 'economical', '!', ...]
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
CorpusView = StreamBackedCorpusView
|
| 48 |
+
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
root,
|
| 52 |
+
fileids,
|
| 53 |
+
word_tokenizer=WordPunctTokenizer(),
|
| 54 |
+
encoding="utf8",
|
| 55 |
+
**kwargs
|
| 56 |
+
):
|
| 57 |
+
"""
|
| 58 |
+
:param root: The root directory for the corpus.
|
| 59 |
+
:param fileids: a list or regexp specifying the fileids in the corpus.
|
| 60 |
+
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
| 61 |
+
into words. Default: `WhitespaceTokenizer`
|
| 62 |
+
:param encoding: the encoding that should be used to read the corpus.
|
| 63 |
+
:param kwargs: additional parameters passed to CategorizedCorpusReader.
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 67 |
+
CategorizedCorpusReader.__init__(self, kwargs)
|
| 68 |
+
self._word_tokenizer = word_tokenizer
|
| 69 |
+
|
| 70 |
+
def sents(self, fileids=None, categories=None):
|
| 71 |
+
"""
|
| 72 |
+
Return all sentences in the corpus or in the specified files/categories.
|
| 73 |
+
|
| 74 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 75 |
+
sentences have to be returned.
|
| 76 |
+
:param categories: a list specifying the categories whose sentences
|
| 77 |
+
have to be returned.
|
| 78 |
+
:return: the given file(s) as a list of sentences. Each sentence is
|
| 79 |
+
tokenized using the specified word_tokenizer.
|
| 80 |
+
:rtype: list(list(str))
|
| 81 |
+
"""
|
| 82 |
+
fileids = self._resolve(fileids, categories)
|
| 83 |
+
if fileids is None:
|
| 84 |
+
fileids = self._fileids
|
| 85 |
+
elif isinstance(fileids, str):
|
| 86 |
+
fileids = [fileids]
|
| 87 |
+
return concat(
|
| 88 |
+
[
|
| 89 |
+
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
| 90 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 91 |
+
]
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def words(self, fileids=None, categories=None):
|
| 95 |
+
"""
|
| 96 |
+
Return all words and punctuation symbols in the corpus or in the specified
|
| 97 |
+
files/categories.
|
| 98 |
+
|
| 99 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 100 |
+
words have to be returned.
|
| 101 |
+
:param categories: a list specifying the categories whose words have
|
| 102 |
+
to be returned.
|
| 103 |
+
:return: the given file(s) as a list of words and punctuation symbols.
|
| 104 |
+
:rtype: list(str)
|
| 105 |
+
"""
|
| 106 |
+
fileids = self._resolve(fileids, categories)
|
| 107 |
+
if fileids is None:
|
| 108 |
+
fileids = self._fileids
|
| 109 |
+
elif isinstance(fileids, str):
|
| 110 |
+
fileids = [fileids]
|
| 111 |
+
return concat(
|
| 112 |
+
[
|
| 113 |
+
self.CorpusView(path, self._read_word_block, encoding=enc)
|
| 114 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 115 |
+
]
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
def _read_sent_block(self, stream):
|
| 119 |
+
sents = []
|
| 120 |
+
for i in range(20): # Read 20 lines at a time.
|
| 121 |
+
line = stream.readline()
|
| 122 |
+
if not line:
|
| 123 |
+
continue
|
| 124 |
+
sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
|
| 125 |
+
if sent:
|
| 126 |
+
sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
|
| 127 |
+
return sents
|
| 128 |
+
|
| 129 |
+
def _read_word_block(self, stream):
|
| 130 |
+
words = []
|
| 131 |
+
for sent in self._read_sent_block(stream):
|
| 132 |
+
words.extend(sent)
|
| 133 |
+
return words
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Product Reviews Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
|
| 10 |
+
|
| 11 |
+
Customer Review Corpus information
|
| 12 |
+
==================================
|
| 13 |
+
|
| 14 |
+
Annotated by: Minqing Hu and Bing Liu, 2004.
|
| 15 |
+
Department of Computer Science
|
| 16 |
+
University of Illinois at Chicago
|
| 17 |
+
|
| 18 |
+
Contact: Bing Liu, liub@cs.uic.edu
|
| 19 |
+
https://www.cs.uic.edu/~liub
|
| 20 |
+
|
| 21 |
+
Distributed with permission.
|
| 22 |
+
|
| 23 |
+
The "product_reviews_1" and "product_reviews_2" datasets respectively contain
|
| 24 |
+
annotated customer reviews of 5 and 9 products from amazon.com.
|
| 25 |
+
|
| 26 |
+
Related papers:
|
| 27 |
+
|
| 28 |
+
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
|
| 29 |
+
Proceedings of the ACM SIGKDD International Conference on Knowledge
|
| 30 |
+
Discovery & Data Mining (KDD-04), 2004.
|
| 31 |
+
|
| 32 |
+
- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
|
| 33 |
+
Proceedings of Nineteeth National Conference on Artificial Intelligence
|
| 34 |
+
(AAAI-2004), 2004.
|
| 35 |
+
|
| 36 |
+
- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
|
| 37 |
+
Opinion Mining." Proceedings of First ACM International Conference on Web
|
| 38 |
+
Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
|
| 39 |
+
Stanford, California, USA.
|
| 40 |
+
|
| 41 |
+
Symbols used in the annotated reviews:
|
| 42 |
+
|
| 43 |
+
:[t]: the title of the review: Each [t] tag starts a review.
|
| 44 |
+
:xxxx[+|-n]: xxxx is a product feature.
|
| 45 |
+
:[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
|
| 46 |
+
Note that the strength is quite subjective.
|
| 47 |
+
You may want ignore it, but only considering + and -
|
| 48 |
+
:[-n]: Negative opinion
|
| 49 |
+
:##: start of each sentence. Each line is a sentence.
|
| 50 |
+
:[u]: feature not appeared in the sentence.
|
| 51 |
+
:[p]: feature not appeared in the sentence. Pronoun resolution is needed.
|
| 52 |
+
:[s]: suggestion or recommendation.
|
| 53 |
+
:[cc]: comparison with a competing product from a different brand.
|
| 54 |
+
:[cs]: comparison with a competing product from the same brand.
|
| 55 |
+
|
| 56 |
+
Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
|
| 57 |
+
provide separation between different reviews. This is due to the fact that
|
| 58 |
+
the dataset was specifically designed for aspect/feature-based sentiment
|
| 59 |
+
analysis, for which sentence-level annotation is sufficient. For document-
|
| 60 |
+
level classification and analysis, this peculiarity should be taken into
|
| 61 |
+
consideration.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
import re
|
| 65 |
+
|
| 66 |
+
from nltk.corpus.reader.api import *
|
| 67 |
+
from nltk.tokenize import *
|
| 68 |
+
|
| 69 |
+
TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
|
| 70 |
+
FEATURES = re.compile(
|
| 71 |
+
r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
|
| 72 |
+
) # find 'feature' in feature[+3]
|
| 73 |
+
NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
|
| 74 |
+
SENT = re.compile(r"##(.*)$") # find tokenized sentence
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class Review:
|
| 78 |
+
"""
|
| 79 |
+
A Review is the main block of a ReviewsCorpusReader.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
def __init__(self, title=None, review_lines=None):
|
| 83 |
+
"""
|
| 84 |
+
:param title: the title of the review.
|
| 85 |
+
:param review_lines: the list of the ReviewLines that belong to the Review.
|
| 86 |
+
"""
|
| 87 |
+
self.title = title
|
| 88 |
+
if review_lines is None:
|
| 89 |
+
self.review_lines = []
|
| 90 |
+
else:
|
| 91 |
+
self.review_lines = review_lines
|
| 92 |
+
|
| 93 |
+
def add_line(self, review_line):
|
| 94 |
+
"""
|
| 95 |
+
Add a line (ReviewLine) to the review.
|
| 96 |
+
|
| 97 |
+
:param review_line: a ReviewLine instance that belongs to the Review.
|
| 98 |
+
"""
|
| 99 |
+
assert isinstance(review_line, ReviewLine)
|
| 100 |
+
self.review_lines.append(review_line)
|
| 101 |
+
|
| 102 |
+
def features(self):
|
| 103 |
+
"""
|
| 104 |
+
Return a list of features in the review. Each feature is a tuple made of
|
| 105 |
+
the specific item feature and the opinion strength about that feature.
|
| 106 |
+
|
| 107 |
+
:return: all features of the review as a list of tuples (feat, score).
|
| 108 |
+
:rtype: list(tuple)
|
| 109 |
+
"""
|
| 110 |
+
features = []
|
| 111 |
+
for review_line in self.review_lines:
|
| 112 |
+
features.extend(review_line.features)
|
| 113 |
+
return features
|
| 114 |
+
|
| 115 |
+
def sents(self):
|
| 116 |
+
"""
|
| 117 |
+
Return all tokenized sentences in the review.
|
| 118 |
+
|
| 119 |
+
:return: all sentences of the review as lists of tokens.
|
| 120 |
+
:rtype: list(list(str))
|
| 121 |
+
"""
|
| 122 |
+
return [review_line.sent for review_line in self.review_lines]
|
| 123 |
+
|
| 124 |
+
def __repr__(self):
|
| 125 |
+
return 'Review(title="{}", review_lines={})'.format(
|
| 126 |
+
self.title, self.review_lines
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class ReviewLine:
|
| 131 |
+
"""
|
| 132 |
+
A ReviewLine represents a sentence of the review, together with (optional)
|
| 133 |
+
annotations of its features and notes about the reviewed item.
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
def __init__(self, sent, features=None, notes=None):
|
| 137 |
+
self.sent = sent
|
| 138 |
+
if features is None:
|
| 139 |
+
self.features = []
|
| 140 |
+
else:
|
| 141 |
+
self.features = features
|
| 142 |
+
|
| 143 |
+
if notes is None:
|
| 144 |
+
self.notes = []
|
| 145 |
+
else:
|
| 146 |
+
self.notes = notes
|
| 147 |
+
|
| 148 |
+
def __repr__(self):
|
| 149 |
+
return "ReviewLine(features={}, notes={}, sent={})".format(
|
| 150 |
+
self.features, self.notes, self.sent
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
class ReviewsCorpusReader(CorpusReader):
|
| 155 |
+
"""
|
| 156 |
+
Reader for the Customer Review Data dataset by Hu, Liu (2004).
|
| 157 |
+
Note: we are not applying any sentence tokenization at the moment, just word
|
| 158 |
+
tokenization.
|
| 159 |
+
|
| 160 |
+
>>> from nltk.corpus import product_reviews_1
|
| 161 |
+
>>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
|
| 162 |
+
>>> review = camera_reviews[0]
|
| 163 |
+
>>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
|
| 164 |
+
['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
|
| 165 |
+
'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
|
| 166 |
+
>>> review.features() # doctest: +NORMALIZE_WHITESPACE
|
| 167 |
+
[('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
|
| 168 |
+
('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
|
| 169 |
+
('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
|
| 170 |
+
('option', '+1')]
|
| 171 |
+
|
| 172 |
+
We can also reach the same information directly from the stream:
|
| 173 |
+
|
| 174 |
+
>>> product_reviews_1.features('Canon_G3.txt')
|
| 175 |
+
[('canon powershot g3', '+3'), ('use', '+2'), ...]
|
| 176 |
+
|
| 177 |
+
We can compute stats for specific product features:
|
| 178 |
+
|
| 179 |
+
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
|
| 180 |
+
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
|
| 181 |
+
>>> mean = tot / n_reviews
|
| 182 |
+
>>> print(n_reviews, tot, mean)
|
| 183 |
+
15 24 1.6
|
| 184 |
+
"""
|
| 185 |
+
|
| 186 |
+
CorpusView = StreamBackedCorpusView
|
| 187 |
+
|
| 188 |
+
def __init__(
|
| 189 |
+
self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
|
| 190 |
+
):
|
| 191 |
+
"""
|
| 192 |
+
:param root: The root directory for the corpus.
|
| 193 |
+
:param fileids: a list or regexp specifying the fileids in the corpus.
|
| 194 |
+
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
| 195 |
+
into words. Default: `WordPunctTokenizer`
|
| 196 |
+
:param encoding: the encoding that should be used to read the corpus.
|
| 197 |
+
"""
|
| 198 |
+
|
| 199 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 200 |
+
self._word_tokenizer = word_tokenizer
|
| 201 |
+
self._readme = "README.txt"
|
| 202 |
+
|
| 203 |
+
def features(self, fileids=None):
|
| 204 |
+
"""
|
| 205 |
+
Return a list of features. Each feature is a tuple made of the specific
|
| 206 |
+
item feature and the opinion strength about that feature.
|
| 207 |
+
|
| 208 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 209 |
+
features have to be returned.
|
| 210 |
+
:return: all features for the item(s) in the given file(s).
|
| 211 |
+
:rtype: list(tuple)
|
| 212 |
+
"""
|
| 213 |
+
if fileids is None:
|
| 214 |
+
fileids = self._fileids
|
| 215 |
+
elif isinstance(fileids, str):
|
| 216 |
+
fileids = [fileids]
|
| 217 |
+
return concat(
|
| 218 |
+
[
|
| 219 |
+
self.CorpusView(fileid, self._read_features, encoding=enc)
|
| 220 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 221 |
+
]
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
def reviews(self, fileids=None):
|
| 225 |
+
"""
|
| 226 |
+
Return all the reviews as a list of Review objects. If `fileids` is
|
| 227 |
+
specified, return all the reviews from each of the specified files.
|
| 228 |
+
|
| 229 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 230 |
+
reviews have to be returned.
|
| 231 |
+
:return: the given file(s) as a list of reviews.
|
| 232 |
+
"""
|
| 233 |
+
if fileids is None:
|
| 234 |
+
fileids = self._fileids
|
| 235 |
+
return concat(
|
| 236 |
+
[
|
| 237 |
+
self.CorpusView(fileid, self._read_review_block, encoding=enc)
|
| 238 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 239 |
+
]
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def sents(self, fileids=None):
|
| 243 |
+
"""
|
| 244 |
+
Return all sentences in the corpus or in the specified files.
|
| 245 |
+
|
| 246 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 247 |
+
sentences have to be returned.
|
| 248 |
+
:return: the given file(s) as a list of sentences, each encoded as a
|
| 249 |
+
list of word strings.
|
| 250 |
+
:rtype: list(list(str))
|
| 251 |
+
"""
|
| 252 |
+
return concat(
|
| 253 |
+
[
|
| 254 |
+
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
| 255 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 256 |
+
]
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
def words(self, fileids=None):
|
| 260 |
+
"""
|
| 261 |
+
Return all words and punctuation symbols in the corpus or in the specified
|
| 262 |
+
files.
|
| 263 |
+
|
| 264 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 265 |
+
words have to be returned.
|
| 266 |
+
:return: the given file(s) as a list of words and punctuation symbols.
|
| 267 |
+
:rtype: list(str)
|
| 268 |
+
"""
|
| 269 |
+
return concat(
|
| 270 |
+
[
|
| 271 |
+
self.CorpusView(path, self._read_word_block, encoding=enc)
|
| 272 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 273 |
+
]
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
def _read_features(self, stream):
|
| 277 |
+
features = []
|
| 278 |
+
for i in range(20):
|
| 279 |
+
line = stream.readline()
|
| 280 |
+
if not line:
|
| 281 |
+
return features
|
| 282 |
+
features.extend(re.findall(FEATURES, line))
|
| 283 |
+
return features
|
| 284 |
+
|
| 285 |
+
def _read_review_block(self, stream):
|
| 286 |
+
while True:
|
| 287 |
+
line = stream.readline()
|
| 288 |
+
if not line:
|
| 289 |
+
return [] # end of file.
|
| 290 |
+
title_match = re.match(TITLE, line)
|
| 291 |
+
if title_match:
|
| 292 |
+
review = Review(
|
| 293 |
+
title=title_match.group(1).strip()
|
| 294 |
+
) # We create a new review
|
| 295 |
+
break
|
| 296 |
+
|
| 297 |
+
# Scan until we find another line matching the regexp, or EOF.
|
| 298 |
+
while True:
|
| 299 |
+
oldpos = stream.tell()
|
| 300 |
+
line = stream.readline()
|
| 301 |
+
# End of file:
|
| 302 |
+
if not line:
|
| 303 |
+
return [review]
|
| 304 |
+
# Start of a new review: backup to just before it starts, and
|
| 305 |
+
# return the review we've already collected.
|
| 306 |
+
if re.match(TITLE, line):
|
| 307 |
+
stream.seek(oldpos)
|
| 308 |
+
return [review]
|
| 309 |
+
# Anything else is part of the review line.
|
| 310 |
+
feats = re.findall(FEATURES, line)
|
| 311 |
+
notes = re.findall(NOTES, line)
|
| 312 |
+
sent = re.findall(SENT, line)
|
| 313 |
+
if sent:
|
| 314 |
+
sent = self._word_tokenizer.tokenize(sent[0])
|
| 315 |
+
review_line = ReviewLine(sent=sent, features=feats, notes=notes)
|
| 316 |
+
review.add_line(review_line)
|
| 317 |
+
|
| 318 |
+
def _read_sent_block(self, stream):
|
| 319 |
+
sents = []
|
| 320 |
+
for review in self._read_review_block(stream):
|
| 321 |
+
sents.extend([sent for sent in review.sents()])
|
| 322 |
+
return sents
|
| 323 |
+
|
| 324 |
+
def _read_word_block(self, stream):
|
| 325 |
+
words = []
|
| 326 |
+
for i in range(20): # Read 20 lines at a time.
|
| 327 |
+
line = stream.readline()
|
| 328 |
+
sent = re.findall(SENT, line)
|
| 329 |
+
if sent:
|
| 330 |
+
words.extend(self._word_tokenizer.tokenize(sent[0]))
|
| 331 |
+
return words
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: RTE Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
|
| 10 |
+
|
| 11 |
+
The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
|
| 12 |
+
were regularized.
|
| 13 |
+
|
| 14 |
+
Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
|
| 15 |
+
gold standard annotated files.
|
| 16 |
+
|
| 17 |
+
Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
|
| 18 |
+
example is taken from RTE3::
|
| 19 |
+
|
| 20 |
+
<pair id="1" entailment="YES" task="IE" length="short" >
|
| 21 |
+
|
| 22 |
+
<t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
|
| 23 |
+
Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
|
| 24 |
+
company Baikalfinansgroup which was later bought by the Russian
|
| 25 |
+
state-owned oil company Rosneft .</t>
|
| 26 |
+
|
| 27 |
+
<h>Baikalfinansgroup was sold to Rosneft.</h>
|
| 28 |
+
</pair>
|
| 29 |
+
|
| 30 |
+
In order to provide globally unique IDs for each pair, a new attribute
|
| 31 |
+
``challenge`` has been added to the root element ``entailment-corpus`` of each
|
| 32 |
+
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
|
| 33 |
+
challenge number and 'n' is the pair ID.
|
| 34 |
+
"""
|
| 35 |
+
from nltk.corpus.reader.api import *
|
| 36 |
+
from nltk.corpus.reader.util import *
|
| 37 |
+
from nltk.corpus.reader.xmldocs import *
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def norm(value_string):
|
| 41 |
+
"""
|
| 42 |
+
Normalize the string value in an RTE pair's ``value`` or ``entailment``
|
| 43 |
+
attribute as an integer (1, 0).
|
| 44 |
+
|
| 45 |
+
:param value_string: the label used to classify a text/hypothesis pair
|
| 46 |
+
:type value_string: str
|
| 47 |
+
:rtype: int
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
|
| 51 |
+
return valdict[value_string.upper()]
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class RTEPair:
|
| 55 |
+
"""
|
| 56 |
+
Container for RTE text-hypothesis pairs.
|
| 57 |
+
|
| 58 |
+
The entailment relation is signalled by the ``value`` attribute in RTE1, and by
|
| 59 |
+
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
|
| 60 |
+
attribute of this class.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
def __init__(
|
| 64 |
+
self,
|
| 65 |
+
pair,
|
| 66 |
+
challenge=None,
|
| 67 |
+
id=None,
|
| 68 |
+
text=None,
|
| 69 |
+
hyp=None,
|
| 70 |
+
value=None,
|
| 71 |
+
task=None,
|
| 72 |
+
length=None,
|
| 73 |
+
):
|
| 74 |
+
"""
|
| 75 |
+
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
|
| 76 |
+
:param id: identifier for the pair
|
| 77 |
+
:param text: the text component of the pair
|
| 78 |
+
:param hyp: the hypothesis component of the pair
|
| 79 |
+
:param value: classification label for the pair
|
| 80 |
+
:param task: attribute for the particular NLP task that the data was drawn from
|
| 81 |
+
:param length: attribute for the length of the text of the pair
|
| 82 |
+
"""
|
| 83 |
+
self.challenge = challenge
|
| 84 |
+
self.id = pair.attrib["id"]
|
| 85 |
+
self.gid = f"{self.challenge}-{self.id}"
|
| 86 |
+
self.text = pair[0].text
|
| 87 |
+
self.hyp = pair[1].text
|
| 88 |
+
|
| 89 |
+
if "value" in pair.attrib:
|
| 90 |
+
self.value = norm(pair.attrib["value"])
|
| 91 |
+
elif "entailment" in pair.attrib:
|
| 92 |
+
self.value = norm(pair.attrib["entailment"])
|
| 93 |
+
else:
|
| 94 |
+
self.value = value
|
| 95 |
+
if "task" in pair.attrib:
|
| 96 |
+
self.task = pair.attrib["task"]
|
| 97 |
+
else:
|
| 98 |
+
self.task = task
|
| 99 |
+
if "length" in pair.attrib:
|
| 100 |
+
self.length = pair.attrib["length"]
|
| 101 |
+
else:
|
| 102 |
+
self.length = length
|
| 103 |
+
|
| 104 |
+
def __repr__(self):
|
| 105 |
+
if self.challenge:
|
| 106 |
+
return f"<RTEPair: gid={self.challenge}-{self.id}>"
|
| 107 |
+
else:
|
| 108 |
+
return "<RTEPair: id=%s>" % self.id
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
class RTECorpusReader(XMLCorpusReader):
|
| 112 |
+
"""
|
| 113 |
+
Corpus reader for corpora in RTE challenges.
|
| 114 |
+
|
| 115 |
+
This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
|
| 116 |
+
structure of input documents.
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
def _read_etree(self, doc):
|
| 120 |
+
"""
|
| 121 |
+
Map the XML input into an RTEPair.
|
| 122 |
+
|
| 123 |
+
This uses the ``getiterator()`` method from the ElementTree package to
|
| 124 |
+
find all the ``<pair>`` elements.
|
| 125 |
+
|
| 126 |
+
:param doc: a parsed XML document
|
| 127 |
+
:rtype: list(RTEPair)
|
| 128 |
+
"""
|
| 129 |
+
try:
|
| 130 |
+
challenge = doc.attrib["challenge"]
|
| 131 |
+
except KeyError:
|
| 132 |
+
challenge = None
|
| 133 |
+
pairiter = doc.iter("pair")
|
| 134 |
+
return [RTEPair(pair, challenge=challenge) for pair in pairiter]
|
| 135 |
+
|
| 136 |
+
def pairs(self, fileids):
|
| 137 |
+
"""
|
| 138 |
+
Build a list of RTEPairs from a RTE corpus.
|
| 139 |
+
|
| 140 |
+
:param fileids: a list of RTE corpus fileids
|
| 141 |
+
:type: list
|
| 142 |
+
:rtype: list(RTEPair)
|
| 143 |
+
"""
|
| 144 |
+
if isinstance(fileids, str):
|
| 145 |
+
fileids = [fileids]
|
| 146 |
+
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: SemCor Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Corpus reader for the SemCor Corpus.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
__docformat__ = "epytext en"
|
| 13 |
+
|
| 14 |
+
from nltk.corpus.reader.api import *
|
| 15 |
+
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
|
| 16 |
+
from nltk.tree import Tree
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class SemcorCorpusReader(XMLCorpusReader):
|
| 20 |
+
"""
|
| 21 |
+
Corpus reader for the SemCor Corpus.
|
| 22 |
+
For access to the complete XML data structure, use the ``xml()``
|
| 23 |
+
method. For access to simple word lists and tagged word lists, use
|
| 24 |
+
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, root, fileids, wordnet, lazy=True):
|
| 28 |
+
XMLCorpusReader.__init__(self, root, fileids)
|
| 29 |
+
self._lazy = lazy
|
| 30 |
+
self._wordnet = wordnet
|
| 31 |
+
|
| 32 |
+
def words(self, fileids=None):
|
| 33 |
+
"""
|
| 34 |
+
:return: the given file(s) as a list of words and punctuation symbols.
|
| 35 |
+
:rtype: list(str)
|
| 36 |
+
"""
|
| 37 |
+
return self._items(fileids, "word", False, False, False)
|
| 38 |
+
|
| 39 |
+
def chunks(self, fileids=None):
|
| 40 |
+
"""
|
| 41 |
+
:return: the given file(s) as a list of chunks,
|
| 42 |
+
each of which is a list of words and punctuation symbols
|
| 43 |
+
that form a unit.
|
| 44 |
+
:rtype: list(list(str))
|
| 45 |
+
"""
|
| 46 |
+
return self._items(fileids, "chunk", False, False, False)
|
| 47 |
+
|
| 48 |
+
def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
|
| 49 |
+
"""
|
| 50 |
+
:return: the given file(s) as a list of tagged chunks, represented
|
| 51 |
+
in tree form.
|
| 52 |
+
:rtype: list(Tree)
|
| 53 |
+
|
| 54 |
+
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
|
| 55 |
+
to indicate the kind of tags to include. Semantic tags consist of
|
| 56 |
+
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
|
| 57 |
+
without a specific entry in WordNet. (Named entities of type 'other'
|
| 58 |
+
have no lemma. Other chunks not in WordNet have no semantic tag.
|
| 59 |
+
Punctuation tokens have `None` for their part of speech tag.)
|
| 60 |
+
"""
|
| 61 |
+
return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
|
| 62 |
+
|
| 63 |
+
def sents(self, fileids=None):
|
| 64 |
+
"""
|
| 65 |
+
:return: the given file(s) as a list of sentences, each encoded
|
| 66 |
+
as a list of word strings.
|
| 67 |
+
:rtype: list(list(str))
|
| 68 |
+
"""
|
| 69 |
+
return self._items(fileids, "word", True, False, False)
|
| 70 |
+
|
| 71 |
+
def chunk_sents(self, fileids=None):
|
| 72 |
+
"""
|
| 73 |
+
:return: the given file(s) as a list of sentences, each encoded
|
| 74 |
+
as a list of chunks.
|
| 75 |
+
:rtype: list(list(list(str)))
|
| 76 |
+
"""
|
| 77 |
+
return self._items(fileids, "chunk", True, False, False)
|
| 78 |
+
|
| 79 |
+
def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
|
| 80 |
+
"""
|
| 81 |
+
:return: the given file(s) as a list of sentences. Each sentence
|
| 82 |
+
is represented as a list of tagged chunks (in tree form).
|
| 83 |
+
:rtype: list(list(Tree))
|
| 84 |
+
|
| 85 |
+
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
|
| 86 |
+
to indicate the kind of tags to include. Semantic tags consist of
|
| 87 |
+
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
|
| 88 |
+
without a specific entry in WordNet. (Named entities of type 'other'
|
| 89 |
+
have no lemma. Other chunks not in WordNet have no semantic tag.
|
| 90 |
+
Punctuation tokens have `None` for their part of speech tag.)
|
| 91 |
+
"""
|
| 92 |
+
return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
|
| 93 |
+
|
| 94 |
+
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
|
| 95 |
+
if unit == "word" and not bracket_sent:
|
| 96 |
+
# the result of the SemcorWordView may be a multiword unit, so the
|
| 97 |
+
# LazyConcatenation will make sure the sentence is flattened
|
| 98 |
+
_ = lambda *args: LazyConcatenation(
|
| 99 |
+
(SemcorWordView if self._lazy else self._words)(*args)
|
| 100 |
+
)
|
| 101 |
+
else:
|
| 102 |
+
_ = SemcorWordView if self._lazy else self._words
|
| 103 |
+
return concat(
|
| 104 |
+
[
|
| 105 |
+
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
|
| 106 |
+
for fileid in self.abspaths(fileids)
|
| 107 |
+
]
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
|
| 111 |
+
"""
|
| 112 |
+
Helper used to implement the view methods -- returns a list of
|
| 113 |
+
tokens, (segmented) words, chunks, or sentences. The tokens
|
| 114 |
+
and chunks may optionally be tagged (with POS and sense
|
| 115 |
+
information).
|
| 116 |
+
|
| 117 |
+
:param fileid: The name of the underlying file.
|
| 118 |
+
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
|
| 119 |
+
:param bracket_sent: If true, include sentence bracketing.
|
| 120 |
+
:param pos_tag: Whether to include part-of-speech tags.
|
| 121 |
+
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
|
| 122 |
+
and OOV named entity status.
|
| 123 |
+
"""
|
| 124 |
+
assert unit in ("token", "word", "chunk")
|
| 125 |
+
result = []
|
| 126 |
+
|
| 127 |
+
xmldoc = ElementTree.parse(fileid).getroot()
|
| 128 |
+
for xmlsent in xmldoc.findall(".//s"):
|
| 129 |
+
sent = []
|
| 130 |
+
for xmlword in _all_xmlwords_in(xmlsent):
|
| 131 |
+
itm = SemcorCorpusReader._word(
|
| 132 |
+
xmlword, unit, pos_tag, sem_tag, self._wordnet
|
| 133 |
+
)
|
| 134 |
+
if unit == "word":
|
| 135 |
+
sent.extend(itm)
|
| 136 |
+
else:
|
| 137 |
+
sent.append(itm)
|
| 138 |
+
|
| 139 |
+
if bracket_sent:
|
| 140 |
+
result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
|
| 141 |
+
else:
|
| 142 |
+
result.extend(sent)
|
| 143 |
+
|
| 144 |
+
assert None not in result
|
| 145 |
+
return result
|
| 146 |
+
|
| 147 |
+
@staticmethod
|
| 148 |
+
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
|
| 149 |
+
tkn = xmlword.text
|
| 150 |
+
if not tkn:
|
| 151 |
+
tkn = "" # fixes issue 337?
|
| 152 |
+
|
| 153 |
+
lemma = xmlword.get("lemma", tkn) # lemma or NE class
|
| 154 |
+
lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
|
| 155 |
+
if lexsn is not None:
|
| 156 |
+
sense_key = lemma + "%" + lexsn
|
| 157 |
+
wnpos = ("n", "v", "a", "r", "s")[
|
| 158 |
+
int(lexsn.split(":")[0]) - 1
|
| 159 |
+
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
|
| 160 |
+
else:
|
| 161 |
+
sense_key = wnpos = None
|
| 162 |
+
redef = xmlword.get(
|
| 163 |
+
"rdf", tkn
|
| 164 |
+
) # redefinition--this indicates the lookup string
|
| 165 |
+
# does not exactly match the enclosed string, e.g. due to typographical adjustments
|
| 166 |
+
# or discontinuity of a multiword expression. If a redefinition has occurred,
|
| 167 |
+
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
|
| 168 |
+
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
|
| 169 |
+
sensenum = xmlword.get("wnsn") # WordNet sense number
|
| 170 |
+
isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
|
| 171 |
+
pos = xmlword.get(
|
| 172 |
+
"pos"
|
| 173 |
+
) # part of speech for the whole chunk (None for punctuation)
|
| 174 |
+
|
| 175 |
+
if unit == "token":
|
| 176 |
+
if not pos_tag and not sem_tag:
|
| 177 |
+
itm = tkn
|
| 178 |
+
else:
|
| 179 |
+
itm = (
|
| 180 |
+
(tkn,)
|
| 181 |
+
+ ((pos,) if pos_tag else ())
|
| 182 |
+
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
|
| 183 |
+
)
|
| 184 |
+
return itm
|
| 185 |
+
else:
|
| 186 |
+
ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
|
| 187 |
+
if unit == "word":
|
| 188 |
+
return ww
|
| 189 |
+
else:
|
| 190 |
+
if sensenum is not None:
|
| 191 |
+
try:
|
| 192 |
+
sense = wordnet.lemma_from_key(sense_key) # Lemma object
|
| 193 |
+
except Exception:
|
| 194 |
+
# cannot retrieve the wordnet.Lemma object. possible reasons:
|
| 195 |
+
# (a) the wordnet corpus is not downloaded;
|
| 196 |
+
# (b) a nonexistent sense is annotated: e.g., such.s.00 triggers:
|
| 197 |
+
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
|
| 198 |
+
# solution: just use the lemma name as a string
|
| 199 |
+
try:
|
| 200 |
+
sense = "%s.%s.%02d" % (
|
| 201 |
+
lemma,
|
| 202 |
+
wnpos,
|
| 203 |
+
int(sensenum),
|
| 204 |
+
) # e.g.: reach.v.02
|
| 205 |
+
except ValueError:
|
| 206 |
+
sense = (
|
| 207 |
+
lemma + "." + wnpos + "." + sensenum
|
| 208 |
+
) # e.g. the sense number may be "2;1"
|
| 209 |
+
|
| 210 |
+
bottom = [Tree(pos, ww)] if pos_tag else ww
|
| 211 |
+
|
| 212 |
+
if sem_tag and isOOVEntity:
|
| 213 |
+
if sensenum is not None:
|
| 214 |
+
return Tree(sense, [Tree("NE", bottom)])
|
| 215 |
+
else: # 'other' NE
|
| 216 |
+
return Tree("NE", bottom)
|
| 217 |
+
elif sem_tag and sensenum is not None:
|
| 218 |
+
return Tree(sense, bottom)
|
| 219 |
+
elif pos_tag:
|
| 220 |
+
return bottom[0]
|
| 221 |
+
else:
|
| 222 |
+
return bottom # chunk as a list
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def _all_xmlwords_in(elt, result=None):
|
| 226 |
+
if result is None:
|
| 227 |
+
result = []
|
| 228 |
+
for child in elt:
|
| 229 |
+
if child.tag in ("wf", "punc"):
|
| 230 |
+
result.append(child)
|
| 231 |
+
else:
|
| 232 |
+
_all_xmlwords_in(child, result)
|
| 233 |
+
return result
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
class SemcorSentence(list):
|
| 237 |
+
"""
|
| 238 |
+
A list of words, augmented by an attribute ``num`` used to record
|
| 239 |
+
the sentence identifier (the ``n`` attribute from the XML).
|
| 240 |
+
"""
|
| 241 |
+
|
| 242 |
+
def __init__(self, num, items):
|
| 243 |
+
self.num = num
|
| 244 |
+
list.__init__(self, items)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
class SemcorWordView(XMLCorpusView):
|
| 248 |
+
"""
|
| 249 |
+
A stream backed corpus view specialized for use with the BNC corpus.
|
| 250 |
+
"""
|
| 251 |
+
|
| 252 |
+
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
|
| 253 |
+
"""
|
| 254 |
+
:param fileid: The name of the underlying file.
|
| 255 |
+
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
|
| 256 |
+
:param bracket_sent: If true, include sentence bracketing.
|
| 257 |
+
:param pos_tag: Whether to include part-of-speech tags.
|
| 258 |
+
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
|
| 259 |
+
and OOV named entity status.
|
| 260 |
+
"""
|
| 261 |
+
if bracket_sent:
|
| 262 |
+
tagspec = ".*/s"
|
| 263 |
+
else:
|
| 264 |
+
tagspec = ".*/s/(punc|wf)"
|
| 265 |
+
|
| 266 |
+
self._unit = unit
|
| 267 |
+
self._sent = bracket_sent
|
| 268 |
+
self._pos_tag = pos_tag
|
| 269 |
+
self._sem_tag = sem_tag
|
| 270 |
+
self._wordnet = wordnet
|
| 271 |
+
|
| 272 |
+
XMLCorpusView.__init__(self, fileid, tagspec)
|
| 273 |
+
|
| 274 |
+
def handle_elt(self, elt, context):
|
| 275 |
+
if self._sent:
|
| 276 |
+
return self.handle_sent(elt)
|
| 277 |
+
else:
|
| 278 |
+
return self.handle_word(elt)
|
| 279 |
+
|
| 280 |
+
def handle_word(self, elt):
|
| 281 |
+
return SemcorCorpusReader._word(
|
| 282 |
+
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
def handle_sent(self, elt):
|
| 286 |
+
sent = []
|
| 287 |
+
for child in elt:
|
| 288 |
+
if child.tag in ("wf", "punc"):
|
| 289 |
+
itm = self.handle_word(child)
|
| 290 |
+
if self._unit == "word":
|
| 291 |
+
sent.extend(itm)
|
| 292 |
+
else:
|
| 293 |
+
sent.append(itm)
|
| 294 |
+
else:
|
| 295 |
+
raise ValueError("Unexpected element %s" % child.tag)
|
| 296 |
+
return SemcorSentence(elt.attrib["snum"], sent)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Senseval 2 Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com> (modifications)
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
Read from the Senseval 2 Corpus.
|
| 11 |
+
|
| 12 |
+
SENSEVAL [http://www.senseval.org/]
|
| 13 |
+
Evaluation exercises for Word Sense Disambiguation.
|
| 14 |
+
Organized by ACL-SIGLEX [https://www.siglex.org/]
|
| 15 |
+
|
| 16 |
+
Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
|
| 17 |
+
https://www.d.umn.edu/~tpederse/data.html
|
| 18 |
+
Distributed with permission.
|
| 19 |
+
|
| 20 |
+
The NLTK version of the Senseval 2 files uses well-formed XML.
|
| 21 |
+
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
|
| 22 |
+
is tagged with a sense identifier, and supplied with context.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
import re
|
| 26 |
+
from xml.etree import ElementTree
|
| 27 |
+
|
| 28 |
+
from nltk.corpus.reader.api import *
|
| 29 |
+
from nltk.corpus.reader.util import *
|
| 30 |
+
from nltk.tokenize import *
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class SensevalInstance:
|
| 34 |
+
def __init__(self, word, position, context, senses):
|
| 35 |
+
self.word = word
|
| 36 |
+
self.senses = tuple(senses)
|
| 37 |
+
self.position = position
|
| 38 |
+
self.context = context
|
| 39 |
+
|
| 40 |
+
def __repr__(self):
|
| 41 |
+
return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
|
| 42 |
+
self.word,
|
| 43 |
+
self.position,
|
| 44 |
+
self.context,
|
| 45 |
+
self.senses,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class SensevalCorpusReader(CorpusReader):
|
| 50 |
+
def instances(self, fileids=None):
|
| 51 |
+
return concat(
|
| 52 |
+
[
|
| 53 |
+
SensevalCorpusView(fileid, enc)
|
| 54 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 55 |
+
]
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
def _entry(self, tree):
|
| 59 |
+
elts = []
|
| 60 |
+
for lexelt in tree.findall("lexelt"):
|
| 61 |
+
for inst in lexelt.findall("instance"):
|
| 62 |
+
sense = inst[0].attrib["senseid"]
|
| 63 |
+
context = [(w.text, w.attrib["pos"]) for w in inst[1]]
|
| 64 |
+
elts.append((sense, context))
|
| 65 |
+
return elts
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class SensevalCorpusView(StreamBackedCorpusView):
|
| 69 |
+
def __init__(self, fileid, encoding):
|
| 70 |
+
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
| 71 |
+
|
| 72 |
+
self._word_tokenizer = WhitespaceTokenizer()
|
| 73 |
+
self._lexelt_starts = [0] # list of streampos
|
| 74 |
+
self._lexelts = [None] # list of lexelt names
|
| 75 |
+
|
| 76 |
+
def read_block(self, stream):
|
| 77 |
+
# Decide which lexical element we're in.
|
| 78 |
+
lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
|
| 79 |
+
lexelt = self._lexelts[lexelt_num]
|
| 80 |
+
|
| 81 |
+
instance_lines = []
|
| 82 |
+
in_instance = False
|
| 83 |
+
while True:
|
| 84 |
+
line = stream.readline()
|
| 85 |
+
if line == "":
|
| 86 |
+
assert instance_lines == []
|
| 87 |
+
return []
|
| 88 |
+
|
| 89 |
+
# Start of a lexical element?
|
| 90 |
+
if line.lstrip().startswith("<lexelt"):
|
| 91 |
+
lexelt_num += 1
|
| 92 |
+
m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
|
| 93 |
+
assert m is not None # <lexelt> has no 'item=...'
|
| 94 |
+
lexelt = m.group(1)[1:-1]
|
| 95 |
+
if lexelt_num < len(self._lexelts):
|
| 96 |
+
assert lexelt == self._lexelts[lexelt_num]
|
| 97 |
+
else:
|
| 98 |
+
self._lexelts.append(lexelt)
|
| 99 |
+
self._lexelt_starts.append(stream.tell())
|
| 100 |
+
|
| 101 |
+
# Start of an instance?
|
| 102 |
+
if line.lstrip().startswith("<instance"):
|
| 103 |
+
assert instance_lines == []
|
| 104 |
+
in_instance = True
|
| 105 |
+
|
| 106 |
+
# Body of an instance?
|
| 107 |
+
if in_instance:
|
| 108 |
+
instance_lines.append(line)
|
| 109 |
+
|
| 110 |
+
# End of an instance?
|
| 111 |
+
if line.lstrip().startswith("</instance"):
|
| 112 |
+
xml_block = "\n".join(instance_lines)
|
| 113 |
+
xml_block = _fixXML(xml_block)
|
| 114 |
+
inst = ElementTree.fromstring(xml_block)
|
| 115 |
+
return [self._parse_instance(inst, lexelt)]
|
| 116 |
+
|
| 117 |
+
def _parse_instance(self, instance, lexelt):
|
| 118 |
+
senses = []
|
| 119 |
+
context = []
|
| 120 |
+
position = None
|
| 121 |
+
for child in instance:
|
| 122 |
+
if child.tag == "answer":
|
| 123 |
+
senses.append(child.attrib["senseid"])
|
| 124 |
+
elif child.tag == "context":
|
| 125 |
+
context += self._word_tokenizer.tokenize(child.text)
|
| 126 |
+
for cword in child:
|
| 127 |
+
if cword.tag == "compound":
|
| 128 |
+
cword = cword[0] # is this ok to do?
|
| 129 |
+
|
| 130 |
+
if cword.tag == "head":
|
| 131 |
+
# Some santiy checks:
|
| 132 |
+
assert position is None, "head specified twice"
|
| 133 |
+
assert cword.text.strip() or len(cword) == 1
|
| 134 |
+
assert not (cword.text.strip() and len(cword) == 1)
|
| 135 |
+
# Record the position of the head:
|
| 136 |
+
position = len(context)
|
| 137 |
+
# Add on the head word itself:
|
| 138 |
+
if cword.text.strip():
|
| 139 |
+
context.append(cword.text.strip())
|
| 140 |
+
elif cword[0].tag == "wf":
|
| 141 |
+
context.append((cword[0].text, cword[0].attrib["pos"]))
|
| 142 |
+
if cword[0].tail:
|
| 143 |
+
context += self._word_tokenizer.tokenize(cword[0].tail)
|
| 144 |
+
else:
|
| 145 |
+
assert False, "expected CDATA or wf in <head>"
|
| 146 |
+
elif cword.tag == "wf":
|
| 147 |
+
context.append((cword.text, cword.attrib["pos"]))
|
| 148 |
+
elif cword.tag == "s":
|
| 149 |
+
pass # Sentence boundary marker.
|
| 150 |
+
|
| 151 |
+
else:
|
| 152 |
+
print("ACK", cword.tag)
|
| 153 |
+
assert False, "expected CDATA or <wf> or <head>"
|
| 154 |
+
if cword.tail:
|
| 155 |
+
context += self._word_tokenizer.tokenize(cword.tail)
|
| 156 |
+
else:
|
| 157 |
+
assert False, "unexpected tag %s" % child.tag
|
| 158 |
+
return SensevalInstance(lexelt, position, context, senses)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def _fixXML(text):
|
| 162 |
+
"""
|
| 163 |
+
Fix the various issues with Senseval pseudo-XML.
|
| 164 |
+
"""
|
| 165 |
+
# <~> or <^> => ~ or ^
|
| 166 |
+
text = re.sub(r"<([~\^])>", r"\1", text)
|
| 167 |
+
# fix lone &
|
| 168 |
+
text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text)
|
| 169 |
+
# fix """
|
| 170 |
+
text = re.sub(r'"""', "'\"'", text)
|
| 171 |
+
# fix <s snum=dd> => <s snum="dd"/>
|
| 172 |
+
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
|
| 173 |
+
# fix foreign word tag
|
| 174 |
+
text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
|
| 175 |
+
# remove <&I .>
|
| 176 |
+
text = re.sub(r"<\&I[^>]*>", "", text)
|
| 177 |
+
# fix <{word}>
|
| 178 |
+
text = re.sub(r"<{([^}]+)}>", r"\1", text)
|
| 179 |
+
# remove <@>, <p>, </p>
|
| 180 |
+
text = re.sub(r"<(@|/?p)>", r"", text)
|
| 181 |
+
# remove <&M .> and <&T .> and <&Ms .>
|
| 182 |
+
text = re.sub(r"<&\w+ \.>", r"", text)
|
| 183 |
+
# remove <!DOCTYPE... > lines
|
| 184 |
+
text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
|
| 185 |
+
# remove <[hi]> and <[/p]> etc
|
| 186 |
+
text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
|
| 187 |
+
# take the thing out of the brackets: <…>
|
| 188 |
+
text = re.sub(r"<(\&\w+;)>", r"\1", text)
|
| 189 |
+
# and remove the & for those patterns that aren't regular XML
|
| 190 |
+
text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
|
| 191 |
+
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
|
| 192 |
+
text = re.sub(
|
| 193 |
+
r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
|
| 194 |
+
)
|
| 195 |
+
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
|
| 196 |
+
return text
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: SentiWordNet
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Christopher Potts <cgpotts@stanford.edu>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
An NLTK interface for SentiWordNet
|
| 10 |
+
|
| 11 |
+
SentiWordNet is a lexical resource for opinion mining.
|
| 12 |
+
SentiWordNet assigns to each synset of WordNet three
|
| 13 |
+
sentiment scores: positivity, negativity, and objectivity.
|
| 14 |
+
|
| 15 |
+
For details about SentiWordNet see:
|
| 16 |
+
http://sentiwordnet.isti.cnr.it/
|
| 17 |
+
|
| 18 |
+
>>> from nltk.corpus import sentiwordnet as swn
|
| 19 |
+
>>> print(swn.senti_synset('breakdown.n.03'))
|
| 20 |
+
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
|
| 21 |
+
>>> list(swn.senti_synsets('slow'))
|
| 22 |
+
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
|
| 23 |
+
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
|
| 24 |
+
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
|
| 25 |
+
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
|
| 26 |
+
SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),\
|
| 27 |
+
SentiSynset('behind.r.03')]
|
| 28 |
+
>>> happy = swn.senti_synsets('happy', 'a')
|
| 29 |
+
>>> happy0 = list(happy)[0]
|
| 30 |
+
>>> happy0.pos_score()
|
| 31 |
+
0.875
|
| 32 |
+
>>> happy0.neg_score()
|
| 33 |
+
0.0
|
| 34 |
+
>>> happy0.obj_score()
|
| 35 |
+
0.125
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
import re
|
| 39 |
+
|
| 40 |
+
from nltk.corpus.reader import CorpusReader
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class SentiWordNetCorpusReader(CorpusReader):
|
| 44 |
+
def __init__(self, root, fileids, encoding="utf-8"):
|
| 45 |
+
"""
|
| 46 |
+
Construct a new SentiWordNet Corpus Reader, using data from
|
| 47 |
+
the specified file.
|
| 48 |
+
"""
|
| 49 |
+
super().__init__(root, fileids, encoding=encoding)
|
| 50 |
+
if len(self._fileids) != 1:
|
| 51 |
+
raise ValueError("Exactly one file must be specified")
|
| 52 |
+
self._db = {}
|
| 53 |
+
self._parse_src_file()
|
| 54 |
+
|
| 55 |
+
def _parse_src_file(self):
|
| 56 |
+
lines = self.open(self._fileids[0]).read().splitlines()
|
| 57 |
+
lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
|
| 58 |
+
for i, line in enumerate(lines):
|
| 59 |
+
fields = [field.strip() for field in re.split(r"\t+", line)]
|
| 60 |
+
try:
|
| 61 |
+
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
|
| 62 |
+
except BaseException as e:
|
| 63 |
+
raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
|
| 64 |
+
if pos and offset:
|
| 65 |
+
offset = int(offset)
|
| 66 |
+
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
|
| 67 |
+
|
| 68 |
+
def senti_synset(self, *vals):
|
| 69 |
+
from nltk.corpus import wordnet as wn
|
| 70 |
+
|
| 71 |
+
if tuple(vals) in self._db:
|
| 72 |
+
pos_score, neg_score = self._db[tuple(vals)]
|
| 73 |
+
pos, offset = vals
|
| 74 |
+
if pos == "s":
|
| 75 |
+
pos = "a"
|
| 76 |
+
synset = wn.synset_from_pos_and_offset(pos, offset)
|
| 77 |
+
return SentiSynset(pos_score, neg_score, synset)
|
| 78 |
+
else:
|
| 79 |
+
synset = wn.synset(vals[0])
|
| 80 |
+
pos = synset.pos()
|
| 81 |
+
if pos == "s":
|
| 82 |
+
pos = "a"
|
| 83 |
+
offset = synset.offset()
|
| 84 |
+
if (pos, offset) in self._db:
|
| 85 |
+
pos_score, neg_score = self._db[(pos, offset)]
|
| 86 |
+
return SentiSynset(pos_score, neg_score, synset)
|
| 87 |
+
else:
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
def senti_synsets(self, string, pos=None):
|
| 91 |
+
from nltk.corpus import wordnet as wn
|
| 92 |
+
|
| 93 |
+
sentis = []
|
| 94 |
+
synset_list = wn.synsets(string, pos)
|
| 95 |
+
for synset in synset_list:
|
| 96 |
+
sentis.append(self.senti_synset(synset.name()))
|
| 97 |
+
sentis = filter(lambda x: x, sentis)
|
| 98 |
+
return sentis
|
| 99 |
+
|
| 100 |
+
def all_senti_synsets(self):
|
| 101 |
+
from nltk.corpus import wordnet as wn
|
| 102 |
+
|
| 103 |
+
for key, fields in self._db.items():
|
| 104 |
+
pos, offset = key
|
| 105 |
+
pos_score, neg_score = fields
|
| 106 |
+
synset = wn.synset_from_pos_and_offset(pos, offset)
|
| 107 |
+
yield SentiSynset(pos_score, neg_score, synset)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class SentiSynset:
|
| 111 |
+
def __init__(self, pos_score, neg_score, synset):
|
| 112 |
+
self._pos_score = pos_score
|
| 113 |
+
self._neg_score = neg_score
|
| 114 |
+
self._obj_score = 1.0 - (self._pos_score + self._neg_score)
|
| 115 |
+
self.synset = synset
|
| 116 |
+
|
| 117 |
+
def pos_score(self):
|
| 118 |
+
return self._pos_score
|
| 119 |
+
|
| 120 |
+
def neg_score(self):
|
| 121 |
+
return self._neg_score
|
| 122 |
+
|
| 123 |
+
def obj_score(self):
|
| 124 |
+
return self._obj_score
|
| 125 |
+
|
| 126 |
+
def __str__(self):
|
| 127 |
+
"""Prints just the Pos/Neg scores for now."""
|
| 128 |
+
s = "<"
|
| 129 |
+
s += self.synset.name() + ": "
|
| 130 |
+
s += "PosScore=%s " % self._pos_score
|
| 131 |
+
s += "NegScore=%s" % self._neg_score
|
| 132 |
+
s += ">"
|
| 133 |
+
return s
|
| 134 |
+
|
| 135 |
+
def __repr__(self):
|
| 136 |
+
return "Senti" + repr(self.synset)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Sinica Treebank Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Sinica Treebank Corpus Sample
|
| 10 |
+
|
| 11 |
+
http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
|
| 12 |
+
|
| 13 |
+
10,000 parsed sentences, drawn from the Academia Sinica Balanced
|
| 14 |
+
Corpus of Modern Chinese. Parse tree notation is based on
|
| 15 |
+
Information-based Case Grammar. Tagset documentation is available
|
| 16 |
+
at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
|
| 17 |
+
|
| 18 |
+
Language and Knowledge Processing Group, Institute of Information
|
| 19 |
+
Science, Academia Sinica
|
| 20 |
+
|
| 21 |
+
The data is distributed with the Natural Language Toolkit under the terms of
|
| 22 |
+
the Creative Commons Attribution-NonCommercial-ShareAlike License
|
| 23 |
+
[https://creativecommons.org/licenses/by-nc-sa/2.5/].
|
| 24 |
+
|
| 25 |
+
References:
|
| 26 |
+
|
| 27 |
+
Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
|
| 28 |
+
The Construction of Sinica Treebank. Computational Linguistics and
|
| 29 |
+
Chinese Language Processing, 4, pp 87-104.
|
| 30 |
+
|
| 31 |
+
Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
|
| 32 |
+
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
|
| 33 |
+
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
|
| 34 |
+
Chinese Language Processing Workshop, Association for Computational
|
| 35 |
+
Linguistics.
|
| 36 |
+
|
| 37 |
+
Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
|
| 38 |
+
Extraction, Proceedings of IJCNLP-04, pp560-565.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
from nltk.corpus.reader.api import *
|
| 42 |
+
from nltk.corpus.reader.util import *
|
| 43 |
+
from nltk.tag import map_tag
|
| 44 |
+
from nltk.tree import sinica_parse
|
| 45 |
+
|
| 46 |
+
IDENTIFIER = re.compile(r"^#\S+\s")
|
| 47 |
+
APPENDIX = re.compile(r"(?<=\))#.*$")
|
| 48 |
+
TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
|
| 49 |
+
WORD = re.compile(r":[^:()|]+:([^:()|]+)")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
|
| 53 |
+
"""
|
| 54 |
+
Reader for the sinica treebank.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
def _read_block(self, stream):
|
| 58 |
+
sent = stream.readline()
|
| 59 |
+
sent = IDENTIFIER.sub("", sent)
|
| 60 |
+
sent = APPENDIX.sub("", sent)
|
| 61 |
+
return [sent]
|
| 62 |
+
|
| 63 |
+
def _parse(self, sent):
|
| 64 |
+
return sinica_parse(sent)
|
| 65 |
+
|
| 66 |
+
def _tag(self, sent, tagset=None):
|
| 67 |
+
tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
|
| 68 |
+
if tagset and tagset != self._tagset:
|
| 69 |
+
tagged_sent = [
|
| 70 |
+
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
|
| 71 |
+
]
|
| 72 |
+
return tagged_sent
|
| 73 |
+
|
| 74 |
+
def _word(self, sent):
|
| 75 |
+
return WORD.findall(sent)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: String Category Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
Read tuples from a corpus consisting of categorized strings.
|
| 11 |
+
For example, from the question classification corpus:
|
| 12 |
+
|
| 13 |
+
NUM:dist How far is it from Denver to Aspen ?
|
| 14 |
+
LOC:city What county is Modesto , California in ?
|
| 15 |
+
HUM:desc Who was Galileo ?
|
| 16 |
+
DESC:def What is an atom ?
|
| 17 |
+
NUM:date When did Hawaii become a state ?
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from nltk.corpus.reader.api import *
|
| 21 |
+
|
| 22 |
+
# based on PPAttachmentCorpusReader
|
| 23 |
+
from nltk.corpus.reader.util import *
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# [xx] Should the order of the tuple be reversed -- in most other places
|
| 27 |
+
# in nltk, we use the form (data, tag) -- e.g., tagged words and
|
| 28 |
+
# labeled texts for classifiers.
|
| 29 |
+
class StringCategoryCorpusReader(CorpusReader):
|
| 30 |
+
def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
|
| 31 |
+
"""
|
| 32 |
+
:param root: The root directory for this corpus.
|
| 33 |
+
:param fileids: A list or regexp specifying the fileids in this corpus.
|
| 34 |
+
:param delimiter: Field delimiter
|
| 35 |
+
"""
|
| 36 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 37 |
+
self._delimiter = delimiter
|
| 38 |
+
|
| 39 |
+
def tuples(self, fileids=None):
|
| 40 |
+
if fileids is None:
|
| 41 |
+
fileids = self._fileids
|
| 42 |
+
elif isinstance(fileids, str):
|
| 43 |
+
fileids = [fileids]
|
| 44 |
+
return concat(
|
| 45 |
+
[
|
| 46 |
+
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
|
| 47 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 48 |
+
]
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
def _read_tuple_block(self, stream):
|
| 52 |
+
line = stream.readline().strip()
|
| 53 |
+
if line:
|
| 54 |
+
return [tuple(line.split(self._delimiter, 1))]
|
| 55 |
+
else:
|
| 56 |
+
return []
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Switchboard Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
from nltk.corpus.reader.api import *
|
| 10 |
+
from nltk.corpus.reader.util import *
|
| 11 |
+
from nltk.tag import map_tag, str2tuple
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SwitchboardTurn(list):
|
| 15 |
+
"""
|
| 16 |
+
A specialized list object used to encode switchboard utterances.
|
| 17 |
+
The elements of the list are the words in the utterance; and two
|
| 18 |
+
attributes, ``speaker`` and ``id``, are provided to retrieve the
|
| 19 |
+
spearker identifier and utterance id. Note that utterance ids
|
| 20 |
+
are only unique within a given discourse.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, words, speaker, id):
|
| 24 |
+
list.__init__(self, words)
|
| 25 |
+
self.speaker = speaker
|
| 26 |
+
self.id = int(id)
|
| 27 |
+
|
| 28 |
+
def __repr__(self):
|
| 29 |
+
if len(self) == 0:
|
| 30 |
+
text = ""
|
| 31 |
+
elif isinstance(self[0], tuple):
|
| 32 |
+
text = " ".join("%s/%s" % w for w in self)
|
| 33 |
+
else:
|
| 34 |
+
text = " ".join(self)
|
| 35 |
+
return f"<{self.speaker}.{self.id}: {text!r}>"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class SwitchboardCorpusReader(CorpusReader):
|
| 39 |
+
_FILES = ["tagged"]
|
| 40 |
+
# Use the "tagged" file even for non-tagged data methods, since
|
| 41 |
+
# it's tokenized.
|
| 42 |
+
|
| 43 |
+
def __init__(self, root, tagset=None):
|
| 44 |
+
CorpusReader.__init__(self, root, self._FILES)
|
| 45 |
+
self._tagset = tagset
|
| 46 |
+
|
| 47 |
+
def words(self):
|
| 48 |
+
return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
|
| 49 |
+
|
| 50 |
+
def tagged_words(self, tagset=None):
|
| 51 |
+
def tagged_words_block_reader(stream):
|
| 52 |
+
return self._tagged_words_block_reader(stream, tagset)
|
| 53 |
+
|
| 54 |
+
return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
|
| 55 |
+
|
| 56 |
+
def turns(self):
|
| 57 |
+
return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
|
| 58 |
+
|
| 59 |
+
def tagged_turns(self, tagset=None):
|
| 60 |
+
def tagged_turns_block_reader(stream):
|
| 61 |
+
return self._tagged_turns_block_reader(stream, tagset)
|
| 62 |
+
|
| 63 |
+
return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
|
| 64 |
+
|
| 65 |
+
def discourses(self):
|
| 66 |
+
return StreamBackedCorpusView(
|
| 67 |
+
self.abspath("tagged"), self._discourses_block_reader
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
def tagged_discourses(self, tagset=False):
|
| 71 |
+
def tagged_discourses_block_reader(stream):
|
| 72 |
+
return self._tagged_discourses_block_reader(stream, tagset)
|
| 73 |
+
|
| 74 |
+
return StreamBackedCorpusView(
|
| 75 |
+
self.abspath("tagged"), tagged_discourses_block_reader
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
def _discourses_block_reader(self, stream):
|
| 79 |
+
# returns at most 1 discourse. (The other methods depend on this.)
|
| 80 |
+
return [
|
| 81 |
+
[
|
| 82 |
+
self._parse_utterance(u, include_tag=False)
|
| 83 |
+
for b in read_blankline_block(stream)
|
| 84 |
+
for u in b.split("\n")
|
| 85 |
+
if u.strip()
|
| 86 |
+
]
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
def _tagged_discourses_block_reader(self, stream, tagset=None):
|
| 90 |
+
# returns at most 1 discourse. (The other methods depend on this.)
|
| 91 |
+
return [
|
| 92 |
+
[
|
| 93 |
+
self._parse_utterance(u, include_tag=True, tagset=tagset)
|
| 94 |
+
for b in read_blankline_block(stream)
|
| 95 |
+
for u in b.split("\n")
|
| 96 |
+
if u.strip()
|
| 97 |
+
]
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
def _turns_block_reader(self, stream):
|
| 101 |
+
return self._discourses_block_reader(stream)[0]
|
| 102 |
+
|
| 103 |
+
def _tagged_turns_block_reader(self, stream, tagset=None):
|
| 104 |
+
return self._tagged_discourses_block_reader(stream, tagset)[0]
|
| 105 |
+
|
| 106 |
+
def _words_block_reader(self, stream):
|
| 107 |
+
return sum(self._discourses_block_reader(stream)[0], [])
|
| 108 |
+
|
| 109 |
+
def _tagged_words_block_reader(self, stream, tagset=None):
|
| 110 |
+
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
|
| 111 |
+
|
| 112 |
+
_UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
|
| 113 |
+
_SEP = "/"
|
| 114 |
+
|
| 115 |
+
def _parse_utterance(self, utterance, include_tag, tagset=None):
|
| 116 |
+
m = self._UTTERANCE_RE.match(utterance)
|
| 117 |
+
if m is None:
|
| 118 |
+
raise ValueError("Bad utterance %r" % utterance)
|
| 119 |
+
speaker, id, text = m.groups()
|
| 120 |
+
words = [str2tuple(s, self._SEP) for s in text.split()]
|
| 121 |
+
if not include_tag:
|
| 122 |
+
words = [w for (w, t) in words]
|
| 123 |
+
elif tagset and tagset != self._tagset:
|
| 124 |
+
words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
|
| 125 |
+
return SwitchboardTurn(words, speaker, id)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Tagged Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 6 |
+
# Jacob Perkins <japerk@gmail.com>
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
A reader for corpora whose documents contain part-of-speech-tagged words.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
from nltk.corpus.reader.api import *
|
| 17 |
+
from nltk.corpus.reader.timit import read_timit_block
|
| 18 |
+
from nltk.corpus.reader.util import *
|
| 19 |
+
from nltk.tag import map_tag, str2tuple
|
| 20 |
+
from nltk.tokenize import *
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class TaggedCorpusReader(CorpusReader):
|
| 24 |
+
"""
|
| 25 |
+
Reader for simple part-of-speech tagged corpora. Paragraphs are
|
| 26 |
+
assumed to be split using blank lines. Sentences and words can be
|
| 27 |
+
tokenized using the default tokenizers, or by custom tokenizers
|
| 28 |
+
specified as parameters to the constructor. Words are parsed
|
| 29 |
+
using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the
|
| 30 |
+
separator. I.e., words should have the form::
|
| 31 |
+
|
| 32 |
+
word1/tag1 word2/tag2 word3/tag3 ...
|
| 33 |
+
|
| 34 |
+
But custom separators may be specified as parameters to the
|
| 35 |
+
constructor. Part of speech tags are case-normalized to upper
|
| 36 |
+
case.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def __init__(
|
| 40 |
+
self,
|
| 41 |
+
root,
|
| 42 |
+
fileids,
|
| 43 |
+
sep="/",
|
| 44 |
+
word_tokenizer=WhitespaceTokenizer(),
|
| 45 |
+
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
| 46 |
+
para_block_reader=read_blankline_block,
|
| 47 |
+
encoding="utf8",
|
| 48 |
+
tagset=None,
|
| 49 |
+
):
|
| 50 |
+
"""
|
| 51 |
+
Construct a new Tagged Corpus reader for a set of documents
|
| 52 |
+
located at the given root directory. Example usage:
|
| 53 |
+
|
| 54 |
+
>>> root = '/...path to corpus.../'
|
| 55 |
+
>>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
|
| 56 |
+
|
| 57 |
+
:param root: The root directory for this corpus.
|
| 58 |
+
:param fileids: A list or regexp specifying the fileids in this corpus.
|
| 59 |
+
"""
|
| 60 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 61 |
+
self._sep = sep
|
| 62 |
+
self._word_tokenizer = word_tokenizer
|
| 63 |
+
self._sent_tokenizer = sent_tokenizer
|
| 64 |
+
self._para_block_reader = para_block_reader
|
| 65 |
+
self._tagset = tagset
|
| 66 |
+
|
| 67 |
+
def words(self, fileids=None):
|
| 68 |
+
"""
|
| 69 |
+
:return: the given file(s) as a list of words
|
| 70 |
+
and punctuation symbols.
|
| 71 |
+
:rtype: list(str)
|
| 72 |
+
"""
|
| 73 |
+
return concat(
|
| 74 |
+
[
|
| 75 |
+
TaggedCorpusView(
|
| 76 |
+
fileid,
|
| 77 |
+
enc,
|
| 78 |
+
False,
|
| 79 |
+
False,
|
| 80 |
+
False,
|
| 81 |
+
self._sep,
|
| 82 |
+
self._word_tokenizer,
|
| 83 |
+
self._sent_tokenizer,
|
| 84 |
+
self._para_block_reader,
|
| 85 |
+
None,
|
| 86 |
+
)
|
| 87 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 88 |
+
]
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def sents(self, fileids=None):
|
| 92 |
+
"""
|
| 93 |
+
:return: the given file(s) as a list of
|
| 94 |
+
sentences or utterances, each encoded as a list of word
|
| 95 |
+
strings.
|
| 96 |
+
:rtype: list(list(str))
|
| 97 |
+
"""
|
| 98 |
+
return concat(
|
| 99 |
+
[
|
| 100 |
+
TaggedCorpusView(
|
| 101 |
+
fileid,
|
| 102 |
+
enc,
|
| 103 |
+
False,
|
| 104 |
+
True,
|
| 105 |
+
False,
|
| 106 |
+
self._sep,
|
| 107 |
+
self._word_tokenizer,
|
| 108 |
+
self._sent_tokenizer,
|
| 109 |
+
self._para_block_reader,
|
| 110 |
+
None,
|
| 111 |
+
)
|
| 112 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 113 |
+
]
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
def paras(self, fileids=None):
|
| 117 |
+
"""
|
| 118 |
+
:return: the given file(s) as a list of
|
| 119 |
+
paragraphs, each encoded as a list of sentences, which are
|
| 120 |
+
in turn encoded as lists of word strings.
|
| 121 |
+
:rtype: list(list(list(str)))
|
| 122 |
+
"""
|
| 123 |
+
return concat(
|
| 124 |
+
[
|
| 125 |
+
TaggedCorpusView(
|
| 126 |
+
fileid,
|
| 127 |
+
enc,
|
| 128 |
+
False,
|
| 129 |
+
True,
|
| 130 |
+
True,
|
| 131 |
+
self._sep,
|
| 132 |
+
self._word_tokenizer,
|
| 133 |
+
self._sent_tokenizer,
|
| 134 |
+
self._para_block_reader,
|
| 135 |
+
None,
|
| 136 |
+
)
|
| 137 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 138 |
+
]
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
def tagged_words(self, fileids=None, tagset=None):
|
| 142 |
+
"""
|
| 143 |
+
:return: the given file(s) as a list of tagged
|
| 144 |
+
words and punctuation symbols, encoded as tuples
|
| 145 |
+
``(word,tag)``.
|
| 146 |
+
:rtype: list(tuple(str,str))
|
| 147 |
+
"""
|
| 148 |
+
if tagset and tagset != self._tagset:
|
| 149 |
+
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
| 150 |
+
else:
|
| 151 |
+
tag_mapping_function = None
|
| 152 |
+
return concat(
|
| 153 |
+
[
|
| 154 |
+
TaggedCorpusView(
|
| 155 |
+
fileid,
|
| 156 |
+
enc,
|
| 157 |
+
True,
|
| 158 |
+
False,
|
| 159 |
+
False,
|
| 160 |
+
self._sep,
|
| 161 |
+
self._word_tokenizer,
|
| 162 |
+
self._sent_tokenizer,
|
| 163 |
+
self._para_block_reader,
|
| 164 |
+
tag_mapping_function,
|
| 165 |
+
)
|
| 166 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 167 |
+
]
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
def tagged_sents(self, fileids=None, tagset=None):
|
| 171 |
+
"""
|
| 172 |
+
:return: the given file(s) as a list of
|
| 173 |
+
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
| 174 |
+
|
| 175 |
+
:rtype: list(list(tuple(str,str)))
|
| 176 |
+
"""
|
| 177 |
+
if tagset and tagset != self._tagset:
|
| 178 |
+
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
| 179 |
+
else:
|
| 180 |
+
tag_mapping_function = None
|
| 181 |
+
return concat(
|
| 182 |
+
[
|
| 183 |
+
TaggedCorpusView(
|
| 184 |
+
fileid,
|
| 185 |
+
enc,
|
| 186 |
+
True,
|
| 187 |
+
True,
|
| 188 |
+
False,
|
| 189 |
+
self._sep,
|
| 190 |
+
self._word_tokenizer,
|
| 191 |
+
self._sent_tokenizer,
|
| 192 |
+
self._para_block_reader,
|
| 193 |
+
tag_mapping_function,
|
| 194 |
+
)
|
| 195 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 196 |
+
]
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
def tagged_paras(self, fileids=None, tagset=None):
|
| 200 |
+
"""
|
| 201 |
+
:return: the given file(s) as a list of
|
| 202 |
+
paragraphs, each encoded as a list of sentences, which are
|
| 203 |
+
in turn encoded as lists of ``(word,tag)`` tuples.
|
| 204 |
+
:rtype: list(list(list(tuple(str,str))))
|
| 205 |
+
"""
|
| 206 |
+
if tagset and tagset != self._tagset:
|
| 207 |
+
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
| 208 |
+
else:
|
| 209 |
+
tag_mapping_function = None
|
| 210 |
+
return concat(
|
| 211 |
+
[
|
| 212 |
+
TaggedCorpusView(
|
| 213 |
+
fileid,
|
| 214 |
+
enc,
|
| 215 |
+
True,
|
| 216 |
+
True,
|
| 217 |
+
True,
|
| 218 |
+
self._sep,
|
| 219 |
+
self._word_tokenizer,
|
| 220 |
+
self._sent_tokenizer,
|
| 221 |
+
self._para_block_reader,
|
| 222 |
+
tag_mapping_function,
|
| 223 |
+
)
|
| 224 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 225 |
+
]
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
|
| 230 |
+
"""
|
| 231 |
+
A reader for part-of-speech tagged corpora whose documents are
|
| 232 |
+
divided into categories based on their file identifiers.
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
def __init__(self, *args, **kwargs):
|
| 236 |
+
"""
|
| 237 |
+
Initialize the corpus reader. Categorization arguments
|
| 238 |
+
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
| 239 |
+
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
| 240 |
+
are passed to the ``TaggedCorpusReader``.
|
| 241 |
+
"""
|
| 242 |
+
CategorizedCorpusReader.__init__(self, kwargs)
|
| 243 |
+
TaggedCorpusReader.__init__(self, *args, **kwargs)
|
| 244 |
+
|
| 245 |
+
def tagged_words(self, fileids=None, categories=None, tagset=None):
|
| 246 |
+
return super().tagged_words(self._resolve(fileids, categories), tagset)
|
| 247 |
+
|
| 248 |
+
def tagged_sents(self, fileids=None, categories=None, tagset=None):
|
| 249 |
+
return super().tagged_sents(self._resolve(fileids, categories), tagset)
|
| 250 |
+
|
| 251 |
+
def tagged_paras(self, fileids=None, categories=None, tagset=None):
|
| 252 |
+
return super().tagged_paras(self._resolve(fileids, categories), tagset)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
class TaggedCorpusView(StreamBackedCorpusView):
|
| 256 |
+
"""
|
| 257 |
+
A specialized corpus view for tagged documents. It can be
|
| 258 |
+
customized via flags to divide the tagged corpus documents up by
|
| 259 |
+
sentence or paragraph, and to include or omit part of speech tags.
|
| 260 |
+
``TaggedCorpusView`` objects are typically created by
|
| 261 |
+
``TaggedCorpusReader`` (not directly by nltk users).
|
| 262 |
+
"""
|
| 263 |
+
|
| 264 |
+
def __init__(
|
| 265 |
+
self,
|
| 266 |
+
corpus_file,
|
| 267 |
+
encoding,
|
| 268 |
+
tagged,
|
| 269 |
+
group_by_sent,
|
| 270 |
+
group_by_para,
|
| 271 |
+
sep,
|
| 272 |
+
word_tokenizer,
|
| 273 |
+
sent_tokenizer,
|
| 274 |
+
para_block_reader,
|
| 275 |
+
tag_mapping_function=None,
|
| 276 |
+
):
|
| 277 |
+
self._tagged = tagged
|
| 278 |
+
self._group_by_sent = group_by_sent
|
| 279 |
+
self._group_by_para = group_by_para
|
| 280 |
+
self._sep = sep
|
| 281 |
+
self._word_tokenizer = word_tokenizer
|
| 282 |
+
self._sent_tokenizer = sent_tokenizer
|
| 283 |
+
self._para_block_reader = para_block_reader
|
| 284 |
+
self._tag_mapping_function = tag_mapping_function
|
| 285 |
+
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
| 286 |
+
|
| 287 |
+
def read_block(self, stream):
|
| 288 |
+
"""Reads one paragraph at a time."""
|
| 289 |
+
block = []
|
| 290 |
+
for para_str in self._para_block_reader(stream):
|
| 291 |
+
para = []
|
| 292 |
+
for sent_str in self._sent_tokenizer.tokenize(para_str):
|
| 293 |
+
sent = [
|
| 294 |
+
str2tuple(s, self._sep)
|
| 295 |
+
for s in self._word_tokenizer.tokenize(sent_str)
|
| 296 |
+
]
|
| 297 |
+
if self._tag_mapping_function:
|
| 298 |
+
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
|
| 299 |
+
if not self._tagged:
|
| 300 |
+
sent = [w for (w, t) in sent]
|
| 301 |
+
if self._group_by_sent:
|
| 302 |
+
para.append(sent)
|
| 303 |
+
else:
|
| 304 |
+
para.extend(sent)
|
| 305 |
+
if self._group_by_para:
|
| 306 |
+
block.append(para)
|
| 307 |
+
else:
|
| 308 |
+
block.extend(para)
|
| 309 |
+
return block
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
# needs to implement simplified tags
|
| 313 |
+
class MacMorphoCorpusReader(TaggedCorpusReader):
|
| 314 |
+
"""
|
| 315 |
+
A corpus reader for the MAC_MORPHO corpus. Each line contains a
|
| 316 |
+
single tagged word, using '_' as a separator. Sentence boundaries
|
| 317 |
+
are based on the end-sentence tag ('_.'). Paragraph information
|
| 318 |
+
is not included in the corpus, so each paragraph returned by
|
| 319 |
+
``self.paras()`` and ``self.tagged_paras()`` contains a single
|
| 320 |
+
sentence.
|
| 321 |
+
"""
|
| 322 |
+
|
| 323 |
+
def __init__(self, root, fileids, encoding="utf8", tagset=None):
|
| 324 |
+
TaggedCorpusReader.__init__(
|
| 325 |
+
self,
|
| 326 |
+
root,
|
| 327 |
+
fileids,
|
| 328 |
+
sep="_",
|
| 329 |
+
word_tokenizer=LineTokenizer(),
|
| 330 |
+
sent_tokenizer=RegexpTokenizer(".*\n"),
|
| 331 |
+
para_block_reader=self._read_block,
|
| 332 |
+
encoding=encoding,
|
| 333 |
+
tagset=tagset,
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
def _read_block(self, stream):
|
| 337 |
+
return read_regexp_block(stream, r".*", r".*_\.")
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
class TimitTaggedCorpusReader(TaggedCorpusReader):
|
| 341 |
+
"""
|
| 342 |
+
A corpus reader for tagged sentences that are included in the TIMIT corpus.
|
| 343 |
+
"""
|
| 344 |
+
|
| 345 |
+
def __init__(self, *args, **kwargs):
|
| 346 |
+
TaggedCorpusReader.__init__(
|
| 347 |
+
self, para_block_reader=read_timit_block, *args, **kwargs
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
def paras(self):
|
| 351 |
+
raise NotImplementedError("use sents() instead")
|
| 352 |
+
|
| 353 |
+
def tagged_paras(self):
|
| 354 |
+
raise NotImplementedError("use tagged_sents() instead")
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py
ADDED
|
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: TIMIT Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2007 NLTK Project
|
| 4 |
+
# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 6 |
+
# Jacob Perkins <japerk@gmail.com>
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
# [xx] this docstring is out-of-date:
|
| 11 |
+
"""
|
| 12 |
+
Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
|
| 13 |
+
|
| 14 |
+
This corpus contains selected portion of the TIMIT corpus.
|
| 15 |
+
|
| 16 |
+
- 16 speakers from 8 dialect regions
|
| 17 |
+
- 1 male and 1 female from each dialect region
|
| 18 |
+
- total 130 sentences (10 sentences per speaker. Note that some
|
| 19 |
+
sentences are shared among other speakers, especially sa1 and sa2
|
| 20 |
+
are spoken by all speakers.)
|
| 21 |
+
- total 160 recording of sentences (10 recordings per speaker)
|
| 22 |
+
- audio format: NIST Sphere, single channel, 16kHz sampling,
|
| 23 |
+
16 bit sample, PCM encoding
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
Module contents
|
| 27 |
+
===============
|
| 28 |
+
|
| 29 |
+
The timit corpus reader provides 4 functions and 4 data items.
|
| 30 |
+
|
| 31 |
+
- utterances
|
| 32 |
+
|
| 33 |
+
List of utterances in the corpus. There are total 160 utterances,
|
| 34 |
+
each of which corresponds to a unique utterance of a speaker.
|
| 35 |
+
Here's an example of an utterance identifier in the list::
|
| 36 |
+
|
| 37 |
+
dr1-fvmh0/sx206
|
| 38 |
+
- _---- _---
|
| 39 |
+
| | | | |
|
| 40 |
+
| | | | |
|
| 41 |
+
| | | | `--- sentence number
|
| 42 |
+
| | | `----- sentence type (a:all, i:shared, x:exclusive)
|
| 43 |
+
| | `--------- speaker ID
|
| 44 |
+
| `------------ sex (m:male, f:female)
|
| 45 |
+
`-------------- dialect region (1..8)
|
| 46 |
+
|
| 47 |
+
- speakers
|
| 48 |
+
|
| 49 |
+
List of speaker IDs. An example of speaker ID::
|
| 50 |
+
|
| 51 |
+
dr1-fvmh0
|
| 52 |
+
|
| 53 |
+
Note that if you split an item ID with colon and take the first element of
|
| 54 |
+
the result, you will get a speaker ID.
|
| 55 |
+
|
| 56 |
+
>>> itemid = 'dr1-fvmh0/sx206'
|
| 57 |
+
>>> spkrid , sentid = itemid.split('/')
|
| 58 |
+
>>> spkrid
|
| 59 |
+
'dr1-fvmh0'
|
| 60 |
+
|
| 61 |
+
The second element of the result is a sentence ID.
|
| 62 |
+
|
| 63 |
+
- dictionary()
|
| 64 |
+
|
| 65 |
+
Phonetic dictionary of words contained in this corpus. This is a Python
|
| 66 |
+
dictionary from words to phoneme lists.
|
| 67 |
+
|
| 68 |
+
- spkrinfo()
|
| 69 |
+
|
| 70 |
+
Speaker information table. It's a Python dictionary from speaker IDs to
|
| 71 |
+
records of 10 fields. Speaker IDs the same as the ones in timie.speakers.
|
| 72 |
+
Each record is a dictionary from field names to values, and the fields are
|
| 73 |
+
as follows::
|
| 74 |
+
|
| 75 |
+
id speaker ID as defined in the original TIMIT speaker info table
|
| 76 |
+
sex speaker gender (M:male, F:female)
|
| 77 |
+
dr speaker dialect region (1:new england, 2:northern,
|
| 78 |
+
3:north midland, 4:south midland, 5:southern, 6:new york city,
|
| 79 |
+
7:western, 8:army brat (moved around))
|
| 80 |
+
use corpus type (TRN:training, TST:test)
|
| 81 |
+
in this sample corpus only TRN is available
|
| 82 |
+
recdate recording date
|
| 83 |
+
birthdate speaker birth date
|
| 84 |
+
ht speaker height
|
| 85 |
+
race speaker race (WHT:white, BLK:black, AMR:american indian,
|
| 86 |
+
SPN:spanish-american, ORN:oriental,???:unknown)
|
| 87 |
+
edu speaker education level (HS:high school, AS:associate degree,
|
| 88 |
+
BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
|
| 89 |
+
PHD:doctorate degree (PhD,JD,MD), ??:unknown)
|
| 90 |
+
comments comments by the recorder
|
| 91 |
+
|
| 92 |
+
The 4 functions are as follows.
|
| 93 |
+
|
| 94 |
+
- tokenized(sentences=items, offset=False)
|
| 95 |
+
|
| 96 |
+
Given a list of items, returns an iterator of a list of word lists,
|
| 97 |
+
each of which corresponds to an item (sentence). If offset is set to True,
|
| 98 |
+
each element of the word list is a tuple of word(string), start offset and
|
| 99 |
+
end offset, where offset is represented as a number of 16kHz samples.
|
| 100 |
+
|
| 101 |
+
- phonetic(sentences=items, offset=False)
|
| 102 |
+
|
| 103 |
+
Given a list of items, returns an iterator of a list of phoneme lists,
|
| 104 |
+
each of which corresponds to an item (sentence). If offset is set to True,
|
| 105 |
+
each element of the phoneme list is a tuple of word(string), start offset
|
| 106 |
+
and end offset, where offset is represented as a number of 16kHz samples.
|
| 107 |
+
|
| 108 |
+
- audiodata(item, start=0, end=None)
|
| 109 |
+
|
| 110 |
+
Given an item, returns a chunk of audio samples formatted into a string.
|
| 111 |
+
When the function is called, if start and end are omitted, the entire
|
| 112 |
+
samples of the recording will be returned. If only end is omitted,
|
| 113 |
+
samples from the start offset to the end of the recording will be returned.
|
| 114 |
+
|
| 115 |
+
- play(data)
|
| 116 |
+
|
| 117 |
+
Play the given audio samples. The audio samples can be obtained from the
|
| 118 |
+
timit.audiodata function.
|
| 119 |
+
|
| 120 |
+
"""
|
| 121 |
+
import sys
|
| 122 |
+
import time
|
| 123 |
+
|
| 124 |
+
from nltk.corpus.reader.api import *
|
| 125 |
+
from nltk.internals import import_from_stdlib
|
| 126 |
+
from nltk.tree import Tree
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class TimitCorpusReader(CorpusReader):
|
| 130 |
+
"""
|
| 131 |
+
Reader for the TIMIT corpus (or any other corpus with the same
|
| 132 |
+
file layout and use of file formats). The corpus root directory
|
| 133 |
+
should contain the following files:
|
| 134 |
+
|
| 135 |
+
- timitdic.txt: dictionary of standard transcriptions
|
| 136 |
+
- spkrinfo.txt: table of speaker information
|
| 137 |
+
|
| 138 |
+
In addition, the root directory should contain one subdirectory
|
| 139 |
+
for each speaker, containing three files for each utterance:
|
| 140 |
+
|
| 141 |
+
- <utterance-id>.txt: text content of utterances
|
| 142 |
+
- <utterance-id>.wrd: tokenized text content of utterances
|
| 143 |
+
- <utterance-id>.phn: phonetic transcription of utterances
|
| 144 |
+
- <utterance-id>.wav: utterance sound file
|
| 145 |
+
"""
|
| 146 |
+
|
| 147 |
+
_FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
|
| 148 |
+
"""A regexp matching fileids that are used by this corpus reader."""
|
| 149 |
+
_UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
|
| 150 |
+
|
| 151 |
+
def __init__(self, root, encoding="utf8"):
|
| 152 |
+
"""
|
| 153 |
+
Construct a new TIMIT corpus reader in the given directory.
|
| 154 |
+
:param root: The root directory for this corpus.
|
| 155 |
+
"""
|
| 156 |
+
# Ensure that wave files don't get treated as unicode data:
|
| 157 |
+
if isinstance(encoding, str):
|
| 158 |
+
encoding = [(r".*\.wav", None), (".*", encoding)]
|
| 159 |
+
|
| 160 |
+
CorpusReader.__init__(
|
| 161 |
+
self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
self._utterances = [
|
| 165 |
+
name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
|
| 166 |
+
]
|
| 167 |
+
"""A list of the utterance identifiers for all utterances in
|
| 168 |
+
this corpus."""
|
| 169 |
+
|
| 170 |
+
self._speakerinfo = None
|
| 171 |
+
self._root = root
|
| 172 |
+
self.speakers = sorted({u.split("/")[0] for u in self._utterances})
|
| 173 |
+
|
| 174 |
+
def fileids(self, filetype=None):
|
| 175 |
+
"""
|
| 176 |
+
Return a list of file identifiers for the files that make up
|
| 177 |
+
this corpus.
|
| 178 |
+
|
| 179 |
+
:param filetype: If specified, then ``filetype`` indicates that
|
| 180 |
+
only the files that have the given type should be
|
| 181 |
+
returned. Accepted values are: ``txt``, ``wrd``, ``phn``,
|
| 182 |
+
``wav``, or ``metadata``,
|
| 183 |
+
"""
|
| 184 |
+
if filetype is None:
|
| 185 |
+
return CorpusReader.fileids(self)
|
| 186 |
+
elif filetype in ("txt", "wrd", "phn", "wav"):
|
| 187 |
+
return [f"{u}.{filetype}" for u in self._utterances]
|
| 188 |
+
elif filetype == "metadata":
|
| 189 |
+
return ["timitdic.txt", "spkrinfo.txt"]
|
| 190 |
+
else:
|
| 191 |
+
raise ValueError("Bad value for filetype: %r" % filetype)
|
| 192 |
+
|
| 193 |
+
def utteranceids(
|
| 194 |
+
self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
|
| 195 |
+
):
|
| 196 |
+
"""
|
| 197 |
+
:return: A list of the utterance identifiers for all
|
| 198 |
+
utterances in this corpus, or for the given speaker, dialect
|
| 199 |
+
region, gender, sentence type, or sentence number, if
|
| 200 |
+
specified.
|
| 201 |
+
"""
|
| 202 |
+
if isinstance(dialect, str):
|
| 203 |
+
dialect = [dialect]
|
| 204 |
+
if isinstance(sex, str):
|
| 205 |
+
sex = [sex]
|
| 206 |
+
if isinstance(spkrid, str):
|
| 207 |
+
spkrid = [spkrid]
|
| 208 |
+
if isinstance(sent_type, str):
|
| 209 |
+
sent_type = [sent_type]
|
| 210 |
+
if isinstance(sentid, str):
|
| 211 |
+
sentid = [sentid]
|
| 212 |
+
|
| 213 |
+
utterances = self._utterances[:]
|
| 214 |
+
if dialect is not None:
|
| 215 |
+
utterances = [u for u in utterances if u[2] in dialect]
|
| 216 |
+
if sex is not None:
|
| 217 |
+
utterances = [u for u in utterances if u[4] in sex]
|
| 218 |
+
if spkrid is not None:
|
| 219 |
+
utterances = [u for u in utterances if u[:9] in spkrid]
|
| 220 |
+
if sent_type is not None:
|
| 221 |
+
utterances = [u for u in utterances if u[11] in sent_type]
|
| 222 |
+
if sentid is not None:
|
| 223 |
+
utterances = [u for u in utterances if u[10:] in spkrid]
|
| 224 |
+
return utterances
|
| 225 |
+
|
| 226 |
+
def transcription_dict(self):
|
| 227 |
+
"""
|
| 228 |
+
:return: A dictionary giving the 'standard' transcription for
|
| 229 |
+
each word.
|
| 230 |
+
"""
|
| 231 |
+
_transcriptions = {}
|
| 232 |
+
with self.open("timitdic.txt") as fp:
|
| 233 |
+
for line in fp:
|
| 234 |
+
if not line.strip() or line[0] == ";":
|
| 235 |
+
continue
|
| 236 |
+
m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
|
| 237 |
+
if not m:
|
| 238 |
+
raise ValueError("Bad line: %r" % line)
|
| 239 |
+
_transcriptions[m.group(1)] = m.group(2).split()
|
| 240 |
+
return _transcriptions
|
| 241 |
+
|
| 242 |
+
def spkrid(self, utterance):
|
| 243 |
+
return utterance.split("/")[0]
|
| 244 |
+
|
| 245 |
+
def sentid(self, utterance):
|
| 246 |
+
return utterance.split("/")[1]
|
| 247 |
+
|
| 248 |
+
def utterance(self, spkrid, sentid):
|
| 249 |
+
return f"{spkrid}/{sentid}"
|
| 250 |
+
|
| 251 |
+
def spkrutteranceids(self, speaker):
|
| 252 |
+
"""
|
| 253 |
+
:return: A list of all utterances associated with a given
|
| 254 |
+
speaker.
|
| 255 |
+
"""
|
| 256 |
+
return [
|
| 257 |
+
utterance
|
| 258 |
+
for utterance in self._utterances
|
| 259 |
+
if utterance.startswith(speaker + "/")
|
| 260 |
+
]
|
| 261 |
+
|
| 262 |
+
def spkrinfo(self, speaker):
|
| 263 |
+
"""
|
| 264 |
+
:return: A dictionary mapping .. something.
|
| 265 |
+
"""
|
| 266 |
+
if speaker in self._utterances:
|
| 267 |
+
speaker = self.spkrid(speaker)
|
| 268 |
+
|
| 269 |
+
if self._speakerinfo is None:
|
| 270 |
+
self._speakerinfo = {}
|
| 271 |
+
with self.open("spkrinfo.txt") as fp:
|
| 272 |
+
for line in fp:
|
| 273 |
+
if not line.strip() or line[0] == ";":
|
| 274 |
+
continue
|
| 275 |
+
rec = line.strip().split(None, 9)
|
| 276 |
+
key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}"
|
| 277 |
+
self._speakerinfo[key] = SpeakerInfo(*rec)
|
| 278 |
+
|
| 279 |
+
return self._speakerinfo[speaker]
|
| 280 |
+
|
| 281 |
+
def phones(self, utterances=None):
|
| 282 |
+
results = []
|
| 283 |
+
for fileid in self._utterance_fileids(utterances, ".phn"):
|
| 284 |
+
with self.open(fileid) as fp:
|
| 285 |
+
for line in fp:
|
| 286 |
+
if line.strip():
|
| 287 |
+
results.append(line.split()[-1])
|
| 288 |
+
return results
|
| 289 |
+
|
| 290 |
+
def phone_times(self, utterances=None):
|
| 291 |
+
"""
|
| 292 |
+
offset is represented as a number of 16kHz samples!
|
| 293 |
+
"""
|
| 294 |
+
results = []
|
| 295 |
+
for fileid in self._utterance_fileids(utterances, ".phn"):
|
| 296 |
+
with self.open(fileid) as fp:
|
| 297 |
+
for line in fp:
|
| 298 |
+
if line.strip():
|
| 299 |
+
results.append(
|
| 300 |
+
(
|
| 301 |
+
line.split()[2],
|
| 302 |
+
int(line.split()[0]),
|
| 303 |
+
int(line.split()[1]),
|
| 304 |
+
)
|
| 305 |
+
)
|
| 306 |
+
return results
|
| 307 |
+
|
| 308 |
+
def words(self, utterances=None):
|
| 309 |
+
results = []
|
| 310 |
+
for fileid in self._utterance_fileids(utterances, ".wrd"):
|
| 311 |
+
with self.open(fileid) as fp:
|
| 312 |
+
for line in fp:
|
| 313 |
+
if line.strip():
|
| 314 |
+
results.append(line.split()[-1])
|
| 315 |
+
return results
|
| 316 |
+
|
| 317 |
+
def word_times(self, utterances=None):
|
| 318 |
+
results = []
|
| 319 |
+
for fileid in self._utterance_fileids(utterances, ".wrd"):
|
| 320 |
+
with self.open(fileid) as fp:
|
| 321 |
+
for line in fp:
|
| 322 |
+
if line.strip():
|
| 323 |
+
results.append(
|
| 324 |
+
(
|
| 325 |
+
line.split()[2],
|
| 326 |
+
int(line.split()[0]),
|
| 327 |
+
int(line.split()[1]),
|
| 328 |
+
)
|
| 329 |
+
)
|
| 330 |
+
return results
|
| 331 |
+
|
| 332 |
+
def sents(self, utterances=None):
|
| 333 |
+
results = []
|
| 334 |
+
for fileid in self._utterance_fileids(utterances, ".wrd"):
|
| 335 |
+
with self.open(fileid) as fp:
|
| 336 |
+
results.append([line.split()[-1] for line in fp if line.strip()])
|
| 337 |
+
return results
|
| 338 |
+
|
| 339 |
+
def sent_times(self, utterances=None):
|
| 340 |
+
# TODO: Check this
|
| 341 |
+
return [
|
| 342 |
+
(
|
| 343 |
+
line.split(None, 2)[-1].strip(),
|
| 344 |
+
int(line.split()[0]),
|
| 345 |
+
int(line.split()[1]),
|
| 346 |
+
)
|
| 347 |
+
for fileid in self._utterance_fileids(utterances, ".txt")
|
| 348 |
+
for line in self.open(fileid)
|
| 349 |
+
if line.strip()
|
| 350 |
+
]
|
| 351 |
+
|
| 352 |
+
def phone_trees(self, utterances=None):
|
| 353 |
+
if utterances is None:
|
| 354 |
+
utterances = self._utterances
|
| 355 |
+
if isinstance(utterances, str):
|
| 356 |
+
utterances = [utterances]
|
| 357 |
+
|
| 358 |
+
trees = []
|
| 359 |
+
for utterance in utterances:
|
| 360 |
+
word_times = self.word_times(utterance)
|
| 361 |
+
phone_times = self.phone_times(utterance)
|
| 362 |
+
sent_times = self.sent_times(utterance)
|
| 363 |
+
|
| 364 |
+
while sent_times:
|
| 365 |
+
(sent, sent_start, sent_end) = sent_times.pop(0)
|
| 366 |
+
trees.append(Tree("S", []))
|
| 367 |
+
while (
|
| 368 |
+
word_times and phone_times and phone_times[0][2] <= word_times[0][1]
|
| 369 |
+
):
|
| 370 |
+
trees[-1].append(phone_times.pop(0)[0])
|
| 371 |
+
while word_times and word_times[0][2] <= sent_end:
|
| 372 |
+
(word, word_start, word_end) = word_times.pop(0)
|
| 373 |
+
trees[-1].append(Tree(word, []))
|
| 374 |
+
while phone_times and phone_times[0][2] <= word_end:
|
| 375 |
+
trees[-1][-1].append(phone_times.pop(0)[0])
|
| 376 |
+
while phone_times and phone_times[0][2] <= sent_end:
|
| 377 |
+
trees[-1].append(phone_times.pop(0)[0])
|
| 378 |
+
return trees
|
| 379 |
+
|
| 380 |
+
# [xx] NOTE: This is currently broken -- we're assuming that the
|
| 381 |
+
# fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
|
| 382 |
+
# fileids.
|
| 383 |
+
def wav(self, utterance, start=0, end=None):
|
| 384 |
+
# nltk.chunk conflicts with the stdlib module 'chunk'
|
| 385 |
+
wave = import_from_stdlib("wave")
|
| 386 |
+
|
| 387 |
+
w = wave.open(self.open(utterance + ".wav"), "rb")
|
| 388 |
+
|
| 389 |
+
if end is None:
|
| 390 |
+
end = w.getnframes()
|
| 391 |
+
|
| 392 |
+
# Skip past frames before start, then read the frames we want
|
| 393 |
+
w.readframes(start)
|
| 394 |
+
frames = w.readframes(end - start)
|
| 395 |
+
|
| 396 |
+
# Open a new temporary file -- the wave module requires
|
| 397 |
+
# an actual file, and won't work w/ stringio. :(
|
| 398 |
+
tf = tempfile.TemporaryFile()
|
| 399 |
+
out = wave.open(tf, "w")
|
| 400 |
+
|
| 401 |
+
# Write the parameters & data to the new file.
|
| 402 |
+
out.setparams(w.getparams())
|
| 403 |
+
out.writeframes(frames)
|
| 404 |
+
out.close()
|
| 405 |
+
|
| 406 |
+
# Read the data back from the file, and return it. The
|
| 407 |
+
# file will automatically be deleted when we return.
|
| 408 |
+
tf.seek(0)
|
| 409 |
+
return tf.read()
|
| 410 |
+
|
| 411 |
+
def audiodata(self, utterance, start=0, end=None):
|
| 412 |
+
assert end is None or end > start
|
| 413 |
+
headersize = 44
|
| 414 |
+
with self.open(utterance + ".wav") as fp:
|
| 415 |
+
if end is None:
|
| 416 |
+
data = fp.read()
|
| 417 |
+
else:
|
| 418 |
+
data = fp.read(headersize + end * 2)
|
| 419 |
+
return data[headersize + start * 2 :]
|
| 420 |
+
|
| 421 |
+
def _utterance_fileids(self, utterances, extension):
|
| 422 |
+
if utterances is None:
|
| 423 |
+
utterances = self._utterances
|
| 424 |
+
if isinstance(utterances, str):
|
| 425 |
+
utterances = [utterances]
|
| 426 |
+
return [f"{u}{extension}" for u in utterances]
|
| 427 |
+
|
| 428 |
+
def play(self, utterance, start=0, end=None):
|
| 429 |
+
"""
|
| 430 |
+
Play the given audio sample.
|
| 431 |
+
|
| 432 |
+
:param utterance: The utterance id of the sample to play
|
| 433 |
+
"""
|
| 434 |
+
# Method 1: os audio dev.
|
| 435 |
+
try:
|
| 436 |
+
import ossaudiodev
|
| 437 |
+
|
| 438 |
+
try:
|
| 439 |
+
dsp = ossaudiodev.open("w")
|
| 440 |
+
dsp.setfmt(ossaudiodev.AFMT_S16_LE)
|
| 441 |
+
dsp.channels(1)
|
| 442 |
+
dsp.speed(16000)
|
| 443 |
+
dsp.write(self.audiodata(utterance, start, end))
|
| 444 |
+
dsp.close()
|
| 445 |
+
except OSError as e:
|
| 446 |
+
print(
|
| 447 |
+
(
|
| 448 |
+
"can't acquire the audio device; please "
|
| 449 |
+
"activate your audio device."
|
| 450 |
+
),
|
| 451 |
+
file=sys.stderr,
|
| 452 |
+
)
|
| 453 |
+
print("system error message:", str(e), file=sys.stderr)
|
| 454 |
+
return
|
| 455 |
+
except ImportError:
|
| 456 |
+
pass
|
| 457 |
+
|
| 458 |
+
# Method 2: pygame
|
| 459 |
+
try:
|
| 460 |
+
# FIXME: this won't work under python 3
|
| 461 |
+
import pygame.mixer
|
| 462 |
+
import StringIO
|
| 463 |
+
|
| 464 |
+
pygame.mixer.init(16000)
|
| 465 |
+
f = StringIO.StringIO(self.wav(utterance, start, end))
|
| 466 |
+
pygame.mixer.Sound(f).play()
|
| 467 |
+
while pygame.mixer.get_busy():
|
| 468 |
+
time.sleep(0.01)
|
| 469 |
+
return
|
| 470 |
+
except ImportError:
|
| 471 |
+
pass
|
| 472 |
+
|
| 473 |
+
# Method 3: complain. :)
|
| 474 |
+
print(
|
| 475 |
+
("you must install pygame or ossaudiodev " "for audio playback."),
|
| 476 |
+
file=sys.stderr,
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
class SpeakerInfo:
|
| 481 |
+
def __init__(
|
| 482 |
+
self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
|
| 483 |
+
):
|
| 484 |
+
self.id = id
|
| 485 |
+
self.sex = sex
|
| 486 |
+
self.dr = dr
|
| 487 |
+
self.use = use
|
| 488 |
+
self.recdate = recdate
|
| 489 |
+
self.birthdate = birthdate
|
| 490 |
+
self.ht = ht
|
| 491 |
+
self.race = race
|
| 492 |
+
self.edu = edu
|
| 493 |
+
self.comments = comments
|
| 494 |
+
|
| 495 |
+
def __repr__(self):
|
| 496 |
+
attribs = "id sex dr use recdate birthdate ht race edu comments"
|
| 497 |
+
args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()]
|
| 498 |
+
return "SpeakerInfo(%s)" % (", ".join(args))
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
def read_timit_block(stream):
|
| 502 |
+
"""
|
| 503 |
+
Block reader for timit tagged sentences, which are preceded by a sentence
|
| 504 |
+
number that will be ignored.
|
| 505 |
+
"""
|
| 506 |
+
line = stream.readline()
|
| 507 |
+
if not line:
|
| 508 |
+
return []
|
| 509 |
+
n, sent = line.split(" ", 1)
|
| 510 |
+
return [sent]
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Toolbox Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Greg Aumann <greg_aumann@sil.org>
|
| 5 |
+
# Stuart Robinson <Stuart.Robinson@mpi.nl>
|
| 6 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
Module for reading, writing and manipulating
|
| 12 |
+
Toolbox databases and settings fileids.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from nltk.corpus.reader.api import *
|
| 16 |
+
from nltk.corpus.reader.util import *
|
| 17 |
+
from nltk.toolbox import ToolboxData
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ToolboxCorpusReader(CorpusReader):
|
| 21 |
+
def xml(self, fileids, key=None):
|
| 22 |
+
return concat(
|
| 23 |
+
[
|
| 24 |
+
ToolboxData(path, enc).parse(key=key)
|
| 25 |
+
for (path, enc) in self.abspaths(fileids, True)
|
| 26 |
+
]
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
def fields(
|
| 30 |
+
self,
|
| 31 |
+
fileids,
|
| 32 |
+
strip=True,
|
| 33 |
+
unwrap=True,
|
| 34 |
+
encoding="utf8",
|
| 35 |
+
errors="strict",
|
| 36 |
+
unicode_fields=None,
|
| 37 |
+
):
|
| 38 |
+
return concat(
|
| 39 |
+
[
|
| 40 |
+
list(
|
| 41 |
+
ToolboxData(fileid, enc).fields(
|
| 42 |
+
strip, unwrap, encoding, errors, unicode_fields
|
| 43 |
+
)
|
| 44 |
+
)
|
| 45 |
+
for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
|
| 46 |
+
]
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# should probably be done lazily:
|
| 50 |
+
def entries(self, fileids, **kwargs):
|
| 51 |
+
if "key" in kwargs:
|
| 52 |
+
key = kwargs["key"]
|
| 53 |
+
del kwargs["key"]
|
| 54 |
+
else:
|
| 55 |
+
key = "lx" # the default key in MDF
|
| 56 |
+
entries = []
|
| 57 |
+
for marker, contents in self.fields(fileids, **kwargs):
|
| 58 |
+
if marker == key:
|
| 59 |
+
entries.append((contents, []))
|
| 60 |
+
else:
|
| 61 |
+
try:
|
| 62 |
+
entries[-1][-1].append((marker, contents))
|
| 63 |
+
except IndexError:
|
| 64 |
+
pass
|
| 65 |
+
return entries
|
| 66 |
+
|
| 67 |
+
def words(self, fileids, key="lx"):
|
| 68 |
+
return [contents for marker, contents in self.fields(fileids) if marker == key]
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def demo():
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Twitter Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A reader for corpora that consist of Tweets. It is assumed that the Tweets
|
| 10 |
+
have been serialised into line-delimited JSON.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
from nltk.corpus.reader.api import CorpusReader
|
| 17 |
+
from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
|
| 18 |
+
from nltk.tokenize import TweetTokenizer
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class TwitterCorpusReader(CorpusReader):
|
| 22 |
+
r"""
|
| 23 |
+
Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
|
| 24 |
+
|
| 25 |
+
Individual Tweets can be tokenized using the default tokenizer, or by a
|
| 26 |
+
custom tokenizer specified as a parameter to the constructor.
|
| 27 |
+
|
| 28 |
+
Construct a new Tweet corpus reader for a set of documents
|
| 29 |
+
located at the given root directory.
|
| 30 |
+
|
| 31 |
+
If you made your own tweet collection in a directory called
|
| 32 |
+
`twitter-files`, then you can initialise the reader as::
|
| 33 |
+
|
| 34 |
+
from nltk.corpus import TwitterCorpusReader
|
| 35 |
+
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
|
| 36 |
+
|
| 37 |
+
However, the recommended approach is to set the relevant directory as the
|
| 38 |
+
value of the environmental variable `TWITTER`, and then invoke the reader
|
| 39 |
+
as follows::
|
| 40 |
+
|
| 41 |
+
root = os.environ['TWITTER']
|
| 42 |
+
reader = TwitterCorpusReader(root, '.*\.json')
|
| 43 |
+
|
| 44 |
+
If you want to work directly with the raw Tweets, the `json` library can
|
| 45 |
+
be used::
|
| 46 |
+
|
| 47 |
+
import json
|
| 48 |
+
for tweet in reader.docs():
|
| 49 |
+
print(json.dumps(tweet, indent=1, sort_keys=True))
|
| 50 |
+
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
CorpusView = StreamBackedCorpusView
|
| 54 |
+
"""
|
| 55 |
+
The corpus view class used by this reader.
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
def __init__(
|
| 59 |
+
self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
|
| 60 |
+
):
|
| 61 |
+
"""
|
| 62 |
+
:param root: The root directory for this corpus.
|
| 63 |
+
:param fileids: A list or regexp specifying the fileids in this corpus.
|
| 64 |
+
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
|
| 65 |
+
smaller units, including but not limited to words.
|
| 66 |
+
"""
|
| 67 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 68 |
+
|
| 69 |
+
for path in self.abspaths(self._fileids):
|
| 70 |
+
if isinstance(path, ZipFilePathPointer):
|
| 71 |
+
pass
|
| 72 |
+
elif os.path.getsize(path) == 0:
|
| 73 |
+
raise ValueError(f"File {path} is empty")
|
| 74 |
+
"""Check that all user-created corpus files are non-empty."""
|
| 75 |
+
|
| 76 |
+
self._word_tokenizer = word_tokenizer
|
| 77 |
+
|
| 78 |
+
def docs(self, fileids=None):
|
| 79 |
+
"""
|
| 80 |
+
Returns the full Tweet objects, as specified by `Twitter
|
| 81 |
+
documentation on Tweets
|
| 82 |
+
<https://dev.twitter.com/docs/platform-objects/tweets>`_
|
| 83 |
+
|
| 84 |
+
:return: the given file(s) as a list of dictionaries deserialised
|
| 85 |
+
from JSON.
|
| 86 |
+
:rtype: list(dict)
|
| 87 |
+
"""
|
| 88 |
+
return concat(
|
| 89 |
+
[
|
| 90 |
+
self.CorpusView(path, self._read_tweets, encoding=enc)
|
| 91 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 92 |
+
]
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
def strings(self, fileids=None):
|
| 96 |
+
"""
|
| 97 |
+
Returns only the text content of Tweets in the file(s)
|
| 98 |
+
|
| 99 |
+
:return: the given file(s) as a list of Tweets.
|
| 100 |
+
:rtype: list(str)
|
| 101 |
+
"""
|
| 102 |
+
fulltweets = self.docs(fileids)
|
| 103 |
+
tweets = []
|
| 104 |
+
for jsono in fulltweets:
|
| 105 |
+
try:
|
| 106 |
+
text = jsono["text"]
|
| 107 |
+
if isinstance(text, bytes):
|
| 108 |
+
text = text.decode(self.encoding)
|
| 109 |
+
tweets.append(text)
|
| 110 |
+
except KeyError:
|
| 111 |
+
pass
|
| 112 |
+
return tweets
|
| 113 |
+
|
| 114 |
+
def tokenized(self, fileids=None):
|
| 115 |
+
"""
|
| 116 |
+
:return: the given file(s) as a list of the text content of Tweets as
|
| 117 |
+
as a list of words, screenanames, hashtags, URLs and punctuation symbols.
|
| 118 |
+
|
| 119 |
+
:rtype: list(list(str))
|
| 120 |
+
"""
|
| 121 |
+
tweets = self.strings(fileids)
|
| 122 |
+
tokenizer = self._word_tokenizer
|
| 123 |
+
return [tokenizer.tokenize(t) for t in tweets]
|
| 124 |
+
|
| 125 |
+
def _read_tweets(self, stream):
|
| 126 |
+
"""
|
| 127 |
+
Assumes that each line in ``stream`` is a JSON-serialised object.
|
| 128 |
+
"""
|
| 129 |
+
tweets = []
|
| 130 |
+
for i in range(10):
|
| 131 |
+
line = stream.readline()
|
| 132 |
+
if not line:
|
| 133 |
+
return tweets
|
| 134 |
+
tweet = json.loads(line)
|
| 135 |
+
tweets.append(tweet)
|
| 136 |
+
return tweets
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
UDHR corpus reader. It mostly deals with encodings.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
|
| 6 |
+
from nltk.corpus.reader.util import find_corpus_fileids
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class UdhrCorpusReader(PlaintextCorpusReader):
|
| 10 |
+
|
| 11 |
+
ENCODINGS = [
|
| 12 |
+
(".*-Latin1$", "latin-1"),
|
| 13 |
+
(".*-Hebrew$", "hebrew"),
|
| 14 |
+
(".*-Arabic$", "cp1256"),
|
| 15 |
+
("Czech_Cesky-UTF8", "cp1250"), # yeah
|
| 16 |
+
("Polish-Latin2", "cp1250"),
|
| 17 |
+
("Polish_Polski-Latin2", "cp1250"),
|
| 18 |
+
(".*-Cyrillic$", "cyrillic"),
|
| 19 |
+
(".*-SJIS$", "SJIS"),
|
| 20 |
+
(".*-GB2312$", "GB2312"),
|
| 21 |
+
(".*-Latin2$", "ISO-8859-2"),
|
| 22 |
+
(".*-Greek$", "greek"),
|
| 23 |
+
(".*-UTF8$", "utf-8"),
|
| 24 |
+
("Hungarian_Magyar-Unicode", "utf-16-le"),
|
| 25 |
+
("Amahuaca", "latin1"),
|
| 26 |
+
("Turkish_Turkce-Turkish", "latin5"),
|
| 27 |
+
("Lithuanian_Lietuviskai-Baltic", "latin4"),
|
| 28 |
+
("Japanese_Nihongo-EUC", "EUC-JP"),
|
| 29 |
+
("Japanese_Nihongo-JIS", "iso2022_jp"),
|
| 30 |
+
("Chinese_Mandarin-HZ", "hz"),
|
| 31 |
+
(r"Abkhaz\-Cyrillic\+Abkh", "cp1251"),
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
SKIP = {
|
| 35 |
+
# The following files are not fully decodable because they
|
| 36 |
+
# were truncated at wrong bytes:
|
| 37 |
+
"Burmese_Myanmar-UTF8",
|
| 38 |
+
"Japanese_Nihongo-JIS",
|
| 39 |
+
"Chinese_Mandarin-HZ",
|
| 40 |
+
"Chinese_Mandarin-UTF8",
|
| 41 |
+
"Gujarati-UTF8",
|
| 42 |
+
"Hungarian_Magyar-Unicode",
|
| 43 |
+
"Lao-UTF8",
|
| 44 |
+
"Magahi-UTF8",
|
| 45 |
+
"Marathi-UTF8",
|
| 46 |
+
"Tamil-UTF8",
|
| 47 |
+
# Unfortunately, encodings required for reading
|
| 48 |
+
# the following files are not supported by Python:
|
| 49 |
+
"Vietnamese-VPS",
|
| 50 |
+
"Vietnamese-VIQR",
|
| 51 |
+
"Vietnamese-TCVN",
|
| 52 |
+
"Magahi-Agra",
|
| 53 |
+
"Bhojpuri-Agra",
|
| 54 |
+
"Esperanto-T61", # latin3 raises an exception
|
| 55 |
+
# The following files are encoded for specific fonts:
|
| 56 |
+
"Burmese_Myanmar-WinResearcher",
|
| 57 |
+
"Armenian-DallakHelv",
|
| 58 |
+
"Tigrinya_Tigrigna-VG2Main",
|
| 59 |
+
"Amharic-Afenegus6..60375", # ?
|
| 60 |
+
"Navaho_Dine-Navajo-Navaho-font",
|
| 61 |
+
# What are these?
|
| 62 |
+
"Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
|
| 63 |
+
"Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
|
| 64 |
+
# The following files are unintended:
|
| 65 |
+
"Czech-Latin2-err",
|
| 66 |
+
"Russian_Russky-UTF8~",
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
def __init__(self, root="udhr"):
|
| 70 |
+
fileids = find_corpus_fileids(root, r"(?!README|\.).*")
|
| 71 |
+
super().__init__(
|
| 72 |
+
root,
|
| 73 |
+
[fileid for fileid in fileids if fileid not in self.SKIP],
|
| 74 |
+
encoding=self.ENCODINGS,
|
| 75 |
+
)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py
ADDED
|
@@ -0,0 +1,867 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Corpus Reader Utilities
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
import bisect
|
| 10 |
+
import os
|
| 11 |
+
import pickle
|
| 12 |
+
import re
|
| 13 |
+
import tempfile
|
| 14 |
+
from functools import reduce
|
| 15 |
+
from xml.etree import ElementTree
|
| 16 |
+
|
| 17 |
+
from nltk.data import (
|
| 18 |
+
FileSystemPathPointer,
|
| 19 |
+
PathPointer,
|
| 20 |
+
SeekableUnicodeStreamReader,
|
| 21 |
+
ZipFilePathPointer,
|
| 22 |
+
)
|
| 23 |
+
from nltk.internals import slice_bounds
|
| 24 |
+
from nltk.tokenize import wordpunct_tokenize
|
| 25 |
+
from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence
|
| 26 |
+
|
| 27 |
+
######################################################################
|
| 28 |
+
# { Corpus View
|
| 29 |
+
######################################################################
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class StreamBackedCorpusView(AbstractLazySequence):
|
| 33 |
+
"""
|
| 34 |
+
A 'view' of a corpus file, which acts like a sequence of tokens:
|
| 35 |
+
it can be accessed by index, iterated over, etc. However, the
|
| 36 |
+
tokens are only constructed as-needed -- the entire corpus is
|
| 37 |
+
never stored in memory at once.
|
| 38 |
+
|
| 39 |
+
The constructor to ``StreamBackedCorpusView`` takes two arguments:
|
| 40 |
+
a corpus fileid (specified as a string or as a ``PathPointer``);
|
| 41 |
+
and a block reader. A "block reader" is a function that reads
|
| 42 |
+
zero or more tokens from a stream, and returns them as a list. A
|
| 43 |
+
very simple example of a block reader is:
|
| 44 |
+
|
| 45 |
+
>>> def simple_block_reader(stream):
|
| 46 |
+
... return stream.readline().split()
|
| 47 |
+
|
| 48 |
+
This simple block reader reads a single line at a time, and
|
| 49 |
+
returns a single token (consisting of a string) for each
|
| 50 |
+
whitespace-separated substring on the line.
|
| 51 |
+
|
| 52 |
+
When deciding how to define the block reader for a given
|
| 53 |
+
corpus, careful consideration should be given to the size of
|
| 54 |
+
blocks handled by the block reader. Smaller block sizes will
|
| 55 |
+
increase the memory requirements of the corpus view's internal
|
| 56 |
+
data structures (by 2 integers per block). On the other hand,
|
| 57 |
+
larger block sizes may decrease performance for random access to
|
| 58 |
+
the corpus. (But note that larger block sizes will *not*
|
| 59 |
+
decrease performance for iteration.)
|
| 60 |
+
|
| 61 |
+
Internally, ``CorpusView`` maintains a partial mapping from token
|
| 62 |
+
index to file position, with one entry per block. When a token
|
| 63 |
+
with a given index *i* is requested, the ``CorpusView`` constructs
|
| 64 |
+
it as follows:
|
| 65 |
+
|
| 66 |
+
1. First, it searches the toknum/filepos mapping for the token
|
| 67 |
+
index closest to (but less than or equal to) *i*.
|
| 68 |
+
|
| 69 |
+
2. Then, starting at the file position corresponding to that
|
| 70 |
+
index, it reads one block at a time using the block reader
|
| 71 |
+
until it reaches the requested token.
|
| 72 |
+
|
| 73 |
+
The toknum/filepos mapping is created lazily: it is initially
|
| 74 |
+
empty, but every time a new block is read, the block's
|
| 75 |
+
initial token is added to the mapping. (Thus, the toknum/filepos
|
| 76 |
+
map has one entry per block.)
|
| 77 |
+
|
| 78 |
+
In order to increase efficiency for random access patterns that
|
| 79 |
+
have high degrees of locality, the corpus view may cache one or
|
| 80 |
+
more blocks.
|
| 81 |
+
|
| 82 |
+
:note: Each ``CorpusView`` object internally maintains an open file
|
| 83 |
+
object for its underlying corpus file. This file should be
|
| 84 |
+
automatically closed when the ``CorpusView`` is garbage collected,
|
| 85 |
+
but if you wish to close it manually, use the ``close()``
|
| 86 |
+
method. If you access a ``CorpusView``'s items after it has been
|
| 87 |
+
closed, the file object will be automatically re-opened.
|
| 88 |
+
|
| 89 |
+
:warning: If the contents of the file are modified during the
|
| 90 |
+
lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
|
| 91 |
+
is undefined.
|
| 92 |
+
|
| 93 |
+
:warning: If a unicode encoding is specified when constructing a
|
| 94 |
+
``CorpusView``, then the block reader may only call
|
| 95 |
+
``stream.seek()`` with offsets that have been returned by
|
| 96 |
+
``stream.tell()``; in particular, calling ``stream.seek()`` with
|
| 97 |
+
relative offsets, or with offsets based on string lengths, may
|
| 98 |
+
lead to incorrect behavior.
|
| 99 |
+
|
| 100 |
+
:ivar _block_reader: The function used to read
|
| 101 |
+
a single block from the underlying file stream.
|
| 102 |
+
:ivar _toknum: A list containing the token index of each block
|
| 103 |
+
that has been processed. In particular, ``_toknum[i]`` is the
|
| 104 |
+
token index of the first token in block ``i``. Together
|
| 105 |
+
with ``_filepos``, this forms a partial mapping between token
|
| 106 |
+
indices and file positions.
|
| 107 |
+
:ivar _filepos: A list containing the file position of each block
|
| 108 |
+
that has been processed. In particular, ``_toknum[i]`` is the
|
| 109 |
+
file position of the first character in block ``i``. Together
|
| 110 |
+
with ``_toknum``, this forms a partial mapping between token
|
| 111 |
+
indices and file positions.
|
| 112 |
+
:ivar _stream: The stream used to access the underlying corpus file.
|
| 113 |
+
:ivar _len: The total number of tokens in the corpus, if known;
|
| 114 |
+
or None, if the number of tokens is not yet known.
|
| 115 |
+
:ivar _eofpos: The character position of the last character in the
|
| 116 |
+
file. This is calculated when the corpus view is initialized,
|
| 117 |
+
and is used to decide when the end of file has been reached.
|
| 118 |
+
:ivar _cache: A cache of the most recently read block. It
|
| 119 |
+
is encoded as a tuple (start_toknum, end_toknum, tokens), where
|
| 120 |
+
start_toknum is the token index of the first token in the block;
|
| 121 |
+
end_toknum is the token index of the first token not in the
|
| 122 |
+
block; and tokens is a list of the tokens in the block.
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
|
| 126 |
+
"""
|
| 127 |
+
Create a new corpus view, based on the file ``fileid``, and
|
| 128 |
+
read with ``block_reader``. See the class documentation
|
| 129 |
+
for more information.
|
| 130 |
+
|
| 131 |
+
:param fileid: The path to the file that is read by this
|
| 132 |
+
corpus view. ``fileid`` can either be a string or a
|
| 133 |
+
``PathPointer``.
|
| 134 |
+
|
| 135 |
+
:param startpos: The file position at which the view will
|
| 136 |
+
start reading. This can be used to skip over preface
|
| 137 |
+
sections.
|
| 138 |
+
|
| 139 |
+
:param encoding: The unicode encoding that should be used to
|
| 140 |
+
read the file's contents. If no encoding is specified,
|
| 141 |
+
then the file's contents will be read as a non-unicode
|
| 142 |
+
string (i.e., a str).
|
| 143 |
+
"""
|
| 144 |
+
if block_reader:
|
| 145 |
+
self.read_block = block_reader
|
| 146 |
+
# Initialize our toknum/filepos mapping.
|
| 147 |
+
self._toknum = [0]
|
| 148 |
+
self._filepos = [startpos]
|
| 149 |
+
self._encoding = encoding
|
| 150 |
+
# We don't know our length (number of tokens) yet.
|
| 151 |
+
self._len = None
|
| 152 |
+
|
| 153 |
+
self._fileid = fileid
|
| 154 |
+
self._stream = None
|
| 155 |
+
|
| 156 |
+
self._current_toknum = None
|
| 157 |
+
"""This variable is set to the index of the next token that
|
| 158 |
+
will be read, immediately before ``self.read_block()`` is
|
| 159 |
+
called. This is provided for the benefit of the block
|
| 160 |
+
reader, which under rare circumstances may need to know
|
| 161 |
+
the current token number."""
|
| 162 |
+
|
| 163 |
+
self._current_blocknum = None
|
| 164 |
+
"""This variable is set to the index of the next block that
|
| 165 |
+
will be read, immediately before ``self.read_block()`` is
|
| 166 |
+
called. This is provided for the benefit of the block
|
| 167 |
+
reader, which under rare circumstances may need to know
|
| 168 |
+
the current block number."""
|
| 169 |
+
|
| 170 |
+
# Find the length of the file.
|
| 171 |
+
try:
|
| 172 |
+
if isinstance(self._fileid, PathPointer):
|
| 173 |
+
self._eofpos = self._fileid.file_size()
|
| 174 |
+
else:
|
| 175 |
+
self._eofpos = os.stat(self._fileid).st_size
|
| 176 |
+
except Exception as exc:
|
| 177 |
+
raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc
|
| 178 |
+
|
| 179 |
+
# Maintain a cache of the most recently read block, to
|
| 180 |
+
# increase efficiency of random access.
|
| 181 |
+
self._cache = (-1, -1, None)
|
| 182 |
+
|
| 183 |
+
fileid = property(
|
| 184 |
+
lambda self: self._fileid,
|
| 185 |
+
doc="""
|
| 186 |
+
The fileid of the file that is accessed by this view.
|
| 187 |
+
|
| 188 |
+
:type: str or PathPointer""",
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
def read_block(self, stream):
|
| 192 |
+
"""
|
| 193 |
+
Read a block from the input stream.
|
| 194 |
+
|
| 195 |
+
:return: a block of tokens from the input stream
|
| 196 |
+
:rtype: list(any)
|
| 197 |
+
:param stream: an input stream
|
| 198 |
+
:type stream: stream
|
| 199 |
+
"""
|
| 200 |
+
raise NotImplementedError("Abstract Method")
|
| 201 |
+
|
| 202 |
+
def _open(self):
|
| 203 |
+
"""
|
| 204 |
+
Open the file stream associated with this corpus view. This
|
| 205 |
+
will be called performed if any value is read from the view
|
| 206 |
+
while its file stream is closed.
|
| 207 |
+
"""
|
| 208 |
+
if isinstance(self._fileid, PathPointer):
|
| 209 |
+
self._stream = self._fileid.open(self._encoding)
|
| 210 |
+
elif self._encoding:
|
| 211 |
+
self._stream = SeekableUnicodeStreamReader(
|
| 212 |
+
open(self._fileid, "rb"), self._encoding
|
| 213 |
+
)
|
| 214 |
+
else:
|
| 215 |
+
self._stream = open(self._fileid, "rb")
|
| 216 |
+
|
| 217 |
+
def close(self):
|
| 218 |
+
"""
|
| 219 |
+
Close the file stream associated with this corpus view. This
|
| 220 |
+
can be useful if you are worried about running out of file
|
| 221 |
+
handles (although the stream should automatically be closed
|
| 222 |
+
upon garbage collection of the corpus view). If the corpus
|
| 223 |
+
view is accessed after it is closed, it will be automatically
|
| 224 |
+
re-opened.
|
| 225 |
+
"""
|
| 226 |
+
if self._stream is not None:
|
| 227 |
+
self._stream.close()
|
| 228 |
+
self._stream = None
|
| 229 |
+
|
| 230 |
+
def __enter__(self):
|
| 231 |
+
return self
|
| 232 |
+
|
| 233 |
+
def __exit__(self, type, value, traceback):
|
| 234 |
+
self.close()
|
| 235 |
+
|
| 236 |
+
def __len__(self):
|
| 237 |
+
if self._len is None:
|
| 238 |
+
# iterate_from() sets self._len when it reaches the end
|
| 239 |
+
# of the file:
|
| 240 |
+
for tok in self.iterate_from(self._toknum[-1]):
|
| 241 |
+
pass
|
| 242 |
+
return self._len
|
| 243 |
+
|
| 244 |
+
def __getitem__(self, i):
|
| 245 |
+
if isinstance(i, slice):
|
| 246 |
+
start, stop = slice_bounds(self, i)
|
| 247 |
+
# Check if it's in the cache.
|
| 248 |
+
offset = self._cache[0]
|
| 249 |
+
if offset <= start and stop <= self._cache[1]:
|
| 250 |
+
return self._cache[2][start - offset : stop - offset]
|
| 251 |
+
# Construct & return the result.
|
| 252 |
+
return LazySubsequence(self, start, stop)
|
| 253 |
+
else:
|
| 254 |
+
# Handle negative indices
|
| 255 |
+
if i < 0:
|
| 256 |
+
i += len(self)
|
| 257 |
+
if i < 0:
|
| 258 |
+
raise IndexError("index out of range")
|
| 259 |
+
# Check if it's in the cache.
|
| 260 |
+
offset = self._cache[0]
|
| 261 |
+
if offset <= i < self._cache[1]:
|
| 262 |
+
return self._cache[2][i - offset]
|
| 263 |
+
# Use iterate_from to extract it.
|
| 264 |
+
try:
|
| 265 |
+
return next(self.iterate_from(i))
|
| 266 |
+
except StopIteration as e:
|
| 267 |
+
raise IndexError("index out of range") from e
|
| 268 |
+
|
| 269 |
+
# If we wanted to be thread-safe, then this method would need to
|
| 270 |
+
# do some locking.
|
| 271 |
+
def iterate_from(self, start_tok):
|
| 272 |
+
# Start by feeding from the cache, if possible.
|
| 273 |
+
if self._cache[0] <= start_tok < self._cache[1]:
|
| 274 |
+
for tok in self._cache[2][start_tok - self._cache[0] :]:
|
| 275 |
+
yield tok
|
| 276 |
+
start_tok += 1
|
| 277 |
+
|
| 278 |
+
# Decide where in the file we should start. If `start` is in
|
| 279 |
+
# our mapping, then we can jump straight to the correct block;
|
| 280 |
+
# otherwise, start at the last block we've processed.
|
| 281 |
+
if start_tok < self._toknum[-1]:
|
| 282 |
+
block_index = bisect.bisect_right(self._toknum, start_tok) - 1
|
| 283 |
+
toknum = self._toknum[block_index]
|
| 284 |
+
filepos = self._filepos[block_index]
|
| 285 |
+
else:
|
| 286 |
+
block_index = len(self._toknum) - 1
|
| 287 |
+
toknum = self._toknum[-1]
|
| 288 |
+
filepos = self._filepos[-1]
|
| 289 |
+
|
| 290 |
+
# Open the stream, if it's not open already.
|
| 291 |
+
if self._stream is None:
|
| 292 |
+
self._open()
|
| 293 |
+
|
| 294 |
+
# If the file is empty, the while loop will never run.
|
| 295 |
+
# This *seems* to be all the state we need to set:
|
| 296 |
+
if self._eofpos == 0:
|
| 297 |
+
self._len = 0
|
| 298 |
+
|
| 299 |
+
# Each iteration through this loop, we read a single block
|
| 300 |
+
# from the stream.
|
| 301 |
+
while filepos < self._eofpos:
|
| 302 |
+
# Read the next block.
|
| 303 |
+
self._stream.seek(filepos)
|
| 304 |
+
self._current_toknum = toknum
|
| 305 |
+
self._current_blocknum = block_index
|
| 306 |
+
tokens = self.read_block(self._stream)
|
| 307 |
+
assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
|
| 308 |
+
"block reader %s() should return list or tuple."
|
| 309 |
+
% self.read_block.__name__
|
| 310 |
+
)
|
| 311 |
+
num_toks = len(tokens)
|
| 312 |
+
new_filepos = self._stream.tell()
|
| 313 |
+
assert (
|
| 314 |
+
new_filepos > filepos
|
| 315 |
+
), "block reader %s() should consume at least 1 byte (filepos=%d)" % (
|
| 316 |
+
self.read_block.__name__,
|
| 317 |
+
filepos,
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
# Update our cache.
|
| 321 |
+
self._cache = (toknum, toknum + num_toks, list(tokens))
|
| 322 |
+
|
| 323 |
+
# Update our mapping.
|
| 324 |
+
assert toknum <= self._toknum[-1]
|
| 325 |
+
if num_toks > 0:
|
| 326 |
+
block_index += 1
|
| 327 |
+
if toknum == self._toknum[-1]:
|
| 328 |
+
assert new_filepos > self._filepos[-1] # monotonic!
|
| 329 |
+
self._filepos.append(new_filepos)
|
| 330 |
+
self._toknum.append(toknum + num_toks)
|
| 331 |
+
else:
|
| 332 |
+
# Check for consistency:
|
| 333 |
+
assert (
|
| 334 |
+
new_filepos == self._filepos[block_index]
|
| 335 |
+
), "inconsistent block reader (num chars read)"
|
| 336 |
+
assert (
|
| 337 |
+
toknum + num_toks == self._toknum[block_index]
|
| 338 |
+
), "inconsistent block reader (num tokens returned)"
|
| 339 |
+
|
| 340 |
+
# If we reached the end of the file, then update self._len
|
| 341 |
+
if new_filepos == self._eofpos:
|
| 342 |
+
self._len = toknum + num_toks
|
| 343 |
+
# Generate the tokens in this block (but skip any tokens
|
| 344 |
+
# before start_tok). Note that between yields, our state
|
| 345 |
+
# may be modified.
|
| 346 |
+
for tok in tokens[max(0, start_tok - toknum) :]:
|
| 347 |
+
yield tok
|
| 348 |
+
# If we're at the end of the file, then we're done.
|
| 349 |
+
assert new_filepos <= self._eofpos
|
| 350 |
+
if new_filepos == self._eofpos:
|
| 351 |
+
break
|
| 352 |
+
# Update our indices
|
| 353 |
+
toknum += num_toks
|
| 354 |
+
filepos = new_filepos
|
| 355 |
+
|
| 356 |
+
# If we reach this point, then we should know our length.
|
| 357 |
+
assert self._len is not None
|
| 358 |
+
# Enforce closing of stream once we reached end of file
|
| 359 |
+
# We should have reached EOF once we're out of the while loop.
|
| 360 |
+
self.close()
|
| 361 |
+
|
| 362 |
+
# Use concat for these, so we can use a ConcatenatedCorpusView
|
| 363 |
+
# when possible.
|
| 364 |
+
def __add__(self, other):
|
| 365 |
+
return concat([self, other])
|
| 366 |
+
|
| 367 |
+
def __radd__(self, other):
|
| 368 |
+
return concat([other, self])
|
| 369 |
+
|
| 370 |
+
def __mul__(self, count):
|
| 371 |
+
return concat([self] * count)
|
| 372 |
+
|
| 373 |
+
def __rmul__(self, count):
|
| 374 |
+
return concat([self] * count)
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
class ConcatenatedCorpusView(AbstractLazySequence):
|
| 378 |
+
"""
|
| 379 |
+
A 'view' of a corpus file that joins together one or more
|
| 380 |
+
``StreamBackedCorpusViews<StreamBackedCorpusView>``. At most
|
| 381 |
+
one file handle is left open at any time.
|
| 382 |
+
"""
|
| 383 |
+
|
| 384 |
+
def __init__(self, corpus_views):
|
| 385 |
+
self._pieces = corpus_views
|
| 386 |
+
"""A list of the corpus subviews that make up this
|
| 387 |
+
concatenation."""
|
| 388 |
+
|
| 389 |
+
self._offsets = [0]
|
| 390 |
+
"""A list of offsets, indicating the index at which each
|
| 391 |
+
subview begins. In particular::
|
| 392 |
+
offsets[i] = sum([len(p) for p in pieces[:i]])"""
|
| 393 |
+
|
| 394 |
+
self._open_piece = None
|
| 395 |
+
"""The most recently accessed corpus subview (or None).
|
| 396 |
+
Before a new subview is accessed, this subview will be closed."""
|
| 397 |
+
|
| 398 |
+
def __len__(self):
|
| 399 |
+
if len(self._offsets) <= len(self._pieces):
|
| 400 |
+
# Iterate to the end of the corpus.
|
| 401 |
+
for tok in self.iterate_from(self._offsets[-1]):
|
| 402 |
+
pass
|
| 403 |
+
|
| 404 |
+
return self._offsets[-1]
|
| 405 |
+
|
| 406 |
+
def close(self):
|
| 407 |
+
for piece in self._pieces:
|
| 408 |
+
piece.close()
|
| 409 |
+
|
| 410 |
+
def iterate_from(self, start_tok):
|
| 411 |
+
piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
|
| 412 |
+
|
| 413 |
+
while piecenum < len(self._pieces):
|
| 414 |
+
offset = self._offsets[piecenum]
|
| 415 |
+
piece = self._pieces[piecenum]
|
| 416 |
+
|
| 417 |
+
# If we've got another piece open, close it first.
|
| 418 |
+
if self._open_piece is not piece:
|
| 419 |
+
if self._open_piece is not None:
|
| 420 |
+
self._open_piece.close()
|
| 421 |
+
self._open_piece = piece
|
| 422 |
+
|
| 423 |
+
# Get everything we can from this piece.
|
| 424 |
+
yield from piece.iterate_from(max(0, start_tok - offset))
|
| 425 |
+
|
| 426 |
+
# Update the offset table.
|
| 427 |
+
if piecenum + 1 == len(self._offsets):
|
| 428 |
+
self._offsets.append(self._offsets[-1] + len(piece))
|
| 429 |
+
|
| 430 |
+
# Move on to the next piece.
|
| 431 |
+
piecenum += 1
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
def concat(docs):
|
| 435 |
+
"""
|
| 436 |
+
Concatenate together the contents of multiple documents from a
|
| 437 |
+
single corpus, using an appropriate concatenation function. This
|
| 438 |
+
utility function is used by corpus readers when the user requests
|
| 439 |
+
more than one document at a time.
|
| 440 |
+
"""
|
| 441 |
+
if len(docs) == 1:
|
| 442 |
+
return docs[0]
|
| 443 |
+
if len(docs) == 0:
|
| 444 |
+
raise ValueError("concat() expects at least one object!")
|
| 445 |
+
|
| 446 |
+
types = {d.__class__ for d in docs}
|
| 447 |
+
|
| 448 |
+
# If they're all strings, use string concatenation.
|
| 449 |
+
if all(isinstance(doc, str) for doc in docs):
|
| 450 |
+
return "".join(docs)
|
| 451 |
+
|
| 452 |
+
# If they're all corpus views, then use ConcatenatedCorpusView.
|
| 453 |
+
for typ in types:
|
| 454 |
+
if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
|
| 455 |
+
break
|
| 456 |
+
else:
|
| 457 |
+
return ConcatenatedCorpusView(docs)
|
| 458 |
+
|
| 459 |
+
# If they're all lazy sequences, use a lazy concatenation
|
| 460 |
+
for typ in types:
|
| 461 |
+
if not issubclass(typ, AbstractLazySequence):
|
| 462 |
+
break
|
| 463 |
+
else:
|
| 464 |
+
return LazyConcatenation(docs)
|
| 465 |
+
|
| 466 |
+
# Otherwise, see what we can do:
|
| 467 |
+
if len(types) == 1:
|
| 468 |
+
typ = list(types)[0]
|
| 469 |
+
|
| 470 |
+
if issubclass(typ, list):
|
| 471 |
+
return reduce((lambda a, b: a + b), docs, [])
|
| 472 |
+
|
| 473 |
+
if issubclass(typ, tuple):
|
| 474 |
+
return reduce((lambda a, b: a + b), docs, ())
|
| 475 |
+
|
| 476 |
+
if ElementTree.iselement(typ):
|
| 477 |
+
xmltree = ElementTree.Element("documents")
|
| 478 |
+
for doc in docs:
|
| 479 |
+
xmltree.append(doc)
|
| 480 |
+
return xmltree
|
| 481 |
+
|
| 482 |
+
# No method found!
|
| 483 |
+
raise ValueError("Don't know how to concatenate types: %r" % types)
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
######################################################################
|
| 487 |
+
# { Corpus View for Pickled Sequences
|
| 488 |
+
######################################################################
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
class PickleCorpusView(StreamBackedCorpusView):
|
| 492 |
+
"""
|
| 493 |
+
A stream backed corpus view for corpus files that consist of
|
| 494 |
+
sequences of serialized Python objects (serialized using
|
| 495 |
+
``pickle.dump``). One use case for this class is to store the
|
| 496 |
+
result of running feature detection on a corpus to disk. This can
|
| 497 |
+
be useful when performing feature detection is expensive (so we
|
| 498 |
+
don't want to repeat it); but the corpus is too large to store in
|
| 499 |
+
memory. The following example illustrates this technique:
|
| 500 |
+
|
| 501 |
+
>>> from nltk.corpus.reader.util import PickleCorpusView
|
| 502 |
+
>>> from nltk.util import LazyMap
|
| 503 |
+
>>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP
|
| 504 |
+
>>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP
|
| 505 |
+
>>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP
|
| 506 |
+
"""
|
| 507 |
+
|
| 508 |
+
BLOCK_SIZE = 100
|
| 509 |
+
PROTOCOL = -1
|
| 510 |
+
|
| 511 |
+
def __init__(self, fileid, delete_on_gc=False):
|
| 512 |
+
"""
|
| 513 |
+
Create a new corpus view that reads the pickle corpus
|
| 514 |
+
``fileid``.
|
| 515 |
+
|
| 516 |
+
:param delete_on_gc: If true, then ``fileid`` will be deleted
|
| 517 |
+
whenever this object gets garbage-collected.
|
| 518 |
+
"""
|
| 519 |
+
self._delete_on_gc = delete_on_gc
|
| 520 |
+
StreamBackedCorpusView.__init__(self, fileid)
|
| 521 |
+
|
| 522 |
+
def read_block(self, stream):
|
| 523 |
+
result = []
|
| 524 |
+
for i in range(self.BLOCK_SIZE):
|
| 525 |
+
try:
|
| 526 |
+
result.append(pickle.load(stream))
|
| 527 |
+
except EOFError:
|
| 528 |
+
break
|
| 529 |
+
return result
|
| 530 |
+
|
| 531 |
+
def __del__(self):
|
| 532 |
+
"""
|
| 533 |
+
If ``delete_on_gc`` was set to true when this
|
| 534 |
+
``PickleCorpusView`` was created, then delete the corpus view's
|
| 535 |
+
fileid. (This method is called whenever a
|
| 536 |
+
``PickledCorpusView`` is garbage-collected.
|
| 537 |
+
"""
|
| 538 |
+
if getattr(self, "_delete_on_gc"):
|
| 539 |
+
if os.path.exists(self._fileid):
|
| 540 |
+
try:
|
| 541 |
+
os.remove(self._fileid)
|
| 542 |
+
except OSError:
|
| 543 |
+
pass
|
| 544 |
+
self.__dict__.clear() # make the garbage collector's job easier
|
| 545 |
+
|
| 546 |
+
@classmethod
|
| 547 |
+
def write(cls, sequence, output_file):
|
| 548 |
+
if isinstance(output_file, str):
|
| 549 |
+
output_file = open(output_file, "wb")
|
| 550 |
+
for item in sequence:
|
| 551 |
+
pickle.dump(item, output_file, cls.PROTOCOL)
|
| 552 |
+
|
| 553 |
+
@classmethod
|
| 554 |
+
def cache_to_tempfile(cls, sequence, delete_on_gc=True):
|
| 555 |
+
"""
|
| 556 |
+
Write the given sequence to a temporary file as a pickle
|
| 557 |
+
corpus; and then return a ``PickleCorpusView`` view for that
|
| 558 |
+
temporary corpus file.
|
| 559 |
+
|
| 560 |
+
:param delete_on_gc: If true, then the temporary file will be
|
| 561 |
+
deleted whenever this object gets garbage-collected.
|
| 562 |
+
"""
|
| 563 |
+
try:
|
| 564 |
+
fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-")
|
| 565 |
+
output_file = os.fdopen(fd, "wb")
|
| 566 |
+
cls.write(sequence, output_file)
|
| 567 |
+
output_file.close()
|
| 568 |
+
return PickleCorpusView(output_file_name, delete_on_gc)
|
| 569 |
+
except OSError as e:
|
| 570 |
+
raise ValueError("Error while creating temp file: %s" % e) from e
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
######################################################################
|
| 574 |
+
# { Block Readers
|
| 575 |
+
######################################################################
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
def read_whitespace_block(stream):
|
| 579 |
+
toks = []
|
| 580 |
+
for i in range(20): # Read 20 lines at a time.
|
| 581 |
+
toks.extend(stream.readline().split())
|
| 582 |
+
return toks
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
def read_wordpunct_block(stream):
|
| 586 |
+
toks = []
|
| 587 |
+
for i in range(20): # Read 20 lines at a time.
|
| 588 |
+
toks.extend(wordpunct_tokenize(stream.readline()))
|
| 589 |
+
return toks
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
def read_line_block(stream):
|
| 593 |
+
toks = []
|
| 594 |
+
for i in range(20):
|
| 595 |
+
line = stream.readline()
|
| 596 |
+
if not line:
|
| 597 |
+
return toks
|
| 598 |
+
toks.append(line.rstrip("\n"))
|
| 599 |
+
return toks
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
def read_blankline_block(stream):
|
| 603 |
+
s = ""
|
| 604 |
+
while True:
|
| 605 |
+
line = stream.readline()
|
| 606 |
+
# End of file:
|
| 607 |
+
if not line:
|
| 608 |
+
if s:
|
| 609 |
+
return [s]
|
| 610 |
+
else:
|
| 611 |
+
return []
|
| 612 |
+
# Blank line:
|
| 613 |
+
elif line and not line.strip():
|
| 614 |
+
if s:
|
| 615 |
+
return [s]
|
| 616 |
+
# Other line:
|
| 617 |
+
else:
|
| 618 |
+
s += line
|
| 619 |
+
|
| 620 |
+
|
| 621 |
+
def read_alignedsent_block(stream):
|
| 622 |
+
s = ""
|
| 623 |
+
while True:
|
| 624 |
+
line = stream.readline()
|
| 625 |
+
if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
|
| 626 |
+
continue
|
| 627 |
+
# End of file:
|
| 628 |
+
if not line:
|
| 629 |
+
if s:
|
| 630 |
+
return [s]
|
| 631 |
+
else:
|
| 632 |
+
return []
|
| 633 |
+
# Other line:
|
| 634 |
+
else:
|
| 635 |
+
s += line
|
| 636 |
+
if re.match(r"^\d+-\d+", line) is not None:
|
| 637 |
+
return [s]
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
def read_regexp_block(stream, start_re, end_re=None):
|
| 641 |
+
"""
|
| 642 |
+
Read a sequence of tokens from a stream, where tokens begin with
|
| 643 |
+
lines that match ``start_re``. If ``end_re`` is specified, then
|
| 644 |
+
tokens end with lines that match ``end_re``; otherwise, tokens end
|
| 645 |
+
whenever the next line matching ``start_re`` or EOF is found.
|
| 646 |
+
"""
|
| 647 |
+
# Scan until we find a line matching the start regexp.
|
| 648 |
+
while True:
|
| 649 |
+
line = stream.readline()
|
| 650 |
+
if not line:
|
| 651 |
+
return [] # end of file.
|
| 652 |
+
if re.match(start_re, line):
|
| 653 |
+
break
|
| 654 |
+
|
| 655 |
+
# Scan until we find another line matching the regexp, or EOF.
|
| 656 |
+
lines = [line]
|
| 657 |
+
while True:
|
| 658 |
+
oldpos = stream.tell()
|
| 659 |
+
line = stream.readline()
|
| 660 |
+
# End of file:
|
| 661 |
+
if not line:
|
| 662 |
+
return ["".join(lines)]
|
| 663 |
+
# End of token:
|
| 664 |
+
if end_re is not None and re.match(end_re, line):
|
| 665 |
+
return ["".join(lines)]
|
| 666 |
+
# Start of new token: backup to just before it starts, and
|
| 667 |
+
# return the token we've already collected.
|
| 668 |
+
if end_re is None and re.match(start_re, line):
|
| 669 |
+
stream.seek(oldpos)
|
| 670 |
+
return ["".join(lines)]
|
| 671 |
+
# Anything else is part of the token.
|
| 672 |
+
lines.append(line)
|
| 673 |
+
|
| 674 |
+
|
| 675 |
+
def read_sexpr_block(stream, block_size=16384, comment_char=None):
|
| 676 |
+
"""
|
| 677 |
+
Read a sequence of s-expressions from the stream, and leave the
|
| 678 |
+
stream's file position at the end the last complete s-expression
|
| 679 |
+
read. This function will always return at least one s-expression,
|
| 680 |
+
unless there are no more s-expressions in the file.
|
| 681 |
+
|
| 682 |
+
If the file ends in in the middle of an s-expression, then that
|
| 683 |
+
incomplete s-expression is returned when the end of the file is
|
| 684 |
+
reached.
|
| 685 |
+
|
| 686 |
+
:param block_size: The default block size for reading. If an
|
| 687 |
+
s-expression is longer than one block, then more than one
|
| 688 |
+
block will be read.
|
| 689 |
+
:param comment_char: A character that marks comments. Any lines
|
| 690 |
+
that begin with this character will be stripped out.
|
| 691 |
+
(If spaces or tabs precede the comment character, then the
|
| 692 |
+
line will not be stripped.)
|
| 693 |
+
"""
|
| 694 |
+
start = stream.tell()
|
| 695 |
+
block = stream.read(block_size)
|
| 696 |
+
encoding = getattr(stream, "encoding", None)
|
| 697 |
+
assert encoding is not None or isinstance(block, str)
|
| 698 |
+
if encoding not in (None, "utf-8"):
|
| 699 |
+
import warnings
|
| 700 |
+
|
| 701 |
+
warnings.warn(
|
| 702 |
+
"Parsing may fail, depending on the properties "
|
| 703 |
+
"of the %s encoding!" % encoding
|
| 704 |
+
)
|
| 705 |
+
# (e.g., the utf-16 encoding does not work because it insists
|
| 706 |
+
# on adding BOMs to the beginning of encoded strings.)
|
| 707 |
+
|
| 708 |
+
if comment_char:
|
| 709 |
+
COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
|
| 710 |
+
while True:
|
| 711 |
+
try:
|
| 712 |
+
# If we're stripping comments, then make sure our block ends
|
| 713 |
+
# on a line boundary; and then replace any comments with
|
| 714 |
+
# space characters. (We can't just strip them out -- that
|
| 715 |
+
# would make our offset wrong.)
|
| 716 |
+
if comment_char:
|
| 717 |
+
block += stream.readline()
|
| 718 |
+
block = re.sub(COMMENT, _sub_space, block)
|
| 719 |
+
# Read the block.
|
| 720 |
+
tokens, offset = _parse_sexpr_block(block)
|
| 721 |
+
# Skip whitespace
|
| 722 |
+
offset = re.compile(r"\s*").search(block, offset).end()
|
| 723 |
+
|
| 724 |
+
# Move to the end position.
|
| 725 |
+
if encoding is None:
|
| 726 |
+
stream.seek(start + offset)
|
| 727 |
+
else:
|
| 728 |
+
stream.seek(start + len(block[:offset].encode(encoding)))
|
| 729 |
+
|
| 730 |
+
# Return the list of tokens we processed
|
| 731 |
+
return tokens
|
| 732 |
+
except ValueError as e:
|
| 733 |
+
if e.args[0] == "Block too small":
|
| 734 |
+
next_block = stream.read(block_size)
|
| 735 |
+
if next_block:
|
| 736 |
+
block += next_block
|
| 737 |
+
continue
|
| 738 |
+
else:
|
| 739 |
+
# The file ended mid-sexpr -- return what we got.
|
| 740 |
+
return [block.strip()]
|
| 741 |
+
else:
|
| 742 |
+
raise
|
| 743 |
+
|
| 744 |
+
|
| 745 |
+
def _sub_space(m):
|
| 746 |
+
"""Helper function: given a regexp match, return a string of
|
| 747 |
+
spaces that's the same length as the matched string."""
|
| 748 |
+
return " " * (m.end() - m.start())
|
| 749 |
+
|
| 750 |
+
|
| 751 |
+
def _parse_sexpr_block(block):
|
| 752 |
+
tokens = []
|
| 753 |
+
start = end = 0
|
| 754 |
+
|
| 755 |
+
while end < len(block):
|
| 756 |
+
m = re.compile(r"\S").search(block, end)
|
| 757 |
+
if not m:
|
| 758 |
+
return tokens, end
|
| 759 |
+
|
| 760 |
+
start = m.start()
|
| 761 |
+
|
| 762 |
+
# Case 1: sexpr is not parenthesized.
|
| 763 |
+
if m.group() != "(":
|
| 764 |
+
m2 = re.compile(r"[\s(]").search(block, start)
|
| 765 |
+
if m2:
|
| 766 |
+
end = m2.start()
|
| 767 |
+
else:
|
| 768 |
+
if tokens:
|
| 769 |
+
return tokens, end
|
| 770 |
+
raise ValueError("Block too small")
|
| 771 |
+
|
| 772 |
+
# Case 2: parenthesized sexpr.
|
| 773 |
+
else:
|
| 774 |
+
nesting = 0
|
| 775 |
+
for m in re.compile(r"[()]").finditer(block, start):
|
| 776 |
+
if m.group() == "(":
|
| 777 |
+
nesting += 1
|
| 778 |
+
else:
|
| 779 |
+
nesting -= 1
|
| 780 |
+
if nesting == 0:
|
| 781 |
+
end = m.end()
|
| 782 |
+
break
|
| 783 |
+
else:
|
| 784 |
+
if tokens:
|
| 785 |
+
return tokens, end
|
| 786 |
+
raise ValueError("Block too small")
|
| 787 |
+
|
| 788 |
+
tokens.append(block[start:end])
|
| 789 |
+
|
| 790 |
+
return tokens, end
|
| 791 |
+
|
| 792 |
+
|
| 793 |
+
######################################################################
|
| 794 |
+
# { Finding Corpus Items
|
| 795 |
+
######################################################################
|
| 796 |
+
|
| 797 |
+
|
| 798 |
+
def find_corpus_fileids(root, regexp):
|
| 799 |
+
if not isinstance(root, PathPointer):
|
| 800 |
+
raise TypeError("find_corpus_fileids: expected a PathPointer")
|
| 801 |
+
regexp += "$"
|
| 802 |
+
|
| 803 |
+
# Find fileids in a zipfile: scan the zipfile's namelist. Filter
|
| 804 |
+
# out entries that end in '/' -- they're directories.
|
| 805 |
+
if isinstance(root, ZipFilePathPointer):
|
| 806 |
+
fileids = [
|
| 807 |
+
name[len(root.entry) :]
|
| 808 |
+
for name in root.zipfile.namelist()
|
| 809 |
+
if not name.endswith("/")
|
| 810 |
+
]
|
| 811 |
+
items = [name for name in fileids if re.match(regexp, name)]
|
| 812 |
+
return sorted(items)
|
| 813 |
+
|
| 814 |
+
# Find fileids in a directory: use os.walk to search all (proper
|
| 815 |
+
# or symlinked) subdirectories, and match paths against the regexp.
|
| 816 |
+
elif isinstance(root, FileSystemPathPointer):
|
| 817 |
+
items = []
|
| 818 |
+
for dirname, subdirs, fileids in os.walk(root.path):
|
| 819 |
+
prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
|
| 820 |
+
items += [
|
| 821 |
+
prefix + fileid
|
| 822 |
+
for fileid in fileids
|
| 823 |
+
if re.match(regexp, prefix + fileid)
|
| 824 |
+
]
|
| 825 |
+
# Don't visit svn directories:
|
| 826 |
+
if ".svn" in subdirs:
|
| 827 |
+
subdirs.remove(".svn")
|
| 828 |
+
return sorted(items)
|
| 829 |
+
|
| 830 |
+
else:
|
| 831 |
+
raise AssertionError("Don't know how to handle %r" % root)
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
def _path_from(parent, child):
|
| 835 |
+
if os.path.split(parent)[1] == "":
|
| 836 |
+
parent = os.path.split(parent)[0]
|
| 837 |
+
path = []
|
| 838 |
+
while parent != child:
|
| 839 |
+
child, dirname = os.path.split(child)
|
| 840 |
+
path.insert(0, dirname)
|
| 841 |
+
assert os.path.split(child)[0] != child
|
| 842 |
+
return path
|
| 843 |
+
|
| 844 |
+
|
| 845 |
+
######################################################################
|
| 846 |
+
# { Paragraph structure in Treebank files
|
| 847 |
+
######################################################################
|
| 848 |
+
|
| 849 |
+
|
| 850 |
+
def tagged_treebank_para_block_reader(stream):
|
| 851 |
+
# Read the next paragraph.
|
| 852 |
+
para = ""
|
| 853 |
+
while True:
|
| 854 |
+
line = stream.readline()
|
| 855 |
+
# End of paragraph:
|
| 856 |
+
if re.match(r"======+\s*$", line):
|
| 857 |
+
if para.strip():
|
| 858 |
+
return [para]
|
| 859 |
+
# End of file:
|
| 860 |
+
elif line == "":
|
| 861 |
+
if para.strip():
|
| 862 |
+
return [para]
|
| 863 |
+
else:
|
| 864 |
+
return []
|
| 865 |
+
# Content line:
|
| 866 |
+
else:
|
| 867 |
+
para += line
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py
ADDED
|
@@ -0,0 +1,629 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Verbnet Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
An NLTK interface to the VerbNet verb lexicon
|
| 10 |
+
|
| 11 |
+
For details about VerbNet see:
|
| 12 |
+
https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import re
|
| 16 |
+
import textwrap
|
| 17 |
+
from collections import defaultdict
|
| 18 |
+
|
| 19 |
+
from nltk.corpus.reader.xmldocs import XMLCorpusReader
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class VerbnetCorpusReader(XMLCorpusReader):
|
| 23 |
+
"""
|
| 24 |
+
An NLTK interface to the VerbNet verb lexicon.
|
| 25 |
+
|
| 26 |
+
From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
|
| 27 |
+
on-line verb lexicon currently available for English. It is a hierarchical
|
| 28 |
+
domain-independent, broad-coverage verb lexicon with mappings to other
|
| 29 |
+
lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
|
| 30 |
+
(XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
|
| 31 |
+
|
| 32 |
+
For details about VerbNet see:
|
| 33 |
+
https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
# No unicode encoding param, since the data files are all XML.
|
| 37 |
+
def __init__(self, root, fileids, wrap_etree=False):
|
| 38 |
+
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
|
| 39 |
+
|
| 40 |
+
self._lemma_to_class = defaultdict(list)
|
| 41 |
+
"""A dictionary mapping from verb lemma strings to lists of
|
| 42 |
+
VerbNet class identifiers."""
|
| 43 |
+
|
| 44 |
+
self._wordnet_to_class = defaultdict(list)
|
| 45 |
+
"""A dictionary mapping from wordnet identifier strings to
|
| 46 |
+
lists of VerbNet class identifiers."""
|
| 47 |
+
|
| 48 |
+
self._class_to_fileid = {}
|
| 49 |
+
"""A dictionary mapping from class identifiers to
|
| 50 |
+
corresponding file identifiers. The keys of this dictionary
|
| 51 |
+
provide a complete list of all classes and subclasses."""
|
| 52 |
+
|
| 53 |
+
self._shortid_to_longid = {}
|
| 54 |
+
|
| 55 |
+
# Initialize the dictionaries. Use the quick (regexp-based)
|
| 56 |
+
# method instead of the slow (xml-based) method, because it
|
| 57 |
+
# runs 2-30 times faster.
|
| 58 |
+
self._quick_index()
|
| 59 |
+
|
| 60 |
+
_LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
|
| 61 |
+
"""Regular expression that matches (and decomposes) longids"""
|
| 62 |
+
|
| 63 |
+
_SHORTID_RE = re.compile(r"[\d+.\-]+$")
|
| 64 |
+
"""Regular expression that matches shortids"""
|
| 65 |
+
|
| 66 |
+
_INDEX_RE = re.compile(
|
| 67 |
+
r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' r'<VNSUBCLASS ID="([^"]+)"/?>'
|
| 68 |
+
)
|
| 69 |
+
"""Regular expression used by ``_index()`` to quickly scan the corpus
|
| 70 |
+
for basic information."""
|
| 71 |
+
|
| 72 |
+
def lemmas(self, vnclass=None):
|
| 73 |
+
"""
|
| 74 |
+
Return a list of all verb lemmas that appear in any class, or
|
| 75 |
+
in the ``classid`` if specified.
|
| 76 |
+
"""
|
| 77 |
+
if vnclass is None:
|
| 78 |
+
return sorted(self._lemma_to_class.keys())
|
| 79 |
+
else:
|
| 80 |
+
# [xx] should this include subclass members?
|
| 81 |
+
if isinstance(vnclass, str):
|
| 82 |
+
vnclass = self.vnclass(vnclass)
|
| 83 |
+
return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
|
| 84 |
+
|
| 85 |
+
def wordnetids(self, vnclass=None):
|
| 86 |
+
"""
|
| 87 |
+
Return a list of all wordnet identifiers that appear in any
|
| 88 |
+
class, or in ``classid`` if specified.
|
| 89 |
+
"""
|
| 90 |
+
if vnclass is None:
|
| 91 |
+
return sorted(self._wordnet_to_class.keys())
|
| 92 |
+
else:
|
| 93 |
+
# [xx] should this include subclass members?
|
| 94 |
+
if isinstance(vnclass, str):
|
| 95 |
+
vnclass = self.vnclass(vnclass)
|
| 96 |
+
return sum(
|
| 97 |
+
(
|
| 98 |
+
member.get("wn", "").split()
|
| 99 |
+
for member in vnclass.findall("MEMBERS/MEMBER")
|
| 100 |
+
),
|
| 101 |
+
[],
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
|
| 105 |
+
"""
|
| 106 |
+
Return a list of the VerbNet class identifiers. If a file
|
| 107 |
+
identifier is specified, then return only the VerbNet class
|
| 108 |
+
identifiers for classes (and subclasses) defined by that file.
|
| 109 |
+
If a lemma is specified, then return only VerbNet class
|
| 110 |
+
identifiers for classes that contain that lemma as a member.
|
| 111 |
+
If a wordnetid is specified, then return only identifiers for
|
| 112 |
+
classes that contain that wordnetid as a member. If a classid
|
| 113 |
+
is specified, then return only identifiers for subclasses of
|
| 114 |
+
the specified VerbNet class.
|
| 115 |
+
If nothing is specified, return all classids within VerbNet
|
| 116 |
+
"""
|
| 117 |
+
if fileid is not None:
|
| 118 |
+
return [c for (c, f) in self._class_to_fileid.items() if f == fileid]
|
| 119 |
+
elif lemma is not None:
|
| 120 |
+
return self._lemma_to_class[lemma]
|
| 121 |
+
elif wordnetid is not None:
|
| 122 |
+
return self._wordnet_to_class[wordnetid]
|
| 123 |
+
elif classid is not None:
|
| 124 |
+
xmltree = self.vnclass(classid)
|
| 125 |
+
return [
|
| 126 |
+
subclass.get("ID")
|
| 127 |
+
for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
|
| 128 |
+
]
|
| 129 |
+
else:
|
| 130 |
+
return sorted(self._class_to_fileid.keys())
|
| 131 |
+
|
| 132 |
+
def vnclass(self, fileid_or_classid):
|
| 133 |
+
"""Returns VerbNet class ElementTree
|
| 134 |
+
|
| 135 |
+
Return an ElementTree containing the xml for the specified
|
| 136 |
+
VerbNet class.
|
| 137 |
+
|
| 138 |
+
:param fileid_or_classid: An identifier specifying which class
|
| 139 |
+
should be returned. Can be a file identifier (such as
|
| 140 |
+
``'put-9.1.xml'``), or a VerbNet class identifier (such as
|
| 141 |
+
``'put-9.1'``) or a short VerbNet class identifier (such as
|
| 142 |
+
``'9.1'``).
|
| 143 |
+
"""
|
| 144 |
+
# File identifier: just return the xml.
|
| 145 |
+
if fileid_or_classid in self._fileids:
|
| 146 |
+
return self.xml(fileid_or_classid)
|
| 147 |
+
|
| 148 |
+
# Class identifier: get the xml, and find the right elt.
|
| 149 |
+
classid = self.longid(fileid_or_classid)
|
| 150 |
+
if classid in self._class_to_fileid:
|
| 151 |
+
fileid = self._class_to_fileid[self.longid(classid)]
|
| 152 |
+
tree = self.xml(fileid)
|
| 153 |
+
if classid == tree.get("ID"):
|
| 154 |
+
return tree
|
| 155 |
+
else:
|
| 156 |
+
for subclass in tree.findall(".//VNSUBCLASS"):
|
| 157 |
+
if classid == subclass.get("ID"):
|
| 158 |
+
return subclass
|
| 159 |
+
else:
|
| 160 |
+
assert False # we saw it during _index()!
|
| 161 |
+
|
| 162 |
+
else:
|
| 163 |
+
raise ValueError(f"Unknown identifier {fileid_or_classid}")
|
| 164 |
+
|
| 165 |
+
def fileids(self, vnclass_ids=None):
|
| 166 |
+
"""
|
| 167 |
+
Return a list of fileids that make up this corpus. If
|
| 168 |
+
``vnclass_ids`` is specified, then return the fileids that make
|
| 169 |
+
up the specified VerbNet class(es).
|
| 170 |
+
"""
|
| 171 |
+
if vnclass_ids is None:
|
| 172 |
+
return self._fileids
|
| 173 |
+
elif isinstance(vnclass_ids, str):
|
| 174 |
+
return [self._class_to_fileid[self.longid(vnclass_ids)]]
|
| 175 |
+
else:
|
| 176 |
+
return [
|
| 177 |
+
self._class_to_fileid[self.longid(vnclass_id)]
|
| 178 |
+
for vnclass_id in vnclass_ids
|
| 179 |
+
]
|
| 180 |
+
|
| 181 |
+
def frames(self, vnclass):
|
| 182 |
+
"""Given a VerbNet class, this method returns VerbNet frames
|
| 183 |
+
|
| 184 |
+
The members returned are:
|
| 185 |
+
1) Example
|
| 186 |
+
2) Description
|
| 187 |
+
3) Syntax
|
| 188 |
+
4) Semantics
|
| 189 |
+
|
| 190 |
+
:param vnclass: A VerbNet class identifier; or an ElementTree
|
| 191 |
+
containing the xml contents of a VerbNet class.
|
| 192 |
+
:return: frames - a list of frame dictionaries
|
| 193 |
+
"""
|
| 194 |
+
if isinstance(vnclass, str):
|
| 195 |
+
vnclass = self.vnclass(vnclass)
|
| 196 |
+
frames = []
|
| 197 |
+
vnframes = vnclass.findall("FRAMES/FRAME")
|
| 198 |
+
for vnframe in vnframes:
|
| 199 |
+
frames.append(
|
| 200 |
+
{
|
| 201 |
+
"example": self._get_example_within_frame(vnframe),
|
| 202 |
+
"description": self._get_description_within_frame(vnframe),
|
| 203 |
+
"syntax": self._get_syntactic_list_within_frame(vnframe),
|
| 204 |
+
"semantics": self._get_semantics_within_frame(vnframe),
|
| 205 |
+
}
|
| 206 |
+
)
|
| 207 |
+
return frames
|
| 208 |
+
|
| 209 |
+
def subclasses(self, vnclass):
|
| 210 |
+
"""Returns subclass ids, if any exist
|
| 211 |
+
|
| 212 |
+
Given a VerbNet class, this method returns subclass ids (if they exist)
|
| 213 |
+
in a list of strings.
|
| 214 |
+
|
| 215 |
+
:param vnclass: A VerbNet class identifier; or an ElementTree
|
| 216 |
+
containing the xml contents of a VerbNet class.
|
| 217 |
+
:return: list of subclasses
|
| 218 |
+
"""
|
| 219 |
+
if isinstance(vnclass, str):
|
| 220 |
+
vnclass = self.vnclass(vnclass)
|
| 221 |
+
|
| 222 |
+
subclasses = [
|
| 223 |
+
subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
|
| 224 |
+
]
|
| 225 |
+
return subclasses
|
| 226 |
+
|
| 227 |
+
def themroles(self, vnclass):
|
| 228 |
+
"""Returns thematic roles participating in a VerbNet class
|
| 229 |
+
|
| 230 |
+
Members returned as part of roles are-
|
| 231 |
+
1) Type
|
| 232 |
+
2) Modifiers
|
| 233 |
+
|
| 234 |
+
:param vnclass: A VerbNet class identifier; or an ElementTree
|
| 235 |
+
containing the xml contents of a VerbNet class.
|
| 236 |
+
:return: themroles: A list of thematic roles in the VerbNet class
|
| 237 |
+
"""
|
| 238 |
+
if isinstance(vnclass, str):
|
| 239 |
+
vnclass = self.vnclass(vnclass)
|
| 240 |
+
|
| 241 |
+
themroles = []
|
| 242 |
+
for trole in vnclass.findall("THEMROLES/THEMROLE"):
|
| 243 |
+
themroles.append(
|
| 244 |
+
{
|
| 245 |
+
"type": trole.get("type"),
|
| 246 |
+
"modifiers": [
|
| 247 |
+
{"value": restr.get("Value"), "type": restr.get("type")}
|
| 248 |
+
for restr in trole.findall("SELRESTRS/SELRESTR")
|
| 249 |
+
],
|
| 250 |
+
}
|
| 251 |
+
)
|
| 252 |
+
return themroles
|
| 253 |
+
|
| 254 |
+
######################################################################
|
| 255 |
+
# { Index Initialization
|
| 256 |
+
######################################################################
|
| 257 |
+
|
| 258 |
+
def _index(self):
|
| 259 |
+
"""
|
| 260 |
+
Initialize the indexes ``_lemma_to_class``,
|
| 261 |
+
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
|
| 262 |
+
through the corpus fileids. This is fast if ElementTree
|
| 263 |
+
uses the C implementation (<0.1 secs), but quite slow (>10 secs)
|
| 264 |
+
if only the python implementation is available.
|
| 265 |
+
"""
|
| 266 |
+
for fileid in self._fileids:
|
| 267 |
+
self._index_helper(self.xml(fileid), fileid)
|
| 268 |
+
|
| 269 |
+
def _index_helper(self, xmltree, fileid):
|
| 270 |
+
"""Helper for ``_index()``"""
|
| 271 |
+
vnclass = xmltree.get("ID")
|
| 272 |
+
self._class_to_fileid[vnclass] = fileid
|
| 273 |
+
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
|
| 274 |
+
for member in xmltree.findall("MEMBERS/MEMBER"):
|
| 275 |
+
self._lemma_to_class[member.get("name")].append(vnclass)
|
| 276 |
+
for wn in member.get("wn", "").split():
|
| 277 |
+
self._wordnet_to_class[wn].append(vnclass)
|
| 278 |
+
for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
|
| 279 |
+
self._index_helper(subclass, fileid)
|
| 280 |
+
|
| 281 |
+
def _quick_index(self):
|
| 282 |
+
"""
|
| 283 |
+
Initialize the indexes ``_lemma_to_class``,
|
| 284 |
+
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
|
| 285 |
+
through the corpus fileids. This doesn't do proper xml parsing,
|
| 286 |
+
but is good enough to find everything in the standard VerbNet
|
| 287 |
+
corpus -- and it runs about 30 times faster than xml parsing
|
| 288 |
+
(with the python ElementTree; only 2-3 times faster
|
| 289 |
+
if ElementTree uses the C implementation).
|
| 290 |
+
"""
|
| 291 |
+
# nb: if we got rid of wordnet_to_class, this would run 2-3
|
| 292 |
+
# times faster.
|
| 293 |
+
for fileid in self._fileids:
|
| 294 |
+
vnclass = fileid[:-4] # strip the '.xml'
|
| 295 |
+
self._class_to_fileid[vnclass] = fileid
|
| 296 |
+
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
|
| 297 |
+
with self.open(fileid) as fp:
|
| 298 |
+
for m in self._INDEX_RE.finditer(fp.read()):
|
| 299 |
+
groups = m.groups()
|
| 300 |
+
if groups[0] is not None:
|
| 301 |
+
self._lemma_to_class[groups[0]].append(vnclass)
|
| 302 |
+
for wn in groups[1].split():
|
| 303 |
+
self._wordnet_to_class[wn].append(vnclass)
|
| 304 |
+
elif groups[2] is not None:
|
| 305 |
+
self._class_to_fileid[groups[2]] = fileid
|
| 306 |
+
vnclass = groups[2] # for <MEMBER> elts.
|
| 307 |
+
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
|
| 308 |
+
else:
|
| 309 |
+
assert False, "unexpected match condition"
|
| 310 |
+
|
| 311 |
+
######################################################################
|
| 312 |
+
# { Identifier conversion
|
| 313 |
+
######################################################################
|
| 314 |
+
|
| 315 |
+
def longid(self, shortid):
|
| 316 |
+
"""Returns longid of a VerbNet class
|
| 317 |
+
|
| 318 |
+
Given a short VerbNet class identifier (eg '37.10'), map it
|
| 319 |
+
to a long id (eg 'confess-37.10'). If ``shortid`` is already a
|
| 320 |
+
long id, then return it as-is"""
|
| 321 |
+
if self._LONGID_RE.match(shortid):
|
| 322 |
+
return shortid # it's already a longid.
|
| 323 |
+
elif not self._SHORTID_RE.match(shortid):
|
| 324 |
+
raise ValueError("vnclass identifier %r not found" % shortid)
|
| 325 |
+
try:
|
| 326 |
+
return self._shortid_to_longid[shortid]
|
| 327 |
+
except KeyError as e:
|
| 328 |
+
raise ValueError("vnclass identifier %r not found" % shortid) from e
|
| 329 |
+
|
| 330 |
+
def shortid(self, longid):
|
| 331 |
+
"""Returns shortid of a VerbNet class
|
| 332 |
+
|
| 333 |
+
Given a long VerbNet class identifier (eg 'confess-37.10'),
|
| 334 |
+
map it to a short id (eg '37.10'). If ``longid`` is already a
|
| 335 |
+
short id, then return it as-is."""
|
| 336 |
+
if self._SHORTID_RE.match(longid):
|
| 337 |
+
return longid # it's already a shortid.
|
| 338 |
+
m = self._LONGID_RE.match(longid)
|
| 339 |
+
if m:
|
| 340 |
+
return m.group(2)
|
| 341 |
+
else:
|
| 342 |
+
raise ValueError("vnclass identifier %r not found" % longid)
|
| 343 |
+
|
| 344 |
+
######################################################################
|
| 345 |
+
# { Frame access utility functions
|
| 346 |
+
######################################################################
|
| 347 |
+
|
| 348 |
+
def _get_semantics_within_frame(self, vnframe):
|
| 349 |
+
"""Returns semantics within a single frame
|
| 350 |
+
|
| 351 |
+
A utility function to retrieve semantics within a frame in VerbNet
|
| 352 |
+
Members of the semantics dictionary:
|
| 353 |
+
1) Predicate value
|
| 354 |
+
2) Arguments
|
| 355 |
+
|
| 356 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 357 |
+
a VerbNet frame.
|
| 358 |
+
:return: semantics: semantics dictionary
|
| 359 |
+
"""
|
| 360 |
+
semantics_within_single_frame = []
|
| 361 |
+
for pred in vnframe.findall("SEMANTICS/PRED"):
|
| 362 |
+
arguments = [
|
| 363 |
+
{"type": arg.get("type"), "value": arg.get("value")}
|
| 364 |
+
for arg in pred.findall("ARGS/ARG")
|
| 365 |
+
]
|
| 366 |
+
semantics_within_single_frame.append(
|
| 367 |
+
{
|
| 368 |
+
"predicate_value": pred.get("value"),
|
| 369 |
+
"arguments": arguments,
|
| 370 |
+
"negated": pred.get("bool") == "!",
|
| 371 |
+
}
|
| 372 |
+
)
|
| 373 |
+
return semantics_within_single_frame
|
| 374 |
+
|
| 375 |
+
def _get_example_within_frame(self, vnframe):
|
| 376 |
+
"""Returns example within a frame
|
| 377 |
+
|
| 378 |
+
A utility function to retrieve an example within a frame in VerbNet.
|
| 379 |
+
|
| 380 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 381 |
+
a VerbNet frame.
|
| 382 |
+
:return: example_text: The example sentence for this particular frame
|
| 383 |
+
"""
|
| 384 |
+
example_element = vnframe.find("EXAMPLES/EXAMPLE")
|
| 385 |
+
if example_element is not None:
|
| 386 |
+
example_text = example_element.text
|
| 387 |
+
else:
|
| 388 |
+
example_text = ""
|
| 389 |
+
return example_text
|
| 390 |
+
|
| 391 |
+
def _get_description_within_frame(self, vnframe):
|
| 392 |
+
"""Returns member description within frame
|
| 393 |
+
|
| 394 |
+
A utility function to retrieve a description of participating members
|
| 395 |
+
within a frame in VerbNet.
|
| 396 |
+
|
| 397 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 398 |
+
a VerbNet frame.
|
| 399 |
+
:return: description: a description dictionary with members - primary and secondary
|
| 400 |
+
"""
|
| 401 |
+
description_element = vnframe.find("DESCRIPTION")
|
| 402 |
+
return {
|
| 403 |
+
"primary": description_element.attrib["primary"],
|
| 404 |
+
"secondary": description_element.get("secondary", ""),
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
def _get_syntactic_list_within_frame(self, vnframe):
|
| 408 |
+
"""Returns semantics within a frame
|
| 409 |
+
|
| 410 |
+
A utility function to retrieve semantics within a frame in VerbNet.
|
| 411 |
+
Members of the syntactic dictionary:
|
| 412 |
+
1) POS Tag
|
| 413 |
+
2) Modifiers
|
| 414 |
+
|
| 415 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 416 |
+
a VerbNet frame.
|
| 417 |
+
:return: syntax_within_single_frame
|
| 418 |
+
"""
|
| 419 |
+
syntax_within_single_frame = []
|
| 420 |
+
for elt in vnframe.find("SYNTAX"):
|
| 421 |
+
pos_tag = elt.tag
|
| 422 |
+
modifiers = dict()
|
| 423 |
+
modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
|
| 424 |
+
modifiers["selrestrs"] = [
|
| 425 |
+
{"value": restr.get("Value"), "type": restr.get("type")}
|
| 426 |
+
for restr in elt.findall("SELRESTRS/SELRESTR")
|
| 427 |
+
]
|
| 428 |
+
modifiers["synrestrs"] = [
|
| 429 |
+
{"value": restr.get("Value"), "type": restr.get("type")}
|
| 430 |
+
for restr in elt.findall("SYNRESTRS/SYNRESTR")
|
| 431 |
+
]
|
| 432 |
+
syntax_within_single_frame.append(
|
| 433 |
+
{"pos_tag": pos_tag, "modifiers": modifiers}
|
| 434 |
+
)
|
| 435 |
+
return syntax_within_single_frame
|
| 436 |
+
|
| 437 |
+
######################################################################
|
| 438 |
+
# { Pretty Printing
|
| 439 |
+
######################################################################
|
| 440 |
+
|
| 441 |
+
def pprint(self, vnclass):
|
| 442 |
+
"""Returns pretty printed version of a VerbNet class
|
| 443 |
+
|
| 444 |
+
Return a string containing a pretty-printed representation of
|
| 445 |
+
the given VerbNet class.
|
| 446 |
+
|
| 447 |
+
:param vnclass: A VerbNet class identifier; or an ElementTree
|
| 448 |
+
containing the xml contents of a VerbNet class.
|
| 449 |
+
"""
|
| 450 |
+
if isinstance(vnclass, str):
|
| 451 |
+
vnclass = self.vnclass(vnclass)
|
| 452 |
+
|
| 453 |
+
s = vnclass.get("ID") + "\n"
|
| 454 |
+
s += self.pprint_subclasses(vnclass, indent=" ") + "\n"
|
| 455 |
+
s += self.pprint_members(vnclass, indent=" ") + "\n"
|
| 456 |
+
s += " Thematic roles:\n"
|
| 457 |
+
s += self.pprint_themroles(vnclass, indent=" ") + "\n"
|
| 458 |
+
s += " Frames:\n"
|
| 459 |
+
s += self.pprint_frames(vnclass, indent=" ")
|
| 460 |
+
return s
|
| 461 |
+
|
| 462 |
+
def pprint_subclasses(self, vnclass, indent=""):
|
| 463 |
+
"""Returns pretty printed version of subclasses of VerbNet class
|
| 464 |
+
|
| 465 |
+
Return a string containing a pretty-printed representation of
|
| 466 |
+
the given VerbNet class's subclasses.
|
| 467 |
+
|
| 468 |
+
:param vnclass: A VerbNet class identifier; or an ElementTree
|
| 469 |
+
containing the xml contents of a VerbNet class.
|
| 470 |
+
"""
|
| 471 |
+
if isinstance(vnclass, str):
|
| 472 |
+
vnclass = self.vnclass(vnclass)
|
| 473 |
+
|
| 474 |
+
subclasses = self.subclasses(vnclass)
|
| 475 |
+
if not subclasses:
|
| 476 |
+
subclasses = ["(none)"]
|
| 477 |
+
s = "Subclasses: " + " ".join(subclasses)
|
| 478 |
+
return textwrap.fill(
|
| 479 |
+
s, 70, initial_indent=indent, subsequent_indent=indent + " "
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
def pprint_members(self, vnclass, indent=""):
|
| 483 |
+
"""Returns pretty printed version of members in a VerbNet class
|
| 484 |
+
|
| 485 |
+
Return a string containing a pretty-printed representation of
|
| 486 |
+
the given VerbNet class's member verbs.
|
| 487 |
+
|
| 488 |
+
:param vnclass: A VerbNet class identifier; or an ElementTree
|
| 489 |
+
containing the xml contents of a VerbNet class.
|
| 490 |
+
"""
|
| 491 |
+
if isinstance(vnclass, str):
|
| 492 |
+
vnclass = self.vnclass(vnclass)
|
| 493 |
+
|
| 494 |
+
members = self.lemmas(vnclass)
|
| 495 |
+
if not members:
|
| 496 |
+
members = ["(none)"]
|
| 497 |
+
s = "Members: " + " ".join(members)
|
| 498 |
+
return textwrap.fill(
|
| 499 |
+
s, 70, initial_indent=indent, subsequent_indent=indent + " "
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
def pprint_themroles(self, vnclass, indent=""):
|
| 503 |
+
"""Returns pretty printed version of thematic roles in a VerbNet class
|
| 504 |
+
|
| 505 |
+
Return a string containing a pretty-printed representation of
|
| 506 |
+
the given VerbNet class's thematic roles.
|
| 507 |
+
|
| 508 |
+
:param vnclass: A VerbNet class identifier; or an ElementTree
|
| 509 |
+
containing the xml contents of a VerbNet class.
|
| 510 |
+
"""
|
| 511 |
+
if isinstance(vnclass, str):
|
| 512 |
+
vnclass = self.vnclass(vnclass)
|
| 513 |
+
|
| 514 |
+
pieces = []
|
| 515 |
+
for themrole in self.themroles(vnclass):
|
| 516 |
+
piece = indent + "* " + themrole.get("type")
|
| 517 |
+
modifiers = [
|
| 518 |
+
modifier["value"] + modifier["type"]
|
| 519 |
+
for modifier in themrole["modifiers"]
|
| 520 |
+
]
|
| 521 |
+
if modifiers:
|
| 522 |
+
piece += "[{}]".format(" ".join(modifiers))
|
| 523 |
+
pieces.append(piece)
|
| 524 |
+
return "\n".join(pieces)
|
| 525 |
+
|
| 526 |
+
def pprint_frames(self, vnclass, indent=""):
|
| 527 |
+
"""Returns pretty version of all frames in a VerbNet class
|
| 528 |
+
|
| 529 |
+
Return a string containing a pretty-printed representation of
|
| 530 |
+
the list of frames within the VerbNet class.
|
| 531 |
+
|
| 532 |
+
:param vnclass: A VerbNet class identifier; or an ElementTree
|
| 533 |
+
containing the xml contents of a VerbNet class.
|
| 534 |
+
"""
|
| 535 |
+
if isinstance(vnclass, str):
|
| 536 |
+
vnclass = self.vnclass(vnclass)
|
| 537 |
+
pieces = []
|
| 538 |
+
for vnframe in self.frames(vnclass):
|
| 539 |
+
pieces.append(self._pprint_single_frame(vnframe, indent))
|
| 540 |
+
return "\n".join(pieces)
|
| 541 |
+
|
| 542 |
+
def _pprint_single_frame(self, vnframe, indent=""):
|
| 543 |
+
"""Returns pretty printed version of a single frame in a VerbNet class
|
| 544 |
+
|
| 545 |
+
Returns a string containing a pretty-printed representation of
|
| 546 |
+
the given frame.
|
| 547 |
+
|
| 548 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 549 |
+
a VerbNet frame.
|
| 550 |
+
"""
|
| 551 |
+
frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
|
| 552 |
+
frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
|
| 553 |
+
frame_string += (
|
| 554 |
+
self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n"
|
| 555 |
+
)
|
| 556 |
+
frame_string += indent + " Semantics:\n"
|
| 557 |
+
frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ")
|
| 558 |
+
return frame_string
|
| 559 |
+
|
| 560 |
+
def _pprint_example_within_frame(self, vnframe, indent=""):
|
| 561 |
+
"""Returns pretty printed version of example within frame in a VerbNet class
|
| 562 |
+
|
| 563 |
+
Return a string containing a pretty-printed representation of
|
| 564 |
+
the given VerbNet frame example.
|
| 565 |
+
|
| 566 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 567 |
+
a Verbnet frame.
|
| 568 |
+
"""
|
| 569 |
+
if vnframe["example"]:
|
| 570 |
+
return indent + " Example: " + vnframe["example"]
|
| 571 |
+
|
| 572 |
+
def _pprint_description_within_frame(self, vnframe, indent=""):
|
| 573 |
+
"""Returns pretty printed version of a VerbNet frame description
|
| 574 |
+
|
| 575 |
+
Return a string containing a pretty-printed representation of
|
| 576 |
+
the given VerbNet frame description.
|
| 577 |
+
|
| 578 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 579 |
+
a VerbNet frame.
|
| 580 |
+
"""
|
| 581 |
+
description = indent + vnframe["description"]["primary"]
|
| 582 |
+
if vnframe["description"]["secondary"]:
|
| 583 |
+
description += " ({})".format(vnframe["description"]["secondary"])
|
| 584 |
+
return description
|
| 585 |
+
|
| 586 |
+
def _pprint_syntax_within_frame(self, vnframe, indent=""):
|
| 587 |
+
"""Returns pretty printed version of syntax within a frame in a VerbNet class
|
| 588 |
+
|
| 589 |
+
Return a string containing a pretty-printed representation of
|
| 590 |
+
the given VerbNet frame syntax.
|
| 591 |
+
|
| 592 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 593 |
+
a VerbNet frame.
|
| 594 |
+
"""
|
| 595 |
+
pieces = []
|
| 596 |
+
for element in vnframe["syntax"]:
|
| 597 |
+
piece = element["pos_tag"]
|
| 598 |
+
modifier_list = []
|
| 599 |
+
if "value" in element["modifiers"] and element["modifiers"]["value"]:
|
| 600 |
+
modifier_list.append(element["modifiers"]["value"])
|
| 601 |
+
modifier_list += [
|
| 602 |
+
"{}{}".format(restr["value"], restr["type"])
|
| 603 |
+
for restr in (
|
| 604 |
+
element["modifiers"]["selrestrs"]
|
| 605 |
+
+ element["modifiers"]["synrestrs"]
|
| 606 |
+
)
|
| 607 |
+
]
|
| 608 |
+
if modifier_list:
|
| 609 |
+
piece += "[{}]".format(" ".join(modifier_list))
|
| 610 |
+
pieces.append(piece)
|
| 611 |
+
|
| 612 |
+
return indent + " ".join(pieces)
|
| 613 |
+
|
| 614 |
+
def _pprint_semantics_within_frame(self, vnframe, indent=""):
|
| 615 |
+
"""Returns a pretty printed version of semantics within frame in a VerbNet class
|
| 616 |
+
|
| 617 |
+
Return a string containing a pretty-printed representation of
|
| 618 |
+
the given VerbNet frame semantics.
|
| 619 |
+
|
| 620 |
+
:param vnframe: An ElementTree containing the xml contents of
|
| 621 |
+
a VerbNet frame.
|
| 622 |
+
"""
|
| 623 |
+
pieces = []
|
| 624 |
+
for predicate in vnframe["semantics"]:
|
| 625 |
+
arguments = [argument["value"] for argument in predicate["arguments"]]
|
| 626 |
+
pieces.append(
|
| 627 |
+
f"{'¬' if predicate['negated'] else ''}{predicate['predicate_value']}({', '.join(arguments)})"
|
| 628 |
+
)
|
| 629 |
+
return "\n".join(f"{indent}* {piece}" for piece in pieces)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Word List Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
from nltk.corpus.reader.api import *
|
| 9 |
+
from nltk.corpus.reader.util import *
|
| 10 |
+
from nltk.tokenize import line_tokenize
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class WordListCorpusReader(CorpusReader):
|
| 14 |
+
"""
|
| 15 |
+
List of words, one per line. Blank lines are ignored.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def words(self, fileids=None, ignore_lines_startswith="\n"):
|
| 19 |
+
return [
|
| 20 |
+
line
|
| 21 |
+
for line in line_tokenize(self.raw(fileids))
|
| 22 |
+
if not line.startswith(ignore_lines_startswith)
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SwadeshCorpusReader(WordListCorpusReader):
|
| 27 |
+
def entries(self, fileids=None):
|
| 28 |
+
"""
|
| 29 |
+
:return: a tuple of words for the specified fileids.
|
| 30 |
+
"""
|
| 31 |
+
if not fileids:
|
| 32 |
+
fileids = self.fileids()
|
| 33 |
+
|
| 34 |
+
wordlists = [self.words(f) for f in fileids]
|
| 35 |
+
return list(zip(*wordlists))
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
|
| 39 |
+
"""
|
| 40 |
+
This is a class to read the nonbreaking prefixes textfiles from the
|
| 41 |
+
Moses Machine Translation toolkit. These lists are used in the Python port
|
| 42 |
+
of the Moses' word tokenizer.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
available_langs = {
|
| 46 |
+
"catalan": "ca",
|
| 47 |
+
"czech": "cs",
|
| 48 |
+
"german": "de",
|
| 49 |
+
"greek": "el",
|
| 50 |
+
"english": "en",
|
| 51 |
+
"spanish": "es",
|
| 52 |
+
"finnish": "fi",
|
| 53 |
+
"french": "fr",
|
| 54 |
+
"hungarian": "hu",
|
| 55 |
+
"icelandic": "is",
|
| 56 |
+
"italian": "it",
|
| 57 |
+
"latvian": "lv",
|
| 58 |
+
"dutch": "nl",
|
| 59 |
+
"polish": "pl",
|
| 60 |
+
"portuguese": "pt",
|
| 61 |
+
"romanian": "ro",
|
| 62 |
+
"russian": "ru",
|
| 63 |
+
"slovak": "sk",
|
| 64 |
+
"slovenian": "sl",
|
| 65 |
+
"swedish": "sv",
|
| 66 |
+
"tamil": "ta",
|
| 67 |
+
}
|
| 68 |
+
# Also, add the lang IDs as the keys.
|
| 69 |
+
available_langs.update({v: v for v in available_langs.values()})
|
| 70 |
+
|
| 71 |
+
def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
|
| 72 |
+
"""
|
| 73 |
+
This module returns a list of nonbreaking prefixes for the specified
|
| 74 |
+
language(s).
|
| 75 |
+
|
| 76 |
+
>>> from nltk.corpus import nonbreaking_prefixes as nbp
|
| 77 |
+
>>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
|
| 78 |
+
True
|
| 79 |
+
>>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
|
| 80 |
+
True
|
| 81 |
+
|
| 82 |
+
:return: a list words for the specified language(s).
|
| 83 |
+
"""
|
| 84 |
+
# If *lang* in list of languages available, allocate apt fileid.
|
| 85 |
+
# Otherwise, the function returns non-breaking prefixes for
|
| 86 |
+
# all languages when fileids==None.
|
| 87 |
+
if lang in self.available_langs:
|
| 88 |
+
lang = self.available_langs[lang]
|
| 89 |
+
fileids = ["nonbreaking_prefix." + lang]
|
| 90 |
+
return [
|
| 91 |
+
line
|
| 92 |
+
for line in line_tokenize(self.raw(fileids))
|
| 93 |
+
if not line.startswith(ignore_lines_startswith)
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class UnicharsCorpusReader(WordListCorpusReader):
|
| 98 |
+
"""
|
| 99 |
+
This class is used to read lists of characters from the Perl Unicode
|
| 100 |
+
Properties (see https://perldoc.perl.org/perluniprops.html).
|
| 101 |
+
The files in the perluniprop.zip are extracted using the Unicode::Tussle
|
| 102 |
+
module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
|
| 103 |
+
"""
|
| 104 |
+
|
| 105 |
+
# These are categories similar to the Perl Unicode Properties
|
| 106 |
+
available_categories = [
|
| 107 |
+
"Close_Punctuation",
|
| 108 |
+
"Currency_Symbol",
|
| 109 |
+
"IsAlnum",
|
| 110 |
+
"IsAlpha",
|
| 111 |
+
"IsLower",
|
| 112 |
+
"IsN",
|
| 113 |
+
"IsSc",
|
| 114 |
+
"IsSo",
|
| 115 |
+
"IsUpper",
|
| 116 |
+
"Line_Separator",
|
| 117 |
+
"Number",
|
| 118 |
+
"Open_Punctuation",
|
| 119 |
+
"Punctuation",
|
| 120 |
+
"Separator",
|
| 121 |
+
"Symbol",
|
| 122 |
+
]
|
| 123 |
+
|
| 124 |
+
def chars(self, category=None, fileids=None):
|
| 125 |
+
"""
|
| 126 |
+
This module returns a list of characters from the Perl Unicode Properties.
|
| 127 |
+
They are very useful when porting Perl tokenizers to Python.
|
| 128 |
+
|
| 129 |
+
>>> from nltk.corpus import perluniprops as pup
|
| 130 |
+
>>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
|
| 131 |
+
True
|
| 132 |
+
>>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
|
| 133 |
+
True
|
| 134 |
+
>>> pup.available_categories
|
| 135 |
+
['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
|
| 136 |
+
|
| 137 |
+
:return: a list of characters given the specific unicode character category
|
| 138 |
+
"""
|
| 139 |
+
if category in self.available_categories:
|
| 140 |
+
fileids = [category + ".txt"]
|
| 141 |
+
return list(self.raw(fileids).strip())
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class MWAPPDBCorpusReader(WordListCorpusReader):
|
| 145 |
+
"""
|
| 146 |
+
This class is used to read the list of word pairs from the subset of lexical
|
| 147 |
+
pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
|
| 148 |
+
Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
|
| 149 |
+
|
| 150 |
+
- http://acl2014.org/acl2014/Q14/pdf/Q14-1017
|
| 151 |
+
- https://www.aclweb.org/anthology/S14-2039
|
| 152 |
+
- https://www.aclweb.org/anthology/S15-2027
|
| 153 |
+
|
| 154 |
+
The original source of the full PPDB corpus can be found on
|
| 155 |
+
https://www.cis.upenn.edu/~ccb/ppdb/
|
| 156 |
+
|
| 157 |
+
:return: a list of tuples of similar lexical terms.
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
|
| 161 |
+
|
| 162 |
+
def entries(self, fileids=mwa_ppdb_xxxl_file):
|
| 163 |
+
"""
|
| 164 |
+
:return: a tuple of synonym word pairs.
|
| 165 |
+
"""
|
| 166 |
+
return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py
ADDED
|
@@ -0,0 +1,2489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: WordNet
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bethard <Steven.Bethard@colorado.edu>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 6 |
+
# Edward Loper <edloper@gmail.com>
|
| 7 |
+
# Nitin Madnani <nmadnani@ets.org>
|
| 8 |
+
# Nasruddin A’aidil Shari
|
| 9 |
+
# Sim Wei Ying Geraldine
|
| 10 |
+
# Soe Lynn
|
| 11 |
+
# Francis Bond <bond@ieee.org>
|
| 12 |
+
# Eric Kafe <kafe.eric@gmail.com>
|
| 13 |
+
|
| 14 |
+
# URL: <https://www.nltk.org/>
|
| 15 |
+
# For license information, see LICENSE.TXT
|
| 16 |
+
|
| 17 |
+
"""
|
| 18 |
+
An NLTK interface for WordNet
|
| 19 |
+
|
| 20 |
+
WordNet is a lexical database of English.
|
| 21 |
+
Using synsets, helps find conceptual relationships between words
|
| 22 |
+
such as hypernyms, hyponyms, synonyms, antonyms etc.
|
| 23 |
+
|
| 24 |
+
For details about WordNet see:
|
| 25 |
+
https://wordnet.princeton.edu/
|
| 26 |
+
|
| 27 |
+
This module also allows you to find lemmas in languages
|
| 28 |
+
other than English from the Open Multilingual Wordnet
|
| 29 |
+
http://compling.hss.ntu.edu.sg/omw/
|
| 30 |
+
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
import math
|
| 34 |
+
import os
|
| 35 |
+
import re
|
| 36 |
+
import warnings
|
| 37 |
+
from collections import defaultdict, deque
|
| 38 |
+
from functools import total_ordering
|
| 39 |
+
from itertools import chain, islice
|
| 40 |
+
from operator import itemgetter
|
| 41 |
+
|
| 42 |
+
from nltk.corpus.reader import CorpusReader
|
| 43 |
+
from nltk.internals import deprecated
|
| 44 |
+
from nltk.probability import FreqDist
|
| 45 |
+
from nltk.util import binary_search_file as _binary_search_file
|
| 46 |
+
|
| 47 |
+
######################################################################
|
| 48 |
+
# Table of Contents
|
| 49 |
+
######################################################################
|
| 50 |
+
# - Constants
|
| 51 |
+
# - Data Classes
|
| 52 |
+
# - WordNetError
|
| 53 |
+
# - Lemma
|
| 54 |
+
# - Synset
|
| 55 |
+
# - WordNet Corpus Reader
|
| 56 |
+
# - WordNet Information Content Corpus Reader
|
| 57 |
+
# - Similarity Metrics
|
| 58 |
+
# - Demo
|
| 59 |
+
|
| 60 |
+
######################################################################
|
| 61 |
+
# Constants
|
| 62 |
+
######################################################################
|
| 63 |
+
|
| 64 |
+
#: Positive infinity (for similarity functions)
|
| 65 |
+
_INF = 1e300
|
| 66 |
+
|
| 67 |
+
# { Part-of-speech constants
|
| 68 |
+
ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
|
| 69 |
+
# }
|
| 70 |
+
|
| 71 |
+
POS_LIST = [NOUN, VERB, ADJ, ADV]
|
| 72 |
+
|
| 73 |
+
# A table of strings that are used to express verb frames.
|
| 74 |
+
VERB_FRAME_STRINGS = (
|
| 75 |
+
None,
|
| 76 |
+
"Something %s",
|
| 77 |
+
"Somebody %s",
|
| 78 |
+
"It is %sing",
|
| 79 |
+
"Something is %sing PP",
|
| 80 |
+
"Something %s something Adjective/Noun",
|
| 81 |
+
"Something %s Adjective/Noun",
|
| 82 |
+
"Somebody %s Adjective",
|
| 83 |
+
"Somebody %s something",
|
| 84 |
+
"Somebody %s somebody",
|
| 85 |
+
"Something %s somebody",
|
| 86 |
+
"Something %s something",
|
| 87 |
+
"Something %s to somebody",
|
| 88 |
+
"Somebody %s on something",
|
| 89 |
+
"Somebody %s somebody something",
|
| 90 |
+
"Somebody %s something to somebody",
|
| 91 |
+
"Somebody %s something from somebody",
|
| 92 |
+
"Somebody %s somebody with something",
|
| 93 |
+
"Somebody %s somebody of something",
|
| 94 |
+
"Somebody %s something on somebody",
|
| 95 |
+
"Somebody %s somebody PP",
|
| 96 |
+
"Somebody %s something PP",
|
| 97 |
+
"Somebody %s PP",
|
| 98 |
+
"Somebody's (body part) %s",
|
| 99 |
+
"Somebody %s somebody to INFINITIVE",
|
| 100 |
+
"Somebody %s somebody INFINITIVE",
|
| 101 |
+
"Somebody %s that CLAUSE",
|
| 102 |
+
"Somebody %s to somebody",
|
| 103 |
+
"Somebody %s to INFINITIVE",
|
| 104 |
+
"Somebody %s whether INFINITIVE",
|
| 105 |
+
"Somebody %s somebody into V-ing something",
|
| 106 |
+
"Somebody %s something with something",
|
| 107 |
+
"Somebody %s INFINITIVE",
|
| 108 |
+
"Somebody %s VERB-ing",
|
| 109 |
+
"It %s that CLAUSE",
|
| 110 |
+
"Something %s INFINITIVE",
|
| 111 |
+
# OEWN additions:
|
| 112 |
+
"Somebody %s at something",
|
| 113 |
+
"Somebody %s for something",
|
| 114 |
+
"Somebody %s on somebody",
|
| 115 |
+
"Somebody %s out of somebody",
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
SENSENUM_RE = re.compile(r"\.[\d]+\.")
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
######################################################################
|
| 122 |
+
# Data Classes
|
| 123 |
+
######################################################################
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class WordNetError(Exception):
|
| 127 |
+
"""An exception class for wordnet-related errors."""
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
@total_ordering
|
| 131 |
+
class _WordNetObject:
|
| 132 |
+
"""A common base class for lemmas and synsets."""
|
| 133 |
+
|
| 134 |
+
def hypernyms(self):
|
| 135 |
+
return self._related("@")
|
| 136 |
+
|
| 137 |
+
def _hypernyms(self):
|
| 138 |
+
return self._related("@")
|
| 139 |
+
|
| 140 |
+
def instance_hypernyms(self):
|
| 141 |
+
return self._related("@i")
|
| 142 |
+
|
| 143 |
+
def _instance_hypernyms(self):
|
| 144 |
+
return self._related("@i")
|
| 145 |
+
|
| 146 |
+
def hyponyms(self):
|
| 147 |
+
return self._related("~")
|
| 148 |
+
|
| 149 |
+
def instance_hyponyms(self):
|
| 150 |
+
return self._related("~i")
|
| 151 |
+
|
| 152 |
+
def member_holonyms(self):
|
| 153 |
+
return self._related("#m")
|
| 154 |
+
|
| 155 |
+
def substance_holonyms(self):
|
| 156 |
+
return self._related("#s")
|
| 157 |
+
|
| 158 |
+
def part_holonyms(self):
|
| 159 |
+
return self._related("#p")
|
| 160 |
+
|
| 161 |
+
def member_meronyms(self):
|
| 162 |
+
return self._related("%m")
|
| 163 |
+
|
| 164 |
+
def substance_meronyms(self):
|
| 165 |
+
return self._related("%s")
|
| 166 |
+
|
| 167 |
+
def part_meronyms(self):
|
| 168 |
+
return self._related("%p")
|
| 169 |
+
|
| 170 |
+
def topic_domains(self):
|
| 171 |
+
return self._related(";c")
|
| 172 |
+
|
| 173 |
+
def in_topic_domains(self):
|
| 174 |
+
return self._related("-c")
|
| 175 |
+
|
| 176 |
+
def region_domains(self):
|
| 177 |
+
return self._related(";r")
|
| 178 |
+
|
| 179 |
+
def in_region_domains(self):
|
| 180 |
+
return self._related("-r")
|
| 181 |
+
|
| 182 |
+
def usage_domains(self):
|
| 183 |
+
return self._related(";u")
|
| 184 |
+
|
| 185 |
+
def in_usage_domains(self):
|
| 186 |
+
return self._related("-u")
|
| 187 |
+
|
| 188 |
+
def attributes(self):
|
| 189 |
+
return self._related("=")
|
| 190 |
+
|
| 191 |
+
def entailments(self):
|
| 192 |
+
return self._related("*")
|
| 193 |
+
|
| 194 |
+
def causes(self):
|
| 195 |
+
return self._related(">")
|
| 196 |
+
|
| 197 |
+
def also_sees(self):
|
| 198 |
+
return self._related("^")
|
| 199 |
+
|
| 200 |
+
def verb_groups(self):
|
| 201 |
+
return self._related("$")
|
| 202 |
+
|
| 203 |
+
def similar_tos(self):
|
| 204 |
+
return self._related("&")
|
| 205 |
+
|
| 206 |
+
def __hash__(self):
|
| 207 |
+
return hash(self._name)
|
| 208 |
+
|
| 209 |
+
def __eq__(self, other):
|
| 210 |
+
return self._name == other._name
|
| 211 |
+
|
| 212 |
+
def __ne__(self, other):
|
| 213 |
+
return self._name != other._name
|
| 214 |
+
|
| 215 |
+
def __lt__(self, other):
|
| 216 |
+
return self._name < other._name
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
class Lemma(_WordNetObject):
|
| 220 |
+
"""
|
| 221 |
+
The lexical entry for a single morphological form of a
|
| 222 |
+
sense-disambiguated word.
|
| 223 |
+
|
| 224 |
+
Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
|
| 225 |
+
<word> is the morphological stem identifying the synset
|
| 226 |
+
<pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
|
| 227 |
+
<number> is the sense number, counting from 0.
|
| 228 |
+
<lemma> is the morphological form of interest
|
| 229 |
+
|
| 230 |
+
Note that <word> and <lemma> can be different, e.g. the Synset
|
| 231 |
+
'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
|
| 232 |
+
'salt.n.03.salinity'.
|
| 233 |
+
|
| 234 |
+
Lemma attributes, accessible via methods with the same name:
|
| 235 |
+
|
| 236 |
+
- name: The canonical name of this lemma.
|
| 237 |
+
- synset: The synset that this lemma belongs to.
|
| 238 |
+
- syntactic_marker: For adjectives, the WordNet string identifying the
|
| 239 |
+
syntactic position relative modified noun. See:
|
| 240 |
+
https://wordnet.princeton.edu/documentation/wninput5wn
|
| 241 |
+
For all other parts of speech, this attribute is None.
|
| 242 |
+
- count: The frequency of this lemma in wordnet.
|
| 243 |
+
|
| 244 |
+
Lemma methods:
|
| 245 |
+
|
| 246 |
+
Lemmas have the following methods for retrieving related Lemmas. They
|
| 247 |
+
correspond to the names for the pointer symbols defined here:
|
| 248 |
+
https://wordnet.princeton.edu/documentation/wninput5wn
|
| 249 |
+
These methods all return lists of Lemmas:
|
| 250 |
+
|
| 251 |
+
- antonyms
|
| 252 |
+
- hypernyms, instance_hypernyms
|
| 253 |
+
- hyponyms, instance_hyponyms
|
| 254 |
+
- member_holonyms, substance_holonyms, part_holonyms
|
| 255 |
+
- member_meronyms, substance_meronyms, part_meronyms
|
| 256 |
+
- topic_domains, region_domains, usage_domains
|
| 257 |
+
- attributes
|
| 258 |
+
- derivationally_related_forms
|
| 259 |
+
- entailments
|
| 260 |
+
- causes
|
| 261 |
+
- also_sees
|
| 262 |
+
- verb_groups
|
| 263 |
+
- similar_tos
|
| 264 |
+
- pertainyms
|
| 265 |
+
"""
|
| 266 |
+
|
| 267 |
+
__slots__ = [
|
| 268 |
+
"_wordnet_corpus_reader",
|
| 269 |
+
"_name",
|
| 270 |
+
"_syntactic_marker",
|
| 271 |
+
"_synset",
|
| 272 |
+
"_frame_strings",
|
| 273 |
+
"_frame_ids",
|
| 274 |
+
"_lexname_index",
|
| 275 |
+
"_lex_id",
|
| 276 |
+
"_lang",
|
| 277 |
+
"_key",
|
| 278 |
+
]
|
| 279 |
+
|
| 280 |
+
def __init__(
|
| 281 |
+
self,
|
| 282 |
+
wordnet_corpus_reader,
|
| 283 |
+
synset,
|
| 284 |
+
name,
|
| 285 |
+
lexname_index,
|
| 286 |
+
lex_id,
|
| 287 |
+
syntactic_marker,
|
| 288 |
+
):
|
| 289 |
+
self._wordnet_corpus_reader = wordnet_corpus_reader
|
| 290 |
+
self._name = name
|
| 291 |
+
self._syntactic_marker = syntactic_marker
|
| 292 |
+
self._synset = synset
|
| 293 |
+
self._frame_strings = []
|
| 294 |
+
self._frame_ids = []
|
| 295 |
+
self._lexname_index = lexname_index
|
| 296 |
+
self._lex_id = lex_id
|
| 297 |
+
self._lang = "eng"
|
| 298 |
+
|
| 299 |
+
self._key = None # gets set later.
|
| 300 |
+
|
| 301 |
+
def name(self):
|
| 302 |
+
return self._name
|
| 303 |
+
|
| 304 |
+
def syntactic_marker(self):
|
| 305 |
+
return self._syntactic_marker
|
| 306 |
+
|
| 307 |
+
def synset(self):
|
| 308 |
+
return self._synset
|
| 309 |
+
|
| 310 |
+
def frame_strings(self):
|
| 311 |
+
return self._frame_strings
|
| 312 |
+
|
| 313 |
+
def frame_ids(self):
|
| 314 |
+
return self._frame_ids
|
| 315 |
+
|
| 316 |
+
def lang(self):
|
| 317 |
+
return self._lang
|
| 318 |
+
|
| 319 |
+
def key(self):
|
| 320 |
+
return self._key
|
| 321 |
+
|
| 322 |
+
def __repr__(self):
|
| 323 |
+
tup = type(self).__name__, self._synset._name, self._name
|
| 324 |
+
return "%s('%s.%s')" % tup
|
| 325 |
+
|
| 326 |
+
def _related(self, relation_symbol):
|
| 327 |
+
get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
|
| 328 |
+
if (self._name, relation_symbol) not in self._synset._lemma_pointers:
|
| 329 |
+
return []
|
| 330 |
+
return [
|
| 331 |
+
get_synset(pos, offset)._lemmas[lemma_index]
|
| 332 |
+
for pos, offset, lemma_index in self._synset._lemma_pointers[
|
| 333 |
+
self._name, relation_symbol
|
| 334 |
+
]
|
| 335 |
+
]
|
| 336 |
+
|
| 337 |
+
def count(self):
|
| 338 |
+
"""Return the frequency count for this Lemma"""
|
| 339 |
+
return self._wordnet_corpus_reader.lemma_count(self)
|
| 340 |
+
|
| 341 |
+
def antonyms(self):
|
| 342 |
+
return self._related("!")
|
| 343 |
+
|
| 344 |
+
def derivationally_related_forms(self):
|
| 345 |
+
return self._related("+")
|
| 346 |
+
|
| 347 |
+
def pertainyms(self):
|
| 348 |
+
return self._related("\\")
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
class Synset(_WordNetObject):
|
| 352 |
+
"""Create a Synset from a "<lemma>.<pos>.<number>" string where:
|
| 353 |
+
<lemma> is the word's morphological stem
|
| 354 |
+
<pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
|
| 355 |
+
<number> is the sense number, counting from 0.
|
| 356 |
+
|
| 357 |
+
Synset attributes, accessible via methods with the same name:
|
| 358 |
+
|
| 359 |
+
- name: The canonical name of this synset, formed using the first lemma
|
| 360 |
+
of this synset. Note that this may be different from the name
|
| 361 |
+
passed to the constructor if that string used a different lemma to
|
| 362 |
+
identify the synset.
|
| 363 |
+
- pos: The synset's part of speech, matching one of the module level
|
| 364 |
+
attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
|
| 365 |
+
- lemmas: A list of the Lemma objects for this synset.
|
| 366 |
+
- definition: The definition for this synset.
|
| 367 |
+
- examples: A list of example strings for this synset.
|
| 368 |
+
- offset: The offset in the WordNet dict file of this synset.
|
| 369 |
+
- lexname: The name of the lexicographer file containing this synset.
|
| 370 |
+
|
| 371 |
+
Synset methods:
|
| 372 |
+
|
| 373 |
+
Synsets have the following methods for retrieving related Synsets.
|
| 374 |
+
They correspond to the names for the pointer symbols defined here:
|
| 375 |
+
https://wordnet.princeton.edu/documentation/wninput5wn
|
| 376 |
+
These methods all return lists of Synsets.
|
| 377 |
+
|
| 378 |
+
- hypernyms, instance_hypernyms
|
| 379 |
+
- hyponyms, instance_hyponyms
|
| 380 |
+
- member_holonyms, substance_holonyms, part_holonyms
|
| 381 |
+
- member_meronyms, substance_meronyms, part_meronyms
|
| 382 |
+
- attributes
|
| 383 |
+
- entailments
|
| 384 |
+
- causes
|
| 385 |
+
- also_sees
|
| 386 |
+
- verb_groups
|
| 387 |
+
- similar_tos
|
| 388 |
+
|
| 389 |
+
Additionally, Synsets support the following methods specific to the
|
| 390 |
+
hypernym relation:
|
| 391 |
+
|
| 392 |
+
- root_hypernyms
|
| 393 |
+
- common_hypernyms
|
| 394 |
+
- lowest_common_hypernyms
|
| 395 |
+
|
| 396 |
+
Note that Synsets do not support the following relations because
|
| 397 |
+
these are defined by WordNet as lexical relations:
|
| 398 |
+
|
| 399 |
+
- antonyms
|
| 400 |
+
- derivationally_related_forms
|
| 401 |
+
- pertainyms
|
| 402 |
+
"""
|
| 403 |
+
|
| 404 |
+
__slots__ = [
|
| 405 |
+
"_pos",
|
| 406 |
+
"_offset",
|
| 407 |
+
"_name",
|
| 408 |
+
"_frame_ids",
|
| 409 |
+
"_lemmas",
|
| 410 |
+
"_lemma_names",
|
| 411 |
+
"_definition",
|
| 412 |
+
"_examples",
|
| 413 |
+
"_lexname",
|
| 414 |
+
"_pointers",
|
| 415 |
+
"_lemma_pointers",
|
| 416 |
+
"_max_depth",
|
| 417 |
+
"_min_depth",
|
| 418 |
+
]
|
| 419 |
+
|
| 420 |
+
def __init__(self, wordnet_corpus_reader):
|
| 421 |
+
self._wordnet_corpus_reader = wordnet_corpus_reader
|
| 422 |
+
# All of these attributes get initialized by
|
| 423 |
+
# WordNetCorpusReader._synset_from_pos_and_line()
|
| 424 |
+
|
| 425 |
+
self._pos = None
|
| 426 |
+
self._offset = None
|
| 427 |
+
self._name = None
|
| 428 |
+
self._frame_ids = []
|
| 429 |
+
self._lemmas = []
|
| 430 |
+
self._lemma_names = []
|
| 431 |
+
self._definition = None
|
| 432 |
+
self._examples = []
|
| 433 |
+
self._lexname = None # lexicographer name
|
| 434 |
+
self._all_hypernyms = None
|
| 435 |
+
|
| 436 |
+
self._pointers = defaultdict(set)
|
| 437 |
+
self._lemma_pointers = defaultdict(list)
|
| 438 |
+
|
| 439 |
+
def pos(self):
|
| 440 |
+
return self._pos
|
| 441 |
+
|
| 442 |
+
def offset(self):
|
| 443 |
+
return self._offset
|
| 444 |
+
|
| 445 |
+
def name(self):
|
| 446 |
+
return self._name
|
| 447 |
+
|
| 448 |
+
def frame_ids(self):
|
| 449 |
+
return self._frame_ids
|
| 450 |
+
|
| 451 |
+
def _doc(self, doc_type, default, lang="eng"):
|
| 452 |
+
"""Helper method for Synset.definition and Synset.examples"""
|
| 453 |
+
corpus = self._wordnet_corpus_reader
|
| 454 |
+
if lang not in corpus.langs():
|
| 455 |
+
return None
|
| 456 |
+
elif lang == "eng":
|
| 457 |
+
return default
|
| 458 |
+
else:
|
| 459 |
+
corpus._load_lang_data(lang)
|
| 460 |
+
of = corpus.ss2of(self)
|
| 461 |
+
i = corpus.lg_attrs.index(doc_type)
|
| 462 |
+
if of in corpus._lang_data[lang][i]:
|
| 463 |
+
return corpus._lang_data[lang][i][of]
|
| 464 |
+
else:
|
| 465 |
+
return None
|
| 466 |
+
|
| 467 |
+
def definition(self, lang="eng"):
|
| 468 |
+
"""Return definition in specified language"""
|
| 469 |
+
return self._doc("def", self._definition, lang=lang)
|
| 470 |
+
|
| 471 |
+
def examples(self, lang="eng"):
|
| 472 |
+
"""Return examples in specified language"""
|
| 473 |
+
return self._doc("exe", self._examples, lang=lang)
|
| 474 |
+
|
| 475 |
+
def lexname(self):
|
| 476 |
+
return self._lexname
|
| 477 |
+
|
| 478 |
+
def _needs_root(self):
|
| 479 |
+
if self._pos == NOUN and self._wordnet_corpus_reader.get_version() != "1.6":
|
| 480 |
+
return False
|
| 481 |
+
else:
|
| 482 |
+
return True
|
| 483 |
+
|
| 484 |
+
def lemma_names(self, lang="eng"):
|
| 485 |
+
"""Return all the lemma_names associated with the synset"""
|
| 486 |
+
if lang == "eng":
|
| 487 |
+
return self._lemma_names
|
| 488 |
+
else:
|
| 489 |
+
reader = self._wordnet_corpus_reader
|
| 490 |
+
reader._load_lang_data(lang)
|
| 491 |
+
i = reader.ss2of(self)
|
| 492 |
+
if i in reader._lang_data[lang][0]:
|
| 493 |
+
return reader._lang_data[lang][0][i]
|
| 494 |
+
else:
|
| 495 |
+
return []
|
| 496 |
+
|
| 497 |
+
def lemmas(self, lang="eng"):
|
| 498 |
+
"""Return all the lemma objects associated with the synset"""
|
| 499 |
+
if lang == "eng":
|
| 500 |
+
return self._lemmas
|
| 501 |
+
elif self._name:
|
| 502 |
+
self._wordnet_corpus_reader._load_lang_data(lang)
|
| 503 |
+
lemmark = []
|
| 504 |
+
lemmy = self.lemma_names(lang)
|
| 505 |
+
for lem in lemmy:
|
| 506 |
+
temp = Lemma(
|
| 507 |
+
self._wordnet_corpus_reader,
|
| 508 |
+
self,
|
| 509 |
+
lem,
|
| 510 |
+
self._wordnet_corpus_reader._lexnames.index(self.lexname()),
|
| 511 |
+
0,
|
| 512 |
+
None,
|
| 513 |
+
)
|
| 514 |
+
temp._lang = lang
|
| 515 |
+
lemmark.append(temp)
|
| 516 |
+
return lemmark
|
| 517 |
+
|
| 518 |
+
def root_hypernyms(self):
|
| 519 |
+
"""Get the topmost hypernyms of this synset in WordNet."""
|
| 520 |
+
|
| 521 |
+
result = []
|
| 522 |
+
seen = set()
|
| 523 |
+
todo = [self]
|
| 524 |
+
while todo:
|
| 525 |
+
next_synset = todo.pop()
|
| 526 |
+
if next_synset not in seen:
|
| 527 |
+
seen.add(next_synset)
|
| 528 |
+
next_hypernyms = (
|
| 529 |
+
next_synset.hypernyms() + next_synset.instance_hypernyms()
|
| 530 |
+
)
|
| 531 |
+
if not next_hypernyms:
|
| 532 |
+
result.append(next_synset)
|
| 533 |
+
else:
|
| 534 |
+
todo.extend(next_hypernyms)
|
| 535 |
+
return result
|
| 536 |
+
|
| 537 |
+
# Simpler implementation which makes incorrect assumption that
|
| 538 |
+
# hypernym hierarchy is acyclic:
|
| 539 |
+
#
|
| 540 |
+
# if not self.hypernyms():
|
| 541 |
+
# return [self]
|
| 542 |
+
# else:
|
| 543 |
+
# return list(set(root for h in self.hypernyms()
|
| 544 |
+
# for root in h.root_hypernyms()))
|
| 545 |
+
def max_depth(self):
|
| 546 |
+
"""
|
| 547 |
+
:return: The length of the longest hypernym path from this
|
| 548 |
+
synset to the root.
|
| 549 |
+
"""
|
| 550 |
+
|
| 551 |
+
if "_max_depth" not in self.__dict__:
|
| 552 |
+
hypernyms = self.hypernyms() + self.instance_hypernyms()
|
| 553 |
+
if not hypernyms:
|
| 554 |
+
self._max_depth = 0
|
| 555 |
+
else:
|
| 556 |
+
self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
|
| 557 |
+
return self._max_depth
|
| 558 |
+
|
| 559 |
+
def min_depth(self):
|
| 560 |
+
"""
|
| 561 |
+
:return: The length of the shortest hypernym path from this
|
| 562 |
+
synset to the root.
|
| 563 |
+
"""
|
| 564 |
+
|
| 565 |
+
if "_min_depth" not in self.__dict__:
|
| 566 |
+
hypernyms = self.hypernyms() + self.instance_hypernyms()
|
| 567 |
+
if not hypernyms:
|
| 568 |
+
self._min_depth = 0
|
| 569 |
+
else:
|
| 570 |
+
self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
|
| 571 |
+
return self._min_depth
|
| 572 |
+
|
| 573 |
+
def closure(self, rel, depth=-1):
|
| 574 |
+
"""
|
| 575 |
+
Return the transitive closure of source under the rel
|
| 576 |
+
relationship, breadth-first, discarding cycles:
|
| 577 |
+
|
| 578 |
+
>>> from nltk.corpus import wordnet as wn
|
| 579 |
+
>>> computer = wn.synset('computer.n.01')
|
| 580 |
+
>>> topic = lambda s:s.topic_domains()
|
| 581 |
+
>>> print(list(computer.closure(topic)))
|
| 582 |
+
[Synset('computer_science.n.01')]
|
| 583 |
+
|
| 584 |
+
UserWarning: Discarded redundant search for Synset('computer.n.01') at depth 2
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
Include redundant paths (but only once), avoiding duplicate searches
|
| 588 |
+
(from 'animal.n.01' to 'entity.n.01'):
|
| 589 |
+
|
| 590 |
+
>>> dog = wn.synset('dog.n.01')
|
| 591 |
+
>>> hyp = lambda s:s.hypernyms()
|
| 592 |
+
>>> print(list(dog.closure(hyp)))
|
| 593 |
+
[Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'),\
|
| 594 |
+
Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'),\
|
| 595 |
+
Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'),\
|
| 596 |
+
Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),\
|
| 597 |
+
Synset('physical_entity.n.01'), Synset('entity.n.01')]
|
| 598 |
+
|
| 599 |
+
UserWarning: Discarded redundant search for Synset('animal.n.01') at depth 7
|
| 600 |
+
"""
|
| 601 |
+
|
| 602 |
+
from nltk.util import acyclic_breadth_first
|
| 603 |
+
|
| 604 |
+
for synset in acyclic_breadth_first(self, rel, depth):
|
| 605 |
+
if synset != self:
|
| 606 |
+
yield synset
|
| 607 |
+
|
| 608 |
+
from nltk.util import acyclic_depth_first as acyclic_tree
|
| 609 |
+
from nltk.util import unweighted_minimum_spanning_tree as mst
|
| 610 |
+
|
| 611 |
+
# Also add this shortcut?
|
| 612 |
+
# from nltk.util import unweighted_minimum_spanning_digraph as umsd
|
| 613 |
+
|
| 614 |
+
def tree(self, rel, depth=-1, cut_mark=None):
|
| 615 |
+
"""
|
| 616 |
+
Return the full relation tree, including self,
|
| 617 |
+
discarding cycles:
|
| 618 |
+
|
| 619 |
+
>>> from nltk.corpus import wordnet as wn
|
| 620 |
+
>>> from pprint import pprint
|
| 621 |
+
>>> computer = wn.synset('computer.n.01')
|
| 622 |
+
>>> topic = lambda s:s.topic_domains()
|
| 623 |
+
>>> pprint(computer.tree(topic))
|
| 624 |
+
[Synset('computer.n.01'), [Synset('computer_science.n.01')]]
|
| 625 |
+
|
| 626 |
+
UserWarning: Discarded redundant search for Synset('computer.n.01') at depth -3
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
But keep duplicate branches (from 'animal.n.01' to 'entity.n.01'):
|
| 630 |
+
|
| 631 |
+
>>> dog = wn.synset('dog.n.01')
|
| 632 |
+
>>> hyp = lambda s:s.hypernyms()
|
| 633 |
+
>>> pprint(dog.tree(hyp))
|
| 634 |
+
[Synset('dog.n.01'),
|
| 635 |
+
[Synset('canine.n.02'),
|
| 636 |
+
[Synset('carnivore.n.01'),
|
| 637 |
+
[Synset('placental.n.01'),
|
| 638 |
+
[Synset('mammal.n.01'),
|
| 639 |
+
[Synset('vertebrate.n.01'),
|
| 640 |
+
[Synset('chordate.n.01'),
|
| 641 |
+
[Synset('animal.n.01'),
|
| 642 |
+
[Synset('organism.n.01'),
|
| 643 |
+
[Synset('living_thing.n.01'),
|
| 644 |
+
[Synset('whole.n.02'),
|
| 645 |
+
[Synset('object.n.01'),
|
| 646 |
+
[Synset('physical_entity.n.01'),
|
| 647 |
+
[Synset('entity.n.01')]]]]]]]]]]]]],
|
| 648 |
+
[Synset('domestic_animal.n.01'),
|
| 649 |
+
[Synset('animal.n.01'),
|
| 650 |
+
[Synset('organism.n.01'),
|
| 651 |
+
[Synset('living_thing.n.01'),
|
| 652 |
+
[Synset('whole.n.02'),
|
| 653 |
+
[Synset('object.n.01'),
|
| 654 |
+
[Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]
|
| 655 |
+
"""
|
| 656 |
+
|
| 657 |
+
from nltk.util import acyclic_branches_depth_first
|
| 658 |
+
|
| 659 |
+
return acyclic_branches_depth_first(self, rel, depth, cut_mark)
|
| 660 |
+
|
| 661 |
+
def hypernym_paths(self):
|
| 662 |
+
"""
|
| 663 |
+
Get the path(s) from this synset to the root, where each path is a
|
| 664 |
+
list of the synset nodes traversed on the way to the root.
|
| 665 |
+
|
| 666 |
+
:return: A list of lists, where each list gives the node sequence
|
| 667 |
+
connecting the initial ``Synset`` node and a root node.
|
| 668 |
+
"""
|
| 669 |
+
paths = []
|
| 670 |
+
|
| 671 |
+
hypernyms = self.hypernyms() + self.instance_hypernyms()
|
| 672 |
+
if len(hypernyms) == 0:
|
| 673 |
+
paths = [[self]]
|
| 674 |
+
|
| 675 |
+
for hypernym in hypernyms:
|
| 676 |
+
for ancestor_list in hypernym.hypernym_paths():
|
| 677 |
+
ancestor_list.append(self)
|
| 678 |
+
paths.append(ancestor_list)
|
| 679 |
+
return paths
|
| 680 |
+
|
| 681 |
+
def common_hypernyms(self, other):
|
| 682 |
+
"""
|
| 683 |
+
Find all synsets that are hypernyms of this synset and the
|
| 684 |
+
other synset.
|
| 685 |
+
|
| 686 |
+
:type other: Synset
|
| 687 |
+
:param other: other input synset.
|
| 688 |
+
:return: The synsets that are hypernyms of both synsets.
|
| 689 |
+
"""
|
| 690 |
+
if not self._all_hypernyms:
|
| 691 |
+
self._all_hypernyms = {
|
| 692 |
+
self_synset
|
| 693 |
+
for self_synsets in self._iter_hypernym_lists()
|
| 694 |
+
for self_synset in self_synsets
|
| 695 |
+
}
|
| 696 |
+
if not other._all_hypernyms:
|
| 697 |
+
other._all_hypernyms = {
|
| 698 |
+
other_synset
|
| 699 |
+
for other_synsets in other._iter_hypernym_lists()
|
| 700 |
+
for other_synset in other_synsets
|
| 701 |
+
}
|
| 702 |
+
return list(self._all_hypernyms.intersection(other._all_hypernyms))
|
| 703 |
+
|
| 704 |
+
def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
|
| 705 |
+
"""
|
| 706 |
+
Get a list of lowest synset(s) that both synsets have as a hypernym.
|
| 707 |
+
When `use_min_depth == False` this means that the synset which appears
|
| 708 |
+
as a hypernym of both `self` and `other` with the lowest maximum depth
|
| 709 |
+
is returned or if there are multiple such synsets at the same depth
|
| 710 |
+
they are all returned
|
| 711 |
+
|
| 712 |
+
However, if `use_min_depth == True` then the synset(s) which has/have
|
| 713 |
+
the lowest minimum depth and appear(s) in both paths is/are returned.
|
| 714 |
+
|
| 715 |
+
By setting the use_min_depth flag to True, the behavior of NLTK2 can be
|
| 716 |
+
preserved. This was changed in NLTK3 to give more accurate results in a
|
| 717 |
+
small set of cases, generally with synsets concerning people. (eg:
|
| 718 |
+
'chef.n.01', 'fireman.n.01', etc.)
|
| 719 |
+
|
| 720 |
+
This method is an implementation of Ted Pedersen's "Lowest Common
|
| 721 |
+
Subsumer" method from the Perl Wordnet module. It can return either
|
| 722 |
+
"self" or "other" if they are a hypernym of the other.
|
| 723 |
+
|
| 724 |
+
:type other: Synset
|
| 725 |
+
:param other: other input synset
|
| 726 |
+
:type simulate_root: bool
|
| 727 |
+
:param simulate_root: The various verb taxonomies do not
|
| 728 |
+
share a single root which disallows this metric from working for
|
| 729 |
+
synsets that are not connected. This flag (False by default)
|
| 730 |
+
creates a fake root that connects all the taxonomies. Set it
|
| 731 |
+
to True to enable this behavior. For the noun taxonomy,
|
| 732 |
+
there is usually a default root except for WordNet version 1.6.
|
| 733 |
+
If you are using wordnet 1.6, a fake root will need to be added
|
| 734 |
+
for nouns as well.
|
| 735 |
+
:type use_min_depth: bool
|
| 736 |
+
:param use_min_depth: This setting mimics older (v2) behavior of NLTK
|
| 737 |
+
wordnet If True, will use the min_depth function to calculate the
|
| 738 |
+
lowest common hypernyms. This is known to give strange results for
|
| 739 |
+
some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
|
| 740 |
+
for backwards compatibility
|
| 741 |
+
:return: The synsets that are the lowest common hypernyms of both
|
| 742 |
+
synsets
|
| 743 |
+
"""
|
| 744 |
+
synsets = self.common_hypernyms(other)
|
| 745 |
+
if simulate_root:
|
| 746 |
+
fake_synset = Synset(None)
|
| 747 |
+
fake_synset._name = "*ROOT*"
|
| 748 |
+
fake_synset.hypernyms = lambda: []
|
| 749 |
+
fake_synset.instance_hypernyms = lambda: []
|
| 750 |
+
synsets.append(fake_synset)
|
| 751 |
+
|
| 752 |
+
try:
|
| 753 |
+
if use_min_depth:
|
| 754 |
+
max_depth = max(s.min_depth() for s in synsets)
|
| 755 |
+
unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
|
| 756 |
+
else:
|
| 757 |
+
max_depth = max(s.max_depth() for s in synsets)
|
| 758 |
+
unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
|
| 759 |
+
return sorted(unsorted_lch)
|
| 760 |
+
except ValueError:
|
| 761 |
+
return []
|
| 762 |
+
|
| 763 |
+
def hypernym_distances(self, distance=0, simulate_root=False):
|
| 764 |
+
"""
|
| 765 |
+
Get the path(s) from this synset to the root, counting the distance
|
| 766 |
+
of each node from the initial node on the way. A set of
|
| 767 |
+
(synset, distance) tuples is returned.
|
| 768 |
+
|
| 769 |
+
:type distance: int
|
| 770 |
+
:param distance: the distance (number of edges) from this hypernym to
|
| 771 |
+
the original hypernym ``Synset`` on which this method was called.
|
| 772 |
+
:return: A set of ``(Synset, int)`` tuples where each ``Synset`` is
|
| 773 |
+
a hypernym of the first ``Synset``.
|
| 774 |
+
"""
|
| 775 |
+
distances = {(self, distance)}
|
| 776 |
+
for hypernym in self._hypernyms() + self._instance_hypernyms():
|
| 777 |
+
distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
|
| 778 |
+
if simulate_root:
|
| 779 |
+
fake_synset = Synset(None)
|
| 780 |
+
fake_synset._name = "*ROOT*"
|
| 781 |
+
fake_synset_distance = max(distances, key=itemgetter(1))[1]
|
| 782 |
+
distances.add((fake_synset, fake_synset_distance + 1))
|
| 783 |
+
return distances
|
| 784 |
+
|
| 785 |
+
def _shortest_hypernym_paths(self, simulate_root):
|
| 786 |
+
if self._name == "*ROOT*":
|
| 787 |
+
return {self: 0}
|
| 788 |
+
|
| 789 |
+
queue = deque([(self, 0)])
|
| 790 |
+
path = {}
|
| 791 |
+
|
| 792 |
+
while queue:
|
| 793 |
+
s, depth = queue.popleft()
|
| 794 |
+
if s in path:
|
| 795 |
+
continue
|
| 796 |
+
path[s] = depth
|
| 797 |
+
|
| 798 |
+
depth += 1
|
| 799 |
+
queue.extend((hyp, depth) for hyp in s._hypernyms())
|
| 800 |
+
queue.extend((hyp, depth) for hyp in s._instance_hypernyms())
|
| 801 |
+
|
| 802 |
+
if simulate_root:
|
| 803 |
+
fake_synset = Synset(None)
|
| 804 |
+
fake_synset._name = "*ROOT*"
|
| 805 |
+
path[fake_synset] = max(path.values()) + 1
|
| 806 |
+
|
| 807 |
+
return path
|
| 808 |
+
|
| 809 |
+
def shortest_path_distance(self, other, simulate_root=False):
|
| 810 |
+
"""
|
| 811 |
+
Returns the distance of the shortest path linking the two synsets (if
|
| 812 |
+
one exists). For each synset, all the ancestor nodes and their
|
| 813 |
+
distances are recorded and compared. The ancestor node common to both
|
| 814 |
+
synsets that can be reached with the minimum number of traversals is
|
| 815 |
+
used. If no ancestor nodes are common, None is returned. If a node is
|
| 816 |
+
compared with itself 0 is returned.
|
| 817 |
+
|
| 818 |
+
:type other: Synset
|
| 819 |
+
:param other: The Synset to which the shortest path will be found.
|
| 820 |
+
:return: The number of edges in the shortest path connecting the two
|
| 821 |
+
nodes, or None if no path exists.
|
| 822 |
+
"""
|
| 823 |
+
|
| 824 |
+
if self == other:
|
| 825 |
+
return 0
|
| 826 |
+
|
| 827 |
+
dist_dict1 = self._shortest_hypernym_paths(simulate_root)
|
| 828 |
+
dist_dict2 = other._shortest_hypernym_paths(simulate_root)
|
| 829 |
+
|
| 830 |
+
# For each ancestor synset common to both subject synsets, find the
|
| 831 |
+
# connecting path length. Return the shortest of these.
|
| 832 |
+
|
| 833 |
+
inf = float("inf")
|
| 834 |
+
path_distance = inf
|
| 835 |
+
for synset, d1 in dist_dict1.items():
|
| 836 |
+
d2 = dist_dict2.get(synset, inf)
|
| 837 |
+
path_distance = min(path_distance, d1 + d2)
|
| 838 |
+
|
| 839 |
+
return None if math.isinf(path_distance) else path_distance
|
| 840 |
+
|
| 841 |
+
# interface to similarity methods
|
| 842 |
+
def path_similarity(self, other, verbose=False, simulate_root=True):
|
| 843 |
+
"""
|
| 844 |
+
Path Distance Similarity:
|
| 845 |
+
Return a score denoting how similar two word senses are, based on the
|
| 846 |
+
shortest path that connects the senses in the is-a (hypernym/hypnoym)
|
| 847 |
+
taxonomy. The score is in the range 0 to 1, except in those cases where
|
| 848 |
+
a path cannot be found (will only be true for verbs as there are many
|
| 849 |
+
distinct verb taxonomies), in which case None is returned. A score of
|
| 850 |
+
1 represents identity i.e. comparing a sense with itself will return 1.
|
| 851 |
+
|
| 852 |
+
:type other: Synset
|
| 853 |
+
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
| 854 |
+
:type simulate_root: bool
|
| 855 |
+
:param simulate_root: The various verb taxonomies do not
|
| 856 |
+
share a single root which disallows this metric from working for
|
| 857 |
+
synsets that are not connected. This flag (True by default)
|
| 858 |
+
creates a fake root that connects all the taxonomies. Set it
|
| 859 |
+
to false to disable this behavior. For the noun taxonomy,
|
| 860 |
+
there is usually a default root except for WordNet version 1.6.
|
| 861 |
+
If you are using wordnet 1.6, a fake root will be added for nouns
|
| 862 |
+
as well.
|
| 863 |
+
:return: A score denoting the similarity of the two ``Synset`` objects,
|
| 864 |
+
normally between 0 and 1. None is returned if no connecting path
|
| 865 |
+
could be found. 1 is returned if a ``Synset`` is compared with
|
| 866 |
+
itself.
|
| 867 |
+
"""
|
| 868 |
+
|
| 869 |
+
distance = self.shortest_path_distance(
|
| 870 |
+
other,
|
| 871 |
+
simulate_root=simulate_root and (self._needs_root() or other._needs_root()),
|
| 872 |
+
)
|
| 873 |
+
if distance is None or distance < 0:
|
| 874 |
+
return None
|
| 875 |
+
return 1.0 / (distance + 1)
|
| 876 |
+
|
| 877 |
+
def lch_similarity(self, other, verbose=False, simulate_root=True):
|
| 878 |
+
"""
|
| 879 |
+
Leacock Chodorow Similarity:
|
| 880 |
+
Return a score denoting how similar two word senses are, based on the
|
| 881 |
+
shortest path that connects the senses (as above) and the maximum depth
|
| 882 |
+
of the taxonomy in which the senses occur. The relationship is given as
|
| 883 |
+
-log(p/2d) where p is the shortest path length and d is the taxonomy
|
| 884 |
+
depth.
|
| 885 |
+
|
| 886 |
+
:type other: Synset
|
| 887 |
+
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
| 888 |
+
:type simulate_root: bool
|
| 889 |
+
:param simulate_root: The various verb taxonomies do not
|
| 890 |
+
share a single root which disallows this metric from working for
|
| 891 |
+
synsets that are not connected. This flag (True by default)
|
| 892 |
+
creates a fake root that connects all the taxonomies. Set it
|
| 893 |
+
to false to disable this behavior. For the noun taxonomy,
|
| 894 |
+
there is usually a default root except for WordNet version 1.6.
|
| 895 |
+
If you are using wordnet 1.6, a fake root will be added for nouns
|
| 896 |
+
as well.
|
| 897 |
+
:return: A score denoting the similarity of the two ``Synset`` objects,
|
| 898 |
+
normally greater than 0. None is returned if no connecting path
|
| 899 |
+
could be found. If a ``Synset`` is compared with itself, the
|
| 900 |
+
maximum score is returned, which varies depending on the taxonomy
|
| 901 |
+
depth.
|
| 902 |
+
"""
|
| 903 |
+
|
| 904 |
+
if self._pos != other._pos:
|
| 905 |
+
raise WordNetError(
|
| 906 |
+
"Computing the lch similarity requires "
|
| 907 |
+
"%s and %s to have the same part of speech." % (self, other)
|
| 908 |
+
)
|
| 909 |
+
|
| 910 |
+
need_root = self._needs_root()
|
| 911 |
+
|
| 912 |
+
if self._pos not in self._wordnet_corpus_reader._max_depth:
|
| 913 |
+
self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
|
| 914 |
+
|
| 915 |
+
depth = self._wordnet_corpus_reader._max_depth[self._pos]
|
| 916 |
+
|
| 917 |
+
distance = self.shortest_path_distance(
|
| 918 |
+
other, simulate_root=simulate_root and need_root
|
| 919 |
+
)
|
| 920 |
+
|
| 921 |
+
if distance is None or distance < 0 or depth == 0:
|
| 922 |
+
return None
|
| 923 |
+
return -math.log((distance + 1) / (2.0 * depth))
|
| 924 |
+
|
| 925 |
+
def wup_similarity(self, other, verbose=False, simulate_root=True):
|
| 926 |
+
"""
|
| 927 |
+
Wu-Palmer Similarity:
|
| 928 |
+
Return a score denoting how similar two word senses are, based on the
|
| 929 |
+
depth of the two senses in the taxonomy and that of their Least Common
|
| 930 |
+
Subsumer (most specific ancestor node). Previously, the scores computed
|
| 931 |
+
by this implementation did _not_ always agree with those given by
|
| 932 |
+
Pedersen's Perl implementation of WordNet Similarity. However, with
|
| 933 |
+
the addition of the simulate_root flag (see below), the score for
|
| 934 |
+
verbs now almost always agree but not always for nouns.
|
| 935 |
+
|
| 936 |
+
The LCS does not necessarily feature in the shortest path connecting
|
| 937 |
+
the two senses, as it is by definition the common ancestor deepest in
|
| 938 |
+
the taxonomy, not closest to the two senses. Typically, however, it
|
| 939 |
+
will so feature. Where multiple candidates for the LCS exist, that
|
| 940 |
+
whose shortest path to the root node is the longest will be selected.
|
| 941 |
+
Where the LCS has multiple paths to the root, the longer path is used
|
| 942 |
+
for the purposes of the calculation.
|
| 943 |
+
|
| 944 |
+
:type other: Synset
|
| 945 |
+
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
| 946 |
+
:type simulate_root: bool
|
| 947 |
+
:param simulate_root: The various verb taxonomies do not
|
| 948 |
+
share a single root which disallows this metric from working for
|
| 949 |
+
synsets that are not connected. This flag (True by default)
|
| 950 |
+
creates a fake root that connects all the taxonomies. Set it
|
| 951 |
+
to false to disable this behavior. For the noun taxonomy,
|
| 952 |
+
there is usually a default root except for WordNet version 1.6.
|
| 953 |
+
If you are using wordnet 1.6, a fake root will be added for nouns
|
| 954 |
+
as well.
|
| 955 |
+
:return: A float score denoting the similarity of the two ``Synset``
|
| 956 |
+
objects, normally greater than zero. If no connecting path between
|
| 957 |
+
the two senses can be found, None is returned.
|
| 958 |
+
|
| 959 |
+
"""
|
| 960 |
+
need_root = self._needs_root() or other._needs_root()
|
| 961 |
+
|
| 962 |
+
# Note that to preserve behavior from NLTK2 we set use_min_depth=True
|
| 963 |
+
# It is possible that more accurate results could be obtained by
|
| 964 |
+
# removing this setting and it should be tested later on
|
| 965 |
+
subsumers = self.lowest_common_hypernyms(
|
| 966 |
+
other, simulate_root=simulate_root and need_root, use_min_depth=True
|
| 967 |
+
)
|
| 968 |
+
|
| 969 |
+
# If no LCS was found return None
|
| 970 |
+
if len(subsumers) == 0:
|
| 971 |
+
return None
|
| 972 |
+
|
| 973 |
+
subsumer = self if self in subsumers else subsumers[0]
|
| 974 |
+
|
| 975 |
+
# Get the longest path from the LCS to the root,
|
| 976 |
+
# including a correction:
|
| 977 |
+
# - add one because the calculations include both the start and end
|
| 978 |
+
# nodes
|
| 979 |
+
depth = subsumer.max_depth() + 1
|
| 980 |
+
|
| 981 |
+
# Note: No need for an additional add-one correction for non-nouns
|
| 982 |
+
# to account for an imaginary root node because that is now
|
| 983 |
+
# automatically handled by simulate_root
|
| 984 |
+
# if subsumer._pos != NOUN:
|
| 985 |
+
# depth += 1
|
| 986 |
+
|
| 987 |
+
# Get the shortest path from the LCS to each of the synsets it is
|
| 988 |
+
# subsuming. Add this to the LCS path length to get the path
|
| 989 |
+
# length from each synset to the root.
|
| 990 |
+
len1 = self.shortest_path_distance(
|
| 991 |
+
subsumer, simulate_root=simulate_root and need_root
|
| 992 |
+
)
|
| 993 |
+
len2 = other.shortest_path_distance(
|
| 994 |
+
subsumer, simulate_root=simulate_root and need_root
|
| 995 |
+
)
|
| 996 |
+
if len1 is None or len2 is None:
|
| 997 |
+
return None
|
| 998 |
+
len1 += depth
|
| 999 |
+
len2 += depth
|
| 1000 |
+
return (2.0 * depth) / (len1 + len2)
|
| 1001 |
+
|
| 1002 |
+
def res_similarity(self, other, ic, verbose=False):
|
| 1003 |
+
"""
|
| 1004 |
+
Resnik Similarity:
|
| 1005 |
+
Return a score denoting how similar two word senses are, based on the
|
| 1006 |
+
Information Content (IC) of the Least Common Subsumer (most specific
|
| 1007 |
+
ancestor node).
|
| 1008 |
+
|
| 1009 |
+
:type other: Synset
|
| 1010 |
+
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
| 1011 |
+
:type ic: dict
|
| 1012 |
+
:param ic: an information content object (as returned by
|
| 1013 |
+
``nltk.corpus.wordnet_ic.ic()``).
|
| 1014 |
+
:return: A float score denoting the similarity of the two ``Synset``
|
| 1015 |
+
objects. Synsets whose LCS is the root node of the taxonomy will
|
| 1016 |
+
have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
|
| 1017 |
+
"""
|
| 1018 |
+
|
| 1019 |
+
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
|
| 1020 |
+
return lcs_ic
|
| 1021 |
+
|
| 1022 |
+
def jcn_similarity(self, other, ic, verbose=False):
|
| 1023 |
+
"""
|
| 1024 |
+
Jiang-Conrath Similarity:
|
| 1025 |
+
Return a score denoting how similar two word senses are, based on the
|
| 1026 |
+
Information Content (IC) of the Least Common Subsumer (most specific
|
| 1027 |
+
ancestor node) and that of the two input Synsets. The relationship is
|
| 1028 |
+
given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
|
| 1029 |
+
|
| 1030 |
+
:type other: Synset
|
| 1031 |
+
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
| 1032 |
+
:type ic: dict
|
| 1033 |
+
:param ic: an information content object (as returned by
|
| 1034 |
+
``nltk.corpus.wordnet_ic.ic()``).
|
| 1035 |
+
:return: A float score denoting the similarity of the two ``Synset``
|
| 1036 |
+
objects.
|
| 1037 |
+
"""
|
| 1038 |
+
|
| 1039 |
+
if self == other:
|
| 1040 |
+
return _INF
|
| 1041 |
+
|
| 1042 |
+
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
|
| 1043 |
+
|
| 1044 |
+
# If either of the input synsets are the root synset, or have a
|
| 1045 |
+
# frequency of 0 (sparse data problem), return 0.
|
| 1046 |
+
if ic1 == 0 or ic2 == 0:
|
| 1047 |
+
return 0
|
| 1048 |
+
|
| 1049 |
+
ic_difference = ic1 + ic2 - 2 * lcs_ic
|
| 1050 |
+
|
| 1051 |
+
if ic_difference == 0:
|
| 1052 |
+
return _INF
|
| 1053 |
+
|
| 1054 |
+
return 1 / ic_difference
|
| 1055 |
+
|
| 1056 |
+
def lin_similarity(self, other, ic, verbose=False):
|
| 1057 |
+
"""
|
| 1058 |
+
Lin Similarity:
|
| 1059 |
+
Return a score denoting how similar two word senses are, based on the
|
| 1060 |
+
Information Content (IC) of the Least Common Subsumer (most specific
|
| 1061 |
+
ancestor node) and that of the two input Synsets. The relationship is
|
| 1062 |
+
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
|
| 1063 |
+
|
| 1064 |
+
:type other: Synset
|
| 1065 |
+
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
| 1066 |
+
:type ic: dict
|
| 1067 |
+
:param ic: an information content object (as returned by
|
| 1068 |
+
``nltk.corpus.wordnet_ic.ic()``).
|
| 1069 |
+
:return: A float score denoting the similarity of the two ``Synset``
|
| 1070 |
+
objects, in the range 0 to 1.
|
| 1071 |
+
"""
|
| 1072 |
+
|
| 1073 |
+
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
|
| 1074 |
+
return (2.0 * lcs_ic) / (ic1 + ic2)
|
| 1075 |
+
|
| 1076 |
+
def _iter_hypernym_lists(self):
|
| 1077 |
+
"""
|
| 1078 |
+
:return: An iterator over ``Synset`` objects that are either proper
|
| 1079 |
+
hypernyms or instance of hypernyms of the synset.
|
| 1080 |
+
"""
|
| 1081 |
+
todo = [self]
|
| 1082 |
+
seen = set()
|
| 1083 |
+
while todo:
|
| 1084 |
+
for synset in todo:
|
| 1085 |
+
seen.add(synset)
|
| 1086 |
+
yield todo
|
| 1087 |
+
todo = [
|
| 1088 |
+
hypernym
|
| 1089 |
+
for synset in todo
|
| 1090 |
+
for hypernym in (synset.hypernyms() + synset.instance_hypernyms())
|
| 1091 |
+
if hypernym not in seen
|
| 1092 |
+
]
|
| 1093 |
+
|
| 1094 |
+
def __repr__(self):
|
| 1095 |
+
return f"{type(self).__name__}('{self._name}')"
|
| 1096 |
+
|
| 1097 |
+
def _related(self, relation_symbol, sort=True):
|
| 1098 |
+
get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
|
| 1099 |
+
if relation_symbol not in self._pointers:
|
| 1100 |
+
return []
|
| 1101 |
+
pointer_tuples = self._pointers[relation_symbol]
|
| 1102 |
+
r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
|
| 1103 |
+
if sort:
|
| 1104 |
+
r.sort()
|
| 1105 |
+
return r
|
| 1106 |
+
|
| 1107 |
+
|
| 1108 |
+
######################################################################
|
| 1109 |
+
# WordNet Corpus Reader
|
| 1110 |
+
######################################################################
|
| 1111 |
+
|
| 1112 |
+
|
| 1113 |
+
class WordNetCorpusReader(CorpusReader):
|
| 1114 |
+
"""
|
| 1115 |
+
A corpus reader used to access wordnet or its variants.
|
| 1116 |
+
"""
|
| 1117 |
+
|
| 1118 |
+
_ENCODING = "utf8"
|
| 1119 |
+
|
| 1120 |
+
# { Part-of-speech constants
|
| 1121 |
+
ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
|
| 1122 |
+
# }
|
| 1123 |
+
|
| 1124 |
+
# { Filename constants
|
| 1125 |
+
_FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"}
|
| 1126 |
+
# }
|
| 1127 |
+
|
| 1128 |
+
# { Part of speech constants
|
| 1129 |
+
_pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
|
| 1130 |
+
_pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
|
| 1131 |
+
# }
|
| 1132 |
+
|
| 1133 |
+
#: A list of file identifiers for all the fileids used by this
|
| 1134 |
+
#: corpus reader.
|
| 1135 |
+
_FILES = (
|
| 1136 |
+
"cntlist.rev",
|
| 1137 |
+
"lexnames",
|
| 1138 |
+
"index.sense",
|
| 1139 |
+
"index.adj",
|
| 1140 |
+
"index.adv",
|
| 1141 |
+
"index.noun",
|
| 1142 |
+
"index.verb",
|
| 1143 |
+
"data.adj",
|
| 1144 |
+
"data.adv",
|
| 1145 |
+
"data.noun",
|
| 1146 |
+
"data.verb",
|
| 1147 |
+
"adj.exc",
|
| 1148 |
+
"adv.exc",
|
| 1149 |
+
"noun.exc",
|
| 1150 |
+
"verb.exc",
|
| 1151 |
+
)
|
| 1152 |
+
|
| 1153 |
+
def __init__(self, root, omw_reader):
|
| 1154 |
+
"""
|
| 1155 |
+
Construct a new wordnet corpus reader, with the given root
|
| 1156 |
+
directory.
|
| 1157 |
+
"""
|
| 1158 |
+
|
| 1159 |
+
super().__init__(root, self._FILES, encoding=self._ENCODING)
|
| 1160 |
+
|
| 1161 |
+
# A index that provides the file offset
|
| 1162 |
+
# Map from lemma -> pos -> synset_index -> offset
|
| 1163 |
+
self._lemma_pos_offset_map = defaultdict(dict)
|
| 1164 |
+
|
| 1165 |
+
# A cache so we don't have to reconstruct synsets
|
| 1166 |
+
# Map from pos -> offset -> synset
|
| 1167 |
+
self._synset_offset_cache = defaultdict(dict)
|
| 1168 |
+
|
| 1169 |
+
# A lookup for the maximum depth of each part of speech. Useful for
|
| 1170 |
+
# the lch similarity metric.
|
| 1171 |
+
self._max_depth = defaultdict(dict)
|
| 1172 |
+
|
| 1173 |
+
# Corpus reader containing omw data.
|
| 1174 |
+
self._omw_reader = omw_reader
|
| 1175 |
+
|
| 1176 |
+
# Corpus reader containing extended_omw data.
|
| 1177 |
+
self._exomw_reader = None
|
| 1178 |
+
|
| 1179 |
+
self.provenances = defaultdict(str)
|
| 1180 |
+
self.provenances["eng"] = ""
|
| 1181 |
+
|
| 1182 |
+
if self._omw_reader is None:
|
| 1183 |
+
warnings.warn(
|
| 1184 |
+
"The multilingual functions are not available with this Wordnet version"
|
| 1185 |
+
)
|
| 1186 |
+
|
| 1187 |
+
self.omw_langs = set()
|
| 1188 |
+
|
| 1189 |
+
# A cache to store the wordnet data of multiple languages
|
| 1190 |
+
self._lang_data = defaultdict(list)
|
| 1191 |
+
|
| 1192 |
+
self._data_file_map = {}
|
| 1193 |
+
self._exception_map = {}
|
| 1194 |
+
self._lexnames = []
|
| 1195 |
+
self._key_count_file = None
|
| 1196 |
+
self._key_synset_file = None
|
| 1197 |
+
|
| 1198 |
+
# Load the lexnames
|
| 1199 |
+
with self.open("lexnames") as fp:
|
| 1200 |
+
for i, line in enumerate(fp):
|
| 1201 |
+
index, lexname, _ = line.split()
|
| 1202 |
+
assert int(index) == i
|
| 1203 |
+
self._lexnames.append(lexname)
|
| 1204 |
+
|
| 1205 |
+
# Load the indices for lemmas and synset offsets
|
| 1206 |
+
self._load_lemma_pos_offset_map()
|
| 1207 |
+
|
| 1208 |
+
# load the exception file data into memory
|
| 1209 |
+
self._load_exception_map()
|
| 1210 |
+
|
| 1211 |
+
self.nomap = []
|
| 1212 |
+
self.splits = {}
|
| 1213 |
+
|
| 1214 |
+
# map from WordNet 3.0 for OMW data
|
| 1215 |
+
self.map30 = self.map_wn30()
|
| 1216 |
+
|
| 1217 |
+
# Language data attributes
|
| 1218 |
+
self.lg_attrs = ["lemma", "none", "def", "exe"]
|
| 1219 |
+
|
| 1220 |
+
def index_sense(self, version=None):
|
| 1221 |
+
"""Read sense key to synset id mapping from index.sense file in corpus directory"""
|
| 1222 |
+
fn = "index.sense"
|
| 1223 |
+
if version:
|
| 1224 |
+
from nltk.corpus import CorpusReader, LazyCorpusLoader
|
| 1225 |
+
|
| 1226 |
+
ixreader = LazyCorpusLoader(version, CorpusReader, r".*/" + fn)
|
| 1227 |
+
else:
|
| 1228 |
+
ixreader = self
|
| 1229 |
+
with ixreader.open(fn) as fp:
|
| 1230 |
+
sensekey_map = {}
|
| 1231 |
+
for line in fp:
|
| 1232 |
+
fields = line.strip().split()
|
| 1233 |
+
sensekey = fields[0]
|
| 1234 |
+
pos = self._pos_names[int(sensekey.split("%")[1].split(":")[0])]
|
| 1235 |
+
sensekey_map[sensekey] = f"{fields[1]}-{pos}"
|
| 1236 |
+
return sensekey_map
|
| 1237 |
+
|
| 1238 |
+
def map_to_many(self):
|
| 1239 |
+
sensekey_map1 = self.index_sense("wordnet")
|
| 1240 |
+
sensekey_map2 = self.index_sense()
|
| 1241 |
+
synset_to_many = {}
|
| 1242 |
+
for synsetid in set(sensekey_map1.values()):
|
| 1243 |
+
synset_to_many[synsetid] = []
|
| 1244 |
+
for sensekey in set(sensekey_map1.keys()).intersection(
|
| 1245 |
+
set(sensekey_map2.keys())
|
| 1246 |
+
):
|
| 1247 |
+
source = sensekey_map1[sensekey]
|
| 1248 |
+
target = sensekey_map2[sensekey]
|
| 1249 |
+
synset_to_many[source].append(target)
|
| 1250 |
+
return synset_to_many
|
| 1251 |
+
|
| 1252 |
+
def map_to_one(self):
|
| 1253 |
+
synset_to_many = self.map_to_many()
|
| 1254 |
+
synset_to_one = {}
|
| 1255 |
+
for source in synset_to_many:
|
| 1256 |
+
candidates_bag = synset_to_many[source]
|
| 1257 |
+
if candidates_bag:
|
| 1258 |
+
candidates_set = set(candidates_bag)
|
| 1259 |
+
if len(candidates_set) == 1:
|
| 1260 |
+
target = candidates_bag[0]
|
| 1261 |
+
else:
|
| 1262 |
+
counts = []
|
| 1263 |
+
for candidate in candidates_set:
|
| 1264 |
+
counts.append((candidates_bag.count(candidate), candidate))
|
| 1265 |
+
self.splits[source] = counts
|
| 1266 |
+
target = max(counts)[1]
|
| 1267 |
+
synset_to_one[source] = target
|
| 1268 |
+
if source[-1] == "s":
|
| 1269 |
+
# Add a mapping from "a" to target for applications like omw,
|
| 1270 |
+
# where only Lithuanian and Slovak use the "s" ss_type.
|
| 1271 |
+
synset_to_one[f"{source[:-1]}a"] = target
|
| 1272 |
+
else:
|
| 1273 |
+
self.nomap.append(source)
|
| 1274 |
+
return synset_to_one
|
| 1275 |
+
|
| 1276 |
+
def map_wn30(self):
|
| 1277 |
+
"""Mapping from Wordnet 3.0 to currently loaded Wordnet version"""
|
| 1278 |
+
if self.get_version() == "3.0":
|
| 1279 |
+
return None
|
| 1280 |
+
else:
|
| 1281 |
+
return self.map_to_one()
|
| 1282 |
+
|
| 1283 |
+
# Open Multilingual WordNet functions, contributed by
|
| 1284 |
+
# Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
|
| 1285 |
+
|
| 1286 |
+
def of2ss(self, of):
|
| 1287 |
+
"""take an id and return the synsets"""
|
| 1288 |
+
return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
|
| 1289 |
+
|
| 1290 |
+
def ss2of(self, ss):
|
| 1291 |
+
"""return the ID of the synset"""
|
| 1292 |
+
if ss:
|
| 1293 |
+
return f"{ss.offset():08d}-{ss.pos()}"
|
| 1294 |
+
|
| 1295 |
+
def _load_lang_data(self, lang):
|
| 1296 |
+
"""load the wordnet data of the requested language from the file to
|
| 1297 |
+
the cache, _lang_data"""
|
| 1298 |
+
|
| 1299 |
+
if lang in self._lang_data:
|
| 1300 |
+
return
|
| 1301 |
+
|
| 1302 |
+
if self._omw_reader and not self.omw_langs:
|
| 1303 |
+
self.add_omw()
|
| 1304 |
+
|
| 1305 |
+
if lang not in self.langs():
|
| 1306 |
+
raise WordNetError("Language is not supported.")
|
| 1307 |
+
|
| 1308 |
+
if self._exomw_reader and lang not in self.omw_langs:
|
| 1309 |
+
reader = self._exomw_reader
|
| 1310 |
+
else:
|
| 1311 |
+
reader = self._omw_reader
|
| 1312 |
+
|
| 1313 |
+
prov = self.provenances[lang]
|
| 1314 |
+
if prov in ["cldr", "wikt"]:
|
| 1315 |
+
prov2 = prov
|
| 1316 |
+
else:
|
| 1317 |
+
prov2 = "data"
|
| 1318 |
+
|
| 1319 |
+
with reader.open(f"{prov}/wn-{prov2}-{lang.split('_')[0]}.tab") as fp:
|
| 1320 |
+
self.custom_lemmas(fp, lang)
|
| 1321 |
+
self.disable_custom_lemmas(lang)
|
| 1322 |
+
|
| 1323 |
+
def add_provs(self, reader):
|
| 1324 |
+
"""Add languages from Multilingual Wordnet to the provenance dictionary"""
|
| 1325 |
+
fileids = reader.fileids()
|
| 1326 |
+
for fileid in fileids:
|
| 1327 |
+
prov, langfile = os.path.split(fileid)
|
| 1328 |
+
file_name, file_extension = os.path.splitext(langfile)
|
| 1329 |
+
if file_extension == ".tab":
|
| 1330 |
+
lang = file_name.split("-")[-1]
|
| 1331 |
+
if lang in self.provenances or prov in ["cldr", "wikt"]:
|
| 1332 |
+
# We already have another resource for this lang,
|
| 1333 |
+
# so we need to further specify the lang id:
|
| 1334 |
+
lang = f"{lang}_{prov}"
|
| 1335 |
+
self.provenances[lang] = prov
|
| 1336 |
+
|
| 1337 |
+
def add_omw(self):
|
| 1338 |
+
self.add_provs(self._omw_reader)
|
| 1339 |
+
self.omw_langs = set(self.provenances.keys())
|
| 1340 |
+
|
| 1341 |
+
def add_exomw(self):
|
| 1342 |
+
"""
|
| 1343 |
+
Add languages from Extended OMW
|
| 1344 |
+
|
| 1345 |
+
>>> import nltk
|
| 1346 |
+
>>> from nltk.corpus import wordnet as wn
|
| 1347 |
+
>>> wn.add_exomw()
|
| 1348 |
+
>>> print(wn.synset('intrinsically.r.01').lemmas(lang="eng_wikt"))
|
| 1349 |
+
[Lemma('intrinsically.r.01.per_se'), Lemma('intrinsically.r.01.as_such')]
|
| 1350 |
+
"""
|
| 1351 |
+
from nltk.corpus import extended_omw
|
| 1352 |
+
|
| 1353 |
+
self.add_omw()
|
| 1354 |
+
self._exomw_reader = extended_omw
|
| 1355 |
+
self.add_provs(self._exomw_reader)
|
| 1356 |
+
|
| 1357 |
+
def langs(self):
|
| 1358 |
+
"""return a list of languages supported by Multilingual Wordnet"""
|
| 1359 |
+
return list(self.provenances.keys())
|
| 1360 |
+
|
| 1361 |
+
def _load_lemma_pos_offset_map(self):
|
| 1362 |
+
for suffix in self._FILEMAP.values():
|
| 1363 |
+
|
| 1364 |
+
# parse each line of the file (ignoring comment lines)
|
| 1365 |
+
with self.open("index.%s" % suffix) as fp:
|
| 1366 |
+
for i, line in enumerate(fp):
|
| 1367 |
+
if line.startswith(" "):
|
| 1368 |
+
continue
|
| 1369 |
+
|
| 1370 |
+
_iter = iter(line.split())
|
| 1371 |
+
|
| 1372 |
+
def _next_token():
|
| 1373 |
+
return next(_iter)
|
| 1374 |
+
|
| 1375 |
+
try:
|
| 1376 |
+
|
| 1377 |
+
# get the lemma and part-of-speech
|
| 1378 |
+
lemma = _next_token()
|
| 1379 |
+
pos = _next_token()
|
| 1380 |
+
|
| 1381 |
+
# get the number of synsets for this lemma
|
| 1382 |
+
n_synsets = int(_next_token())
|
| 1383 |
+
assert n_synsets > 0
|
| 1384 |
+
|
| 1385 |
+
# get and ignore the pointer symbols for all synsets of
|
| 1386 |
+
# this lemma
|
| 1387 |
+
n_pointers = int(_next_token())
|
| 1388 |
+
[_next_token() for _ in range(n_pointers)]
|
| 1389 |
+
|
| 1390 |
+
# same as number of synsets
|
| 1391 |
+
n_senses = int(_next_token())
|
| 1392 |
+
assert n_synsets == n_senses
|
| 1393 |
+
|
| 1394 |
+
# get and ignore number of senses ranked according to
|
| 1395 |
+
# frequency
|
| 1396 |
+
_next_token()
|
| 1397 |
+
|
| 1398 |
+
# get synset offsets
|
| 1399 |
+
synset_offsets = [int(_next_token()) for _ in range(n_synsets)]
|
| 1400 |
+
|
| 1401 |
+
# raise more informative error with file name and line number
|
| 1402 |
+
except (AssertionError, ValueError) as e:
|
| 1403 |
+
tup = ("index.%s" % suffix), (i + 1), e
|
| 1404 |
+
raise WordNetError("file %s, line %i: %s" % tup) from e
|
| 1405 |
+
|
| 1406 |
+
# map lemmas and parts of speech to synsets
|
| 1407 |
+
self._lemma_pos_offset_map[lemma][pos] = synset_offsets
|
| 1408 |
+
if pos == ADJ:
|
| 1409 |
+
self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
|
| 1410 |
+
|
| 1411 |
+
def _load_exception_map(self):
|
| 1412 |
+
# load the exception file data into memory
|
| 1413 |
+
for pos, suffix in self._FILEMAP.items():
|
| 1414 |
+
self._exception_map[pos] = {}
|
| 1415 |
+
with self.open("%s.exc" % suffix) as fp:
|
| 1416 |
+
for line in fp:
|
| 1417 |
+
terms = line.split()
|
| 1418 |
+
self._exception_map[pos][terms[0]] = terms[1:]
|
| 1419 |
+
self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
|
| 1420 |
+
|
| 1421 |
+
def _compute_max_depth(self, pos, simulate_root):
|
| 1422 |
+
"""
|
| 1423 |
+
Compute the max depth for the given part of speech. This is
|
| 1424 |
+
used by the lch similarity metric.
|
| 1425 |
+
"""
|
| 1426 |
+
depth = 0
|
| 1427 |
+
for ii in self.all_synsets(pos):
|
| 1428 |
+
try:
|
| 1429 |
+
depth = max(depth, ii.max_depth())
|
| 1430 |
+
except RuntimeError:
|
| 1431 |
+
print(ii)
|
| 1432 |
+
if simulate_root:
|
| 1433 |
+
depth += 1
|
| 1434 |
+
self._max_depth[pos] = depth
|
| 1435 |
+
|
| 1436 |
+
def get_version(self):
|
| 1437 |
+
fh = self._data_file(ADJ)
|
| 1438 |
+
fh.seek(0)
|
| 1439 |
+
for line in fh:
|
| 1440 |
+
match = re.search(r"Word[nN]et (\d+|\d+\.\d+) Copyright", line)
|
| 1441 |
+
if match is not None:
|
| 1442 |
+
version = match.group(1)
|
| 1443 |
+
fh.seek(0)
|
| 1444 |
+
return version
|
| 1445 |
+
|
| 1446 |
+
#############################################################
|
| 1447 |
+
# Loading Lemmas
|
| 1448 |
+
#############################################################
|
| 1449 |
+
|
| 1450 |
+
def lemma(self, name, lang="eng"):
|
| 1451 |
+
"""Return lemma object that matches the name"""
|
| 1452 |
+
# cannot simply split on first '.',
|
| 1453 |
+
# e.g.: '.45_caliber.a.01..45_caliber'
|
| 1454 |
+
separator = SENSENUM_RE.search(name).end()
|
| 1455 |
+
|
| 1456 |
+
synset_name, lemma_name = name[: separator - 1], name[separator:]
|
| 1457 |
+
|
| 1458 |
+
synset = self.synset(synset_name)
|
| 1459 |
+
for lemma in synset.lemmas(lang):
|
| 1460 |
+
if lemma._name == lemma_name:
|
| 1461 |
+
return lemma
|
| 1462 |
+
raise WordNetError(f"No lemma {lemma_name!r} in {synset_name!r}")
|
| 1463 |
+
|
| 1464 |
+
def lemma_from_key(self, key):
|
| 1465 |
+
# Keys are case sensitive and always lower-case
|
| 1466 |
+
key = key.lower()
|
| 1467 |
+
|
| 1468 |
+
lemma_name, lex_sense = key.split("%")
|
| 1469 |
+
pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":")
|
| 1470 |
+
pos = self._pos_names[int(pos_number)]
|
| 1471 |
+
|
| 1472 |
+
# open the key -> synset file if necessary
|
| 1473 |
+
if self._key_synset_file is None:
|
| 1474 |
+
self._key_synset_file = self.open("index.sense")
|
| 1475 |
+
|
| 1476 |
+
# Find the synset for the lemma.
|
| 1477 |
+
synset_line = _binary_search_file(self._key_synset_file, key)
|
| 1478 |
+
if not synset_line:
|
| 1479 |
+
raise WordNetError("No synset found for key %r" % key)
|
| 1480 |
+
offset = int(synset_line.split()[1])
|
| 1481 |
+
synset = self.synset_from_pos_and_offset(pos, offset)
|
| 1482 |
+
# return the corresponding lemma
|
| 1483 |
+
for lemma in synset._lemmas:
|
| 1484 |
+
if lemma._key == key:
|
| 1485 |
+
return lemma
|
| 1486 |
+
raise WordNetError("No lemma found for for key %r" % key)
|
| 1487 |
+
|
| 1488 |
+
#############################################################
|
| 1489 |
+
# Loading Synsets
|
| 1490 |
+
#############################################################
|
| 1491 |
+
def synset(self, name):
|
| 1492 |
+
# split name into lemma, part of speech and synset number
|
| 1493 |
+
lemma, pos, synset_index_str = name.lower().rsplit(".", 2)
|
| 1494 |
+
synset_index = int(synset_index_str) - 1
|
| 1495 |
+
|
| 1496 |
+
# get the offset for this synset
|
| 1497 |
+
try:
|
| 1498 |
+
offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
|
| 1499 |
+
except KeyError as e:
|
| 1500 |
+
raise WordNetError(f"No lemma {lemma!r} with part of speech {pos!r}") from e
|
| 1501 |
+
except IndexError as e:
|
| 1502 |
+
n_senses = len(self._lemma_pos_offset_map[lemma][pos])
|
| 1503 |
+
raise WordNetError(
|
| 1504 |
+
f"Lemma {lemma!r} with part of speech {pos!r} only "
|
| 1505 |
+
f"has {n_senses} {'sense' if n_senses == 1 else 'senses'}"
|
| 1506 |
+
) from e
|
| 1507 |
+
|
| 1508 |
+
# load synset information from the appropriate file
|
| 1509 |
+
synset = self.synset_from_pos_and_offset(pos, offset)
|
| 1510 |
+
|
| 1511 |
+
# some basic sanity checks on loaded attributes
|
| 1512 |
+
if pos == "s" and synset._pos == "a":
|
| 1513 |
+
message = (
|
| 1514 |
+
"Adjective satellite requested but only plain "
|
| 1515 |
+
"adjective found for lemma %r"
|
| 1516 |
+
)
|
| 1517 |
+
raise WordNetError(message % lemma)
|
| 1518 |
+
assert synset._pos == pos or (pos == "a" and synset._pos == "s")
|
| 1519 |
+
|
| 1520 |
+
# Return the synset object.
|
| 1521 |
+
return synset
|
| 1522 |
+
|
| 1523 |
+
def _data_file(self, pos):
|
| 1524 |
+
"""
|
| 1525 |
+
Return an open file pointer for the data file for the given
|
| 1526 |
+
part of speech.
|
| 1527 |
+
"""
|
| 1528 |
+
if pos == ADJ_SAT:
|
| 1529 |
+
pos = ADJ
|
| 1530 |
+
if self._data_file_map.get(pos) is None:
|
| 1531 |
+
fileid = "data.%s" % self._FILEMAP[pos]
|
| 1532 |
+
self._data_file_map[pos] = self.open(fileid)
|
| 1533 |
+
return self._data_file_map[pos]
|
| 1534 |
+
|
| 1535 |
+
def synset_from_pos_and_offset(self, pos, offset):
|
| 1536 |
+
"""
|
| 1537 |
+
- pos: The synset's part of speech, matching one of the module level
|
| 1538 |
+
attributes ADJ, ADJ_SAT, ADV, NOUN or VERB ('a', 's', 'r', 'n', or 'v').
|
| 1539 |
+
- offset: The byte offset of this synset in the WordNet dict file
|
| 1540 |
+
for this pos.
|
| 1541 |
+
|
| 1542 |
+
>>> from nltk.corpus import wordnet as wn
|
| 1543 |
+
>>> print(wn.synset_from_pos_and_offset('n', 1740))
|
| 1544 |
+
Synset('entity.n.01')
|
| 1545 |
+
"""
|
| 1546 |
+
# Check to see if the synset is in the cache
|
| 1547 |
+
if offset in self._synset_offset_cache[pos]:
|
| 1548 |
+
return self._synset_offset_cache[pos][offset]
|
| 1549 |
+
|
| 1550 |
+
data_file = self._data_file(pos)
|
| 1551 |
+
data_file.seek(offset)
|
| 1552 |
+
data_file_line = data_file.readline()
|
| 1553 |
+
# If valid, the offset equals the 8-digit 0-padded integer found at the start of the line:
|
| 1554 |
+
line_offset = data_file_line[:8]
|
| 1555 |
+
if (
|
| 1556 |
+
line_offset.isalnum()
|
| 1557 |
+
and line_offset == f"{'0'*(8-len(str(offset)))}{str(offset)}"
|
| 1558 |
+
):
|
| 1559 |
+
synset = self._synset_from_pos_and_line(pos, data_file_line)
|
| 1560 |
+
assert synset._offset == offset
|
| 1561 |
+
self._synset_offset_cache[pos][offset] = synset
|
| 1562 |
+
else:
|
| 1563 |
+
synset = None
|
| 1564 |
+
warnings.warn(f"No WordNet synset found for pos={pos} at offset={offset}.")
|
| 1565 |
+
data_file.seek(0)
|
| 1566 |
+
return synset
|
| 1567 |
+
|
| 1568 |
+
@deprecated("Use public method synset_from_pos_and_offset() instead")
|
| 1569 |
+
def _synset_from_pos_and_offset(self, *args, **kwargs):
|
| 1570 |
+
"""
|
| 1571 |
+
Hack to help people like the readers of
|
| 1572 |
+
https://stackoverflow.com/a/27145655/1709587
|
| 1573 |
+
who were using this function before it was officially a public method
|
| 1574 |
+
"""
|
| 1575 |
+
return self.synset_from_pos_and_offset(*args, **kwargs)
|
| 1576 |
+
|
| 1577 |
+
def _synset_from_pos_and_line(self, pos, data_file_line):
|
| 1578 |
+
# Construct a new (empty) synset.
|
| 1579 |
+
synset = Synset(self)
|
| 1580 |
+
|
| 1581 |
+
# parse the entry for this synset
|
| 1582 |
+
try:
|
| 1583 |
+
|
| 1584 |
+
# parse out the definitions and examples from the gloss
|
| 1585 |
+
columns_str, gloss = data_file_line.strip().split("|")
|
| 1586 |
+
definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
|
| 1587 |
+
examples = re.findall(r'"([^"]*)"', gloss)
|
| 1588 |
+
for example in examples:
|
| 1589 |
+
synset._examples.append(example)
|
| 1590 |
+
|
| 1591 |
+
synset._definition = definition.strip("; ")
|
| 1592 |
+
|
| 1593 |
+
# split the other info into fields
|
| 1594 |
+
_iter = iter(columns_str.split())
|
| 1595 |
+
|
| 1596 |
+
def _next_token():
|
| 1597 |
+
return next(_iter)
|
| 1598 |
+
|
| 1599 |
+
# get the offset
|
| 1600 |
+
synset._offset = int(_next_token())
|
| 1601 |
+
|
| 1602 |
+
# determine the lexicographer file name
|
| 1603 |
+
lexname_index = int(_next_token())
|
| 1604 |
+
synset._lexname = self._lexnames[lexname_index]
|
| 1605 |
+
|
| 1606 |
+
# get the part of speech
|
| 1607 |
+
synset._pos = _next_token()
|
| 1608 |
+
|
| 1609 |
+
# create Lemma objects for each lemma
|
| 1610 |
+
n_lemmas = int(_next_token(), 16)
|
| 1611 |
+
for _ in range(n_lemmas):
|
| 1612 |
+
# get the lemma name
|
| 1613 |
+
lemma_name = _next_token()
|
| 1614 |
+
# get the lex_id (used for sense_keys)
|
| 1615 |
+
lex_id = int(_next_token(), 16)
|
| 1616 |
+
# If the lemma has a syntactic marker, extract it.
|
| 1617 |
+
m = re.match(r"(.*?)(\(.*\))?$", lemma_name)
|
| 1618 |
+
lemma_name, syn_mark = m.groups()
|
| 1619 |
+
# create the lemma object
|
| 1620 |
+
lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
|
| 1621 |
+
synset._lemmas.append(lemma)
|
| 1622 |
+
synset._lemma_names.append(lemma._name)
|
| 1623 |
+
|
| 1624 |
+
# collect the pointer tuples
|
| 1625 |
+
n_pointers = int(_next_token())
|
| 1626 |
+
for _ in range(n_pointers):
|
| 1627 |
+
symbol = _next_token()
|
| 1628 |
+
offset = int(_next_token())
|
| 1629 |
+
pos = _next_token()
|
| 1630 |
+
lemma_ids_str = _next_token()
|
| 1631 |
+
if lemma_ids_str == "0000":
|
| 1632 |
+
synset._pointers[symbol].add((pos, offset))
|
| 1633 |
+
else:
|
| 1634 |
+
source_index = int(lemma_ids_str[:2], 16) - 1
|
| 1635 |
+
target_index = int(lemma_ids_str[2:], 16) - 1
|
| 1636 |
+
source_lemma_name = synset._lemmas[source_index]._name
|
| 1637 |
+
lemma_pointers = synset._lemma_pointers
|
| 1638 |
+
tups = lemma_pointers[source_lemma_name, symbol]
|
| 1639 |
+
tups.append((pos, offset, target_index))
|
| 1640 |
+
|
| 1641 |
+
# read the verb frames
|
| 1642 |
+
try:
|
| 1643 |
+
frame_count = int(_next_token())
|
| 1644 |
+
except StopIteration:
|
| 1645 |
+
pass
|
| 1646 |
+
else:
|
| 1647 |
+
for _ in range(frame_count):
|
| 1648 |
+
# read the plus sign
|
| 1649 |
+
plus = _next_token()
|
| 1650 |
+
assert plus == "+"
|
| 1651 |
+
# read the frame and lemma number
|
| 1652 |
+
frame_number = int(_next_token())
|
| 1653 |
+
frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
|
| 1654 |
+
lemma_number = int(_next_token(), 16)
|
| 1655 |
+
# lemma number of 00 means all words in the synset
|
| 1656 |
+
if lemma_number == 0:
|
| 1657 |
+
synset._frame_ids.append(frame_number)
|
| 1658 |
+
for lemma in synset._lemmas:
|
| 1659 |
+
lemma._frame_ids.append(frame_number)
|
| 1660 |
+
lemma._frame_strings.append(frame_string_fmt % lemma._name)
|
| 1661 |
+
# only a specific word in the synset
|
| 1662 |
+
else:
|
| 1663 |
+
lemma = synset._lemmas[lemma_number - 1]
|
| 1664 |
+
lemma._frame_ids.append(frame_number)
|
| 1665 |
+
lemma._frame_strings.append(frame_string_fmt % lemma._name)
|
| 1666 |
+
|
| 1667 |
+
# raise a more informative error with line text
|
| 1668 |
+
except ValueError as e:
|
| 1669 |
+
raise WordNetError(f"line {data_file_line!r}: {e}") from e
|
| 1670 |
+
|
| 1671 |
+
# set sense keys for Lemma objects - note that this has to be
|
| 1672 |
+
# done afterwards so that the relations are available
|
| 1673 |
+
for lemma in synset._lemmas:
|
| 1674 |
+
if synset._pos == ADJ_SAT:
|
| 1675 |
+
head_lemma = synset.similar_tos()[0]._lemmas[0]
|
| 1676 |
+
head_name = head_lemma._name
|
| 1677 |
+
head_id = "%02d" % head_lemma._lex_id
|
| 1678 |
+
else:
|
| 1679 |
+
head_name = head_id = ""
|
| 1680 |
+
tup = (
|
| 1681 |
+
lemma._name,
|
| 1682 |
+
WordNetCorpusReader._pos_numbers[synset._pos],
|
| 1683 |
+
lemma._lexname_index,
|
| 1684 |
+
lemma._lex_id,
|
| 1685 |
+
head_name,
|
| 1686 |
+
head_id,
|
| 1687 |
+
)
|
| 1688 |
+
lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower()
|
| 1689 |
+
|
| 1690 |
+
# the canonical name is based on the first lemma
|
| 1691 |
+
lemma_name = synset._lemmas[0]._name.lower()
|
| 1692 |
+
offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
|
| 1693 |
+
sense_index = offsets.index(synset._offset)
|
| 1694 |
+
tup = lemma_name, synset._pos, sense_index + 1
|
| 1695 |
+
synset._name = "%s.%s.%02i" % tup
|
| 1696 |
+
|
| 1697 |
+
return synset
|
| 1698 |
+
|
| 1699 |
+
def synset_from_sense_key(self, sense_key):
|
| 1700 |
+
"""
|
| 1701 |
+
Retrieves synset based on a given sense_key. Sense keys can be
|
| 1702 |
+
obtained from lemma.key()
|
| 1703 |
+
|
| 1704 |
+
From https://wordnet.princeton.edu/documentation/senseidx5wn:
|
| 1705 |
+
A sense_key is represented as::
|
| 1706 |
+
|
| 1707 |
+
lemma % lex_sense (e.g. 'dog%1:18:01::')
|
| 1708 |
+
|
| 1709 |
+
where lex_sense is encoded as::
|
| 1710 |
+
|
| 1711 |
+
ss_type:lex_filenum:lex_id:head_word:head_id
|
| 1712 |
+
|
| 1713 |
+
:lemma: ASCII text of word/collocation, in lower case
|
| 1714 |
+
:ss_type: synset type for the sense (1 digit int)
|
| 1715 |
+
The synset type is encoded as follows::
|
| 1716 |
+
|
| 1717 |
+
1 NOUN
|
| 1718 |
+
2 VERB
|
| 1719 |
+
3 ADJECTIVE
|
| 1720 |
+
4 ADVERB
|
| 1721 |
+
5 ADJECTIVE SATELLITE
|
| 1722 |
+
:lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
|
| 1723 |
+
:lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
|
| 1724 |
+
:head_word: lemma of the first word in satellite's head synset
|
| 1725 |
+
Only used if sense is in an adjective satellite synset
|
| 1726 |
+
:head_id: uniquely identifies sense in a lexicographer file when paired with head_word
|
| 1727 |
+
Only used if head_word is present (2 digit int)
|
| 1728 |
+
|
| 1729 |
+
>>> import nltk
|
| 1730 |
+
>>> from nltk.corpus import wordnet as wn
|
| 1731 |
+
>>> print(wn.synset_from_sense_key("drive%1:04:03::"))
|
| 1732 |
+
Synset('drive.n.06')
|
| 1733 |
+
|
| 1734 |
+
>>> print(wn.synset_from_sense_key("driving%1:04:03::"))
|
| 1735 |
+
Synset('drive.n.06')
|
| 1736 |
+
"""
|
| 1737 |
+
return self.lemma_from_key(sense_key).synset()
|
| 1738 |
+
|
| 1739 |
+
#############################################################
|
| 1740 |
+
# Retrieve synsets and lemmas.
|
| 1741 |
+
#############################################################
|
| 1742 |
+
|
| 1743 |
+
def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True):
|
| 1744 |
+
"""Load all synsets with a given lemma and part of speech tag.
|
| 1745 |
+
If no pos is specified, all synsets for all parts of speech
|
| 1746 |
+
will be loaded.
|
| 1747 |
+
If lang is specified, all the synsets associated with the lemma name
|
| 1748 |
+
of that language will be returned.
|
| 1749 |
+
"""
|
| 1750 |
+
lemma = lemma.lower()
|
| 1751 |
+
|
| 1752 |
+
if lang == "eng":
|
| 1753 |
+
get_synset = self.synset_from_pos_and_offset
|
| 1754 |
+
index = self._lemma_pos_offset_map
|
| 1755 |
+
if pos is None:
|
| 1756 |
+
pos = POS_LIST
|
| 1757 |
+
return [
|
| 1758 |
+
get_synset(p, offset)
|
| 1759 |
+
for p in pos
|
| 1760 |
+
for form in self._morphy(lemma, p, check_exceptions)
|
| 1761 |
+
for offset in index[form].get(p, [])
|
| 1762 |
+
]
|
| 1763 |
+
|
| 1764 |
+
else:
|
| 1765 |
+
self._load_lang_data(lang)
|
| 1766 |
+
synset_list = []
|
| 1767 |
+
if lemma in self._lang_data[lang][1]:
|
| 1768 |
+
for l in self._lang_data[lang][1][lemma]:
|
| 1769 |
+
if pos is not None and l[-1] != pos:
|
| 1770 |
+
continue
|
| 1771 |
+
synset_list.append(self.of2ss(l))
|
| 1772 |
+
return synset_list
|
| 1773 |
+
|
| 1774 |
+
def lemmas(self, lemma, pos=None, lang="eng"):
|
| 1775 |
+
"""Return all Lemma objects with a name matching the specified lemma
|
| 1776 |
+
name and part of speech tag. Matches any part of speech tag if none is
|
| 1777 |
+
specified."""
|
| 1778 |
+
|
| 1779 |
+
lemma = lemma.lower()
|
| 1780 |
+
if lang == "eng":
|
| 1781 |
+
return [
|
| 1782 |
+
lemma_obj
|
| 1783 |
+
for synset in self.synsets(lemma, pos)
|
| 1784 |
+
for lemma_obj in synset.lemmas()
|
| 1785 |
+
if lemma_obj.name().lower() == lemma
|
| 1786 |
+
]
|
| 1787 |
+
|
| 1788 |
+
else:
|
| 1789 |
+
self._load_lang_data(lang)
|
| 1790 |
+
lemmas = []
|
| 1791 |
+
syn = self.synsets(lemma, lang=lang)
|
| 1792 |
+
for s in syn:
|
| 1793 |
+
if pos is not None and s.pos() != pos:
|
| 1794 |
+
continue
|
| 1795 |
+
for lemma_obj in s.lemmas(lang=lang):
|
| 1796 |
+
if lemma_obj.name().lower() == lemma:
|
| 1797 |
+
lemmas.append(lemma_obj)
|
| 1798 |
+
return lemmas
|
| 1799 |
+
|
| 1800 |
+
def all_lemma_names(self, pos=None, lang="eng"):
|
| 1801 |
+
"""Return all lemma names for all synsets for the given
|
| 1802 |
+
part of speech tag and language or languages. If pos is
|
| 1803 |
+
not specified, all synsets for all parts of speech will
|
| 1804 |
+
be used."""
|
| 1805 |
+
|
| 1806 |
+
if lang == "eng":
|
| 1807 |
+
if pos is None:
|
| 1808 |
+
return iter(self._lemma_pos_offset_map)
|
| 1809 |
+
else:
|
| 1810 |
+
return (
|
| 1811 |
+
lemma
|
| 1812 |
+
for lemma in self._lemma_pos_offset_map
|
| 1813 |
+
if pos in self._lemma_pos_offset_map[lemma]
|
| 1814 |
+
)
|
| 1815 |
+
else:
|
| 1816 |
+
self._load_lang_data(lang)
|
| 1817 |
+
lemma = []
|
| 1818 |
+
for i in self._lang_data[lang][0]:
|
| 1819 |
+
if pos is not None and i[-1] != pos:
|
| 1820 |
+
continue
|
| 1821 |
+
lemma.extend(self._lang_data[lang][0][i])
|
| 1822 |
+
|
| 1823 |
+
lemma = iter(set(lemma))
|
| 1824 |
+
return lemma
|
| 1825 |
+
|
| 1826 |
+
def all_omw_synsets(self, pos=None, lang=None):
|
| 1827 |
+
if lang not in self.langs():
|
| 1828 |
+
return None
|
| 1829 |
+
self._load_lang_data(lang)
|
| 1830 |
+
for of in self._lang_data[lang][0]:
|
| 1831 |
+
if not pos or of[-1] == pos:
|
| 1832 |
+
ss = self.of2ss(of)
|
| 1833 |
+
if ss:
|
| 1834 |
+
yield ss
|
| 1835 |
+
|
| 1836 |
+
# else:
|
| 1837 |
+
# A few OMW offsets don't exist in Wordnet 3.0.
|
| 1838 |
+
# warnings.warn(f"Language {lang}: no synset found for {of}")
|
| 1839 |
+
|
| 1840 |
+
def all_synsets(self, pos=None, lang="eng"):
|
| 1841 |
+
"""Iterate over all synsets with a given part of speech tag.
|
| 1842 |
+
If no pos is specified, all synsets for all parts of speech
|
| 1843 |
+
will be loaded.
|
| 1844 |
+
"""
|
| 1845 |
+
if lang == "eng":
|
| 1846 |
+
return self.all_eng_synsets(pos=pos)
|
| 1847 |
+
else:
|
| 1848 |
+
return self.all_omw_synsets(pos=pos, lang=lang)
|
| 1849 |
+
|
| 1850 |
+
def all_eng_synsets(self, pos=None):
|
| 1851 |
+
if pos is None:
|
| 1852 |
+
pos_tags = self._FILEMAP.keys()
|
| 1853 |
+
else:
|
| 1854 |
+
pos_tags = [pos]
|
| 1855 |
+
|
| 1856 |
+
cache = self._synset_offset_cache
|
| 1857 |
+
from_pos_and_line = self._synset_from_pos_and_line
|
| 1858 |
+
|
| 1859 |
+
# generate all synsets for each part of speech
|
| 1860 |
+
for pos_tag in pos_tags:
|
| 1861 |
+
# Open the file for reading. Note that we can not re-use
|
| 1862 |
+
# the file pointers from self._data_file_map here, because
|
| 1863 |
+
# we're defining an iterator, and those file pointers might
|
| 1864 |
+
# be moved while we're not looking.
|
| 1865 |
+
if pos_tag == ADJ_SAT:
|
| 1866 |
+
pos_file = ADJ
|
| 1867 |
+
else:
|
| 1868 |
+
pos_file = pos_tag
|
| 1869 |
+
fileid = "data.%s" % self._FILEMAP[pos_file]
|
| 1870 |
+
data_file = self.open(fileid)
|
| 1871 |
+
|
| 1872 |
+
try:
|
| 1873 |
+
# generate synsets for each line in the POS file
|
| 1874 |
+
offset = data_file.tell()
|
| 1875 |
+
line = data_file.readline()
|
| 1876 |
+
while line:
|
| 1877 |
+
if not line[0].isspace():
|
| 1878 |
+
if offset in cache[pos_tag]:
|
| 1879 |
+
# See if the synset is cached
|
| 1880 |
+
synset = cache[pos_tag][offset]
|
| 1881 |
+
else:
|
| 1882 |
+
# Otherwise, parse the line
|
| 1883 |
+
synset = from_pos_and_line(pos_tag, line)
|
| 1884 |
+
cache[pos_tag][offset] = synset
|
| 1885 |
+
|
| 1886 |
+
# adjective satellites are in the same file as
|
| 1887 |
+
# adjectives so only yield the synset if it's actually
|
| 1888 |
+
# a satellite
|
| 1889 |
+
if pos_tag == ADJ_SAT and synset._pos == ADJ_SAT:
|
| 1890 |
+
yield synset
|
| 1891 |
+
# for all other POS tags, yield all synsets (this means
|
| 1892 |
+
# that adjectives also include adjective satellites)
|
| 1893 |
+
elif pos_tag != ADJ_SAT:
|
| 1894 |
+
yield synset
|
| 1895 |
+
offset = data_file.tell()
|
| 1896 |
+
line = data_file.readline()
|
| 1897 |
+
|
| 1898 |
+
# close the extra file handle we opened
|
| 1899 |
+
except:
|
| 1900 |
+
data_file.close()
|
| 1901 |
+
raise
|
| 1902 |
+
else:
|
| 1903 |
+
data_file.close()
|
| 1904 |
+
|
| 1905 |
+
def words(self, lang="eng"):
|
| 1906 |
+
"""return lemmas of the given language as list of words"""
|
| 1907 |
+
return self.all_lemma_names(lang=lang)
|
| 1908 |
+
|
| 1909 |
+
def synonyms(self, word, lang="eng"):
|
| 1910 |
+
"""return nested list with the synonyms of the different senses of word in the given language"""
|
| 1911 |
+
return [
|
| 1912 |
+
sorted(list(set(ss.lemma_names(lang=lang)) - {word}))
|
| 1913 |
+
for ss in self.synsets(word, lang=lang)
|
| 1914 |
+
]
|
| 1915 |
+
|
| 1916 |
+
def doc(self, file="README", lang="eng"):
|
| 1917 |
+
"""Return the contents of readme, license or citation file
|
| 1918 |
+
use lang=lang to get the file for an individual language"""
|
| 1919 |
+
if lang == "eng":
|
| 1920 |
+
reader = self
|
| 1921 |
+
else:
|
| 1922 |
+
reader = self._omw_reader
|
| 1923 |
+
if lang in self.langs():
|
| 1924 |
+
file = f"{os.path.join(self.provenances[lang],file)}"
|
| 1925 |
+
try:
|
| 1926 |
+
with reader.open(file) as fp:
|
| 1927 |
+
return fp.read()
|
| 1928 |
+
except:
|
| 1929 |
+
if lang in self._lang_data:
|
| 1930 |
+
return f"Cannot determine {file} for {lang}"
|
| 1931 |
+
else:
|
| 1932 |
+
return f"Language {lang} is not supported."
|
| 1933 |
+
|
| 1934 |
+
def license(self, lang="eng"):
|
| 1935 |
+
"""Return the contents of LICENSE (for omw)
|
| 1936 |
+
use lang=lang to get the license for an individual language"""
|
| 1937 |
+
return self.doc(file="LICENSE", lang=lang)
|
| 1938 |
+
|
| 1939 |
+
def readme(self, lang="eng"):
|
| 1940 |
+
"""Return the contents of README (for omw)
|
| 1941 |
+
use lang=lang to get the readme for an individual language"""
|
| 1942 |
+
return self.doc(file="README", lang=lang)
|
| 1943 |
+
|
| 1944 |
+
def citation(self, lang="eng"):
|
| 1945 |
+
"""Return the contents of citation.bib file (for omw)
|
| 1946 |
+
use lang=lang to get the citation for an individual language"""
|
| 1947 |
+
return self.doc(file="citation.bib", lang=lang)
|
| 1948 |
+
|
| 1949 |
+
#############################################################
|
| 1950 |
+
# Misc
|
| 1951 |
+
#############################################################
|
| 1952 |
+
def lemma_count(self, lemma):
|
| 1953 |
+
"""Return the frequency count for this Lemma"""
|
| 1954 |
+
# Currently, count is only work for English
|
| 1955 |
+
if lemma._lang != "eng":
|
| 1956 |
+
return 0
|
| 1957 |
+
# open the count file if we haven't already
|
| 1958 |
+
if self._key_count_file is None:
|
| 1959 |
+
self._key_count_file = self.open("cntlist.rev")
|
| 1960 |
+
# find the key in the counts file and return the count
|
| 1961 |
+
line = _binary_search_file(self._key_count_file, lemma._key)
|
| 1962 |
+
if line:
|
| 1963 |
+
return int(line.rsplit(" ", 1)[-1])
|
| 1964 |
+
else:
|
| 1965 |
+
return 0
|
| 1966 |
+
|
| 1967 |
+
def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
|
| 1968 |
+
return synset1.path_similarity(synset2, verbose, simulate_root)
|
| 1969 |
+
|
| 1970 |
+
path_similarity.__doc__ = Synset.path_similarity.__doc__
|
| 1971 |
+
|
| 1972 |
+
def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
|
| 1973 |
+
return synset1.lch_similarity(synset2, verbose, simulate_root)
|
| 1974 |
+
|
| 1975 |
+
lch_similarity.__doc__ = Synset.lch_similarity.__doc__
|
| 1976 |
+
|
| 1977 |
+
def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
|
| 1978 |
+
return synset1.wup_similarity(synset2, verbose, simulate_root)
|
| 1979 |
+
|
| 1980 |
+
wup_similarity.__doc__ = Synset.wup_similarity.__doc__
|
| 1981 |
+
|
| 1982 |
+
def res_similarity(self, synset1, synset2, ic, verbose=False):
|
| 1983 |
+
return synset1.res_similarity(synset2, ic, verbose)
|
| 1984 |
+
|
| 1985 |
+
res_similarity.__doc__ = Synset.res_similarity.__doc__
|
| 1986 |
+
|
| 1987 |
+
def jcn_similarity(self, synset1, synset2, ic, verbose=False):
|
| 1988 |
+
return synset1.jcn_similarity(synset2, ic, verbose)
|
| 1989 |
+
|
| 1990 |
+
jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
|
| 1991 |
+
|
| 1992 |
+
def lin_similarity(self, synset1, synset2, ic, verbose=False):
|
| 1993 |
+
return synset1.lin_similarity(synset2, ic, verbose)
|
| 1994 |
+
|
| 1995 |
+
lin_similarity.__doc__ = Synset.lin_similarity.__doc__
|
| 1996 |
+
|
| 1997 |
+
#############################################################
|
| 1998 |
+
# Morphy
|
| 1999 |
+
#############################################################
|
| 2000 |
+
# Morphy, adapted from Oliver Steele's pywordnet
|
| 2001 |
+
def morphy(self, form, pos=None, check_exceptions=True):
|
| 2002 |
+
"""
|
| 2003 |
+
Find a possible base form for the given form, with the given
|
| 2004 |
+
part of speech, by checking WordNet's list of exceptional
|
| 2005 |
+
forms, and by recursively stripping affixes for this part of
|
| 2006 |
+
speech until a form in WordNet is found.
|
| 2007 |
+
|
| 2008 |
+
>>> from nltk.corpus import wordnet as wn
|
| 2009 |
+
>>> print(wn.morphy('dogs'))
|
| 2010 |
+
dog
|
| 2011 |
+
>>> print(wn.morphy('churches'))
|
| 2012 |
+
church
|
| 2013 |
+
>>> print(wn.morphy('aardwolves'))
|
| 2014 |
+
aardwolf
|
| 2015 |
+
>>> print(wn.morphy('abaci'))
|
| 2016 |
+
abacus
|
| 2017 |
+
>>> wn.morphy('hardrock', wn.ADV)
|
| 2018 |
+
>>> print(wn.morphy('book', wn.NOUN))
|
| 2019 |
+
book
|
| 2020 |
+
>>> wn.morphy('book', wn.ADJ)
|
| 2021 |
+
"""
|
| 2022 |
+
|
| 2023 |
+
if pos is None:
|
| 2024 |
+
morphy = self._morphy
|
| 2025 |
+
analyses = chain(a for p in POS_LIST for a in morphy(form, p))
|
| 2026 |
+
else:
|
| 2027 |
+
analyses = self._morphy(form, pos, check_exceptions)
|
| 2028 |
+
|
| 2029 |
+
# get the first one we find
|
| 2030 |
+
first = list(islice(analyses, 1))
|
| 2031 |
+
if len(first) == 1:
|
| 2032 |
+
return first[0]
|
| 2033 |
+
else:
|
| 2034 |
+
return None
|
| 2035 |
+
|
| 2036 |
+
MORPHOLOGICAL_SUBSTITUTIONS = {
|
| 2037 |
+
NOUN: [
|
| 2038 |
+
("s", ""),
|
| 2039 |
+
("ses", "s"),
|
| 2040 |
+
("ves", "f"),
|
| 2041 |
+
("xes", "x"),
|
| 2042 |
+
("zes", "z"),
|
| 2043 |
+
("ches", "ch"),
|
| 2044 |
+
("shes", "sh"),
|
| 2045 |
+
("men", "man"),
|
| 2046 |
+
("ies", "y"),
|
| 2047 |
+
],
|
| 2048 |
+
VERB: [
|
| 2049 |
+
("s", ""),
|
| 2050 |
+
("ies", "y"),
|
| 2051 |
+
("es", "e"),
|
| 2052 |
+
("es", ""),
|
| 2053 |
+
("ed", "e"),
|
| 2054 |
+
("ed", ""),
|
| 2055 |
+
("ing", "e"),
|
| 2056 |
+
("ing", ""),
|
| 2057 |
+
],
|
| 2058 |
+
ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")],
|
| 2059 |
+
ADV: [],
|
| 2060 |
+
}
|
| 2061 |
+
|
| 2062 |
+
MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
|
| 2063 |
+
|
| 2064 |
+
def _morphy(self, form, pos, check_exceptions=True):
|
| 2065 |
+
# from jordanbg:
|
| 2066 |
+
# Given an original string x
|
| 2067 |
+
# 1. Apply rules once to the input to get y1, y2, y3, etc.
|
| 2068 |
+
# 2. Return all that are in the database
|
| 2069 |
+
# 3. If there are no matches, keep applying rules until you either
|
| 2070 |
+
# find a match or you can't go any further
|
| 2071 |
+
|
| 2072 |
+
exceptions = self._exception_map[pos]
|
| 2073 |
+
substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
|
| 2074 |
+
|
| 2075 |
+
def apply_rules(forms):
|
| 2076 |
+
return [
|
| 2077 |
+
form[: -len(old)] + new
|
| 2078 |
+
for form in forms
|
| 2079 |
+
for old, new in substitutions
|
| 2080 |
+
if form.endswith(old)
|
| 2081 |
+
]
|
| 2082 |
+
|
| 2083 |
+
def filter_forms(forms):
|
| 2084 |
+
result = []
|
| 2085 |
+
seen = set()
|
| 2086 |
+
for form in forms:
|
| 2087 |
+
if form in self._lemma_pos_offset_map:
|
| 2088 |
+
if pos in self._lemma_pos_offset_map[form]:
|
| 2089 |
+
if form not in seen:
|
| 2090 |
+
result.append(form)
|
| 2091 |
+
seen.add(form)
|
| 2092 |
+
return result
|
| 2093 |
+
|
| 2094 |
+
# 0. Check the exception lists
|
| 2095 |
+
if check_exceptions:
|
| 2096 |
+
if form in exceptions:
|
| 2097 |
+
return filter_forms([form] + exceptions[form])
|
| 2098 |
+
|
| 2099 |
+
# 1. Apply rules once to the input to get y1, y2, y3, etc.
|
| 2100 |
+
forms = apply_rules([form])
|
| 2101 |
+
|
| 2102 |
+
# 2. Return all that are in the database (and check the original too)
|
| 2103 |
+
results = filter_forms([form] + forms)
|
| 2104 |
+
if results:
|
| 2105 |
+
return results
|
| 2106 |
+
|
| 2107 |
+
# 3. If there are no matches, keep applying rules until we find a match
|
| 2108 |
+
while forms:
|
| 2109 |
+
forms = apply_rules(forms)
|
| 2110 |
+
results = filter_forms(forms)
|
| 2111 |
+
if results:
|
| 2112 |
+
return results
|
| 2113 |
+
|
| 2114 |
+
# Return an empty list if we can't find anything
|
| 2115 |
+
return []
|
| 2116 |
+
|
| 2117 |
+
#############################################################
|
| 2118 |
+
# Create information content from corpus
|
| 2119 |
+
#############################################################
|
| 2120 |
+
def ic(self, corpus, weight_senses_equally=False, smoothing=1.0):
|
| 2121 |
+
"""
|
| 2122 |
+
Creates an information content lookup dictionary from a corpus.
|
| 2123 |
+
|
| 2124 |
+
:type corpus: CorpusReader
|
| 2125 |
+
:param corpus: The corpus from which we create an information
|
| 2126 |
+
content dictionary.
|
| 2127 |
+
:type weight_senses_equally: bool
|
| 2128 |
+
:param weight_senses_equally: If this is True, gives all
|
| 2129 |
+
possible senses equal weight rather than dividing by the
|
| 2130 |
+
number of possible senses. (If a word has 3 synses, each
|
| 2131 |
+
sense gets 0.3333 per appearance when this is False, 1.0 when
|
| 2132 |
+
it is true.)
|
| 2133 |
+
:param smoothing: How much do we smooth synset counts (default is 1.0)
|
| 2134 |
+
:type smoothing: float
|
| 2135 |
+
:return: An information content dictionary
|
| 2136 |
+
"""
|
| 2137 |
+
counts = FreqDist()
|
| 2138 |
+
for ww in corpus.words():
|
| 2139 |
+
counts[ww] += 1
|
| 2140 |
+
|
| 2141 |
+
ic = {}
|
| 2142 |
+
for pp in POS_LIST:
|
| 2143 |
+
ic[pp] = defaultdict(float)
|
| 2144 |
+
|
| 2145 |
+
# Initialize the counts with the smoothing value
|
| 2146 |
+
if smoothing > 0.0:
|
| 2147 |
+
for pp in POS_LIST:
|
| 2148 |
+
ic[pp][0] = smoothing
|
| 2149 |
+
for ss in self.all_synsets():
|
| 2150 |
+
pos = ss._pos
|
| 2151 |
+
if pos == ADJ_SAT:
|
| 2152 |
+
pos = ADJ
|
| 2153 |
+
ic[pos][ss._offset] = smoothing
|
| 2154 |
+
|
| 2155 |
+
for ww in counts:
|
| 2156 |
+
possible_synsets = self.synsets(ww)
|
| 2157 |
+
if len(possible_synsets) == 0:
|
| 2158 |
+
continue
|
| 2159 |
+
|
| 2160 |
+
# Distribute weight among possible synsets
|
| 2161 |
+
weight = float(counts[ww])
|
| 2162 |
+
if not weight_senses_equally:
|
| 2163 |
+
weight /= float(len(possible_synsets))
|
| 2164 |
+
|
| 2165 |
+
for ss in possible_synsets:
|
| 2166 |
+
pos = ss._pos
|
| 2167 |
+
if pos == ADJ_SAT:
|
| 2168 |
+
pos = ADJ
|
| 2169 |
+
for level in ss._iter_hypernym_lists():
|
| 2170 |
+
for hh in level:
|
| 2171 |
+
ic[pos][hh._offset] += weight
|
| 2172 |
+
# Add the weight to the root
|
| 2173 |
+
ic[pos][0] += weight
|
| 2174 |
+
return ic
|
| 2175 |
+
|
| 2176 |
+
def custom_lemmas(self, tab_file, lang):
|
| 2177 |
+
"""
|
| 2178 |
+
Reads a custom tab file containing mappings of lemmas in the given
|
| 2179 |
+
language to Princeton WordNet 3.0 synset offsets, allowing NLTK's
|
| 2180 |
+
WordNet functions to then be used with that language.
|
| 2181 |
+
|
| 2182 |
+
See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for
|
| 2183 |
+
documentation on the Multilingual WordNet tab file format.
|
| 2184 |
+
|
| 2185 |
+
:param tab_file: Tab file as a file or file-like object
|
| 2186 |
+
:type: lang str
|
| 2187 |
+
:param: lang ISO 639-3 code of the language of the tab file
|
| 2188 |
+
"""
|
| 2189 |
+
lg = lang.split("_")[0]
|
| 2190 |
+
if len(lg) != 3:
|
| 2191 |
+
raise ValueError("lang should be a (3 character) ISO 639-3 code")
|
| 2192 |
+
self._lang_data[lang] = [
|
| 2193 |
+
defaultdict(list),
|
| 2194 |
+
defaultdict(list),
|
| 2195 |
+
defaultdict(list),
|
| 2196 |
+
defaultdict(list),
|
| 2197 |
+
]
|
| 2198 |
+
for line in tab_file.readlines():
|
| 2199 |
+
if isinstance(line, bytes):
|
| 2200 |
+
# Support byte-stream files (e.g. as returned by Python 2's
|
| 2201 |
+
# open() function) as well as text-stream ones
|
| 2202 |
+
line = line.decode("utf-8")
|
| 2203 |
+
if not line.startswith("#"):
|
| 2204 |
+
triple = line.strip().split("\t")
|
| 2205 |
+
if len(triple) < 3:
|
| 2206 |
+
continue
|
| 2207 |
+
offset_pos, label = triple[:2]
|
| 2208 |
+
val = triple[-1]
|
| 2209 |
+
if self.map30:
|
| 2210 |
+
if offset_pos in self.map30:
|
| 2211 |
+
# Map offset_pos to current Wordnet version:
|
| 2212 |
+
offset_pos = self.map30[offset_pos]
|
| 2213 |
+
else:
|
| 2214 |
+
# Some OMW offsets were never in Wordnet:
|
| 2215 |
+
if (
|
| 2216 |
+
offset_pos not in self.nomap
|
| 2217 |
+
and offset_pos.replace("a", "s") not in self.nomap
|
| 2218 |
+
):
|
| 2219 |
+
warnings.warn(
|
| 2220 |
+
f"{lang}: invalid offset {offset_pos} in '{line}'"
|
| 2221 |
+
)
|
| 2222 |
+
continue
|
| 2223 |
+
elif offset_pos[-1] == "a":
|
| 2224 |
+
wnss = self.of2ss(offset_pos)
|
| 2225 |
+
if wnss and wnss.pos() == "s": # Wordnet pos is "s"
|
| 2226 |
+
# Label OMW adjective satellites back to their Wordnet pos ("s")
|
| 2227 |
+
offset_pos = self.ss2of(wnss)
|
| 2228 |
+
pair = label.split(":")
|
| 2229 |
+
attr = pair[-1]
|
| 2230 |
+
if len(pair) == 1 or pair[0] == lg:
|
| 2231 |
+
if attr == "lemma":
|
| 2232 |
+
val = val.strip().replace(" ", "_")
|
| 2233 |
+
self._lang_data[lang][1][val.lower()].append(offset_pos)
|
| 2234 |
+
if attr in self.lg_attrs:
|
| 2235 |
+
self._lang_data[lang][self.lg_attrs.index(attr)][
|
| 2236 |
+
offset_pos
|
| 2237 |
+
].append(val)
|
| 2238 |
+
|
| 2239 |
+
def disable_custom_lemmas(self, lang):
|
| 2240 |
+
"""prevent synsets from being mistakenly added"""
|
| 2241 |
+
for n in range(len(self.lg_attrs)):
|
| 2242 |
+
self._lang_data[lang][n].default_factory = None
|
| 2243 |
+
|
| 2244 |
+
######################################################################
|
| 2245 |
+
# Visualize WordNet relation graphs using Graphviz
|
| 2246 |
+
######################################################################
|
| 2247 |
+
|
| 2248 |
+
def digraph(
|
| 2249 |
+
self,
|
| 2250 |
+
inputs,
|
| 2251 |
+
rel=lambda s: s.hypernyms(),
|
| 2252 |
+
pos=None,
|
| 2253 |
+
maxdepth=-1,
|
| 2254 |
+
shapes=None,
|
| 2255 |
+
attr=None,
|
| 2256 |
+
verbose=False,
|
| 2257 |
+
):
|
| 2258 |
+
"""
|
| 2259 |
+
Produce a graphical representation from 'inputs' (a list of
|
| 2260 |
+
start nodes, which can be a mix of Synsets, Lemmas and/or words),
|
| 2261 |
+
and a synset relation, for drawing with the 'dot' graph visualisation
|
| 2262 |
+
program from the Graphviz package.
|
| 2263 |
+
|
| 2264 |
+
Return a string in the DOT graph file language, which can then be
|
| 2265 |
+
converted to an image by nltk.parse.dependencygraph.dot2img(dot_string).
|
| 2266 |
+
|
| 2267 |
+
Optional Parameters:
|
| 2268 |
+
:rel: Wordnet synset relation
|
| 2269 |
+
:pos: for words, restricts Part of Speech to 'n', 'v', 'a' or 'r'
|
| 2270 |
+
:maxdepth: limit the longest path
|
| 2271 |
+
:shapes: dictionary of strings that trigger a specified shape
|
| 2272 |
+
:attr: dictionary with global graph attributes
|
| 2273 |
+
:verbose: warn about cycles
|
| 2274 |
+
|
| 2275 |
+
>>> from nltk.corpus import wordnet as wn
|
| 2276 |
+
>>> print(wn.digraph([wn.synset('dog.n.01')]))
|
| 2277 |
+
digraph G {
|
| 2278 |
+
"Synset('animal.n.01')" -> "Synset('organism.n.01')";
|
| 2279 |
+
"Synset('canine.n.02')" -> "Synset('carnivore.n.01')";
|
| 2280 |
+
"Synset('carnivore.n.01')" -> "Synset('placental.n.01')";
|
| 2281 |
+
"Synset('chordate.n.01')" -> "Synset('animal.n.01')";
|
| 2282 |
+
"Synset('dog.n.01')" -> "Synset('canine.n.02')";
|
| 2283 |
+
"Synset('dog.n.01')" -> "Synset('domestic_animal.n.01')";
|
| 2284 |
+
"Synset('domestic_animal.n.01')" -> "Synset('animal.n.01')";
|
| 2285 |
+
"Synset('living_thing.n.01')" -> "Synset('whole.n.02')";
|
| 2286 |
+
"Synset('mammal.n.01')" -> "Synset('vertebrate.n.01')";
|
| 2287 |
+
"Synset('object.n.01')" -> "Synset('physical_entity.n.01')";
|
| 2288 |
+
"Synset('organism.n.01')" -> "Synset('living_thing.n.01')";
|
| 2289 |
+
"Synset('physical_entity.n.01')" -> "Synset('entity.n.01')";
|
| 2290 |
+
"Synset('placental.n.01')" -> "Synset('mammal.n.01')";
|
| 2291 |
+
"Synset('vertebrate.n.01')" -> "Synset('chordate.n.01')";
|
| 2292 |
+
"Synset('whole.n.02')" -> "Synset('object.n.01')";
|
| 2293 |
+
}
|
| 2294 |
+
<BLANKLINE>
|
| 2295 |
+
"""
|
| 2296 |
+
from nltk.util import edge_closure, edges2dot
|
| 2297 |
+
|
| 2298 |
+
synsets = set()
|
| 2299 |
+
edges = set()
|
| 2300 |
+
if not shapes:
|
| 2301 |
+
shapes = dict()
|
| 2302 |
+
if not attr:
|
| 2303 |
+
attr = dict()
|
| 2304 |
+
|
| 2305 |
+
def add_lemma(lem):
|
| 2306 |
+
ss = lem.synset()
|
| 2307 |
+
synsets.add(ss)
|
| 2308 |
+
edges.add((lem, ss))
|
| 2309 |
+
|
| 2310 |
+
for node in inputs:
|
| 2311 |
+
typ = type(node)
|
| 2312 |
+
if typ == Synset:
|
| 2313 |
+
synsets.add(node)
|
| 2314 |
+
elif typ == Lemma:
|
| 2315 |
+
add_lemma(node)
|
| 2316 |
+
elif typ == str:
|
| 2317 |
+
for lemma in self.lemmas(node, pos):
|
| 2318 |
+
add_lemma(lemma)
|
| 2319 |
+
|
| 2320 |
+
for ss in synsets:
|
| 2321 |
+
edges = edges.union(edge_closure(ss, rel, maxdepth, verbose))
|
| 2322 |
+
dot_string = edges2dot(sorted(list(edges)), shapes=shapes, attr=attr)
|
| 2323 |
+
return dot_string
|
| 2324 |
+
|
| 2325 |
+
|
| 2326 |
+
######################################################################
|
| 2327 |
+
# WordNet Information Content Corpus Reader
|
| 2328 |
+
######################################################################
|
| 2329 |
+
|
| 2330 |
+
|
| 2331 |
+
class WordNetICCorpusReader(CorpusReader):
|
| 2332 |
+
"""
|
| 2333 |
+
A corpus reader for the WordNet information content corpus.
|
| 2334 |
+
"""
|
| 2335 |
+
|
| 2336 |
+
def __init__(self, root, fileids):
|
| 2337 |
+
CorpusReader.__init__(self, root, fileids, encoding="utf8")
|
| 2338 |
+
|
| 2339 |
+
# this load function would be more efficient if the data was pickled
|
| 2340 |
+
# Note that we can't use NLTK's frequency distributions because
|
| 2341 |
+
# synsets are overlapping (each instance of a synset also counts
|
| 2342 |
+
# as an instance of its hypernyms)
|
| 2343 |
+
def ic(self, icfile):
|
| 2344 |
+
"""
|
| 2345 |
+
Load an information content file from the wordnet_ic corpus
|
| 2346 |
+
and return a dictionary. This dictionary has just two keys,
|
| 2347 |
+
NOUN and VERB, whose values are dictionaries that map from
|
| 2348 |
+
synsets to information content values.
|
| 2349 |
+
|
| 2350 |
+
:type icfile: str
|
| 2351 |
+
:param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
|
| 2352 |
+
:return: An information content dictionary
|
| 2353 |
+
"""
|
| 2354 |
+
ic = {}
|
| 2355 |
+
ic[NOUN] = defaultdict(float)
|
| 2356 |
+
ic[VERB] = defaultdict(float)
|
| 2357 |
+
with self.open(icfile) as fp:
|
| 2358 |
+
for num, line in enumerate(fp):
|
| 2359 |
+
if num == 0: # skip the header
|
| 2360 |
+
continue
|
| 2361 |
+
fields = line.split()
|
| 2362 |
+
offset = int(fields[0][:-1])
|
| 2363 |
+
value = float(fields[1])
|
| 2364 |
+
pos = _get_pos(fields[0])
|
| 2365 |
+
if len(fields) == 3 and fields[2] == "ROOT":
|
| 2366 |
+
# Store root count.
|
| 2367 |
+
ic[pos][0] += value
|
| 2368 |
+
if value != 0:
|
| 2369 |
+
ic[pos][offset] = value
|
| 2370 |
+
return ic
|
| 2371 |
+
|
| 2372 |
+
|
| 2373 |
+
######################################################################
|
| 2374 |
+
# Similarity metrics
|
| 2375 |
+
######################################################################
|
| 2376 |
+
|
| 2377 |
+
# TODO: Add in the option to manually add a new root node; this will be
|
| 2378 |
+
# useful for verb similarity as there exist multiple verb taxonomies.
|
| 2379 |
+
|
| 2380 |
+
# More information about the metrics is available at
|
| 2381 |
+
# http://marimba.d.umn.edu/similarity/measures.html
|
| 2382 |
+
|
| 2383 |
+
|
| 2384 |
+
def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
|
| 2385 |
+
return synset1.path_similarity(
|
| 2386 |
+
synset2, verbose=verbose, simulate_root=simulate_root
|
| 2387 |
+
)
|
| 2388 |
+
|
| 2389 |
+
|
| 2390 |
+
def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
|
| 2391 |
+
return synset1.lch_similarity(synset2, verbose=verbose, simulate_root=simulate_root)
|
| 2392 |
+
|
| 2393 |
+
|
| 2394 |
+
def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
|
| 2395 |
+
return synset1.wup_similarity(synset2, verbose=verbose, simulate_root=simulate_root)
|
| 2396 |
+
|
| 2397 |
+
|
| 2398 |
+
def res_similarity(synset1, synset2, ic, verbose=False):
|
| 2399 |
+
return synset1.res_similarity(synset2, ic, verbose=verbose)
|
| 2400 |
+
|
| 2401 |
+
|
| 2402 |
+
def jcn_similarity(synset1, synset2, ic, verbose=False):
|
| 2403 |
+
return synset1.jcn_similarity(synset2, ic, verbose=verbose)
|
| 2404 |
+
|
| 2405 |
+
|
| 2406 |
+
def lin_similarity(synset1, synset2, ic, verbose=False):
|
| 2407 |
+
return synset1.lin_similarity(synset2, ic, verbose=verbose)
|
| 2408 |
+
|
| 2409 |
+
|
| 2410 |
+
path_similarity.__doc__ = Synset.path_similarity.__doc__
|
| 2411 |
+
lch_similarity.__doc__ = Synset.lch_similarity.__doc__
|
| 2412 |
+
wup_similarity.__doc__ = Synset.wup_similarity.__doc__
|
| 2413 |
+
res_similarity.__doc__ = Synset.res_similarity.__doc__
|
| 2414 |
+
jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
|
| 2415 |
+
lin_similarity.__doc__ = Synset.lin_similarity.__doc__
|
| 2416 |
+
|
| 2417 |
+
|
| 2418 |
+
def _lcs_ic(synset1, synset2, ic, verbose=False):
|
| 2419 |
+
"""
|
| 2420 |
+
Get the information content of the least common subsumer that has
|
| 2421 |
+
the highest information content value. If two nodes have no
|
| 2422 |
+
explicit common subsumer, assume that they share an artificial
|
| 2423 |
+
root node that is the hypernym of all explicit roots.
|
| 2424 |
+
|
| 2425 |
+
:type synset1: Synset
|
| 2426 |
+
:param synset1: First input synset.
|
| 2427 |
+
:type synset2: Synset
|
| 2428 |
+
:param synset2: Second input synset. Must be the same part of
|
| 2429 |
+
speech as the first synset.
|
| 2430 |
+
:type ic: dict
|
| 2431 |
+
:param ic: an information content object (as returned by ``load_ic()``).
|
| 2432 |
+
:return: The information content of the two synsets and their most
|
| 2433 |
+
informative subsumer
|
| 2434 |
+
"""
|
| 2435 |
+
if synset1._pos != synset2._pos:
|
| 2436 |
+
raise WordNetError(
|
| 2437 |
+
"Computing the least common subsumer requires "
|
| 2438 |
+
"%s and %s to have the same part of speech." % (synset1, synset2)
|
| 2439 |
+
)
|
| 2440 |
+
|
| 2441 |
+
ic1 = information_content(synset1, ic)
|
| 2442 |
+
ic2 = information_content(synset2, ic)
|
| 2443 |
+
subsumers = synset1.common_hypernyms(synset2)
|
| 2444 |
+
if len(subsumers) == 0:
|
| 2445 |
+
subsumer_ic = 0
|
| 2446 |
+
else:
|
| 2447 |
+
subsumer_ic = max(information_content(s, ic) for s in subsumers)
|
| 2448 |
+
|
| 2449 |
+
if verbose:
|
| 2450 |
+
print("> LCS Subsumer by content:", subsumer_ic)
|
| 2451 |
+
|
| 2452 |
+
return ic1, ic2, subsumer_ic
|
| 2453 |
+
|
| 2454 |
+
|
| 2455 |
+
# Utility functions
|
| 2456 |
+
|
| 2457 |
+
|
| 2458 |
+
def information_content(synset, ic):
|
| 2459 |
+
pos = synset._pos
|
| 2460 |
+
if pos == ADJ_SAT:
|
| 2461 |
+
pos = ADJ
|
| 2462 |
+
try:
|
| 2463 |
+
icpos = ic[pos]
|
| 2464 |
+
except KeyError as e:
|
| 2465 |
+
msg = "Information content file has no entries for part-of-speech: %s"
|
| 2466 |
+
raise WordNetError(msg % pos) from e
|
| 2467 |
+
|
| 2468 |
+
counts = icpos[synset._offset]
|
| 2469 |
+
if counts == 0:
|
| 2470 |
+
return _INF
|
| 2471 |
+
else:
|
| 2472 |
+
return -math.log(counts / icpos[0])
|
| 2473 |
+
|
| 2474 |
+
|
| 2475 |
+
# get the part of speech (NOUN or VERB) from the information content record
|
| 2476 |
+
# (each identifier has a 'n' or 'v' suffix)
|
| 2477 |
+
|
| 2478 |
+
|
| 2479 |
+
def _get_pos(field):
|
| 2480 |
+
if field[-1] == "n":
|
| 2481 |
+
return NOUN
|
| 2482 |
+
elif field[-1] == "v":
|
| 2483 |
+
return VERB
|
| 2484 |
+
else:
|
| 2485 |
+
msg = (
|
| 2486 |
+
"Unidentified part of speech in WordNet Information Content file "
|
| 2487 |
+
"for field %s" % field
|
| 2488 |
+
)
|
| 2489 |
+
raise ValueError(msg)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: XML Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Corpus reader for corpora whose documents are xml files.
|
| 10 |
+
|
| 11 |
+
(note -- not named 'xml' to avoid conflicting w/ standard xml package)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import codecs
|
| 15 |
+
from xml.etree import ElementTree
|
| 16 |
+
|
| 17 |
+
from nltk.corpus.reader.api import CorpusReader
|
| 18 |
+
from nltk.corpus.reader.util import *
|
| 19 |
+
from nltk.data import SeekableUnicodeStreamReader
|
| 20 |
+
from nltk.internals import ElementWrapper
|
| 21 |
+
from nltk.tokenize import WordPunctTokenizer
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class XMLCorpusReader(CorpusReader):
|
| 25 |
+
"""
|
| 26 |
+
Corpus reader for corpora whose documents are xml files.
|
| 27 |
+
|
| 28 |
+
Note that the ``XMLCorpusReader`` constructor does not take an
|
| 29 |
+
``encoding`` argument, because the unicode encoding is specified by
|
| 30 |
+
the XML files themselves. See the XML specs for more info.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(self, root, fileids, wrap_etree=False):
|
| 34 |
+
self._wrap_etree = wrap_etree
|
| 35 |
+
CorpusReader.__init__(self, root, fileids)
|
| 36 |
+
|
| 37 |
+
def xml(self, fileid=None):
|
| 38 |
+
# Make sure we have exactly one file -- no concatenating XML.
|
| 39 |
+
if fileid is None and len(self._fileids) == 1:
|
| 40 |
+
fileid = self._fileids[0]
|
| 41 |
+
if not isinstance(fileid, str):
|
| 42 |
+
raise TypeError("Expected a single file identifier string")
|
| 43 |
+
# Read the XML in using ElementTree.
|
| 44 |
+
with self.abspath(fileid).open() as fp:
|
| 45 |
+
elt = ElementTree.parse(fp).getroot()
|
| 46 |
+
# If requested, wrap it.
|
| 47 |
+
if self._wrap_etree:
|
| 48 |
+
elt = ElementWrapper(elt)
|
| 49 |
+
# Return the ElementTree element.
|
| 50 |
+
return elt
|
| 51 |
+
|
| 52 |
+
def words(self, fileid=None):
|
| 53 |
+
"""
|
| 54 |
+
Returns all of the words and punctuation symbols in the specified file
|
| 55 |
+
that were in text nodes -- ie, tags are ignored. Like the xml() method,
|
| 56 |
+
fileid can only specify one file.
|
| 57 |
+
|
| 58 |
+
:return: the given file's text nodes as a list of words and punctuation symbols
|
| 59 |
+
:rtype: list(str)
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
elt = self.xml(fileid)
|
| 63 |
+
encoding = self.encoding(fileid)
|
| 64 |
+
word_tokenizer = WordPunctTokenizer()
|
| 65 |
+
try:
|
| 66 |
+
iterator = elt.getiterator()
|
| 67 |
+
except:
|
| 68 |
+
iterator = elt.iter()
|
| 69 |
+
out = []
|
| 70 |
+
|
| 71 |
+
for node in iterator:
|
| 72 |
+
text = node.text
|
| 73 |
+
if text is not None:
|
| 74 |
+
if isinstance(text, bytes):
|
| 75 |
+
text = text.decode(encoding)
|
| 76 |
+
toks = word_tokenizer.tokenize(text)
|
| 77 |
+
out.extend(toks)
|
| 78 |
+
return out
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class XMLCorpusView(StreamBackedCorpusView):
|
| 82 |
+
"""
|
| 83 |
+
A corpus view that selects out specified elements from an XML
|
| 84 |
+
file, and provides a flat list-like interface for accessing them.
|
| 85 |
+
(Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
|
| 86 |
+
but may be used by subclasses of ``XMLCorpusReader``.)
|
| 87 |
+
|
| 88 |
+
Every XML corpus view has a "tag specification", indicating what
|
| 89 |
+
XML elements should be included in the view; and each (non-nested)
|
| 90 |
+
element that matches this specification corresponds to one item in
|
| 91 |
+
the view. Tag specifications are regular expressions over tag
|
| 92 |
+
paths, where a tag path is a list of element tag names, separated
|
| 93 |
+
by '/', indicating the ancestry of the element. Some examples:
|
| 94 |
+
|
| 95 |
+
- ``'foo'``: A top-level element whose tag is ``foo``.
|
| 96 |
+
- ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
|
| 97 |
+
is a top-level element whose tag is ``foo``.
|
| 98 |
+
- ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
|
| 99 |
+
in the xml tree.
|
| 100 |
+
- ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
|
| 101 |
+
appearing anywhere in the xml tree.
|
| 102 |
+
|
| 103 |
+
The view items are generated from the selected XML elements via
|
| 104 |
+
the method ``handle_elt()``. By default, this method returns the
|
| 105 |
+
element as-is (i.e., as an ElementTree object); but it can be
|
| 106 |
+
overridden, either via subclassing or via the ``elt_handler``
|
| 107 |
+
constructor parameter.
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
#: If true, then display debugging output to stdout when reading
|
| 111 |
+
#: blocks.
|
| 112 |
+
_DEBUG = False
|
| 113 |
+
|
| 114 |
+
#: The number of characters read at a time by this corpus reader.
|
| 115 |
+
_BLOCK_SIZE = 1024
|
| 116 |
+
|
| 117 |
+
def __init__(self, fileid, tagspec, elt_handler=None):
|
| 118 |
+
"""
|
| 119 |
+
Create a new corpus view based on a specified XML file.
|
| 120 |
+
|
| 121 |
+
Note that the ``XMLCorpusView`` constructor does not take an
|
| 122 |
+
``encoding`` argument, because the unicode encoding is
|
| 123 |
+
specified by the XML files themselves.
|
| 124 |
+
|
| 125 |
+
:type tagspec: str
|
| 126 |
+
:param tagspec: A tag specification, indicating what XML
|
| 127 |
+
elements should be included in the view. Each non-nested
|
| 128 |
+
element that matches this specification corresponds to one
|
| 129 |
+
item in the view.
|
| 130 |
+
|
| 131 |
+
:param elt_handler: A function used to transform each element
|
| 132 |
+
to a value for the view. If no handler is specified, then
|
| 133 |
+
``self.handle_elt()`` is called, which returns the element
|
| 134 |
+
as an ElementTree object. The signature of elt_handler is::
|
| 135 |
+
|
| 136 |
+
elt_handler(elt, tagspec) -> value
|
| 137 |
+
"""
|
| 138 |
+
if elt_handler:
|
| 139 |
+
self.handle_elt = elt_handler
|
| 140 |
+
|
| 141 |
+
self._tagspec = re.compile(tagspec + r"\Z")
|
| 142 |
+
"""The tag specification for this corpus view."""
|
| 143 |
+
|
| 144 |
+
self._tag_context = {0: ()}
|
| 145 |
+
"""A dictionary mapping from file positions (as returned by
|
| 146 |
+
``stream.seek()`` to XML contexts. An XML context is a
|
| 147 |
+
tuple of XML tag names, indicating which tags have not yet
|
| 148 |
+
been closed."""
|
| 149 |
+
|
| 150 |
+
encoding = self._detect_encoding(fileid)
|
| 151 |
+
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
| 152 |
+
|
| 153 |
+
def _detect_encoding(self, fileid):
|
| 154 |
+
if isinstance(fileid, PathPointer):
|
| 155 |
+
try:
|
| 156 |
+
infile = fileid.open()
|
| 157 |
+
s = infile.readline()
|
| 158 |
+
finally:
|
| 159 |
+
infile.close()
|
| 160 |
+
else:
|
| 161 |
+
with open(fileid, "rb") as infile:
|
| 162 |
+
s = infile.readline()
|
| 163 |
+
if s.startswith(codecs.BOM_UTF16_BE):
|
| 164 |
+
return "utf-16-be"
|
| 165 |
+
if s.startswith(codecs.BOM_UTF16_LE):
|
| 166 |
+
return "utf-16-le"
|
| 167 |
+
if s.startswith(codecs.BOM_UTF32_BE):
|
| 168 |
+
return "utf-32-be"
|
| 169 |
+
if s.startswith(codecs.BOM_UTF32_LE):
|
| 170 |
+
return "utf-32-le"
|
| 171 |
+
if s.startswith(codecs.BOM_UTF8):
|
| 172 |
+
return "utf-8"
|
| 173 |
+
m = re.match(rb'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
|
| 174 |
+
if m:
|
| 175 |
+
return m.group(1).decode()
|
| 176 |
+
m = re.match(rb"\s*<\?xml\b.*\bencoding='([^']+)'", s)
|
| 177 |
+
if m:
|
| 178 |
+
return m.group(1).decode()
|
| 179 |
+
# No encoding found -- what should the default be?
|
| 180 |
+
return "utf-8"
|
| 181 |
+
|
| 182 |
+
def handle_elt(self, elt, context):
|
| 183 |
+
"""
|
| 184 |
+
Convert an element into an appropriate value for inclusion in
|
| 185 |
+
the view. Unless overridden by a subclass or by the
|
| 186 |
+
``elt_handler`` constructor argument, this method simply
|
| 187 |
+
returns ``elt``.
|
| 188 |
+
|
| 189 |
+
:return: The view value corresponding to ``elt``.
|
| 190 |
+
|
| 191 |
+
:type elt: ElementTree
|
| 192 |
+
:param elt: The element that should be converted.
|
| 193 |
+
|
| 194 |
+
:type context: str
|
| 195 |
+
:param context: A string composed of element tags separated by
|
| 196 |
+
forward slashes, indicating the XML context of the given
|
| 197 |
+
element. For example, the string ``'foo/bar/baz'``
|
| 198 |
+
indicates that the element is a ``baz`` element whose
|
| 199 |
+
parent is a ``bar`` element and whose grandparent is a
|
| 200 |
+
top-level ``foo`` element.
|
| 201 |
+
"""
|
| 202 |
+
return elt
|
| 203 |
+
|
| 204 |
+
#: A regular expression that matches XML fragments that do not
|
| 205 |
+
#: contain any un-closed tags.
|
| 206 |
+
_VALID_XML_RE = re.compile(
|
| 207 |
+
r"""
|
| 208 |
+
[^<]*
|
| 209 |
+
(
|
| 210 |
+
((<!--.*?-->) | # comment
|
| 211 |
+
(<![CDATA[.*?]]) | # raw character data
|
| 212 |
+
(<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
|
| 213 |
+
(<[^!>][^>]*>)) # tag or PI
|
| 214 |
+
[^<]*)*
|
| 215 |
+
\Z""",
|
| 216 |
+
re.DOTALL | re.VERBOSE,
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
#: A regular expression used to extract the tag name from a start tag,
|
| 220 |
+
#: end tag, or empty-elt tag string.
|
| 221 |
+
_XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)")
|
| 222 |
+
|
| 223 |
+
#: A regular expression used to find all start-tags, end-tags, and
|
| 224 |
+
#: empty-elt tags in an XML file. This regexp is more lenient than
|
| 225 |
+
#: the XML spec -- e.g., it allows spaces in some places where the
|
| 226 |
+
#: spec does not.
|
| 227 |
+
_XML_PIECE = re.compile(
|
| 228 |
+
r"""
|
| 229 |
+
# Include these so we can skip them:
|
| 230 |
+
(?P<COMMENT> <!--.*?--> )|
|
| 231 |
+
(?P<CDATA> <![CDATA[.*?]]> )|
|
| 232 |
+
(?P<PI> <\?.*?\?> )|
|
| 233 |
+
(?P<DOCTYPE> <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
|
| 234 |
+
# These are the ones we actually care about:
|
| 235 |
+
(?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
|
| 236 |
+
(?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
|
| 237 |
+
(?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
|
| 238 |
+
re.DOTALL | re.VERBOSE,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
def _read_xml_fragment(self, stream):
|
| 242 |
+
"""
|
| 243 |
+
Read a string from the given stream that does not contain any
|
| 244 |
+
un-closed tags. In particular, this function first reads a
|
| 245 |
+
block from the stream of size ``self._BLOCK_SIZE``. It then
|
| 246 |
+
checks if that block contains an un-closed tag. If it does,
|
| 247 |
+
then this function either backtracks to the last '<', or reads
|
| 248 |
+
another block.
|
| 249 |
+
"""
|
| 250 |
+
fragment = ""
|
| 251 |
+
|
| 252 |
+
if isinstance(stream, SeekableUnicodeStreamReader):
|
| 253 |
+
startpos = stream.tell()
|
| 254 |
+
while True:
|
| 255 |
+
# Read a block and add it to the fragment.
|
| 256 |
+
xml_block = stream.read(self._BLOCK_SIZE)
|
| 257 |
+
fragment += xml_block
|
| 258 |
+
|
| 259 |
+
# Do we have a well-formed xml fragment?
|
| 260 |
+
if self._VALID_XML_RE.match(fragment):
|
| 261 |
+
return fragment
|
| 262 |
+
|
| 263 |
+
# Do we have a fragment that will never be well-formed?
|
| 264 |
+
if re.search("[<>]", fragment).group(0) == ">":
|
| 265 |
+
pos = stream.tell() - (
|
| 266 |
+
len(fragment) - re.search("[<>]", fragment).end()
|
| 267 |
+
)
|
| 268 |
+
raise ValueError('Unexpected ">" near char %s' % pos)
|
| 269 |
+
|
| 270 |
+
# End of file?
|
| 271 |
+
if not xml_block:
|
| 272 |
+
raise ValueError("Unexpected end of file: tag not closed")
|
| 273 |
+
|
| 274 |
+
# If not, then we must be in the middle of a <..tag..>.
|
| 275 |
+
# If appropriate, backtrack to the most recent '<'
|
| 276 |
+
# character.
|
| 277 |
+
last_open_bracket = fragment.rfind("<")
|
| 278 |
+
if last_open_bracket > 0:
|
| 279 |
+
if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
|
| 280 |
+
if isinstance(stream, SeekableUnicodeStreamReader):
|
| 281 |
+
stream.seek(startpos)
|
| 282 |
+
stream.char_seek_forward(last_open_bracket)
|
| 283 |
+
else:
|
| 284 |
+
stream.seek(-(len(fragment) - last_open_bracket), 1)
|
| 285 |
+
return fragment[:last_open_bracket]
|
| 286 |
+
|
| 287 |
+
# Otherwise, read another block. (i.e., return to the
|
| 288 |
+
# top of the loop.)
|
| 289 |
+
|
| 290 |
+
def read_block(self, stream, tagspec=None, elt_handler=None):
|
| 291 |
+
"""
|
| 292 |
+
Read from ``stream`` until we find at least one element that
|
| 293 |
+
matches ``tagspec``, and return the result of applying
|
| 294 |
+
``elt_handler`` to each element found.
|
| 295 |
+
"""
|
| 296 |
+
if tagspec is None:
|
| 297 |
+
tagspec = self._tagspec
|
| 298 |
+
if elt_handler is None:
|
| 299 |
+
elt_handler = self.handle_elt
|
| 300 |
+
|
| 301 |
+
# Use a stack of strings to keep track of our context:
|
| 302 |
+
context = list(self._tag_context.get(stream.tell()))
|
| 303 |
+
assert context is not None # check this -- could it ever happen?
|
| 304 |
+
|
| 305 |
+
elts = []
|
| 306 |
+
|
| 307 |
+
elt_start = None # where does the elt start
|
| 308 |
+
elt_depth = None # what context depth
|
| 309 |
+
elt_text = ""
|
| 310 |
+
|
| 311 |
+
while elts == [] or elt_start is not None:
|
| 312 |
+
if isinstance(stream, SeekableUnicodeStreamReader):
|
| 313 |
+
startpos = stream.tell()
|
| 314 |
+
xml_fragment = self._read_xml_fragment(stream)
|
| 315 |
+
|
| 316 |
+
# End of file.
|
| 317 |
+
if not xml_fragment:
|
| 318 |
+
if elt_start is None:
|
| 319 |
+
break
|
| 320 |
+
else:
|
| 321 |
+
raise ValueError("Unexpected end of file")
|
| 322 |
+
|
| 323 |
+
# Process each <tag> in the xml fragment.
|
| 324 |
+
for piece in self._XML_PIECE.finditer(xml_fragment):
|
| 325 |
+
if self._DEBUG:
|
| 326 |
+
print("{:>25} {}".format("/".join(context)[-20:], piece.group()))
|
| 327 |
+
|
| 328 |
+
if piece.group("START_TAG"):
|
| 329 |
+
name = self._XML_TAG_NAME.match(piece.group()).group(1)
|
| 330 |
+
# Keep context up-to-date.
|
| 331 |
+
context.append(name)
|
| 332 |
+
# Is this one of the elts we're looking for?
|
| 333 |
+
if elt_start is None:
|
| 334 |
+
if re.match(tagspec, "/".join(context)):
|
| 335 |
+
elt_start = piece.start()
|
| 336 |
+
elt_depth = len(context)
|
| 337 |
+
|
| 338 |
+
elif piece.group("END_TAG"):
|
| 339 |
+
name = self._XML_TAG_NAME.match(piece.group()).group(1)
|
| 340 |
+
# sanity checks:
|
| 341 |
+
if not context:
|
| 342 |
+
raise ValueError("Unmatched tag </%s>" % name)
|
| 343 |
+
if name != context[-1]:
|
| 344 |
+
raise ValueError(f"Unmatched tag <{context[-1]}>...</{name}>")
|
| 345 |
+
# Is this the end of an element?
|
| 346 |
+
if elt_start is not None and elt_depth == len(context):
|
| 347 |
+
elt_text += xml_fragment[elt_start : piece.end()]
|
| 348 |
+
elts.append((elt_text, "/".join(context)))
|
| 349 |
+
elt_start = elt_depth = None
|
| 350 |
+
elt_text = ""
|
| 351 |
+
# Keep context up-to-date
|
| 352 |
+
context.pop()
|
| 353 |
+
|
| 354 |
+
elif piece.group("EMPTY_ELT_TAG"):
|
| 355 |
+
name = self._XML_TAG_NAME.match(piece.group()).group(1)
|
| 356 |
+
if elt_start is None:
|
| 357 |
+
if re.match(tagspec, "/".join(context) + "/" + name):
|
| 358 |
+
elts.append((piece.group(), "/".join(context) + "/" + name))
|
| 359 |
+
|
| 360 |
+
if elt_start is not None:
|
| 361 |
+
# If we haven't found any elements yet, then keep
|
| 362 |
+
# looping until we do.
|
| 363 |
+
if elts == []:
|
| 364 |
+
elt_text += xml_fragment[elt_start:]
|
| 365 |
+
elt_start = 0
|
| 366 |
+
|
| 367 |
+
# If we've found at least one element, then try
|
| 368 |
+
# backtracking to the start of the element that we're
|
| 369 |
+
# inside of.
|
| 370 |
+
else:
|
| 371 |
+
# take back the last start-tag, and return what
|
| 372 |
+
# we've gotten so far (elts is non-empty).
|
| 373 |
+
if self._DEBUG:
|
| 374 |
+
print(" " * 36 + "(backtrack)")
|
| 375 |
+
if isinstance(stream, SeekableUnicodeStreamReader):
|
| 376 |
+
stream.seek(startpos)
|
| 377 |
+
stream.char_seek_forward(elt_start)
|
| 378 |
+
else:
|
| 379 |
+
stream.seek(-(len(xml_fragment) - elt_start), 1)
|
| 380 |
+
context = context[: elt_depth - 1]
|
| 381 |
+
elt_start = elt_depth = None
|
| 382 |
+
elt_text = ""
|
| 383 |
+
|
| 384 |
+
# Update the _tag_context dict.
|
| 385 |
+
pos = stream.tell()
|
| 386 |
+
if pos in self._tag_context:
|
| 387 |
+
assert tuple(context) == self._tag_context[pos]
|
| 388 |
+
else:
|
| 389 |
+
self._tag_context[pos] = tuple(context)
|
| 390 |
+
|
| 391 |
+
return [
|
| 392 |
+
elt_handler(
|
| 393 |
+
ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
|
| 394 |
+
context,
|
| 395 |
+
)
|
| 396 |
+
for (elt, context) in elts
|
| 397 |
+
]
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2015 NLTK Project
|
| 4 |
+
# Author: Selina Dennis <selina@tranzfusion.net>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
|
| 10 |
+
English Prose (YCOE), a 1.5 million word syntactically-annotated
|
| 11 |
+
corpus of Old English prose texts. The corpus is distributed by the
|
| 12 |
+
Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
|
| 13 |
+
with NLTK.
|
| 14 |
+
|
| 15 |
+
The YCOE corpus is divided into 100 files, each representing
|
| 16 |
+
an Old English prose text. Tags used within each text complies
|
| 17 |
+
to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
|
| 23 |
+
from nltk.corpus.reader.api import *
|
| 24 |
+
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
|
| 25 |
+
from nltk.corpus.reader.tagged import TaggedCorpusReader
|
| 26 |
+
from nltk.corpus.reader.util import *
|
| 27 |
+
from nltk.tokenize import RegexpTokenizer
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class YCOECorpusReader(CorpusReader):
|
| 31 |
+
"""
|
| 32 |
+
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
|
| 33 |
+
English Prose (YCOE), a 1.5 million word syntactically-annotated
|
| 34 |
+
corpus of Old English prose texts.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, root, encoding="utf8"):
|
| 38 |
+
CorpusReader.__init__(self, root, [], encoding)
|
| 39 |
+
|
| 40 |
+
self._psd_reader = YCOEParseCorpusReader(
|
| 41 |
+
self.root.join("psd"), ".*", ".psd", encoding=encoding
|
| 42 |
+
)
|
| 43 |
+
self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
|
| 44 |
+
|
| 45 |
+
# Make sure we have a consistent set of items:
|
| 46 |
+
documents = {f[:-4] for f in self._psd_reader.fileids()}
|
| 47 |
+
if {f[:-4] for f in self._pos_reader.fileids()} != documents:
|
| 48 |
+
raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
|
| 49 |
+
|
| 50 |
+
fileids = sorted(
|
| 51 |
+
["%s.psd" % doc for doc in documents]
|
| 52 |
+
+ ["%s.pos" % doc for doc in documents]
|
| 53 |
+
)
|
| 54 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 55 |
+
self._documents = sorted(documents)
|
| 56 |
+
|
| 57 |
+
def documents(self, fileids=None):
|
| 58 |
+
"""
|
| 59 |
+
Return a list of document identifiers for all documents in
|
| 60 |
+
this corpus, or for the documents with the given file(s) if
|
| 61 |
+
specified.
|
| 62 |
+
"""
|
| 63 |
+
if fileids is None:
|
| 64 |
+
return self._documents
|
| 65 |
+
if isinstance(fileids, str):
|
| 66 |
+
fileids = [fileids]
|
| 67 |
+
for f in fileids:
|
| 68 |
+
if f not in self._fileids:
|
| 69 |
+
raise KeyError("File id %s not found" % fileids)
|
| 70 |
+
# Strip off the '.pos' and '.psd' extensions.
|
| 71 |
+
return sorted({f[:-4] for f in fileids})
|
| 72 |
+
|
| 73 |
+
def fileids(self, documents=None):
|
| 74 |
+
"""
|
| 75 |
+
Return a list of file identifiers for the files that make up
|
| 76 |
+
this corpus, or that store the given document(s) if specified.
|
| 77 |
+
"""
|
| 78 |
+
if documents is None:
|
| 79 |
+
return self._fileids
|
| 80 |
+
elif isinstance(documents, str):
|
| 81 |
+
documents = [documents]
|
| 82 |
+
return sorted(
|
| 83 |
+
set(
|
| 84 |
+
["%s.pos" % doc for doc in documents]
|
| 85 |
+
+ ["%s.psd" % doc for doc in documents]
|
| 86 |
+
)
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
def _getfileids(self, documents, subcorpus):
|
| 90 |
+
"""
|
| 91 |
+
Helper that selects the appropriate fileids for a given set of
|
| 92 |
+
documents from a given subcorpus (pos or psd).
|
| 93 |
+
"""
|
| 94 |
+
if documents is None:
|
| 95 |
+
documents = self._documents
|
| 96 |
+
else:
|
| 97 |
+
if isinstance(documents, str):
|
| 98 |
+
documents = [documents]
|
| 99 |
+
for document in documents:
|
| 100 |
+
if document not in self._documents:
|
| 101 |
+
if document[-4:] in (".pos", ".psd"):
|
| 102 |
+
raise ValueError(
|
| 103 |
+
"Expected a document identifier, not a file "
|
| 104 |
+
"identifier. (Use corpus.documents() to get "
|
| 105 |
+
"a list of document identifiers."
|
| 106 |
+
)
|
| 107 |
+
else:
|
| 108 |
+
raise ValueError("Document identifier %s not found" % document)
|
| 109 |
+
return [f"{d}.{subcorpus}" for d in documents]
|
| 110 |
+
|
| 111 |
+
# Delegate to one of our two sub-readers:
|
| 112 |
+
def words(self, documents=None):
|
| 113 |
+
return self._pos_reader.words(self._getfileids(documents, "pos"))
|
| 114 |
+
|
| 115 |
+
def sents(self, documents=None):
|
| 116 |
+
return self._pos_reader.sents(self._getfileids(documents, "pos"))
|
| 117 |
+
|
| 118 |
+
def paras(self, documents=None):
|
| 119 |
+
return self._pos_reader.paras(self._getfileids(documents, "pos"))
|
| 120 |
+
|
| 121 |
+
def tagged_words(self, documents=None):
|
| 122 |
+
return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
|
| 123 |
+
|
| 124 |
+
def tagged_sents(self, documents=None):
|
| 125 |
+
return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
|
| 126 |
+
|
| 127 |
+
def tagged_paras(self, documents=None):
|
| 128 |
+
return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
|
| 129 |
+
|
| 130 |
+
def parsed_sents(self, documents=None):
|
| 131 |
+
return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class YCOEParseCorpusReader(BracketParseCorpusReader):
|
| 135 |
+
"""Specialized version of the standard bracket parse corpus reader
|
| 136 |
+
that strips out (CODE ...) and (ID ...) nodes."""
|
| 137 |
+
|
| 138 |
+
def _parse(self, t):
|
| 139 |
+
t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
|
| 140 |
+
if re.match(r"\s*\(\s*\)\s*$", t):
|
| 141 |
+
return None
|
| 142 |
+
return BracketParseCorpusReader._parse(self, t)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class YCOETaggedCorpusReader(TaggedCorpusReader):
|
| 146 |
+
def __init__(self, root, items, encoding="utf8"):
|
| 147 |
+
gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
|
| 148 |
+
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
|
| 149 |
+
TaggedCorpusReader.__init__(
|
| 150 |
+
self, root, items, sep="_", sent_tokenizer=sent_tokenizer
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
#: A list of all documents and their titles in ycoe.
|
| 155 |
+
documents = {
|
| 156 |
+
"coadrian.o34": "Adrian and Ritheus",
|
| 157 |
+
"coaelhom.o3": "Ælfric, Supplemental Homilies",
|
| 158 |
+
"coaelive.o3": "Ælfric's Lives of Saints",
|
| 159 |
+
"coalcuin": "Alcuin De virtutibus et vitiis",
|
| 160 |
+
"coalex.o23": "Alexander's Letter to Aristotle",
|
| 161 |
+
"coapollo.o3": "Apollonius of Tyre",
|
| 162 |
+
"coaugust": "Augustine",
|
| 163 |
+
"cobede.o2": "Bede's History of the English Church",
|
| 164 |
+
"cobenrul.o3": "Benedictine Rule",
|
| 165 |
+
"coblick.o23": "Blickling Homilies",
|
| 166 |
+
"coboeth.o2": "Boethius' Consolation of Philosophy",
|
| 167 |
+
"cobyrhtf.o3": "Byrhtferth's Manual",
|
| 168 |
+
"cocanedgD": "Canons of Edgar (D)",
|
| 169 |
+
"cocanedgX": "Canons of Edgar (X)",
|
| 170 |
+
"cocathom1.o3": "Ælfric's Catholic Homilies I",
|
| 171 |
+
"cocathom2.o3": "Ælfric's Catholic Homilies II",
|
| 172 |
+
"cochad.o24": "Saint Chad",
|
| 173 |
+
"cochdrul": "Chrodegang of Metz, Rule",
|
| 174 |
+
"cochristoph": "Saint Christopher",
|
| 175 |
+
"cochronA.o23": "Anglo-Saxon Chronicle A",
|
| 176 |
+
"cochronC": "Anglo-Saxon Chronicle C",
|
| 177 |
+
"cochronD": "Anglo-Saxon Chronicle D",
|
| 178 |
+
"cochronE.o34": "Anglo-Saxon Chronicle E",
|
| 179 |
+
"cocura.o2": "Cura Pastoralis",
|
| 180 |
+
"cocuraC": "Cura Pastoralis (Cotton)",
|
| 181 |
+
"codicts.o34": "Dicts of Cato",
|
| 182 |
+
"codocu1.o1": "Documents 1 (O1)",
|
| 183 |
+
"codocu2.o12": "Documents 2 (O1/O2)",
|
| 184 |
+
"codocu2.o2": "Documents 2 (O2)",
|
| 185 |
+
"codocu3.o23": "Documents 3 (O2/O3)",
|
| 186 |
+
"codocu3.o3": "Documents 3 (O3)",
|
| 187 |
+
"codocu4.o24": "Documents 4 (O2/O4)",
|
| 188 |
+
"coeluc1": "Honorius of Autun, Elucidarium 1",
|
| 189 |
+
"coeluc2": "Honorius of Autun, Elucidarium 1",
|
| 190 |
+
"coepigen.o3": "Ælfric's Epilogue to Genesis",
|
| 191 |
+
"coeuphr": "Saint Euphrosyne",
|
| 192 |
+
"coeust": "Saint Eustace and his companions",
|
| 193 |
+
"coexodusP": "Exodus (P)",
|
| 194 |
+
"cogenesiC": "Genesis (C)",
|
| 195 |
+
"cogregdC.o24": "Gregory's Dialogues (C)",
|
| 196 |
+
"cogregdH.o23": "Gregory's Dialogues (H)",
|
| 197 |
+
"coherbar": "Pseudo-Apuleius, Herbarium",
|
| 198 |
+
"coinspolD.o34": "Wulfstan's Institute of Polity (D)",
|
| 199 |
+
"coinspolX": "Wulfstan's Institute of Polity (X)",
|
| 200 |
+
"cojames": "Saint James",
|
| 201 |
+
"colacnu.o23": "Lacnunga",
|
| 202 |
+
"colaece.o2": "Leechdoms",
|
| 203 |
+
"colaw1cn.o3": "Laws, Cnut I",
|
| 204 |
+
"colaw2cn.o3": "Laws, Cnut II",
|
| 205 |
+
"colaw5atr.o3": "Laws, Æthelred V",
|
| 206 |
+
"colaw6atr.o3": "Laws, Æthelred VI",
|
| 207 |
+
"colawaf.o2": "Laws, Alfred",
|
| 208 |
+
"colawafint.o2": "Alfred's Introduction to Laws",
|
| 209 |
+
"colawger.o34": "Laws, Gerefa",
|
| 210 |
+
"colawine.ox2": "Laws, Ine",
|
| 211 |
+
"colawnorthu.o3": "Northumbra Preosta Lagu",
|
| 212 |
+
"colawwllad.o4": "Laws, William I, Lad",
|
| 213 |
+
"coleofri.o4": "Leofric",
|
| 214 |
+
"colsigef.o3": "Ælfric's Letter to Sigefyrth",
|
| 215 |
+
"colsigewB": "Ælfric's Letter to Sigeweard (B)",
|
| 216 |
+
"colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
|
| 217 |
+
"colwgeat": "Ælfric's Letter to Wulfgeat",
|
| 218 |
+
"colwsigeT": "Ælfric's Letter to Wulfsige (T)",
|
| 219 |
+
"colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
|
| 220 |
+
"colwstan1.o3": "Ælfric's Letter to Wulfstan I",
|
| 221 |
+
"colwstan2.o3": "Ælfric's Letter to Wulfstan II",
|
| 222 |
+
"comargaC.o34": "Saint Margaret (C)",
|
| 223 |
+
"comargaT": "Saint Margaret (T)",
|
| 224 |
+
"comart1": "Martyrology, I",
|
| 225 |
+
"comart2": "Martyrology, II",
|
| 226 |
+
"comart3.o23": "Martyrology, III",
|
| 227 |
+
"comarvel.o23": "Marvels of the East",
|
| 228 |
+
"comary": "Mary of Egypt",
|
| 229 |
+
"coneot": "Saint Neot",
|
| 230 |
+
"conicodA": "Gospel of Nicodemus (A)",
|
| 231 |
+
"conicodC": "Gospel of Nicodemus (C)",
|
| 232 |
+
"conicodD": "Gospel of Nicodemus (D)",
|
| 233 |
+
"conicodE": "Gospel of Nicodemus (E)",
|
| 234 |
+
"coorosiu.o2": "Orosius",
|
| 235 |
+
"cootest.o3": "Heptateuch",
|
| 236 |
+
"coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
|
| 237 |
+
"coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
|
| 238 |
+
"coprefcura.o2": "Preface to the Cura Pastoralis",
|
| 239 |
+
"coprefgen.o3": "Ælfric's Preface to Genesis",
|
| 240 |
+
"copreflives.o3": "Ælfric's Preface to Lives of Saints",
|
| 241 |
+
"coprefsolilo": "Preface to Augustine's Soliloquies",
|
| 242 |
+
"coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
|
| 243 |
+
"corood": "History of the Holy Rood-Tree",
|
| 244 |
+
"cosevensl": "Seven Sleepers",
|
| 245 |
+
"cosolilo": "St. Augustine's Soliloquies",
|
| 246 |
+
"cosolsat1.o4": "Solomon and Saturn I",
|
| 247 |
+
"cosolsat2": "Solomon and Saturn II",
|
| 248 |
+
"cotempo.o3": "Ælfric's De Temporibus Anni",
|
| 249 |
+
"coverhom": "Vercelli Homilies",
|
| 250 |
+
"coverhomE": "Vercelli Homilies (E)",
|
| 251 |
+
"coverhomL": "Vercelli Homilies (L)",
|
| 252 |
+
"covinceB": "Saint Vincent (Bodley 343)",
|
| 253 |
+
"covinsal": "Vindicta Salvatoris",
|
| 254 |
+
"cowsgosp.o3": "West-Saxon Gospels",
|
| 255 |
+
"cowulf.o34": "Wulfstan's Homilies",
|
| 256 |
+
}
|
.eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Interface to MaltParser
|
| 2 |
+
#
|
| 3 |
+
# Author: Dan Garrette <dhgarrette@gmail.com>
|
| 4 |
+
# Contributor: Liling Tan, Mustufain, osamamukhtar11
|
| 5 |
+
#
|
| 6 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
import inspect
|
| 11 |
+
import os
|
| 12 |
+
import subprocess
|
| 13 |
+
import sys
|
| 14 |
+
import tempfile
|
| 15 |
+
|
| 16 |
+
from nltk.data import ZipFilePathPointer
|
| 17 |
+
from nltk.internals import find_dir, find_file, find_jars_within_path
|
| 18 |
+
from nltk.parse.api import ParserI
|
| 19 |
+
from nltk.parse.dependencygraph import DependencyGraph
|
| 20 |
+
from nltk.parse.util import taggedsents_to_conll
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def malt_regex_tagger():
|
| 24 |
+
from nltk.tag import RegexpTagger
|
| 25 |
+
|
| 26 |
+
_tagger = RegexpTagger(
|
| 27 |
+
[
|
| 28 |
+
(r"\.$", "."),
|
| 29 |
+
(r"\,$", ","),
|
| 30 |
+
(r"\?$", "?"), # fullstop, comma, Qmark
|
| 31 |
+
(r"\($", "("),
|
| 32 |
+
(r"\)$", ")"), # round brackets
|
| 33 |
+
(r"\[$", "["),
|
| 34 |
+
(r"\]$", "]"), # square brackets
|
| 35 |
+
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
|
| 36 |
+
(r"(The|the|A|a|An|an)$", "DT"), # articles
|
| 37 |
+
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
|
| 38 |
+
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive
|
| 39 |
+
(r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive
|
| 40 |
+
(r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
|
| 41 |
+
(r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
|
| 42 |
+
(r"(till|Till|until|Until)$", "IN"), # time prepopsitions
|
| 43 |
+
(r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
|
| 44 |
+
(r"(under|Under|below|Below)$", "IN"), # space prepopsitions
|
| 45 |
+
(r"(over|Over|above|Above)$", "IN"), # space prepopsitions
|
| 46 |
+
(r"(across|Across|through|Through)$", "IN"), # space prepopsitions
|
| 47 |
+
(r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
|
| 48 |
+
(r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
|
| 49 |
+
(r".*able$", "JJ"), # adjectives
|
| 50 |
+
(r".*ness$", "NN"), # nouns formed from adjectives
|
| 51 |
+
(r".*ly$", "RB"), # adverbs
|
| 52 |
+
(r".*s$", "NNS"), # plural nouns
|
| 53 |
+
(r".*ing$", "VBG"), # gerunds
|
| 54 |
+
(r".*ed$", "VBD"), # past tense verbs
|
| 55 |
+
(r".*", "NN"), # nouns (default)
|
| 56 |
+
]
|
| 57 |
+
)
|
| 58 |
+
return _tagger.tag
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def find_maltparser(parser_dirname):
|
| 62 |
+
"""
|
| 63 |
+
A module to find MaltParser .jar file and its dependencies.
|
| 64 |
+
"""
|
| 65 |
+
if os.path.exists(parser_dirname): # If a full path is given.
|
| 66 |
+
_malt_dir = parser_dirname
|
| 67 |
+
else: # Try to find path to maltparser directory in environment variables.
|
| 68 |
+
_malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
|
| 69 |
+
# Checks that that the found directory contains all the necessary .jar
|
| 70 |
+
malt_dependencies = ["", "", ""]
|
| 71 |
+
_malt_jars = set(find_jars_within_path(_malt_dir))
|
| 72 |
+
_jars = {os.path.split(jar)[1] for jar in _malt_jars}
|
| 73 |
+
malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"}
|
| 74 |
+
|
| 75 |
+
assert malt_dependencies.issubset(_jars)
|
| 76 |
+
assert any(
|
| 77 |
+
filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
|
| 78 |
+
)
|
| 79 |
+
return list(_malt_jars)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def find_malt_model(model_filename):
|
| 83 |
+
"""
|
| 84 |
+
A module to find pre-trained MaltParser model.
|
| 85 |
+
"""
|
| 86 |
+
if model_filename is None:
|
| 87 |
+
return "malt_temp.mco"
|
| 88 |
+
elif os.path.exists(model_filename): # If a full path is given.
|
| 89 |
+
return model_filename
|
| 90 |
+
else: # Try to find path to malt model in environment variables.
|
| 91 |
+
return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class MaltParser(ParserI):
|
| 95 |
+
"""
|
| 96 |
+
A class for dependency parsing with MaltParser. The input is the paths to:
|
| 97 |
+
- (optionally) a maltparser directory
|
| 98 |
+
- (optionally) the path to a pre-trained MaltParser .mco model file
|
| 99 |
+
- (optionally) the tagger to use for POS tagging before parsing
|
| 100 |
+
- (optionally) additional Java arguments
|
| 101 |
+
|
| 102 |
+
Example:
|
| 103 |
+
>>> from nltk.parse import malt
|
| 104 |
+
>>> # With MALT_PARSER and MALT_MODEL environment set.
|
| 105 |
+
>>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP
|
| 106 |
+
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
|
| 107 |
+
(shot I (elephant an) (in (pajamas my)) .)
|
| 108 |
+
>>> # Without MALT_PARSER and MALT_MODEL environment.
|
| 109 |
+
>>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
|
| 110 |
+
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
|
| 111 |
+
(shot I (elephant an) (in (pajamas my)) .)
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def __init__(
|
| 115 |
+
self,
|
| 116 |
+
parser_dirname="",
|
| 117 |
+
model_filename=None,
|
| 118 |
+
tagger=None,
|
| 119 |
+
additional_java_args=None,
|
| 120 |
+
):
|
| 121 |
+
"""
|
| 122 |
+
An interface for parsing with the Malt Parser.
|
| 123 |
+
|
| 124 |
+
:param parser_dirname: The path to the maltparser directory that
|
| 125 |
+
contains the maltparser-1.x.jar
|
| 126 |
+
:type parser_dirname: str
|
| 127 |
+
:param model_filename: The name of the pre-trained model with .mco file
|
| 128 |
+
extension. If provided, training will not be required.
|
| 129 |
+
(see http://www.maltparser.org/mco/mco.html and
|
| 130 |
+
see http://www.patful.com/chalk/node/185)
|
| 131 |
+
:type model_filename: str
|
| 132 |
+
:param tagger: The tagger used to POS tag the raw string before
|
| 133 |
+
formatting to CONLL format. It should behave like `nltk.pos_tag`
|
| 134 |
+
:type tagger: function
|
| 135 |
+
:param additional_java_args: This is the additional Java arguments that
|
| 136 |
+
one can use when calling Maltparser, usually this is the heapsize
|
| 137 |
+
limits, e.g. `additional_java_args=['-Xmx1024m']`
|
| 138 |
+
(see https://goo.gl/mpDBvQ)
|
| 139 |
+
:type additional_java_args: list
|
| 140 |
+
"""
|
| 141 |
+
|
| 142 |
+
# Find all the necessary jar files for MaltParser.
|
| 143 |
+
self.malt_jars = find_maltparser(parser_dirname)
|
| 144 |
+
# Initialize additional java arguments.
|
| 145 |
+
self.additional_java_args = (
|
| 146 |
+
additional_java_args if additional_java_args is not None else []
|
| 147 |
+
)
|
| 148 |
+
# Initialize model.
|
| 149 |
+
self.model = find_malt_model(model_filename)
|
| 150 |
+
self._trained = self.model != "malt_temp.mco"
|
| 151 |
+
# Set the working_dir parameters i.e. `-w` from MaltParser's option.
|
| 152 |
+
self.working_dir = tempfile.gettempdir()
|
| 153 |
+
# Initialize POS tagger.
|
| 154 |
+
self.tagger = tagger if tagger is not None else malt_regex_tagger()
|
| 155 |
+
|
| 156 |
+
def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
|
| 157 |
+
"""
|
| 158 |
+
Use MaltParser to parse multiple POS tagged sentences. Takes multiple
|
| 159 |
+
sentences where each sentence is a list of (word, tag) tuples.
|
| 160 |
+
The sentences must have already been tokenized and tagged.
|
| 161 |
+
|
| 162 |
+
:param sentences: Input sentences to parse
|
| 163 |
+
:type sentence: list(list(tuple(str, str)))
|
| 164 |
+
:return: iter(iter(``DependencyGraph``)) the dependency graph
|
| 165 |
+
representation of each sentence
|
| 166 |
+
"""
|
| 167 |
+
if not self._trained:
|
| 168 |
+
raise Exception("Parser has not been trained. Call train() first.")
|
| 169 |
+
|
| 170 |
+
with tempfile.NamedTemporaryFile(
|
| 171 |
+
prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
|
| 172 |
+
) as input_file:
|
| 173 |
+
with tempfile.NamedTemporaryFile(
|
| 174 |
+
prefix="malt_output.conll.",
|
| 175 |
+
dir=self.working_dir,
|
| 176 |
+
mode="w",
|
| 177 |
+
delete=False,
|
| 178 |
+
) as output_file:
|
| 179 |
+
# Convert list of sentences to CONLL format.
|
| 180 |
+
for line in taggedsents_to_conll(sentences):
|
| 181 |
+
input_file.write(str(line))
|
| 182 |
+
input_file.close()
|
| 183 |
+
|
| 184 |
+
# Generate command to run maltparser.
|
| 185 |
+
cmd = self.generate_malt_command(
|
| 186 |
+
input_file.name, output_file.name, mode="parse"
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# This is a maltparser quirk, it needs to be run
|
| 190 |
+
# where the model file is. otherwise it goes into an awkward
|
| 191 |
+
# missing .jars or strange -w working_dir problem.
|
| 192 |
+
_current_path = os.getcwd() # Remembers the current path.
|
| 193 |
+
try: # Change to modelfile path
|
| 194 |
+
os.chdir(os.path.split(self.model)[0])
|
| 195 |
+
except:
|
| 196 |
+
pass
|
| 197 |
+
ret = self._execute(cmd, verbose) # Run command.
|
| 198 |
+
os.chdir(_current_path) # Change back to current path.
|
| 199 |
+
|
| 200 |
+
if ret != 0:
|
| 201 |
+
raise Exception(
|
| 202 |
+
"MaltParser parsing (%s) failed with exit "
|
| 203 |
+
"code %d" % (" ".join(cmd), ret)
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Must return iter(iter(Tree))
|
| 207 |
+
with open(output_file.name) as infile:
|
| 208 |
+
for tree_str in infile.read().split("\n\n"):
|
| 209 |
+
yield (
|
| 210 |
+
iter(
|
| 211 |
+
[
|
| 212 |
+
DependencyGraph(
|
| 213 |
+
tree_str, top_relation_label=top_relation_label
|
| 214 |
+
)
|
| 215 |
+
]
|
| 216 |
+
)
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
os.remove(input_file.name)
|
| 220 |
+
os.remove(output_file.name)
|
| 221 |
+
|
| 222 |
+
def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
|
| 223 |
+
"""
|
| 224 |
+
Use MaltParser to parse multiple sentences.
|
| 225 |
+
Takes a list of sentences, where each sentence is a list of words.
|
| 226 |
+
Each sentence will be automatically tagged with this
|
| 227 |
+
MaltParser instance's tagger.
|
| 228 |
+
|
| 229 |
+
:param sentences: Input sentences to parse
|
| 230 |
+
:type sentence: list(list(str))
|
| 231 |
+
:return: iter(DependencyGraph)
|
| 232 |
+
"""
|
| 233 |
+
tagged_sentences = (self.tagger(sentence) for sentence in sentences)
|
| 234 |
+
return self.parse_tagged_sents(
|
| 235 |
+
tagged_sentences, verbose, top_relation_label=top_relation_label
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
|
| 239 |
+
"""
|
| 240 |
+
This function generates the maltparser command use at the terminal.
|
| 241 |
+
|
| 242 |
+
:param inputfilename: path to the input file
|
| 243 |
+
:type inputfilename: str
|
| 244 |
+
:param outputfilename: path to the output file
|
| 245 |
+
:type outputfilename: str
|
| 246 |
+
"""
|
| 247 |
+
|
| 248 |
+
cmd = ["java"]
|
| 249 |
+
cmd += self.additional_java_args # Adds additional java arguments
|
| 250 |
+
# Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
|
| 251 |
+
classpaths_separator = ";" if sys.platform.startswith("win") else ":"
|
| 252 |
+
cmd += [
|
| 253 |
+
"-cp",
|
| 254 |
+
classpaths_separator.join(self.malt_jars),
|
| 255 |
+
] # Adds classpaths for jars
|
| 256 |
+
cmd += ["org.maltparser.Malt"] # Adds the main function.
|
| 257 |
+
|
| 258 |
+
# Adds the model file.
|
| 259 |
+
if os.path.exists(self.model): # when parsing
|
| 260 |
+
cmd += ["-c", os.path.split(self.model)[-1]]
|
| 261 |
+
else: # when learning
|
| 262 |
+
cmd += ["-c", self.model]
|
| 263 |
+
|
| 264 |
+
cmd += ["-i", inputfilename]
|
| 265 |
+
if mode == "parse":
|
| 266 |
+
cmd += ["-o", outputfilename]
|
| 267 |
+
cmd += ["-m", mode] # mode use to generate parses.
|
| 268 |
+
return cmd
|
| 269 |
+
|
| 270 |
+
@staticmethod
|
| 271 |
+
def _execute(cmd, verbose=False):
|
| 272 |
+
output = None if verbose else subprocess.PIPE
|
| 273 |
+
p = subprocess.Popen(cmd, stdout=output, stderr=output)
|
| 274 |
+
return p.wait()
|
| 275 |
+
|
| 276 |
+
def train(self, depgraphs, verbose=False):
|
| 277 |
+
"""
|
| 278 |
+
Train MaltParser from a list of ``DependencyGraph`` objects
|
| 279 |
+
|
| 280 |
+
:param depgraphs: list of ``DependencyGraph`` objects for training input data
|
| 281 |
+
:type depgraphs: DependencyGraph
|
| 282 |
+
"""
|
| 283 |
+
|
| 284 |
+
# Write the conll_str to malt_train.conll file in /tmp/
|
| 285 |
+
with tempfile.NamedTemporaryFile(
|
| 286 |
+
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
|
| 287 |
+
) as input_file:
|
| 288 |
+
input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
|
| 289 |
+
input_file.write(str(input_str))
|
| 290 |
+
# Trains the model with the malt_train.conll
|
| 291 |
+
self.train_from_file(input_file.name, verbose=verbose)
|
| 292 |
+
# Removes the malt_train.conll once training finishes.
|
| 293 |
+
os.remove(input_file.name)
|
| 294 |
+
|
| 295 |
+
def train_from_file(self, conll_file, verbose=False):
|
| 296 |
+
"""
|
| 297 |
+
Train MaltParser from a file
|
| 298 |
+
:param conll_file: str for the filename of the training input data
|
| 299 |
+
:type conll_file: str
|
| 300 |
+
"""
|
| 301 |
+
|
| 302 |
+
# If conll_file is a ZipFilePathPointer,
|
| 303 |
+
# then we need to do some extra massaging
|
| 304 |
+
if isinstance(conll_file, ZipFilePathPointer):
|
| 305 |
+
with tempfile.NamedTemporaryFile(
|
| 306 |
+
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
|
| 307 |
+
) as input_file:
|
| 308 |
+
with conll_file.open() as conll_input_file:
|
| 309 |
+
conll_str = conll_input_file.read()
|
| 310 |
+
input_file.write(str(conll_str))
|
| 311 |
+
return self.train_from_file(input_file.name, verbose=verbose)
|
| 312 |
+
|
| 313 |
+
# Generate command to run maltparser.
|
| 314 |
+
cmd = self.generate_malt_command(conll_file, mode="learn")
|
| 315 |
+
ret = self._execute(cmd, verbose)
|
| 316 |
+
if ret != 0:
|
| 317 |
+
raise Exception(
|
| 318 |
+
"MaltParser training (%s) failed with exit "
|
| 319 |
+
"code %d" % (" ".join(cmd), ret)
|
| 320 |
+
)
|
| 321 |
+
self._trained = True
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
if __name__ == "__main__":
|
| 325 |
+
"""
|
| 326 |
+
A demonstration function to show how NLTK users can use the malt parser API.
|
| 327 |
+
|
| 328 |
+
>>> from nltk import pos_tag
|
| 329 |
+
>>> assert 'MALT_PARSER' in os.environ, str(
|
| 330 |
+
... "Please set MALT_PARSER in your global environment, e.g.:\n"
|
| 331 |
+
... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'")
|
| 332 |
+
>>>
|
| 333 |
+
>>> assert 'MALT_MODEL' in os.environ, str(
|
| 334 |
+
... "Please set MALT_MODEL in your global environment, e.g.:\n"
|
| 335 |
+
... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
|
| 336 |
+
>>>
|
| 337 |
+
>>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
|
| 338 |
+
... "2 sees _ VB _ _ 0 ROOT _ _\n"
|
| 339 |
+
... "3 a _ DT _ _ 4 SPEC _ _\n"
|
| 340 |
+
... "4 dog _ NN _ _ 2 OBJ _ _\n"
|
| 341 |
+
... "5 . _ . _ _ 2 PUNCT _ _\n")
|
| 342 |
+
>>>
|
| 343 |
+
>>>
|
| 344 |
+
>>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
|
| 345 |
+
... "2 walks _ VB _ _ 0 ROOT _ _\n"
|
| 346 |
+
... "3 . _ . _ _ 2 PUNCT _ _\n")
|
| 347 |
+
>>> dg1 = DependencyGraph(_dg1_str)
|
| 348 |
+
>>> dg2 = DependencyGraph(_dg2_str)
|
| 349 |
+
>>> # Initialize a MaltParser object
|
| 350 |
+
>>> mp = MaltParser()
|
| 351 |
+
>>>
|
| 352 |
+
>>> # Trains a model.
|
| 353 |
+
>>> mp.train([dg1,dg2], verbose=False)
|
| 354 |
+
>>> sent1 = ['John','sees','Mary', '.']
|
| 355 |
+
>>> sent2 = ['John', 'walks', 'a', 'dog', '.']
|
| 356 |
+
>>>
|
| 357 |
+
>>> # Parse a single sentence.
|
| 358 |
+
>>> parsed_sent1 = mp.parse_one(sent1)
|
| 359 |
+
>>> parsed_sent2 = mp.parse_one(sent2)
|
| 360 |
+
>>> print(parsed_sent1.tree())
|
| 361 |
+
(sees John Mary .)
|
| 362 |
+
>>> print(parsed_sent2.tree())
|
| 363 |
+
(walks John (dog a) .)
|
| 364 |
+
>>>
|
| 365 |
+
>>> # Parsing multiple sentences.
|
| 366 |
+
>>> sentences = [sent1,sent2]
|
| 367 |
+
>>> parsed_sents = mp.parse_sents(sentences)
|
| 368 |
+
>>> print(next(next(parsed_sents)).tree())
|
| 369 |
+
(sees John Mary .)
|
| 370 |
+
>>> print(next(next(parsed_sents)).tree())
|
| 371 |
+
(walks John (dog a) .)
|
| 372 |
+
>>>
|
| 373 |
+
>>> # Initialize a MaltParser object with an English pre-trained model.
|
| 374 |
+
>>> parser_dirname = 'maltparser-1.9.2'
|
| 375 |
+
>>> model_name = 'engmalt.linear-1.7.mco'
|
| 376 |
+
>>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
|
| 377 |
+
>>> sent1 = 'I shot an elephant in my pajamas .'.split()
|
| 378 |
+
>>> sent2 = 'Time flies like banana .'.split()
|
| 379 |
+
>>> # Parse a single sentence.
|
| 380 |
+
>>> print(mp.parse_one(sent1).tree())
|
| 381 |
+
(shot I (elephant an) (in (pajamas my)) .)
|
| 382 |
+
# Parsing multiple sentences
|
| 383 |
+
>>> sentences = [sent1,sent2]
|
| 384 |
+
>>> parsed_sents = mp.parse_sents(sentences)
|
| 385 |
+
>>> print(next(next(parsed_sents)).tree())
|
| 386 |
+
(shot I (elephant an) (in (pajamas my)) .)
|
| 387 |
+
>>> print(next(next(parsed_sents)).tree())
|
| 388 |
+
(flies Time (like banana) .)
|
| 389 |
+
"""
|
| 390 |
+
|
| 391 |
+
import doctest
|
| 392 |
+
|
| 393 |
+
doctest.testmod()
|
.eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py
ADDED
|
@@ -0,0 +1,772 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Dependency Grammars
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Jason Narad <jason.narad@gmail.com>
|
| 5 |
+
#
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
import logging
|
| 11 |
+
import math
|
| 12 |
+
|
| 13 |
+
from nltk.parse.dependencygraph import DependencyGraph
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
#################################################################
|
| 18 |
+
# DependencyScorerI - Interface for Graph-Edge Weight Calculation
|
| 19 |
+
#################################################################
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class DependencyScorerI:
|
| 23 |
+
"""
|
| 24 |
+
A scorer for calculated the weights on the edges of a weighted
|
| 25 |
+
dependency graph. This is used by a
|
| 26 |
+
``ProbabilisticNonprojectiveParser`` to initialize the edge
|
| 27 |
+
weights of a ``DependencyGraph``. While typically this would be done
|
| 28 |
+
by training a binary classifier, any class that can return a
|
| 29 |
+
multidimensional list representation of the edge weights can
|
| 30 |
+
implement this interface. As such, it has no necessary
|
| 31 |
+
fields.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(self):
|
| 35 |
+
if self.__class__ == DependencyScorerI:
|
| 36 |
+
raise TypeError("DependencyScorerI is an abstract interface")
|
| 37 |
+
|
| 38 |
+
def train(self, graphs):
|
| 39 |
+
"""
|
| 40 |
+
:type graphs: list(DependencyGraph)
|
| 41 |
+
:param graphs: A list of dependency graphs to train the scorer.
|
| 42 |
+
Typically the edges present in the graphs can be used as
|
| 43 |
+
positive training examples, and the edges not present as negative
|
| 44 |
+
examples.
|
| 45 |
+
"""
|
| 46 |
+
raise NotImplementedError()
|
| 47 |
+
|
| 48 |
+
def score(self, graph):
|
| 49 |
+
"""
|
| 50 |
+
:type graph: DependencyGraph
|
| 51 |
+
:param graph: A dependency graph whose set of edges need to be
|
| 52 |
+
scored.
|
| 53 |
+
:rtype: A three-dimensional list of numbers.
|
| 54 |
+
:return: The score is returned in a multidimensional(3) list, such
|
| 55 |
+
that the outer-dimension refers to the head, and the
|
| 56 |
+
inner-dimension refers to the dependencies. For instance,
|
| 57 |
+
scores[0][1] would reference the list of scores corresponding to
|
| 58 |
+
arcs from node 0 to node 1. The node's 'address' field can be used
|
| 59 |
+
to determine its number identification.
|
| 60 |
+
|
| 61 |
+
For further illustration, a score list corresponding to Fig.2 of
|
| 62 |
+
Keith Hall's 'K-best Spanning Tree Parsing' paper::
|
| 63 |
+
|
| 64 |
+
scores = [[[], [5], [1], [1]],
|
| 65 |
+
[[], [], [11], [4]],
|
| 66 |
+
[[], [10], [], [5]],
|
| 67 |
+
[[], [8], [8], []]]
|
| 68 |
+
|
| 69 |
+
When used in conjunction with a MaxEntClassifier, each score would
|
| 70 |
+
correspond to the confidence of a particular edge being classified
|
| 71 |
+
with the positive training examples.
|
| 72 |
+
"""
|
| 73 |
+
raise NotImplementedError()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
#################################################################
|
| 77 |
+
# NaiveBayesDependencyScorer
|
| 78 |
+
#################################################################
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class NaiveBayesDependencyScorer(DependencyScorerI):
|
| 82 |
+
"""
|
| 83 |
+
A dependency scorer built around a MaxEnt classifier. In this
|
| 84 |
+
particular class that classifier is a ``NaiveBayesClassifier``.
|
| 85 |
+
It uses head-word, head-tag, child-word, and child-tag features
|
| 86 |
+
for classification.
|
| 87 |
+
|
| 88 |
+
>>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2
|
| 89 |
+
|
| 90 |
+
>>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry]
|
| 91 |
+
>>> npp = ProbabilisticNonprojectiveParser()
|
| 92 |
+
>>> npp.train(graphs, NaiveBayesDependencyScorer())
|
| 93 |
+
>>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc'])
|
| 94 |
+
>>> len(list(parses))
|
| 95 |
+
1
|
| 96 |
+
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
def __init__(self):
|
| 100 |
+
pass # Do nothing without throwing error
|
| 101 |
+
|
| 102 |
+
def train(self, graphs):
|
| 103 |
+
"""
|
| 104 |
+
Trains a ``NaiveBayesClassifier`` using the edges present in
|
| 105 |
+
graphs list as positive examples, the edges not present as
|
| 106 |
+
negative examples. Uses a feature vector of head-word,
|
| 107 |
+
head-tag, child-word, and child-tag.
|
| 108 |
+
|
| 109 |
+
:type graphs: list(DependencyGraph)
|
| 110 |
+
:param graphs: A list of dependency graphs to train the scorer.
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
from nltk.classify import NaiveBayesClassifier
|
| 114 |
+
|
| 115 |
+
# Create training labeled training examples
|
| 116 |
+
labeled_examples = []
|
| 117 |
+
for graph in graphs:
|
| 118 |
+
for head_node in graph.nodes.values():
|
| 119 |
+
for child_index, child_node in graph.nodes.items():
|
| 120 |
+
if child_index in head_node["deps"]:
|
| 121 |
+
label = "T"
|
| 122 |
+
else:
|
| 123 |
+
label = "F"
|
| 124 |
+
labeled_examples.append(
|
| 125 |
+
(
|
| 126 |
+
dict(
|
| 127 |
+
a=head_node["word"],
|
| 128 |
+
b=head_node["tag"],
|
| 129 |
+
c=child_node["word"],
|
| 130 |
+
d=child_node["tag"],
|
| 131 |
+
),
|
| 132 |
+
label,
|
| 133 |
+
)
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
self.classifier = NaiveBayesClassifier.train(labeled_examples)
|
| 137 |
+
|
| 138 |
+
def score(self, graph):
|
| 139 |
+
"""
|
| 140 |
+
Converts the graph into a feature-based representation of
|
| 141 |
+
each edge, and then assigns a score to each based on the
|
| 142 |
+
confidence of the classifier in assigning it to the
|
| 143 |
+
positive label. Scores are returned in a multidimensional list.
|
| 144 |
+
|
| 145 |
+
:type graph: DependencyGraph
|
| 146 |
+
:param graph: A dependency graph to score.
|
| 147 |
+
:rtype: 3 dimensional list
|
| 148 |
+
:return: Edge scores for the graph parameter.
|
| 149 |
+
"""
|
| 150 |
+
# Convert graph to feature representation
|
| 151 |
+
edges = []
|
| 152 |
+
for head_node in graph.nodes.values():
|
| 153 |
+
for child_node in graph.nodes.values():
|
| 154 |
+
edges.append(
|
| 155 |
+
dict(
|
| 156 |
+
a=head_node["word"],
|
| 157 |
+
b=head_node["tag"],
|
| 158 |
+
c=child_node["word"],
|
| 159 |
+
d=child_node["tag"],
|
| 160 |
+
)
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# Score edges
|
| 164 |
+
edge_scores = []
|
| 165 |
+
row = []
|
| 166 |
+
count = 0
|
| 167 |
+
for pdist in self.classifier.prob_classify_many(edges):
|
| 168 |
+
logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
|
| 169 |
+
# smoothing in case the probability = 0
|
| 170 |
+
row.append([math.log(pdist.prob("T") + 0.00000000001)])
|
| 171 |
+
count += 1
|
| 172 |
+
if count == len(graph.nodes):
|
| 173 |
+
edge_scores.append(row)
|
| 174 |
+
row = []
|
| 175 |
+
count = 0
|
| 176 |
+
return edge_scores
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
#################################################################
|
| 180 |
+
# A Scorer for Demo Purposes
|
| 181 |
+
#################################################################
|
| 182 |
+
# A short class necessary to show parsing example from paper
|
| 183 |
+
class DemoScorer(DependencyScorerI):
|
| 184 |
+
def train(self, graphs):
|
| 185 |
+
print("Training...")
|
| 186 |
+
|
| 187 |
+
def score(self, graph):
|
| 188 |
+
# scores for Keith Hall 'K-best Spanning Tree Parsing' paper
|
| 189 |
+
return [
|
| 190 |
+
[[], [5], [1], [1]],
|
| 191 |
+
[[], [], [11], [4]],
|
| 192 |
+
[[], [10], [], [5]],
|
| 193 |
+
[[], [8], [8], []],
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
#################################################################
|
| 198 |
+
# Non-Projective Probabilistic Parsing
|
| 199 |
+
#################################################################
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
class ProbabilisticNonprojectiveParser:
|
| 203 |
+
"""A probabilistic non-projective dependency parser.
|
| 204 |
+
|
| 205 |
+
Nonprojective dependencies allows for "crossing branches" in the parse tree
|
| 206 |
+
which is necessary for representing particular linguistic phenomena, or even
|
| 207 |
+
typical parses in some languages. This parser follows the MST parsing
|
| 208 |
+
algorithm, outlined in McDonald(2005), which likens the search for the best
|
| 209 |
+
non-projective parse to finding the maximum spanning tree in a weighted
|
| 210 |
+
directed graph.
|
| 211 |
+
|
| 212 |
+
>>> class Scorer(DependencyScorerI):
|
| 213 |
+
... def train(self, graphs):
|
| 214 |
+
... pass
|
| 215 |
+
...
|
| 216 |
+
... def score(self, graph):
|
| 217 |
+
... return [
|
| 218 |
+
... [[], [5], [1], [1]],
|
| 219 |
+
... [[], [], [11], [4]],
|
| 220 |
+
... [[], [10], [], [5]],
|
| 221 |
+
... [[], [8], [8], []],
|
| 222 |
+
... ]
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
>>> npp = ProbabilisticNonprojectiveParser()
|
| 226 |
+
>>> npp.train([], Scorer())
|
| 227 |
+
|
| 228 |
+
>>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None])
|
| 229 |
+
>>> len(list(parses))
|
| 230 |
+
1
|
| 231 |
+
|
| 232 |
+
Rule based example
|
| 233 |
+
|
| 234 |
+
>>> from nltk.grammar import DependencyGrammar
|
| 235 |
+
|
| 236 |
+
>>> grammar = DependencyGrammar.fromstring('''
|
| 237 |
+
... 'taught' -> 'play' | 'man'
|
| 238 |
+
... 'man' -> 'the' | 'in'
|
| 239 |
+
... 'in' -> 'corner'
|
| 240 |
+
... 'corner' -> 'the'
|
| 241 |
+
... 'play' -> 'golf' | 'dachshund' | 'to'
|
| 242 |
+
... 'dachshund' -> 'his'
|
| 243 |
+
... ''')
|
| 244 |
+
|
| 245 |
+
>>> ndp = NonprojectiveDependencyParser(grammar)
|
| 246 |
+
>>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
|
| 247 |
+
>>> len(list(parses))
|
| 248 |
+
4
|
| 249 |
+
|
| 250 |
+
"""
|
| 251 |
+
|
| 252 |
+
def __init__(self):
|
| 253 |
+
"""
|
| 254 |
+
Creates a new non-projective parser.
|
| 255 |
+
"""
|
| 256 |
+
logging.debug("initializing prob. nonprojective...")
|
| 257 |
+
|
| 258 |
+
def train(self, graphs, dependency_scorer):
|
| 259 |
+
"""
|
| 260 |
+
Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects,
|
| 261 |
+
and establishes this as the parser's scorer. This is used to
|
| 262 |
+
initialize the scores on a ``DependencyGraph`` during the parsing
|
| 263 |
+
procedure.
|
| 264 |
+
|
| 265 |
+
:type graphs: list(DependencyGraph)
|
| 266 |
+
:param graphs: A list of dependency graphs to train the scorer.
|
| 267 |
+
:type dependency_scorer: DependencyScorerI
|
| 268 |
+
:param dependency_scorer: A scorer which implements the
|
| 269 |
+
``DependencyScorerI`` interface.
|
| 270 |
+
"""
|
| 271 |
+
self._scorer = dependency_scorer
|
| 272 |
+
self._scorer.train(graphs)
|
| 273 |
+
|
| 274 |
+
def initialize_edge_scores(self, graph):
|
| 275 |
+
"""
|
| 276 |
+
Assigns a score to every edge in the ``DependencyGraph`` graph.
|
| 277 |
+
These scores are generated via the parser's scorer which
|
| 278 |
+
was assigned during the training process.
|
| 279 |
+
|
| 280 |
+
:type graph: DependencyGraph
|
| 281 |
+
:param graph: A dependency graph to assign scores to.
|
| 282 |
+
"""
|
| 283 |
+
self.scores = self._scorer.score(graph)
|
| 284 |
+
|
| 285 |
+
def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph):
|
| 286 |
+
"""
|
| 287 |
+
Takes a list of nodes that have been identified to belong to a cycle,
|
| 288 |
+
and collapses them into on larger node. The arcs of all nodes in
|
| 289 |
+
the graph must be updated to account for this.
|
| 290 |
+
|
| 291 |
+
:type new_node: Node.
|
| 292 |
+
:param new_node: A Node (Dictionary) to collapse the cycle nodes into.
|
| 293 |
+
:type cycle_path: A list of integers.
|
| 294 |
+
:param cycle_path: A list of node addresses, each of which is in the cycle.
|
| 295 |
+
:type g_graph, b_graph, c_graph: DependencyGraph
|
| 296 |
+
:param g_graph, b_graph, c_graph: Graphs which need to be updated.
|
| 297 |
+
"""
|
| 298 |
+
logger.debug("Collapsing nodes...")
|
| 299 |
+
# Collapse all cycle nodes into v_n+1 in G_Graph
|
| 300 |
+
for cycle_node_index in cycle_path:
|
| 301 |
+
g_graph.remove_by_address(cycle_node_index)
|
| 302 |
+
g_graph.add_node(new_node)
|
| 303 |
+
g_graph.redirect_arcs(cycle_path, new_node["address"])
|
| 304 |
+
|
| 305 |
+
def update_edge_scores(self, new_node, cycle_path):
|
| 306 |
+
"""
|
| 307 |
+
Updates the edge scores to reflect a collapse operation into
|
| 308 |
+
new_node.
|
| 309 |
+
|
| 310 |
+
:type new_node: A Node.
|
| 311 |
+
:param new_node: The node which cycle nodes are collapsed into.
|
| 312 |
+
:type cycle_path: A list of integers.
|
| 313 |
+
:param cycle_path: A list of node addresses that belong to the cycle.
|
| 314 |
+
"""
|
| 315 |
+
logger.debug("cycle %s", cycle_path)
|
| 316 |
+
|
| 317 |
+
cycle_path = self.compute_original_indexes(cycle_path)
|
| 318 |
+
|
| 319 |
+
logger.debug("old cycle %s", cycle_path)
|
| 320 |
+
logger.debug("Prior to update: %s", self.scores)
|
| 321 |
+
|
| 322 |
+
for i, row in enumerate(self.scores):
|
| 323 |
+
for j, column in enumerate(self.scores[i]):
|
| 324 |
+
logger.debug(self.scores[i][j])
|
| 325 |
+
if j in cycle_path and i not in cycle_path and self.scores[i][j]:
|
| 326 |
+
subtract_val = self.compute_max_subtract_score(j, cycle_path)
|
| 327 |
+
|
| 328 |
+
logger.debug("%s - %s", self.scores[i][j], subtract_val)
|
| 329 |
+
|
| 330 |
+
new_vals = []
|
| 331 |
+
for cur_val in self.scores[i][j]:
|
| 332 |
+
new_vals.append(cur_val - subtract_val)
|
| 333 |
+
|
| 334 |
+
self.scores[i][j] = new_vals
|
| 335 |
+
|
| 336 |
+
for i, row in enumerate(self.scores):
|
| 337 |
+
for j, cell in enumerate(self.scores[i]):
|
| 338 |
+
if i in cycle_path and j in cycle_path:
|
| 339 |
+
self.scores[i][j] = []
|
| 340 |
+
|
| 341 |
+
logger.debug("After update: %s", self.scores)
|
| 342 |
+
|
| 343 |
+
def compute_original_indexes(self, new_indexes):
|
| 344 |
+
"""
|
| 345 |
+
As nodes are collapsed into others, they are replaced
|
| 346 |
+
by the new node in the graph, but it's still necessary
|
| 347 |
+
to keep track of what these original nodes were. This
|
| 348 |
+
takes a list of node addresses and replaces any collapsed
|
| 349 |
+
node addresses with their original addresses.
|
| 350 |
+
|
| 351 |
+
:type new_indexes: A list of integers.
|
| 352 |
+
:param new_indexes: A list of node addresses to check for
|
| 353 |
+
subsumed nodes.
|
| 354 |
+
"""
|
| 355 |
+
swapped = True
|
| 356 |
+
while swapped:
|
| 357 |
+
originals = []
|
| 358 |
+
swapped = False
|
| 359 |
+
for new_index in new_indexes:
|
| 360 |
+
if new_index in self.inner_nodes:
|
| 361 |
+
for old_val in self.inner_nodes[new_index]:
|
| 362 |
+
if old_val not in originals:
|
| 363 |
+
originals.append(old_val)
|
| 364 |
+
swapped = True
|
| 365 |
+
else:
|
| 366 |
+
originals.append(new_index)
|
| 367 |
+
new_indexes = originals
|
| 368 |
+
return new_indexes
|
| 369 |
+
|
| 370 |
+
def compute_max_subtract_score(self, column_index, cycle_indexes):
|
| 371 |
+
"""
|
| 372 |
+
When updating scores the score of the highest-weighted incoming
|
| 373 |
+
arc is subtracted upon collapse. This returns the correct
|
| 374 |
+
amount to subtract from that edge.
|
| 375 |
+
|
| 376 |
+
:type column_index: integer.
|
| 377 |
+
:param column_index: A index representing the column of incoming arcs
|
| 378 |
+
to a particular node being updated
|
| 379 |
+
:type cycle_indexes: A list of integers.
|
| 380 |
+
:param cycle_indexes: Only arcs from cycle nodes are considered. This
|
| 381 |
+
is a list of such nodes addresses.
|
| 382 |
+
"""
|
| 383 |
+
max_score = -100000
|
| 384 |
+
for row_index in cycle_indexes:
|
| 385 |
+
for subtract_val in self.scores[row_index][column_index]:
|
| 386 |
+
if subtract_val > max_score:
|
| 387 |
+
max_score = subtract_val
|
| 388 |
+
return max_score
|
| 389 |
+
|
| 390 |
+
def best_incoming_arc(self, node_index):
|
| 391 |
+
"""
|
| 392 |
+
Returns the source of the best incoming arc to the
|
| 393 |
+
node with address: node_index
|
| 394 |
+
|
| 395 |
+
:type node_index: integer.
|
| 396 |
+
:param node_index: The address of the 'destination' node,
|
| 397 |
+
the node that is arced to.
|
| 398 |
+
"""
|
| 399 |
+
originals = self.compute_original_indexes([node_index])
|
| 400 |
+
logger.debug("originals: %s", originals)
|
| 401 |
+
|
| 402 |
+
max_arc = None
|
| 403 |
+
max_score = None
|
| 404 |
+
for row_index in range(len(self.scores)):
|
| 405 |
+
for col_index in range(len(self.scores[row_index])):
|
| 406 |
+
if col_index in originals and (
|
| 407 |
+
max_score is None or self.scores[row_index][col_index] > max_score
|
| 408 |
+
):
|
| 409 |
+
max_score = self.scores[row_index][col_index]
|
| 410 |
+
max_arc = row_index
|
| 411 |
+
logger.debug("%s, %s", row_index, col_index)
|
| 412 |
+
|
| 413 |
+
logger.debug(max_score)
|
| 414 |
+
|
| 415 |
+
for key in self.inner_nodes:
|
| 416 |
+
replaced_nodes = self.inner_nodes[key]
|
| 417 |
+
if max_arc in replaced_nodes:
|
| 418 |
+
return key
|
| 419 |
+
|
| 420 |
+
return max_arc
|
| 421 |
+
|
| 422 |
+
def original_best_arc(self, node_index):
|
| 423 |
+
originals = self.compute_original_indexes([node_index])
|
| 424 |
+
max_arc = None
|
| 425 |
+
max_score = None
|
| 426 |
+
max_orig = None
|
| 427 |
+
for row_index in range(len(self.scores)):
|
| 428 |
+
for col_index in range(len(self.scores[row_index])):
|
| 429 |
+
if col_index in originals and (
|
| 430 |
+
max_score is None or self.scores[row_index][col_index] > max_score
|
| 431 |
+
):
|
| 432 |
+
max_score = self.scores[row_index][col_index]
|
| 433 |
+
max_arc = row_index
|
| 434 |
+
max_orig = col_index
|
| 435 |
+
return [max_arc, max_orig]
|
| 436 |
+
|
| 437 |
+
def parse(self, tokens, tags):
|
| 438 |
+
"""
|
| 439 |
+
Parses a list of tokens in accordance to the MST parsing algorithm
|
| 440 |
+
for non-projective dependency parses. Assumes that the tokens to
|
| 441 |
+
be parsed have already been tagged and those tags are provided. Various
|
| 442 |
+
scoring methods can be used by implementing the ``DependencyScorerI``
|
| 443 |
+
interface and passing it to the training algorithm.
|
| 444 |
+
|
| 445 |
+
:type tokens: list(str)
|
| 446 |
+
:param tokens: A list of words or punctuation to be parsed.
|
| 447 |
+
:type tags: list(str)
|
| 448 |
+
:param tags: A list of tags corresponding by index to the words in the tokens list.
|
| 449 |
+
:return: An iterator of non-projective parses.
|
| 450 |
+
:rtype: iter(DependencyGraph)
|
| 451 |
+
"""
|
| 452 |
+
self.inner_nodes = {}
|
| 453 |
+
|
| 454 |
+
# Initialize g_graph
|
| 455 |
+
g_graph = DependencyGraph()
|
| 456 |
+
for index, token in enumerate(tokens):
|
| 457 |
+
g_graph.nodes[index + 1].update(
|
| 458 |
+
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
# Fully connect non-root nodes in g_graph
|
| 462 |
+
g_graph.connect_graph()
|
| 463 |
+
original_graph = DependencyGraph()
|
| 464 |
+
for index, token in enumerate(tokens):
|
| 465 |
+
original_graph.nodes[index + 1].update(
|
| 466 |
+
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
b_graph = DependencyGraph()
|
| 470 |
+
c_graph = DependencyGraph()
|
| 471 |
+
|
| 472 |
+
for index, token in enumerate(tokens):
|
| 473 |
+
c_graph.nodes[index + 1].update(
|
| 474 |
+
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
| 475 |
+
)
|
| 476 |
+
|
| 477 |
+
# Assign initial scores to g_graph edges
|
| 478 |
+
self.initialize_edge_scores(g_graph)
|
| 479 |
+
logger.debug(self.scores)
|
| 480 |
+
# Initialize a list of unvisited vertices (by node address)
|
| 481 |
+
unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
|
| 482 |
+
# Iterate over unvisited vertices
|
| 483 |
+
nr_vertices = len(tokens)
|
| 484 |
+
betas = {}
|
| 485 |
+
while unvisited_vertices:
|
| 486 |
+
# Mark current node as visited
|
| 487 |
+
current_vertex = unvisited_vertices.pop(0)
|
| 488 |
+
logger.debug("current_vertex: %s", current_vertex)
|
| 489 |
+
# Get corresponding node n_i to vertex v_i
|
| 490 |
+
current_node = g_graph.get_by_address(current_vertex)
|
| 491 |
+
logger.debug("current_node: %s", current_node)
|
| 492 |
+
# Get best in-edge node b for current node
|
| 493 |
+
best_in_edge = self.best_incoming_arc(current_vertex)
|
| 494 |
+
betas[current_vertex] = self.original_best_arc(current_vertex)
|
| 495 |
+
logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
|
| 496 |
+
# b_graph = Union(b_graph, b)
|
| 497 |
+
for new_vertex in [current_vertex, best_in_edge]:
|
| 498 |
+
b_graph.nodes[new_vertex].update(
|
| 499 |
+
{"word": "TEMP", "rel": "NTOP", "address": new_vertex}
|
| 500 |
+
)
|
| 501 |
+
b_graph.add_arc(best_in_edge, current_vertex)
|
| 502 |
+
# Beta(current node) = b - stored for parse recovery
|
| 503 |
+
# If b_graph contains a cycle, collapse it
|
| 504 |
+
cycle_path = b_graph.contains_cycle()
|
| 505 |
+
if cycle_path:
|
| 506 |
+
# Create a new node v_n+1 with address = len(nodes) + 1
|
| 507 |
+
new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
|
| 508 |
+
# c_graph = Union(c_graph, v_n+1)
|
| 509 |
+
c_graph.add_node(new_node)
|
| 510 |
+
# Collapse all nodes in cycle C into v_n+1
|
| 511 |
+
self.update_edge_scores(new_node, cycle_path)
|
| 512 |
+
self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
|
| 513 |
+
for cycle_index in cycle_path:
|
| 514 |
+
c_graph.add_arc(new_node["address"], cycle_index)
|
| 515 |
+
# self.replaced_by[cycle_index] = new_node['address']
|
| 516 |
+
|
| 517 |
+
self.inner_nodes[new_node["address"]] = cycle_path
|
| 518 |
+
|
| 519 |
+
# Add v_n+1 to list of unvisited vertices
|
| 520 |
+
unvisited_vertices.insert(0, nr_vertices + 1)
|
| 521 |
+
|
| 522 |
+
# increment # of nodes counter
|
| 523 |
+
nr_vertices += 1
|
| 524 |
+
|
| 525 |
+
# Remove cycle nodes from b_graph; B = B - cycle c
|
| 526 |
+
for cycle_node_address in cycle_path:
|
| 527 |
+
b_graph.remove_by_address(cycle_node_address)
|
| 528 |
+
|
| 529 |
+
logger.debug("g_graph: %s", g_graph)
|
| 530 |
+
logger.debug("b_graph: %s", b_graph)
|
| 531 |
+
logger.debug("c_graph: %s", c_graph)
|
| 532 |
+
logger.debug("Betas: %s", betas)
|
| 533 |
+
logger.debug("replaced nodes %s", self.inner_nodes)
|
| 534 |
+
|
| 535 |
+
# Recover parse tree
|
| 536 |
+
logger.debug("Final scores: %s", self.scores)
|
| 537 |
+
|
| 538 |
+
logger.debug("Recovering parse...")
|
| 539 |
+
for i in range(len(tokens) + 1, nr_vertices + 1):
|
| 540 |
+
betas[betas[i][1]] = betas[i]
|
| 541 |
+
|
| 542 |
+
logger.debug("Betas: %s", betas)
|
| 543 |
+
for node in original_graph.nodes.values():
|
| 544 |
+
# TODO: It's dangerous to assume that deps it a dictionary
|
| 545 |
+
# because it's a default dictionary. Ideally, here we should not
|
| 546 |
+
# be concerned how dependencies are stored inside of a dependency
|
| 547 |
+
# graph.
|
| 548 |
+
node["deps"] = {}
|
| 549 |
+
for i in range(1, len(tokens) + 1):
|
| 550 |
+
original_graph.add_arc(betas[i][0], betas[i][1])
|
| 551 |
+
|
| 552 |
+
logger.debug("Done.")
|
| 553 |
+
yield original_graph
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
#################################################################
|
| 557 |
+
# Rule-based Non-Projective Parser
|
| 558 |
+
#################################################################
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
class NonprojectiveDependencyParser:
|
| 562 |
+
"""
|
| 563 |
+
A non-projective, rule-based, dependency parser. This parser
|
| 564 |
+
will return the set of all possible non-projective parses based on
|
| 565 |
+
the word-to-word relations defined in the parser's dependency
|
| 566 |
+
grammar, and will allow the branches of the parse tree to cross
|
| 567 |
+
in order to capture a variety of linguistic phenomena that a
|
| 568 |
+
projective parser will not.
|
| 569 |
+
"""
|
| 570 |
+
|
| 571 |
+
def __init__(self, dependency_grammar):
|
| 572 |
+
"""
|
| 573 |
+
Creates a new ``NonprojectiveDependencyParser``.
|
| 574 |
+
|
| 575 |
+
:param dependency_grammar: a grammar of word-to-word relations.
|
| 576 |
+
:type dependency_grammar: DependencyGrammar
|
| 577 |
+
"""
|
| 578 |
+
self._grammar = dependency_grammar
|
| 579 |
+
|
| 580 |
+
def parse(self, tokens):
|
| 581 |
+
"""
|
| 582 |
+
Parses the input tokens with respect to the parser's grammar. Parsing
|
| 583 |
+
is accomplished by representing the search-space of possible parses as
|
| 584 |
+
a fully-connected directed graph. Arcs that would lead to ungrammatical
|
| 585 |
+
parses are removed and a lattice is constructed of length n, where n is
|
| 586 |
+
the number of input tokens, to represent all possible grammatical
|
| 587 |
+
traversals. All possible paths through the lattice are then enumerated
|
| 588 |
+
to produce the set of non-projective parses.
|
| 589 |
+
|
| 590 |
+
param tokens: A list of tokens to parse.
|
| 591 |
+
type tokens: list(str)
|
| 592 |
+
return: An iterator of non-projective parses.
|
| 593 |
+
rtype: iter(DependencyGraph)
|
| 594 |
+
"""
|
| 595 |
+
# Create graph representation of tokens
|
| 596 |
+
self._graph = DependencyGraph()
|
| 597 |
+
|
| 598 |
+
for index, token in enumerate(tokens):
|
| 599 |
+
self._graph.nodes[index] = {
|
| 600 |
+
"word": token,
|
| 601 |
+
"deps": [],
|
| 602 |
+
"rel": "NTOP",
|
| 603 |
+
"address": index,
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
for head_node in self._graph.nodes.values():
|
| 607 |
+
deps = []
|
| 608 |
+
for dep_node in self._graph.nodes.values():
|
| 609 |
+
if (
|
| 610 |
+
self._grammar.contains(head_node["word"], dep_node["word"])
|
| 611 |
+
and head_node["word"] != dep_node["word"]
|
| 612 |
+
):
|
| 613 |
+
deps.append(dep_node["address"])
|
| 614 |
+
head_node["deps"] = deps
|
| 615 |
+
|
| 616 |
+
# Create lattice of possible heads
|
| 617 |
+
roots = []
|
| 618 |
+
possible_heads = []
|
| 619 |
+
for i, word in enumerate(tokens):
|
| 620 |
+
heads = []
|
| 621 |
+
for j, head in enumerate(tokens):
|
| 622 |
+
if (i != j) and self._grammar.contains(head, word):
|
| 623 |
+
heads.append(j)
|
| 624 |
+
if len(heads) == 0:
|
| 625 |
+
roots.append(i)
|
| 626 |
+
possible_heads.append(heads)
|
| 627 |
+
|
| 628 |
+
# Set roots to attempt
|
| 629 |
+
if len(roots) < 2:
|
| 630 |
+
if len(roots) == 0:
|
| 631 |
+
for i in range(len(tokens)):
|
| 632 |
+
roots.append(i)
|
| 633 |
+
|
| 634 |
+
# Traverse lattice
|
| 635 |
+
analyses = []
|
| 636 |
+
for _ in roots:
|
| 637 |
+
stack = []
|
| 638 |
+
analysis = [[] for i in range(len(possible_heads))]
|
| 639 |
+
i = 0
|
| 640 |
+
forward = True
|
| 641 |
+
while i >= 0:
|
| 642 |
+
if forward:
|
| 643 |
+
if len(possible_heads[i]) == 1:
|
| 644 |
+
analysis[i] = possible_heads[i][0]
|
| 645 |
+
elif len(possible_heads[i]) == 0:
|
| 646 |
+
analysis[i] = -1
|
| 647 |
+
else:
|
| 648 |
+
head = possible_heads[i].pop()
|
| 649 |
+
analysis[i] = head
|
| 650 |
+
stack.append([i, head])
|
| 651 |
+
if not forward:
|
| 652 |
+
index_on_stack = False
|
| 653 |
+
for stack_item in stack:
|
| 654 |
+
if stack_item[0] == i:
|
| 655 |
+
index_on_stack = True
|
| 656 |
+
orig_length = len(possible_heads[i])
|
| 657 |
+
|
| 658 |
+
if index_on_stack and orig_length == 0:
|
| 659 |
+
for j in range(len(stack) - 1, -1, -1):
|
| 660 |
+
stack_item = stack[j]
|
| 661 |
+
if stack_item[0] == i:
|
| 662 |
+
possible_heads[i].append(stack.pop(j)[1])
|
| 663 |
+
|
| 664 |
+
elif index_on_stack and orig_length > 0:
|
| 665 |
+
head = possible_heads[i].pop()
|
| 666 |
+
analysis[i] = head
|
| 667 |
+
stack.append([i, head])
|
| 668 |
+
forward = True
|
| 669 |
+
|
| 670 |
+
if i + 1 == len(possible_heads):
|
| 671 |
+
analyses.append(analysis[:])
|
| 672 |
+
forward = False
|
| 673 |
+
if forward:
|
| 674 |
+
i += 1
|
| 675 |
+
else:
|
| 676 |
+
i -= 1
|
| 677 |
+
|
| 678 |
+
# Filter parses
|
| 679 |
+
# ensure 1 root, every thing has 1 head
|
| 680 |
+
for analysis in analyses:
|
| 681 |
+
if analysis.count(-1) > 1:
|
| 682 |
+
# there are several root elements!
|
| 683 |
+
continue
|
| 684 |
+
|
| 685 |
+
graph = DependencyGraph()
|
| 686 |
+
graph.root = graph.nodes[analysis.index(-1) + 1]
|
| 687 |
+
|
| 688 |
+
for address, (token, head_index) in enumerate(
|
| 689 |
+
zip(tokens, analysis), start=1
|
| 690 |
+
):
|
| 691 |
+
head_address = head_index + 1
|
| 692 |
+
|
| 693 |
+
node = graph.nodes[address]
|
| 694 |
+
node.update({"word": token, "address": address})
|
| 695 |
+
|
| 696 |
+
if head_address == 0:
|
| 697 |
+
rel = "ROOT"
|
| 698 |
+
else:
|
| 699 |
+
rel = ""
|
| 700 |
+
graph.nodes[head_index + 1]["deps"][rel].append(address)
|
| 701 |
+
|
| 702 |
+
# TODO: check for cycles
|
| 703 |
+
yield graph
|
| 704 |
+
|
| 705 |
+
|
| 706 |
+
#################################################################
|
| 707 |
+
# Demos
|
| 708 |
+
#################################################################
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
def demo():
|
| 712 |
+
# hall_demo()
|
| 713 |
+
nonprojective_conll_parse_demo()
|
| 714 |
+
rule_based_demo()
|
| 715 |
+
|
| 716 |
+
|
| 717 |
+
def hall_demo():
|
| 718 |
+
npp = ProbabilisticNonprojectiveParser()
|
| 719 |
+
npp.train([], DemoScorer())
|
| 720 |
+
for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
|
| 721 |
+
print(parse_graph)
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
def nonprojective_conll_parse_demo():
|
| 725 |
+
from nltk.parse.dependencygraph import conll_data2
|
| 726 |
+
|
| 727 |
+
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
|
| 728 |
+
npp = ProbabilisticNonprojectiveParser()
|
| 729 |
+
npp.train(graphs, NaiveBayesDependencyScorer())
|
| 730 |
+
for parse_graph in npp.parse(
|
| 731 |
+
["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
|
| 732 |
+
):
|
| 733 |
+
print(parse_graph)
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
def rule_based_demo():
|
| 737 |
+
from nltk.grammar import DependencyGrammar
|
| 738 |
+
|
| 739 |
+
grammar = DependencyGrammar.fromstring(
|
| 740 |
+
"""
|
| 741 |
+
'taught' -> 'play' | 'man'
|
| 742 |
+
'man' -> 'the' | 'in'
|
| 743 |
+
'in' -> 'corner'
|
| 744 |
+
'corner' -> 'the'
|
| 745 |
+
'play' -> 'golf' | 'dachshund' | 'to'
|
| 746 |
+
'dachshund' -> 'his'
|
| 747 |
+
"""
|
| 748 |
+
)
|
| 749 |
+
print(grammar)
|
| 750 |
+
ndp = NonprojectiveDependencyParser(grammar)
|
| 751 |
+
graphs = ndp.parse(
|
| 752 |
+
[
|
| 753 |
+
"the",
|
| 754 |
+
"man",
|
| 755 |
+
"in",
|
| 756 |
+
"the",
|
| 757 |
+
"corner",
|
| 758 |
+
"taught",
|
| 759 |
+
"his",
|
| 760 |
+
"dachshund",
|
| 761 |
+
"to",
|
| 762 |
+
"play",
|
| 763 |
+
"golf",
|
| 764 |
+
]
|
| 765 |
+
)
|
| 766 |
+
print("Graphs:")
|
| 767 |
+
for graph in graphs:
|
| 768 |
+
print(graph)
|
| 769 |
+
|
| 770 |
+
|
| 771 |
+
if __name__ == "__main__":
|
| 772 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py
ADDED
|
@@ -0,0 +1,684 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Recursive Descent Parser
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
from nltk.grammar import Nonterminal
|
| 10 |
+
from nltk.parse.api import ParserI
|
| 11 |
+
from nltk.tree import ImmutableTree, Tree
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
##//////////////////////////////////////////////////////
|
| 15 |
+
## Recursive Descent Parser
|
| 16 |
+
##//////////////////////////////////////////////////////
|
| 17 |
+
class RecursiveDescentParser(ParserI):
|
| 18 |
+
"""
|
| 19 |
+
A simple top-down CFG parser that parses texts by recursively
|
| 20 |
+
expanding the fringe of a Tree, and matching it against a
|
| 21 |
+
text.
|
| 22 |
+
|
| 23 |
+
``RecursiveDescentParser`` uses a list of tree locations called a
|
| 24 |
+
"frontier" to remember which subtrees have not yet been expanded
|
| 25 |
+
and which leaves have not yet been matched against the text. Each
|
| 26 |
+
tree location consists of a list of child indices specifying the
|
| 27 |
+
path from the root of the tree to a subtree or a leaf; see the
|
| 28 |
+
reference documentation for Tree for more information
|
| 29 |
+
about tree locations.
|
| 30 |
+
|
| 31 |
+
When the parser begins parsing a text, it constructs a tree
|
| 32 |
+
containing only the start symbol, and a frontier containing the
|
| 33 |
+
location of the tree's root node. It then extends the tree to
|
| 34 |
+
cover the text, using the following recursive procedure:
|
| 35 |
+
|
| 36 |
+
- If the frontier is empty, and the text is covered by the tree,
|
| 37 |
+
then return the tree as a possible parse.
|
| 38 |
+
- If the frontier is empty, and the text is not covered by the
|
| 39 |
+
tree, then return no parses.
|
| 40 |
+
- If the first element of the frontier is a subtree, then
|
| 41 |
+
use CFG productions to "expand" it. For each applicable
|
| 42 |
+
production, add the expanded subtree's children to the
|
| 43 |
+
frontier, and recursively find all parses that can be
|
| 44 |
+
generated by the new tree and frontier.
|
| 45 |
+
- If the first element of the frontier is a token, then "match"
|
| 46 |
+
it against the next token from the text. Remove the token
|
| 47 |
+
from the frontier, and recursively find all parses that can be
|
| 48 |
+
generated by the new tree and frontier.
|
| 49 |
+
|
| 50 |
+
:see: ``nltk.grammar``
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
def __init__(self, grammar, trace=0):
|
| 54 |
+
"""
|
| 55 |
+
Create a new ``RecursiveDescentParser``, that uses ``grammar``
|
| 56 |
+
to parse texts.
|
| 57 |
+
|
| 58 |
+
:type grammar: CFG
|
| 59 |
+
:param grammar: The grammar used to parse texts.
|
| 60 |
+
:type trace: int
|
| 61 |
+
:param trace: The level of tracing that should be used when
|
| 62 |
+
parsing a text. ``0`` will generate no tracing output;
|
| 63 |
+
and higher numbers will produce more verbose tracing
|
| 64 |
+
output.
|
| 65 |
+
"""
|
| 66 |
+
self._grammar = grammar
|
| 67 |
+
self._trace = trace
|
| 68 |
+
|
| 69 |
+
def grammar(self):
|
| 70 |
+
return self._grammar
|
| 71 |
+
|
| 72 |
+
def parse(self, tokens):
|
| 73 |
+
# Inherit docs from ParserI
|
| 74 |
+
|
| 75 |
+
tokens = list(tokens)
|
| 76 |
+
self._grammar.check_coverage(tokens)
|
| 77 |
+
|
| 78 |
+
# Start a recursive descent parse, with an initial tree
|
| 79 |
+
# containing just the start symbol.
|
| 80 |
+
start = self._grammar.start().symbol()
|
| 81 |
+
initial_tree = Tree(start, [])
|
| 82 |
+
frontier = [()]
|
| 83 |
+
if self._trace:
|
| 84 |
+
self._trace_start(initial_tree, frontier, tokens)
|
| 85 |
+
return self._parse(tokens, initial_tree, frontier)
|
| 86 |
+
|
| 87 |
+
def _parse(self, remaining_text, tree, frontier):
|
| 88 |
+
"""
|
| 89 |
+
Recursively expand and match each elements of ``tree``
|
| 90 |
+
specified by ``frontier``, to cover ``remaining_text``. Return
|
| 91 |
+
a list of all parses found.
|
| 92 |
+
|
| 93 |
+
:return: An iterator of all parses that can be generated by
|
| 94 |
+
matching and expanding the elements of ``tree``
|
| 95 |
+
specified by ``frontier``.
|
| 96 |
+
:rtype: iter(Tree)
|
| 97 |
+
:type tree: Tree
|
| 98 |
+
:param tree: A partial structure for the text that is
|
| 99 |
+
currently being parsed. The elements of ``tree``
|
| 100 |
+
that are specified by ``frontier`` have not yet been
|
| 101 |
+
expanded or matched.
|
| 102 |
+
:type remaining_text: list(str)
|
| 103 |
+
:param remaining_text: The portion of the text that is not yet
|
| 104 |
+
covered by ``tree``.
|
| 105 |
+
:type frontier: list(tuple(int))
|
| 106 |
+
:param frontier: A list of the locations within ``tree`` of
|
| 107 |
+
all subtrees that have not yet been expanded, and all
|
| 108 |
+
leaves that have not yet been matched. This list sorted
|
| 109 |
+
in left-to-right order of location within the tree.
|
| 110 |
+
"""
|
| 111 |
+
|
| 112 |
+
# If the tree covers the text, and there's nothing left to
|
| 113 |
+
# expand, then we've found a complete parse; return it.
|
| 114 |
+
if len(remaining_text) == 0 and len(frontier) == 0:
|
| 115 |
+
if self._trace:
|
| 116 |
+
self._trace_succeed(tree, frontier)
|
| 117 |
+
yield tree
|
| 118 |
+
|
| 119 |
+
# If there's still text, but nothing left to expand, we failed.
|
| 120 |
+
elif len(frontier) == 0:
|
| 121 |
+
if self._trace:
|
| 122 |
+
self._trace_backtrack(tree, frontier)
|
| 123 |
+
|
| 124 |
+
# If the next element on the frontier is a tree, expand it.
|
| 125 |
+
elif isinstance(tree[frontier[0]], Tree):
|
| 126 |
+
yield from self._expand(remaining_text, tree, frontier)
|
| 127 |
+
|
| 128 |
+
# If the next element on the frontier is a token, match it.
|
| 129 |
+
else:
|
| 130 |
+
yield from self._match(remaining_text, tree, frontier)
|
| 131 |
+
|
| 132 |
+
def _match(self, rtext, tree, frontier):
|
| 133 |
+
"""
|
| 134 |
+
:rtype: iter(Tree)
|
| 135 |
+
:return: an iterator of all parses that can be generated by
|
| 136 |
+
matching the first element of ``frontier`` against the
|
| 137 |
+
first token in ``rtext``. In particular, if the first
|
| 138 |
+
element of ``frontier`` has the same type as the first
|
| 139 |
+
token in ``rtext``, then substitute the token into
|
| 140 |
+
``tree``; and return all parses that can be generated by
|
| 141 |
+
matching and expanding the remaining elements of
|
| 142 |
+
``frontier``. If the first element of ``frontier`` does not
|
| 143 |
+
have the same type as the first token in ``rtext``, then
|
| 144 |
+
return empty list.
|
| 145 |
+
|
| 146 |
+
:type tree: Tree
|
| 147 |
+
:param tree: A partial structure for the text that is
|
| 148 |
+
currently being parsed. The elements of ``tree``
|
| 149 |
+
that are specified by ``frontier`` have not yet been
|
| 150 |
+
expanded or matched.
|
| 151 |
+
:type rtext: list(str)
|
| 152 |
+
:param rtext: The portion of the text that is not yet
|
| 153 |
+
covered by ``tree``.
|
| 154 |
+
:type frontier: list of tuple of int
|
| 155 |
+
:param frontier: A list of the locations within ``tree`` of
|
| 156 |
+
all subtrees that have not yet been expanded, and all
|
| 157 |
+
leaves that have not yet been matched.
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
tree_leaf = tree[frontier[0]]
|
| 161 |
+
if len(rtext) > 0 and tree_leaf == rtext[0]:
|
| 162 |
+
# If it's a terminal that matches rtext[0], then substitute
|
| 163 |
+
# in the token, and continue parsing.
|
| 164 |
+
newtree = tree.copy(deep=True)
|
| 165 |
+
newtree[frontier[0]] = rtext[0]
|
| 166 |
+
if self._trace:
|
| 167 |
+
self._trace_match(newtree, frontier[1:], rtext[0])
|
| 168 |
+
yield from self._parse(rtext[1:], newtree, frontier[1:])
|
| 169 |
+
else:
|
| 170 |
+
# If it's a non-matching terminal, fail.
|
| 171 |
+
if self._trace:
|
| 172 |
+
self._trace_backtrack(tree, frontier, rtext[:1])
|
| 173 |
+
|
| 174 |
+
def _expand(self, remaining_text, tree, frontier, production=None):
|
| 175 |
+
"""
|
| 176 |
+
:rtype: iter(Tree)
|
| 177 |
+
:return: An iterator of all parses that can be generated by
|
| 178 |
+
expanding the first element of ``frontier`` with
|
| 179 |
+
``production``. In particular, if the first element of
|
| 180 |
+
``frontier`` is a subtree whose node type is equal to
|
| 181 |
+
``production``'s left hand side, then add a child to that
|
| 182 |
+
subtree for each element of ``production``'s right hand
|
| 183 |
+
side; and return all parses that can be generated by
|
| 184 |
+
matching and expanding the remaining elements of
|
| 185 |
+
``frontier``. If the first element of ``frontier`` is not a
|
| 186 |
+
subtree whose node type is equal to ``production``'s left
|
| 187 |
+
hand side, then return an empty list. If ``production`` is
|
| 188 |
+
not specified, then return a list of all parses that can
|
| 189 |
+
be generated by expanding the first element of ``frontier``
|
| 190 |
+
with *any* CFG production.
|
| 191 |
+
|
| 192 |
+
:type tree: Tree
|
| 193 |
+
:param tree: A partial structure for the text that is
|
| 194 |
+
currently being parsed. The elements of ``tree``
|
| 195 |
+
that are specified by ``frontier`` have not yet been
|
| 196 |
+
expanded or matched.
|
| 197 |
+
:type remaining_text: list(str)
|
| 198 |
+
:param remaining_text: The portion of the text that is not yet
|
| 199 |
+
covered by ``tree``.
|
| 200 |
+
:type frontier: list(tuple(int))
|
| 201 |
+
:param frontier: A list of the locations within ``tree`` of
|
| 202 |
+
all subtrees that have not yet been expanded, and all
|
| 203 |
+
leaves that have not yet been matched.
|
| 204 |
+
"""
|
| 205 |
+
|
| 206 |
+
if production is None:
|
| 207 |
+
productions = self._grammar.productions()
|
| 208 |
+
else:
|
| 209 |
+
productions = [production]
|
| 210 |
+
|
| 211 |
+
for production in productions:
|
| 212 |
+
lhs = production.lhs().symbol()
|
| 213 |
+
if lhs == tree[frontier[0]].label():
|
| 214 |
+
subtree = self._production_to_tree(production)
|
| 215 |
+
if frontier[0] == ():
|
| 216 |
+
newtree = subtree
|
| 217 |
+
else:
|
| 218 |
+
newtree = tree.copy(deep=True)
|
| 219 |
+
newtree[frontier[0]] = subtree
|
| 220 |
+
new_frontier = [
|
| 221 |
+
frontier[0] + (i,) for i in range(len(production.rhs()))
|
| 222 |
+
]
|
| 223 |
+
if self._trace:
|
| 224 |
+
self._trace_expand(newtree, new_frontier, production)
|
| 225 |
+
yield from self._parse(
|
| 226 |
+
remaining_text, newtree, new_frontier + frontier[1:]
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
def _production_to_tree(self, production):
|
| 230 |
+
"""
|
| 231 |
+
:rtype: Tree
|
| 232 |
+
:return: The Tree that is licensed by ``production``.
|
| 233 |
+
In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
|
| 234 |
+
return a tree that has a node ``lhs.symbol``, and
|
| 235 |
+
``n`` children. For each nonterminal element
|
| 236 |
+
``elt[i]`` in the production, the tree token has a
|
| 237 |
+
childless subtree with node value ``elt[i].symbol``; and
|
| 238 |
+
for each terminal element ``elt[j]``, the tree token has
|
| 239 |
+
a leaf token with type ``elt[j]``.
|
| 240 |
+
|
| 241 |
+
:param production: The CFG production that licenses the tree
|
| 242 |
+
token that should be returned.
|
| 243 |
+
:type production: Production
|
| 244 |
+
"""
|
| 245 |
+
children = []
|
| 246 |
+
for elt in production.rhs():
|
| 247 |
+
if isinstance(elt, Nonterminal):
|
| 248 |
+
children.append(Tree(elt.symbol(), []))
|
| 249 |
+
else:
|
| 250 |
+
# This will be matched.
|
| 251 |
+
children.append(elt)
|
| 252 |
+
return Tree(production.lhs().symbol(), children)
|
| 253 |
+
|
| 254 |
+
def trace(self, trace=2):
|
| 255 |
+
"""
|
| 256 |
+
Set the level of tracing output that should be generated when
|
| 257 |
+
parsing a text.
|
| 258 |
+
|
| 259 |
+
:type trace: int
|
| 260 |
+
:param trace: The trace level. A trace level of ``0`` will
|
| 261 |
+
generate no tracing output; and higher trace levels will
|
| 262 |
+
produce more verbose tracing output.
|
| 263 |
+
:rtype: None
|
| 264 |
+
"""
|
| 265 |
+
self._trace = trace
|
| 266 |
+
|
| 267 |
+
def _trace_fringe(self, tree, treeloc=None):
|
| 268 |
+
"""
|
| 269 |
+
Print trace output displaying the fringe of ``tree``. The
|
| 270 |
+
fringe of ``tree`` consists of all of its leaves and all of
|
| 271 |
+
its childless subtrees.
|
| 272 |
+
|
| 273 |
+
:rtype: None
|
| 274 |
+
"""
|
| 275 |
+
|
| 276 |
+
if treeloc == ():
|
| 277 |
+
print("*", end=" ")
|
| 278 |
+
if isinstance(tree, Tree):
|
| 279 |
+
if len(tree) == 0:
|
| 280 |
+
print(repr(Nonterminal(tree.label())), end=" ")
|
| 281 |
+
for i in range(len(tree)):
|
| 282 |
+
if treeloc is not None and i == treeloc[0]:
|
| 283 |
+
self._trace_fringe(tree[i], treeloc[1:])
|
| 284 |
+
else:
|
| 285 |
+
self._trace_fringe(tree[i])
|
| 286 |
+
else:
|
| 287 |
+
print(repr(tree), end=" ")
|
| 288 |
+
|
| 289 |
+
def _trace_tree(self, tree, frontier, operation):
|
| 290 |
+
"""
|
| 291 |
+
Print trace output displaying the parser's current state.
|
| 292 |
+
|
| 293 |
+
:param operation: A character identifying the operation that
|
| 294 |
+
generated the current state.
|
| 295 |
+
:rtype: None
|
| 296 |
+
"""
|
| 297 |
+
if self._trace == 2:
|
| 298 |
+
print(" %c [" % operation, end=" ")
|
| 299 |
+
else:
|
| 300 |
+
print(" [", end=" ")
|
| 301 |
+
if len(frontier) > 0:
|
| 302 |
+
self._trace_fringe(tree, frontier[0])
|
| 303 |
+
else:
|
| 304 |
+
self._trace_fringe(tree)
|
| 305 |
+
print("]")
|
| 306 |
+
|
| 307 |
+
def _trace_start(self, tree, frontier, text):
|
| 308 |
+
print("Parsing %r" % " ".join(text))
|
| 309 |
+
if self._trace > 2:
|
| 310 |
+
print("Start:")
|
| 311 |
+
if self._trace > 1:
|
| 312 |
+
self._trace_tree(tree, frontier, " ")
|
| 313 |
+
|
| 314 |
+
def _trace_expand(self, tree, frontier, production):
|
| 315 |
+
if self._trace > 2:
|
| 316 |
+
print("Expand: %s" % production)
|
| 317 |
+
if self._trace > 1:
|
| 318 |
+
self._trace_tree(tree, frontier, "E")
|
| 319 |
+
|
| 320 |
+
def _trace_match(self, tree, frontier, tok):
|
| 321 |
+
if self._trace > 2:
|
| 322 |
+
print("Match: %r" % tok)
|
| 323 |
+
if self._trace > 1:
|
| 324 |
+
self._trace_tree(tree, frontier, "M")
|
| 325 |
+
|
| 326 |
+
def _trace_succeed(self, tree, frontier):
|
| 327 |
+
if self._trace > 2:
|
| 328 |
+
print("GOOD PARSE:")
|
| 329 |
+
if self._trace == 1:
|
| 330 |
+
print("Found a parse:\n%s" % tree)
|
| 331 |
+
if self._trace > 1:
|
| 332 |
+
self._trace_tree(tree, frontier, "+")
|
| 333 |
+
|
| 334 |
+
def _trace_backtrack(self, tree, frontier, toks=None):
|
| 335 |
+
if self._trace > 2:
|
| 336 |
+
if toks:
|
| 337 |
+
print("Backtrack: %r match failed" % toks[0])
|
| 338 |
+
else:
|
| 339 |
+
print("Backtrack")
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
##//////////////////////////////////////////////////////
|
| 343 |
+
## Stepping Recursive Descent Parser
|
| 344 |
+
##//////////////////////////////////////////////////////
|
| 345 |
+
class SteppingRecursiveDescentParser(RecursiveDescentParser):
|
| 346 |
+
"""
|
| 347 |
+
A ``RecursiveDescentParser`` that allows you to step through the
|
| 348 |
+
parsing process, performing a single operation at a time.
|
| 349 |
+
|
| 350 |
+
The ``initialize`` method is used to start parsing a text.
|
| 351 |
+
``expand`` expands the first element on the frontier using a single
|
| 352 |
+
CFG production, and ``match`` matches the first element on the
|
| 353 |
+
frontier against the next text token. ``backtrack`` undoes the most
|
| 354 |
+
recent expand or match operation. ``step`` performs a single
|
| 355 |
+
expand, match, or backtrack operation. ``parses`` returns the set
|
| 356 |
+
of parses that have been found by the parser.
|
| 357 |
+
|
| 358 |
+
:ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
|
| 359 |
+
containing the previous states of the parser. This history is
|
| 360 |
+
used to implement the ``backtrack`` operation.
|
| 361 |
+
:ivar _tried_e: A record of all productions that have been tried
|
| 362 |
+
for a given tree. This record is used by ``expand`` to perform
|
| 363 |
+
the next untried production.
|
| 364 |
+
:ivar _tried_m: A record of what tokens have been matched for a
|
| 365 |
+
given tree. This record is used by ``step`` to decide whether
|
| 366 |
+
or not to match a token.
|
| 367 |
+
:see: ``nltk.grammar``
|
| 368 |
+
"""
|
| 369 |
+
|
| 370 |
+
def __init__(self, grammar, trace=0):
|
| 371 |
+
super().__init__(grammar, trace)
|
| 372 |
+
self._rtext = None
|
| 373 |
+
self._tree = None
|
| 374 |
+
self._frontier = [()]
|
| 375 |
+
self._tried_e = {}
|
| 376 |
+
self._tried_m = {}
|
| 377 |
+
self._history = []
|
| 378 |
+
self._parses = []
|
| 379 |
+
|
| 380 |
+
# [XX] TEMPORARY HACK WARNING! This should be replaced with
|
| 381 |
+
# something nicer when we get the chance.
|
| 382 |
+
def _freeze(self, tree):
|
| 383 |
+
c = tree.copy()
|
| 384 |
+
# for pos in c.treepositions('leaves'):
|
| 385 |
+
# c[pos] = c[pos].freeze()
|
| 386 |
+
return ImmutableTree.convert(c)
|
| 387 |
+
|
| 388 |
+
def parse(self, tokens):
|
| 389 |
+
tokens = list(tokens)
|
| 390 |
+
self.initialize(tokens)
|
| 391 |
+
while self.step() is not None:
|
| 392 |
+
pass
|
| 393 |
+
return self.parses()
|
| 394 |
+
|
| 395 |
+
def initialize(self, tokens):
|
| 396 |
+
"""
|
| 397 |
+
Start parsing a given text. This sets the parser's tree to
|
| 398 |
+
the start symbol, its frontier to the root node, and its
|
| 399 |
+
remaining text to ``token['SUBTOKENS']``.
|
| 400 |
+
"""
|
| 401 |
+
|
| 402 |
+
self._rtext = tokens
|
| 403 |
+
start = self._grammar.start().symbol()
|
| 404 |
+
self._tree = Tree(start, [])
|
| 405 |
+
self._frontier = [()]
|
| 406 |
+
self._tried_e = {}
|
| 407 |
+
self._tried_m = {}
|
| 408 |
+
self._history = []
|
| 409 |
+
self._parses = []
|
| 410 |
+
if self._trace:
|
| 411 |
+
self._trace_start(self._tree, self._frontier, self._rtext)
|
| 412 |
+
|
| 413 |
+
def remaining_text(self):
|
| 414 |
+
"""
|
| 415 |
+
:return: The portion of the text that is not yet covered by the
|
| 416 |
+
tree.
|
| 417 |
+
:rtype: list(str)
|
| 418 |
+
"""
|
| 419 |
+
return self._rtext
|
| 420 |
+
|
| 421 |
+
def frontier(self):
|
| 422 |
+
"""
|
| 423 |
+
:return: A list of the tree locations of all subtrees that
|
| 424 |
+
have not yet been expanded, and all leaves that have not
|
| 425 |
+
yet been matched.
|
| 426 |
+
:rtype: list(tuple(int))
|
| 427 |
+
"""
|
| 428 |
+
return self._frontier
|
| 429 |
+
|
| 430 |
+
def tree(self):
|
| 431 |
+
"""
|
| 432 |
+
:return: A partial structure for the text that is
|
| 433 |
+
currently being parsed. The elements specified by the
|
| 434 |
+
frontier have not yet been expanded or matched.
|
| 435 |
+
:rtype: Tree
|
| 436 |
+
"""
|
| 437 |
+
return self._tree
|
| 438 |
+
|
| 439 |
+
def step(self):
|
| 440 |
+
"""
|
| 441 |
+
Perform a single parsing operation. If an untried match is
|
| 442 |
+
possible, then perform the match, and return the matched
|
| 443 |
+
token. If an untried expansion is possible, then perform the
|
| 444 |
+
expansion, and return the production that it is based on. If
|
| 445 |
+
backtracking is possible, then backtrack, and return True.
|
| 446 |
+
Otherwise, return None.
|
| 447 |
+
|
| 448 |
+
:return: None if no operation was performed; a token if a match
|
| 449 |
+
was performed; a production if an expansion was performed;
|
| 450 |
+
and True if a backtrack operation was performed.
|
| 451 |
+
:rtype: Production or String or bool
|
| 452 |
+
"""
|
| 453 |
+
# Try matching (if we haven't already)
|
| 454 |
+
if self.untried_match():
|
| 455 |
+
token = self.match()
|
| 456 |
+
if token is not None:
|
| 457 |
+
return token
|
| 458 |
+
|
| 459 |
+
# Try expanding.
|
| 460 |
+
production = self.expand()
|
| 461 |
+
if production is not None:
|
| 462 |
+
return production
|
| 463 |
+
|
| 464 |
+
# Try backtracking
|
| 465 |
+
if self.backtrack():
|
| 466 |
+
self._trace_backtrack(self._tree, self._frontier)
|
| 467 |
+
return True
|
| 468 |
+
|
| 469 |
+
# Nothing left to do.
|
| 470 |
+
return None
|
| 471 |
+
|
| 472 |
+
def expand(self, production=None):
|
| 473 |
+
"""
|
| 474 |
+
Expand the first element of the frontier. In particular, if
|
| 475 |
+
the first element of the frontier is a subtree whose node type
|
| 476 |
+
is equal to ``production``'s left hand side, then add a child
|
| 477 |
+
to that subtree for each element of ``production``'s right hand
|
| 478 |
+
side. If ``production`` is not specified, then use the first
|
| 479 |
+
untried expandable production. If all expandable productions
|
| 480 |
+
have been tried, do nothing.
|
| 481 |
+
|
| 482 |
+
:return: The production used to expand the frontier, if an
|
| 483 |
+
expansion was performed. If no expansion was performed,
|
| 484 |
+
return None.
|
| 485 |
+
:rtype: Production or None
|
| 486 |
+
"""
|
| 487 |
+
|
| 488 |
+
# Make sure we *can* expand.
|
| 489 |
+
if len(self._frontier) == 0:
|
| 490 |
+
return None
|
| 491 |
+
if not isinstance(self._tree[self._frontier[0]], Tree):
|
| 492 |
+
return None
|
| 493 |
+
|
| 494 |
+
# If they didn't specify a production, check all untried ones.
|
| 495 |
+
if production is None:
|
| 496 |
+
productions = self.untried_expandable_productions()
|
| 497 |
+
else:
|
| 498 |
+
productions = [production]
|
| 499 |
+
|
| 500 |
+
parses = []
|
| 501 |
+
for prod in productions:
|
| 502 |
+
# Record that we've tried this production now.
|
| 503 |
+
self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
|
| 504 |
+
|
| 505 |
+
# Try expanding.
|
| 506 |
+
for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
|
| 507 |
+
return prod
|
| 508 |
+
|
| 509 |
+
# We didn't expand anything.
|
| 510 |
+
return None
|
| 511 |
+
|
| 512 |
+
def match(self):
|
| 513 |
+
"""
|
| 514 |
+
Match the first element of the frontier. In particular, if
|
| 515 |
+
the first element of the frontier has the same type as the
|
| 516 |
+
next text token, then substitute the text token into the tree.
|
| 517 |
+
|
| 518 |
+
:return: The token matched, if a match operation was
|
| 519 |
+
performed. If no match was performed, return None
|
| 520 |
+
:rtype: str or None
|
| 521 |
+
"""
|
| 522 |
+
|
| 523 |
+
# Record that we've tried matching this token.
|
| 524 |
+
tok = self._rtext[0]
|
| 525 |
+
self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
|
| 526 |
+
|
| 527 |
+
# Make sure we *can* match.
|
| 528 |
+
if len(self._frontier) == 0:
|
| 529 |
+
return None
|
| 530 |
+
if isinstance(self._tree[self._frontier[0]], Tree):
|
| 531 |
+
return None
|
| 532 |
+
|
| 533 |
+
for _result in self._match(self._rtext, self._tree, self._frontier):
|
| 534 |
+
# Return the token we just matched.
|
| 535 |
+
return self._history[-1][0][0]
|
| 536 |
+
return None
|
| 537 |
+
|
| 538 |
+
def backtrack(self):
|
| 539 |
+
"""
|
| 540 |
+
Return the parser to its state before the most recent
|
| 541 |
+
match or expand operation. Calling ``undo`` repeatedly return
|
| 542 |
+
the parser to successively earlier states. If no match or
|
| 543 |
+
expand operations have been performed, ``undo`` will make no
|
| 544 |
+
changes.
|
| 545 |
+
|
| 546 |
+
:return: true if an operation was successfully undone.
|
| 547 |
+
:rtype: bool
|
| 548 |
+
"""
|
| 549 |
+
if len(self._history) == 0:
|
| 550 |
+
return False
|
| 551 |
+
(self._rtext, self._tree, self._frontier) = self._history.pop()
|
| 552 |
+
return True
|
| 553 |
+
|
| 554 |
+
def expandable_productions(self):
|
| 555 |
+
"""
|
| 556 |
+
:return: A list of all the productions for which expansions
|
| 557 |
+
are available for the current parser state.
|
| 558 |
+
:rtype: list(Production)
|
| 559 |
+
"""
|
| 560 |
+
# Make sure we *can* expand.
|
| 561 |
+
if len(self._frontier) == 0:
|
| 562 |
+
return []
|
| 563 |
+
frontier_child = self._tree[self._frontier[0]]
|
| 564 |
+
if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
|
| 565 |
+
return []
|
| 566 |
+
|
| 567 |
+
return [
|
| 568 |
+
p
|
| 569 |
+
for p in self._grammar.productions()
|
| 570 |
+
if p.lhs().symbol() == frontier_child.label()
|
| 571 |
+
]
|
| 572 |
+
|
| 573 |
+
def untried_expandable_productions(self):
|
| 574 |
+
"""
|
| 575 |
+
:return: A list of all the untried productions for which
|
| 576 |
+
expansions are available for the current parser state.
|
| 577 |
+
:rtype: list(Production)
|
| 578 |
+
"""
|
| 579 |
+
|
| 580 |
+
tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
|
| 581 |
+
return [p for p in self.expandable_productions() if p not in tried_expansions]
|
| 582 |
+
|
| 583 |
+
def untried_match(self):
|
| 584 |
+
"""
|
| 585 |
+
:return: Whether the first element of the frontier is a token
|
| 586 |
+
that has not yet been matched.
|
| 587 |
+
:rtype: bool
|
| 588 |
+
"""
|
| 589 |
+
|
| 590 |
+
if len(self._rtext) == 0:
|
| 591 |
+
return False
|
| 592 |
+
tried_matches = self._tried_m.get(self._freeze(self._tree), [])
|
| 593 |
+
return self._rtext[0] not in tried_matches
|
| 594 |
+
|
| 595 |
+
def currently_complete(self):
|
| 596 |
+
"""
|
| 597 |
+
:return: Whether the parser's current state represents a
|
| 598 |
+
complete parse.
|
| 599 |
+
:rtype: bool
|
| 600 |
+
"""
|
| 601 |
+
return len(self._frontier) == 0 and len(self._rtext) == 0
|
| 602 |
+
|
| 603 |
+
def _parse(self, remaining_text, tree, frontier):
|
| 604 |
+
"""
|
| 605 |
+
A stub version of ``_parse`` that sets the parsers current
|
| 606 |
+
state to the given arguments. In ``RecursiveDescentParser``,
|
| 607 |
+
the ``_parse`` method is used to recursively continue parsing a
|
| 608 |
+
text. ``SteppingRecursiveDescentParser`` overrides it to
|
| 609 |
+
capture these recursive calls. It records the parser's old
|
| 610 |
+
state in the history (to allow for backtracking), and updates
|
| 611 |
+
the parser's new state using the given arguments. Finally, it
|
| 612 |
+
returns ``[1]``, which is used by ``match`` and ``expand`` to
|
| 613 |
+
detect whether their operations were successful.
|
| 614 |
+
|
| 615 |
+
:return: ``[1]``
|
| 616 |
+
:rtype: list of int
|
| 617 |
+
"""
|
| 618 |
+
self._history.append((self._rtext, self._tree, self._frontier))
|
| 619 |
+
self._rtext = remaining_text
|
| 620 |
+
self._tree = tree
|
| 621 |
+
self._frontier = frontier
|
| 622 |
+
|
| 623 |
+
# Is it a good parse? If so, record it.
|
| 624 |
+
if len(frontier) == 0 and len(remaining_text) == 0:
|
| 625 |
+
self._parses.append(tree)
|
| 626 |
+
self._trace_succeed(self._tree, self._frontier)
|
| 627 |
+
|
| 628 |
+
return [1]
|
| 629 |
+
|
| 630 |
+
def parses(self):
|
| 631 |
+
"""
|
| 632 |
+
:return: An iterator of the parses that have been found by this
|
| 633 |
+
parser so far.
|
| 634 |
+
:rtype: list of Tree
|
| 635 |
+
"""
|
| 636 |
+
return iter(self._parses)
|
| 637 |
+
|
| 638 |
+
def set_grammar(self, grammar):
|
| 639 |
+
"""
|
| 640 |
+
Change the grammar used to parse texts.
|
| 641 |
+
|
| 642 |
+
:param grammar: The new grammar.
|
| 643 |
+
:type grammar: CFG
|
| 644 |
+
"""
|
| 645 |
+
self._grammar = grammar
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
##//////////////////////////////////////////////////////
|
| 649 |
+
## Demonstration Code
|
| 650 |
+
##//////////////////////////////////////////////////////
|
| 651 |
+
|
| 652 |
+
|
| 653 |
+
def demo():
|
| 654 |
+
"""
|
| 655 |
+
A demonstration of the recursive descent parser.
|
| 656 |
+
"""
|
| 657 |
+
|
| 658 |
+
from nltk import CFG, parse
|
| 659 |
+
|
| 660 |
+
grammar = CFG.fromstring(
|
| 661 |
+
"""
|
| 662 |
+
S -> NP VP
|
| 663 |
+
NP -> Det N | Det N PP
|
| 664 |
+
VP -> V NP | V NP PP
|
| 665 |
+
PP -> P NP
|
| 666 |
+
NP -> 'I'
|
| 667 |
+
N -> 'man' | 'park' | 'telescope' | 'dog'
|
| 668 |
+
Det -> 'the' | 'a'
|
| 669 |
+
P -> 'in' | 'with'
|
| 670 |
+
V -> 'saw'
|
| 671 |
+
"""
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
for prod in grammar.productions():
|
| 675 |
+
print(prod)
|
| 676 |
+
|
| 677 |
+
sent = "I saw a man in the park".split()
|
| 678 |
+
parser = parse.RecursiveDescentParser(grammar, trace=2)
|
| 679 |
+
for p in parser.parse(sent):
|
| 680 |
+
print(p)
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
if __name__ == "__main__":
|
| 684 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py
ADDED
|
@@ -0,0 +1,479 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Shift-Reduce Parser
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
from nltk.grammar import Nonterminal
|
| 10 |
+
from nltk.parse.api import ParserI
|
| 11 |
+
from nltk.tree import Tree
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
##//////////////////////////////////////////////////////
|
| 15 |
+
## Shift/Reduce Parser
|
| 16 |
+
##//////////////////////////////////////////////////////
|
| 17 |
+
class ShiftReduceParser(ParserI):
|
| 18 |
+
"""
|
| 19 |
+
A simple bottom-up CFG parser that uses two operations, "shift"
|
| 20 |
+
and "reduce", to find a single parse for a text.
|
| 21 |
+
|
| 22 |
+
``ShiftReduceParser`` maintains a stack, which records the
|
| 23 |
+
structure of a portion of the text. This stack is a list of
|
| 24 |
+
strings and Trees that collectively cover a portion of
|
| 25 |
+
the text. For example, while parsing the sentence "the dog saw
|
| 26 |
+
the man" with a typical grammar, ``ShiftReduceParser`` will produce
|
| 27 |
+
the following stack, which covers "the dog saw"::
|
| 28 |
+
|
| 29 |
+
[(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
|
| 30 |
+
|
| 31 |
+
``ShiftReduceParser`` attempts to extend the stack to cover the
|
| 32 |
+
entire text, and to combine the stack elements into a single tree,
|
| 33 |
+
producing a complete parse for the sentence.
|
| 34 |
+
|
| 35 |
+
Initially, the stack is empty. It is extended to cover the text,
|
| 36 |
+
from left to right, by repeatedly applying two operations:
|
| 37 |
+
|
| 38 |
+
- "shift" moves a token from the beginning of the text to the
|
| 39 |
+
end of the stack.
|
| 40 |
+
- "reduce" uses a CFG production to combine the rightmost stack
|
| 41 |
+
elements into a single Tree.
|
| 42 |
+
|
| 43 |
+
Often, more than one operation can be performed on a given stack.
|
| 44 |
+
In this case, ``ShiftReduceParser`` uses the following heuristics
|
| 45 |
+
to decide which operation to perform:
|
| 46 |
+
|
| 47 |
+
- Only shift if no reductions are available.
|
| 48 |
+
- If multiple reductions are available, then apply the reduction
|
| 49 |
+
whose CFG production is listed earliest in the grammar.
|
| 50 |
+
|
| 51 |
+
Note that these heuristics are not guaranteed to choose an
|
| 52 |
+
operation that leads to a parse of the text. Also, if multiple
|
| 53 |
+
parses exists, ``ShiftReduceParser`` will return at most one of
|
| 54 |
+
them.
|
| 55 |
+
|
| 56 |
+
:see: ``nltk.grammar``
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
def __init__(self, grammar, trace=0):
|
| 60 |
+
"""
|
| 61 |
+
Create a new ``ShiftReduceParser``, that uses ``grammar`` to
|
| 62 |
+
parse texts.
|
| 63 |
+
|
| 64 |
+
:type grammar: Grammar
|
| 65 |
+
:param grammar: The grammar used to parse texts.
|
| 66 |
+
:type trace: int
|
| 67 |
+
:param trace: The level of tracing that should be used when
|
| 68 |
+
parsing a text. ``0`` will generate no tracing output;
|
| 69 |
+
and higher numbers will produce more verbose tracing
|
| 70 |
+
output.
|
| 71 |
+
"""
|
| 72 |
+
self._grammar = grammar
|
| 73 |
+
self._trace = trace
|
| 74 |
+
self._check_grammar()
|
| 75 |
+
|
| 76 |
+
def grammar(self):
|
| 77 |
+
return self._grammar
|
| 78 |
+
|
| 79 |
+
def parse(self, tokens):
|
| 80 |
+
tokens = list(tokens)
|
| 81 |
+
self._grammar.check_coverage(tokens)
|
| 82 |
+
|
| 83 |
+
# initialize the stack.
|
| 84 |
+
stack = []
|
| 85 |
+
remaining_text = tokens
|
| 86 |
+
|
| 87 |
+
# Trace output.
|
| 88 |
+
if self._trace:
|
| 89 |
+
print("Parsing %r" % " ".join(tokens))
|
| 90 |
+
self._trace_stack(stack, remaining_text)
|
| 91 |
+
|
| 92 |
+
# iterate through the text, pushing the token onto
|
| 93 |
+
# the stack, then reducing the stack.
|
| 94 |
+
while len(remaining_text) > 0:
|
| 95 |
+
self._shift(stack, remaining_text)
|
| 96 |
+
while self._reduce(stack, remaining_text):
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
# Did we reduce everything?
|
| 100 |
+
if len(stack) == 1:
|
| 101 |
+
# Did we end up with the right category?
|
| 102 |
+
if stack[0].label() == self._grammar.start().symbol():
|
| 103 |
+
yield stack[0]
|
| 104 |
+
|
| 105 |
+
def _shift(self, stack, remaining_text):
|
| 106 |
+
"""
|
| 107 |
+
Move a token from the beginning of ``remaining_text`` to the
|
| 108 |
+
end of ``stack``.
|
| 109 |
+
|
| 110 |
+
:type stack: list(str and Tree)
|
| 111 |
+
:param stack: A list of strings and Trees, encoding
|
| 112 |
+
the structure of the text that has been parsed so far.
|
| 113 |
+
:type remaining_text: list(str)
|
| 114 |
+
:param remaining_text: The portion of the text that is not yet
|
| 115 |
+
covered by ``stack``.
|
| 116 |
+
:rtype: None
|
| 117 |
+
"""
|
| 118 |
+
stack.append(remaining_text[0])
|
| 119 |
+
remaining_text.remove(remaining_text[0])
|
| 120 |
+
if self._trace:
|
| 121 |
+
self._trace_shift(stack, remaining_text)
|
| 122 |
+
|
| 123 |
+
def _match_rhs(self, rhs, rightmost_stack):
|
| 124 |
+
"""
|
| 125 |
+
:rtype: bool
|
| 126 |
+
:return: true if the right hand side of a CFG production
|
| 127 |
+
matches the rightmost elements of the stack. ``rhs``
|
| 128 |
+
matches ``rightmost_stack`` if they are the same length,
|
| 129 |
+
and each element of ``rhs`` matches the corresponding
|
| 130 |
+
element of ``rightmost_stack``. A nonterminal element of
|
| 131 |
+
``rhs`` matches any Tree whose node value is equal
|
| 132 |
+
to the nonterminal's symbol. A terminal element of ``rhs``
|
| 133 |
+
matches any string whose type is equal to the terminal.
|
| 134 |
+
:type rhs: list(terminal and Nonterminal)
|
| 135 |
+
:param rhs: The right hand side of a CFG production.
|
| 136 |
+
:type rightmost_stack: list(string and Tree)
|
| 137 |
+
:param rightmost_stack: The rightmost elements of the parser's
|
| 138 |
+
stack.
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
if len(rightmost_stack) != len(rhs):
|
| 142 |
+
return False
|
| 143 |
+
for i in range(len(rightmost_stack)):
|
| 144 |
+
if isinstance(rightmost_stack[i], Tree):
|
| 145 |
+
if not isinstance(rhs[i], Nonterminal):
|
| 146 |
+
return False
|
| 147 |
+
if rightmost_stack[i].label() != rhs[i].symbol():
|
| 148 |
+
return False
|
| 149 |
+
else:
|
| 150 |
+
if isinstance(rhs[i], Nonterminal):
|
| 151 |
+
return False
|
| 152 |
+
if rightmost_stack[i] != rhs[i]:
|
| 153 |
+
return False
|
| 154 |
+
return True
|
| 155 |
+
|
| 156 |
+
def _reduce(self, stack, remaining_text, production=None):
|
| 157 |
+
"""
|
| 158 |
+
Find a CFG production whose right hand side matches the
|
| 159 |
+
rightmost stack elements; and combine those stack elements
|
| 160 |
+
into a single Tree, with the node specified by the
|
| 161 |
+
production's left-hand side. If more than one CFG production
|
| 162 |
+
matches the stack, then use the production that is listed
|
| 163 |
+
earliest in the grammar. The new Tree replaces the
|
| 164 |
+
elements in the stack.
|
| 165 |
+
|
| 166 |
+
:rtype: Production or None
|
| 167 |
+
:return: If a reduction is performed, then return the CFG
|
| 168 |
+
production that the reduction is based on; otherwise,
|
| 169 |
+
return false.
|
| 170 |
+
:type stack: list(string and Tree)
|
| 171 |
+
:param stack: A list of strings and Trees, encoding
|
| 172 |
+
the structure of the text that has been parsed so far.
|
| 173 |
+
:type remaining_text: list(str)
|
| 174 |
+
:param remaining_text: The portion of the text that is not yet
|
| 175 |
+
covered by ``stack``.
|
| 176 |
+
"""
|
| 177 |
+
if production is None:
|
| 178 |
+
productions = self._grammar.productions()
|
| 179 |
+
else:
|
| 180 |
+
productions = [production]
|
| 181 |
+
|
| 182 |
+
# Try each production, in order.
|
| 183 |
+
for production in productions:
|
| 184 |
+
rhslen = len(production.rhs())
|
| 185 |
+
|
| 186 |
+
# check if the RHS of a production matches the top of the stack
|
| 187 |
+
if self._match_rhs(production.rhs(), stack[-rhslen:]):
|
| 188 |
+
|
| 189 |
+
# combine the tree to reflect the reduction
|
| 190 |
+
tree = Tree(production.lhs().symbol(), stack[-rhslen:])
|
| 191 |
+
stack[-rhslen:] = [tree]
|
| 192 |
+
|
| 193 |
+
# We reduced something
|
| 194 |
+
if self._trace:
|
| 195 |
+
self._trace_reduce(stack, production, remaining_text)
|
| 196 |
+
return production
|
| 197 |
+
|
| 198 |
+
# We didn't reduce anything
|
| 199 |
+
return None
|
| 200 |
+
|
| 201 |
+
def trace(self, trace=2):
|
| 202 |
+
"""
|
| 203 |
+
Set the level of tracing output that should be generated when
|
| 204 |
+
parsing a text.
|
| 205 |
+
|
| 206 |
+
:type trace: int
|
| 207 |
+
:param trace: The trace level. A trace level of ``0`` will
|
| 208 |
+
generate no tracing output; and higher trace levels will
|
| 209 |
+
produce more verbose tracing output.
|
| 210 |
+
:rtype: None
|
| 211 |
+
"""
|
| 212 |
+
# 1: just show shifts.
|
| 213 |
+
# 2: show shifts & reduces
|
| 214 |
+
# 3: display which tokens & productions are shifed/reduced
|
| 215 |
+
self._trace = trace
|
| 216 |
+
|
| 217 |
+
def _trace_stack(self, stack, remaining_text, marker=" "):
|
| 218 |
+
"""
|
| 219 |
+
Print trace output displaying the given stack and text.
|
| 220 |
+
|
| 221 |
+
:rtype: None
|
| 222 |
+
:param marker: A character that is printed to the left of the
|
| 223 |
+
stack. This is used with trace level 2 to print 'S'
|
| 224 |
+
before shifted stacks and 'R' before reduced stacks.
|
| 225 |
+
"""
|
| 226 |
+
s = " " + marker + " [ "
|
| 227 |
+
for elt in stack:
|
| 228 |
+
if isinstance(elt, Tree):
|
| 229 |
+
s += repr(Nonterminal(elt.label())) + " "
|
| 230 |
+
else:
|
| 231 |
+
s += repr(elt) + " "
|
| 232 |
+
s += "* " + " ".join(remaining_text) + "]"
|
| 233 |
+
print(s)
|
| 234 |
+
|
| 235 |
+
def _trace_shift(self, stack, remaining_text):
|
| 236 |
+
"""
|
| 237 |
+
Print trace output displaying that a token has been shifted.
|
| 238 |
+
|
| 239 |
+
:rtype: None
|
| 240 |
+
"""
|
| 241 |
+
if self._trace > 2:
|
| 242 |
+
print("Shift %r:" % stack[-1])
|
| 243 |
+
if self._trace == 2:
|
| 244 |
+
self._trace_stack(stack, remaining_text, "S")
|
| 245 |
+
elif self._trace > 0:
|
| 246 |
+
self._trace_stack(stack, remaining_text)
|
| 247 |
+
|
| 248 |
+
def _trace_reduce(self, stack, production, remaining_text):
|
| 249 |
+
"""
|
| 250 |
+
Print trace output displaying that ``production`` was used to
|
| 251 |
+
reduce ``stack``.
|
| 252 |
+
|
| 253 |
+
:rtype: None
|
| 254 |
+
"""
|
| 255 |
+
if self._trace > 2:
|
| 256 |
+
rhs = " ".join(production.rhs())
|
| 257 |
+
print(f"Reduce {production.lhs()!r} <- {rhs}")
|
| 258 |
+
if self._trace == 2:
|
| 259 |
+
self._trace_stack(stack, remaining_text, "R")
|
| 260 |
+
elif self._trace > 1:
|
| 261 |
+
self._trace_stack(stack, remaining_text)
|
| 262 |
+
|
| 263 |
+
def _check_grammar(self):
|
| 264 |
+
"""
|
| 265 |
+
Check to make sure that all of the CFG productions are
|
| 266 |
+
potentially useful. If any productions can never be used,
|
| 267 |
+
then print a warning.
|
| 268 |
+
|
| 269 |
+
:rtype: None
|
| 270 |
+
"""
|
| 271 |
+
productions = self._grammar.productions()
|
| 272 |
+
|
| 273 |
+
# Any production whose RHS is an extension of another production's RHS
|
| 274 |
+
# will never be used.
|
| 275 |
+
for i in range(len(productions)):
|
| 276 |
+
for j in range(i + 1, len(productions)):
|
| 277 |
+
rhs1 = productions[i].rhs()
|
| 278 |
+
rhs2 = productions[j].rhs()
|
| 279 |
+
if rhs1[: len(rhs2)] == rhs2:
|
| 280 |
+
print("Warning: %r will never be used" % productions[i])
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
##//////////////////////////////////////////////////////
|
| 284 |
+
## Stepping Shift/Reduce Parser
|
| 285 |
+
##//////////////////////////////////////////////////////
|
| 286 |
+
class SteppingShiftReduceParser(ShiftReduceParser):
|
| 287 |
+
"""
|
| 288 |
+
A ``ShiftReduceParser`` that allows you to setp through the parsing
|
| 289 |
+
process, performing a single operation at a time. It also allows
|
| 290 |
+
you to change the parser's grammar midway through parsing a text.
|
| 291 |
+
|
| 292 |
+
The ``initialize`` method is used to start parsing a text.
|
| 293 |
+
``shift`` performs a single shift operation, and ``reduce`` performs
|
| 294 |
+
a single reduce operation. ``step`` will perform a single reduce
|
| 295 |
+
operation if possible; otherwise, it will perform a single shift
|
| 296 |
+
operation. ``parses`` returns the set of parses that have been
|
| 297 |
+
found by the parser.
|
| 298 |
+
|
| 299 |
+
:ivar _history: A list of ``(stack, remaining_text)`` pairs,
|
| 300 |
+
containing all of the previous states of the parser. This
|
| 301 |
+
history is used to implement the ``undo`` operation.
|
| 302 |
+
:see: ``nltk.grammar``
|
| 303 |
+
"""
|
| 304 |
+
|
| 305 |
+
def __init__(self, grammar, trace=0):
|
| 306 |
+
super().__init__(grammar, trace)
|
| 307 |
+
self._stack = None
|
| 308 |
+
self._remaining_text = None
|
| 309 |
+
self._history = []
|
| 310 |
+
|
| 311 |
+
def parse(self, tokens):
|
| 312 |
+
tokens = list(tokens)
|
| 313 |
+
self.initialize(tokens)
|
| 314 |
+
while self.step():
|
| 315 |
+
pass
|
| 316 |
+
return self.parses()
|
| 317 |
+
|
| 318 |
+
def stack(self):
|
| 319 |
+
"""
|
| 320 |
+
:return: The parser's stack.
|
| 321 |
+
:rtype: list(str and Tree)
|
| 322 |
+
"""
|
| 323 |
+
return self._stack
|
| 324 |
+
|
| 325 |
+
def remaining_text(self):
|
| 326 |
+
"""
|
| 327 |
+
:return: The portion of the text that is not yet covered by the
|
| 328 |
+
stack.
|
| 329 |
+
:rtype: list(str)
|
| 330 |
+
"""
|
| 331 |
+
return self._remaining_text
|
| 332 |
+
|
| 333 |
+
def initialize(self, tokens):
|
| 334 |
+
"""
|
| 335 |
+
Start parsing a given text. This sets the parser's stack to
|
| 336 |
+
``[]`` and sets its remaining text to ``tokens``.
|
| 337 |
+
"""
|
| 338 |
+
self._stack = []
|
| 339 |
+
self._remaining_text = tokens
|
| 340 |
+
self._history = []
|
| 341 |
+
|
| 342 |
+
def step(self):
|
| 343 |
+
"""
|
| 344 |
+
Perform a single parsing operation. If a reduction is
|
| 345 |
+
possible, then perform that reduction, and return the
|
| 346 |
+
production that it is based on. Otherwise, if a shift is
|
| 347 |
+
possible, then perform it, and return True. Otherwise,
|
| 348 |
+
return False.
|
| 349 |
+
|
| 350 |
+
:return: False if no operation was performed; True if a shift was
|
| 351 |
+
performed; and the CFG production used to reduce if a
|
| 352 |
+
reduction was performed.
|
| 353 |
+
:rtype: Production or bool
|
| 354 |
+
"""
|
| 355 |
+
return self.reduce() or self.shift()
|
| 356 |
+
|
| 357 |
+
def shift(self):
|
| 358 |
+
"""
|
| 359 |
+
Move a token from the beginning of the remaining text to the
|
| 360 |
+
end of the stack. If there are no more tokens in the
|
| 361 |
+
remaining text, then do nothing.
|
| 362 |
+
|
| 363 |
+
:return: True if the shift operation was successful.
|
| 364 |
+
:rtype: bool
|
| 365 |
+
"""
|
| 366 |
+
if len(self._remaining_text) == 0:
|
| 367 |
+
return False
|
| 368 |
+
self._history.append((self._stack[:], self._remaining_text[:]))
|
| 369 |
+
self._shift(self._stack, self._remaining_text)
|
| 370 |
+
return True
|
| 371 |
+
|
| 372 |
+
def reduce(self, production=None):
|
| 373 |
+
"""
|
| 374 |
+
Use ``production`` to combine the rightmost stack elements into
|
| 375 |
+
a single Tree. If ``production`` does not match the
|
| 376 |
+
rightmost stack elements, then do nothing.
|
| 377 |
+
|
| 378 |
+
:return: The production used to reduce the stack, if a
|
| 379 |
+
reduction was performed. If no reduction was performed,
|
| 380 |
+
return None.
|
| 381 |
+
|
| 382 |
+
:rtype: Production or None
|
| 383 |
+
"""
|
| 384 |
+
self._history.append((self._stack[:], self._remaining_text[:]))
|
| 385 |
+
return_val = self._reduce(self._stack, self._remaining_text, production)
|
| 386 |
+
|
| 387 |
+
if not return_val:
|
| 388 |
+
self._history.pop()
|
| 389 |
+
return return_val
|
| 390 |
+
|
| 391 |
+
def undo(self):
|
| 392 |
+
"""
|
| 393 |
+
Return the parser to its state before the most recent
|
| 394 |
+
shift or reduce operation. Calling ``undo`` repeatedly return
|
| 395 |
+
the parser to successively earlier states. If no shift or
|
| 396 |
+
reduce operations have been performed, ``undo`` will make no
|
| 397 |
+
changes.
|
| 398 |
+
|
| 399 |
+
:return: true if an operation was successfully undone.
|
| 400 |
+
:rtype: bool
|
| 401 |
+
"""
|
| 402 |
+
if len(self._history) == 0:
|
| 403 |
+
return False
|
| 404 |
+
(self._stack, self._remaining_text) = self._history.pop()
|
| 405 |
+
return True
|
| 406 |
+
|
| 407 |
+
def reducible_productions(self):
|
| 408 |
+
"""
|
| 409 |
+
:return: A list of the productions for which reductions are
|
| 410 |
+
available for the current parser state.
|
| 411 |
+
:rtype: list(Production)
|
| 412 |
+
"""
|
| 413 |
+
productions = []
|
| 414 |
+
for production in self._grammar.productions():
|
| 415 |
+
rhslen = len(production.rhs())
|
| 416 |
+
if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
|
| 417 |
+
productions.append(production)
|
| 418 |
+
return productions
|
| 419 |
+
|
| 420 |
+
def parses(self):
|
| 421 |
+
"""
|
| 422 |
+
:return: An iterator of the parses that have been found by this
|
| 423 |
+
parser so far.
|
| 424 |
+
:rtype: iter(Tree)
|
| 425 |
+
"""
|
| 426 |
+
if (
|
| 427 |
+
len(self._remaining_text) == 0
|
| 428 |
+
and len(self._stack) == 1
|
| 429 |
+
and self._stack[0].label() == self._grammar.start().symbol()
|
| 430 |
+
):
|
| 431 |
+
yield self._stack[0]
|
| 432 |
+
|
| 433 |
+
# copied from nltk.parser
|
| 434 |
+
|
| 435 |
+
def set_grammar(self, grammar):
|
| 436 |
+
"""
|
| 437 |
+
Change the grammar used to parse texts.
|
| 438 |
+
|
| 439 |
+
:param grammar: The new grammar.
|
| 440 |
+
:type grammar: CFG
|
| 441 |
+
"""
|
| 442 |
+
self._grammar = grammar
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
##//////////////////////////////////////////////////////
|
| 446 |
+
## Demonstration Code
|
| 447 |
+
##//////////////////////////////////////////////////////
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def demo():
|
| 451 |
+
"""
|
| 452 |
+
A demonstration of the shift-reduce parser.
|
| 453 |
+
"""
|
| 454 |
+
|
| 455 |
+
from nltk import CFG, parse
|
| 456 |
+
|
| 457 |
+
grammar = CFG.fromstring(
|
| 458 |
+
"""
|
| 459 |
+
S -> NP VP
|
| 460 |
+
NP -> Det N | Det N PP
|
| 461 |
+
VP -> V NP | V NP PP
|
| 462 |
+
PP -> P NP
|
| 463 |
+
NP -> 'I'
|
| 464 |
+
N -> 'man' | 'park' | 'telescope' | 'dog'
|
| 465 |
+
Det -> 'the' | 'a'
|
| 466 |
+
P -> 'in' | 'with'
|
| 467 |
+
V -> 'saw'
|
| 468 |
+
"""
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
sent = "I saw a man in the park".split()
|
| 472 |
+
|
| 473 |
+
parser = parse.ShiftReduceParser(grammar, trace=2)
|
| 474 |
+
for p in parser.parse(sent):
|
| 475 |
+
print(p)
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
if __name__ == "__main__":
|
| 479 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Interface to the Stanford Parser
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Xu <xxu@student.unimelb.edu.au>
|
| 5 |
+
#
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import tempfile
|
| 11 |
+
import warnings
|
| 12 |
+
from subprocess import PIPE
|
| 13 |
+
|
| 14 |
+
from nltk.internals import (
|
| 15 |
+
_java_options,
|
| 16 |
+
config_java,
|
| 17 |
+
find_jar_iter,
|
| 18 |
+
find_jars_within_path,
|
| 19 |
+
java,
|
| 20 |
+
)
|
| 21 |
+
from nltk.parse.api import ParserI
|
| 22 |
+
from nltk.parse.dependencygraph import DependencyGraph
|
| 23 |
+
from nltk.tree import Tree
|
| 24 |
+
|
| 25 |
+
_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class GenericStanfordParser(ParserI):
|
| 29 |
+
"""Interface to the Stanford Parser"""
|
| 30 |
+
|
| 31 |
+
_MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
|
| 32 |
+
_JAR = r"stanford-parser\.jar"
|
| 33 |
+
_MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
|
| 34 |
+
|
| 35 |
+
_USE_STDIN = False
|
| 36 |
+
_DOUBLE_SPACED_OUTPUT = False
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
path_to_jar=None,
|
| 41 |
+
path_to_models_jar=None,
|
| 42 |
+
model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
|
| 43 |
+
encoding="utf8",
|
| 44 |
+
verbose=False,
|
| 45 |
+
java_options="-mx4g",
|
| 46 |
+
corenlp_options="",
|
| 47 |
+
):
|
| 48 |
+
|
| 49 |
+
# find the most recent code and model jar
|
| 50 |
+
stanford_jar = max(
|
| 51 |
+
find_jar_iter(
|
| 52 |
+
self._JAR,
|
| 53 |
+
path_to_jar,
|
| 54 |
+
env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
|
| 55 |
+
searchpath=(),
|
| 56 |
+
url=_stanford_url,
|
| 57 |
+
verbose=verbose,
|
| 58 |
+
is_regex=True,
|
| 59 |
+
),
|
| 60 |
+
key=lambda model_path: os.path.dirname(model_path),
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
model_jar = max(
|
| 64 |
+
find_jar_iter(
|
| 65 |
+
self._MODEL_JAR_PATTERN,
|
| 66 |
+
path_to_models_jar,
|
| 67 |
+
env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
|
| 68 |
+
searchpath=(),
|
| 69 |
+
url=_stanford_url,
|
| 70 |
+
verbose=verbose,
|
| 71 |
+
is_regex=True,
|
| 72 |
+
),
|
| 73 |
+
key=lambda model_path: os.path.dirname(model_path),
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# self._classpath = (stanford_jar, model_jar)
|
| 77 |
+
|
| 78 |
+
# Adding logging jar files to classpath
|
| 79 |
+
stanford_dir = os.path.split(stanford_jar)[0]
|
| 80 |
+
self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
|
| 81 |
+
|
| 82 |
+
self.model_path = model_path
|
| 83 |
+
self._encoding = encoding
|
| 84 |
+
self.corenlp_options = corenlp_options
|
| 85 |
+
self.java_options = java_options
|
| 86 |
+
|
| 87 |
+
def _parse_trees_output(self, output_):
|
| 88 |
+
res = []
|
| 89 |
+
cur_lines = []
|
| 90 |
+
cur_trees = []
|
| 91 |
+
blank = False
|
| 92 |
+
for line in output_.splitlines(False):
|
| 93 |
+
if line == "":
|
| 94 |
+
if blank:
|
| 95 |
+
res.append(iter(cur_trees))
|
| 96 |
+
cur_trees = []
|
| 97 |
+
blank = False
|
| 98 |
+
elif self._DOUBLE_SPACED_OUTPUT:
|
| 99 |
+
cur_trees.append(self._make_tree("\n".join(cur_lines)))
|
| 100 |
+
cur_lines = []
|
| 101 |
+
blank = True
|
| 102 |
+
else:
|
| 103 |
+
res.append(iter([self._make_tree("\n".join(cur_lines))]))
|
| 104 |
+
cur_lines = []
|
| 105 |
+
else:
|
| 106 |
+
cur_lines.append(line)
|
| 107 |
+
blank = False
|
| 108 |
+
return iter(res)
|
| 109 |
+
|
| 110 |
+
def parse_sents(self, sentences, verbose=False):
|
| 111 |
+
"""
|
| 112 |
+
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
|
| 113 |
+
list where each sentence is a list of words.
|
| 114 |
+
Each sentence will be automatically tagged with this StanfordParser instance's
|
| 115 |
+
tagger.
|
| 116 |
+
If whitespaces exists inside a token, then the token will be treated as
|
| 117 |
+
separate tokens.
|
| 118 |
+
|
| 119 |
+
:param sentences: Input sentences to parse
|
| 120 |
+
:type sentences: list(list(str))
|
| 121 |
+
:rtype: iter(iter(Tree))
|
| 122 |
+
"""
|
| 123 |
+
cmd = [
|
| 124 |
+
self._MAIN_CLASS,
|
| 125 |
+
"-model",
|
| 126 |
+
self.model_path,
|
| 127 |
+
"-sentences",
|
| 128 |
+
"newline",
|
| 129 |
+
"-outputFormat",
|
| 130 |
+
self._OUTPUT_FORMAT,
|
| 131 |
+
"-tokenized",
|
| 132 |
+
"-escaper",
|
| 133 |
+
"edu.stanford.nlp.process.PTBEscapingProcessor",
|
| 134 |
+
]
|
| 135 |
+
return self._parse_trees_output(
|
| 136 |
+
self._execute(
|
| 137 |
+
cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
|
| 138 |
+
)
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
def raw_parse(self, sentence, verbose=False):
|
| 142 |
+
"""
|
| 143 |
+
Use StanfordParser to parse a sentence. Takes a sentence as a string;
|
| 144 |
+
before parsing, it will be automatically tokenized and tagged by
|
| 145 |
+
the Stanford Parser.
|
| 146 |
+
|
| 147 |
+
:param sentence: Input sentence to parse
|
| 148 |
+
:type sentence: str
|
| 149 |
+
:rtype: iter(Tree)
|
| 150 |
+
"""
|
| 151 |
+
return next(self.raw_parse_sents([sentence], verbose))
|
| 152 |
+
|
| 153 |
+
def raw_parse_sents(self, sentences, verbose=False):
|
| 154 |
+
"""
|
| 155 |
+
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
|
| 156 |
+
list of strings.
|
| 157 |
+
Each sentence will be automatically tokenized and tagged by the Stanford Parser.
|
| 158 |
+
|
| 159 |
+
:param sentences: Input sentences to parse
|
| 160 |
+
:type sentences: list(str)
|
| 161 |
+
:rtype: iter(iter(Tree))
|
| 162 |
+
"""
|
| 163 |
+
cmd = [
|
| 164 |
+
self._MAIN_CLASS,
|
| 165 |
+
"-model",
|
| 166 |
+
self.model_path,
|
| 167 |
+
"-sentences",
|
| 168 |
+
"newline",
|
| 169 |
+
"-outputFormat",
|
| 170 |
+
self._OUTPUT_FORMAT,
|
| 171 |
+
]
|
| 172 |
+
return self._parse_trees_output(
|
| 173 |
+
self._execute(cmd, "\n".join(sentences), verbose)
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
def tagged_parse(self, sentence, verbose=False):
|
| 177 |
+
"""
|
| 178 |
+
Use StanfordParser to parse a sentence. Takes a sentence as a list of
|
| 179 |
+
(word, tag) tuples; the sentence must have already been tokenized and
|
| 180 |
+
tagged.
|
| 181 |
+
|
| 182 |
+
:param sentence: Input sentence to parse
|
| 183 |
+
:type sentence: list(tuple(str, str))
|
| 184 |
+
:rtype: iter(Tree)
|
| 185 |
+
"""
|
| 186 |
+
return next(self.tagged_parse_sents([sentence], verbose))
|
| 187 |
+
|
| 188 |
+
def tagged_parse_sents(self, sentences, verbose=False):
|
| 189 |
+
"""
|
| 190 |
+
Use StanfordParser to parse multiple sentences. Takes multiple sentences
|
| 191 |
+
where each sentence is a list of (word, tag) tuples.
|
| 192 |
+
The sentences must have already been tokenized and tagged.
|
| 193 |
+
|
| 194 |
+
:param sentences: Input sentences to parse
|
| 195 |
+
:type sentences: list(list(tuple(str, str)))
|
| 196 |
+
:rtype: iter(iter(Tree))
|
| 197 |
+
"""
|
| 198 |
+
tag_separator = "/"
|
| 199 |
+
cmd = [
|
| 200 |
+
self._MAIN_CLASS,
|
| 201 |
+
"-model",
|
| 202 |
+
self.model_path,
|
| 203 |
+
"-sentences",
|
| 204 |
+
"newline",
|
| 205 |
+
"-outputFormat",
|
| 206 |
+
self._OUTPUT_FORMAT,
|
| 207 |
+
"-tokenized",
|
| 208 |
+
"-tagSeparator",
|
| 209 |
+
tag_separator,
|
| 210 |
+
"-tokenizerFactory",
|
| 211 |
+
"edu.stanford.nlp.process.WhitespaceTokenizer",
|
| 212 |
+
"-tokenizerMethod",
|
| 213 |
+
"newCoreLabelTokenizerFactory",
|
| 214 |
+
]
|
| 215 |
+
# We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
|
| 216 |
+
return self._parse_trees_output(
|
| 217 |
+
self._execute(
|
| 218 |
+
cmd,
|
| 219 |
+
"\n".join(
|
| 220 |
+
" ".join(tag_separator.join(tagged) for tagged in sentence)
|
| 221 |
+
for sentence in sentences
|
| 222 |
+
),
|
| 223 |
+
verbose,
|
| 224 |
+
)
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
def _execute(self, cmd, input_, verbose=False):
|
| 228 |
+
encoding = self._encoding
|
| 229 |
+
cmd.extend(["-encoding", encoding])
|
| 230 |
+
if self.corenlp_options:
|
| 231 |
+
cmd.extend(self.corenlp_options.split())
|
| 232 |
+
|
| 233 |
+
default_options = " ".join(_java_options)
|
| 234 |
+
|
| 235 |
+
# Configure java.
|
| 236 |
+
config_java(options=self.java_options, verbose=verbose)
|
| 237 |
+
|
| 238 |
+
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
|
| 239 |
+
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
|
| 240 |
+
# Write the actual sentences to the temporary input file
|
| 241 |
+
if isinstance(input_, str) and encoding:
|
| 242 |
+
input_ = input_.encode(encoding)
|
| 243 |
+
input_file.write(input_)
|
| 244 |
+
input_file.flush()
|
| 245 |
+
|
| 246 |
+
# Run the tagger and get the output.
|
| 247 |
+
if self._USE_STDIN:
|
| 248 |
+
input_file.seek(0)
|
| 249 |
+
stdout, stderr = java(
|
| 250 |
+
cmd,
|
| 251 |
+
classpath=self._classpath,
|
| 252 |
+
stdin=input_file,
|
| 253 |
+
stdout=PIPE,
|
| 254 |
+
stderr=PIPE,
|
| 255 |
+
)
|
| 256 |
+
else:
|
| 257 |
+
cmd.append(input_file.name)
|
| 258 |
+
stdout, stderr = java(
|
| 259 |
+
cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
stdout = stdout.replace(b"\xc2\xa0", b" ")
|
| 263 |
+
stdout = stdout.replace(b"\x00\xa0", b" ")
|
| 264 |
+
stdout = stdout.decode(encoding)
|
| 265 |
+
|
| 266 |
+
os.unlink(input_file.name)
|
| 267 |
+
|
| 268 |
+
# Return java configurations to their default values.
|
| 269 |
+
config_java(options=default_options, verbose=False)
|
| 270 |
+
|
| 271 |
+
return stdout
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
class StanfordParser(GenericStanfordParser):
|
| 275 |
+
"""
|
| 276 |
+
>>> parser=StanfordParser(
|
| 277 |
+
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
|
| 278 |
+
... ) # doctest: +SKIP
|
| 279 |
+
|
| 280 |
+
>>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 281 |
+
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
| 282 |
+
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
|
| 283 |
+
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
|
| 284 |
+
|
| 285 |
+
>>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
|
| 286 |
+
... "the quick brown fox jumps over the lazy dog",
|
| 287 |
+
... "the quick grey wolf jumps over the lazy fox"
|
| 288 |
+
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 289 |
+
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
| 290 |
+
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
|
| 291 |
+
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
|
| 292 |
+
[Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
|
| 293 |
+
[Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
|
| 294 |
+
Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
|
| 295 |
+
|
| 296 |
+
>>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
|
| 297 |
+
... "I 'm a dog".split(),
|
| 298 |
+
... "This is my friends ' cat ( the tabby )".split(),
|
| 299 |
+
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 300 |
+
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
|
| 301 |
+
Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
|
| 302 |
+
[Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
|
| 303 |
+
Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
|
| 304 |
+
Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
|
| 305 |
+
|
| 306 |
+
>>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
|
| 307 |
+
... (
|
| 308 |
+
... ("The", "DT"),
|
| 309 |
+
... ("quick", "JJ"),
|
| 310 |
+
... ("brown", "JJ"),
|
| 311 |
+
... ("fox", "NN"),
|
| 312 |
+
... ("jumped", "VBD"),
|
| 313 |
+
... ("over", "IN"),
|
| 314 |
+
... ("the", "DT"),
|
| 315 |
+
... ("lazy", "JJ"),
|
| 316 |
+
... ("dog", "NN"),
|
| 317 |
+
... (".", "."),
|
| 318 |
+
... ),
|
| 319 |
+
... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 320 |
+
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
| 321 |
+
Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
|
| 322 |
+
[Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
|
| 323 |
+
"""
|
| 324 |
+
|
| 325 |
+
_OUTPUT_FORMAT = "penn"
|
| 326 |
+
|
| 327 |
+
def __init__(self, *args, **kwargs):
|
| 328 |
+
warnings.warn(
|
| 329 |
+
"The StanfordParser will be deprecated\n"
|
| 330 |
+
"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
|
| 331 |
+
DeprecationWarning,
|
| 332 |
+
stacklevel=2,
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
super().__init__(*args, **kwargs)
|
| 336 |
+
|
| 337 |
+
def _make_tree(self, result):
|
| 338 |
+
return Tree.fromstring(result)
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
class StanfordDependencyParser(GenericStanfordParser):
|
| 342 |
+
|
| 343 |
+
"""
|
| 344 |
+
>>> dep_parser=StanfordDependencyParser(
|
| 345 |
+
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
|
| 346 |
+
... ) # doctest: +SKIP
|
| 347 |
+
|
| 348 |
+
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 349 |
+
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
|
| 350 |
+
|
| 351 |
+
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 352 |
+
[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
|
| 353 |
+
((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
|
| 354 |
+
((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
|
| 355 |
+
((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
|
| 356 |
+
|
| 357 |
+
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
|
| 358 |
+
... "The quick brown fox jumps over the lazy dog.",
|
| 359 |
+
... "The quick grey wolf jumps over the lazy fox."
|
| 360 |
+
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 361 |
+
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
|
| 362 |
+
Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
|
| 363 |
+
|
| 364 |
+
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
|
| 365 |
+
... "I 'm a dog".split(),
|
| 366 |
+
... "This is my friends ' cat ( the tabby )".split(),
|
| 367 |
+
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 368 |
+
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
|
| 369 |
+
|
| 370 |
+
>>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
|
| 371 |
+
... (
|
| 372 |
+
... ("The", "DT"),
|
| 373 |
+
... ("quick", "JJ"),
|
| 374 |
+
... ("brown", "JJ"),
|
| 375 |
+
... ("fox", "NN"),
|
| 376 |
+
... ("jumped", "VBD"),
|
| 377 |
+
... ("over", "IN"),
|
| 378 |
+
... ("the", "DT"),
|
| 379 |
+
... ("lazy", "JJ"),
|
| 380 |
+
... ("dog", "NN"),
|
| 381 |
+
... (".", "."),
|
| 382 |
+
... ),
|
| 383 |
+
... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 384 |
+
[[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
|
| 385 |
+
((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
|
| 386 |
+
((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
|
| 387 |
+
((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
|
| 388 |
+
|
| 389 |
+
"""
|
| 390 |
+
|
| 391 |
+
_OUTPUT_FORMAT = "conll2007"
|
| 392 |
+
|
| 393 |
+
def __init__(self, *args, **kwargs):
|
| 394 |
+
warnings.warn(
|
| 395 |
+
"The StanfordDependencyParser will be deprecated\n"
|
| 396 |
+
"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
|
| 397 |
+
DeprecationWarning,
|
| 398 |
+
stacklevel=2,
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
super().__init__(*args, **kwargs)
|
| 402 |
+
|
| 403 |
+
def _make_tree(self, result):
|
| 404 |
+
return DependencyGraph(result, top_relation_label="root")
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
class StanfordNeuralDependencyParser(GenericStanfordParser):
|
| 408 |
+
"""
|
| 409 |
+
>>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP
|
| 410 |
+
>>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP
|
| 411 |
+
|
| 412 |
+
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 413 |
+
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
|
| 414 |
+
|
| 415 |
+
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 416 |
+
[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
|
| 417 |
+
(u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
|
| 418 |
+
u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
|
| 419 |
+
((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
|
| 420 |
+
(u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
|
| 421 |
+
u'punct', (u'.', u'.'))]]
|
| 422 |
+
|
| 423 |
+
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
|
| 424 |
+
... "The quick brown fox jumps over the lazy dog.",
|
| 425 |
+
... "The quick grey wolf jumps over the lazy fox."
|
| 426 |
+
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 427 |
+
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
|
| 428 |
+
'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
|
| 429 |
+
Tree('fox', ['over', 'the', 'lazy']), '.'])]
|
| 430 |
+
|
| 431 |
+
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
|
| 432 |
+
... "I 'm a dog".split(),
|
| 433 |
+
... "This is my friends ' cat ( the tabby )".split(),
|
| 434 |
+
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
| 435 |
+
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
|
| 436 |
+
['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
|
| 437 |
+
"""
|
| 438 |
+
|
| 439 |
+
_OUTPUT_FORMAT = "conll"
|
| 440 |
+
_MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
|
| 441 |
+
_JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
|
| 442 |
+
_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
|
| 443 |
+
_USE_STDIN = True
|
| 444 |
+
_DOUBLE_SPACED_OUTPUT = True
|
| 445 |
+
|
| 446 |
+
def __init__(self, *args, **kwargs):
|
| 447 |
+
warnings.warn(
|
| 448 |
+
"The StanfordNeuralDependencyParser will be deprecated\n"
|
| 449 |
+
"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
|
| 450 |
+
DeprecationWarning,
|
| 451 |
+
stacklevel=2,
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
super().__init__(*args, **kwargs)
|
| 455 |
+
self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
|
| 456 |
+
|
| 457 |
+
def tagged_parse_sents(self, sentences, verbose=False):
|
| 458 |
+
"""
|
| 459 |
+
Currently unimplemented because the neural dependency parser (and
|
| 460 |
+
the StanfordCoreNLP pipeline class) doesn't support passing in pre-
|
| 461 |
+
tagged tokens.
|
| 462 |
+
"""
|
| 463 |
+
raise NotImplementedError(
|
| 464 |
+
"tagged_parse[_sents] is not supported by "
|
| 465 |
+
"StanfordNeuralDependencyParser; use "
|
| 466 |
+
"parse[_sents] or raw_parse[_sents] instead."
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
def _make_tree(self, result):
|
| 470 |
+
return DependencyGraph(result, top_relation_label="ROOT")
|
.eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py
ADDED
|
@@ -0,0 +1,794 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers
|
| 2 |
+
#
|
| 3 |
+
# Author: Long Duong <longdt219@gmail.com>
|
| 4 |
+
#
|
| 5 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
import pickle
|
| 10 |
+
import tempfile
|
| 11 |
+
from copy import deepcopy
|
| 12 |
+
from operator import itemgetter
|
| 13 |
+
from os import remove
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from numpy import array
|
| 17 |
+
from scipy import sparse
|
| 18 |
+
from sklearn import svm
|
| 19 |
+
from sklearn.datasets import load_svmlight_file
|
| 20 |
+
except ImportError:
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class Configuration:
|
| 27 |
+
"""
|
| 28 |
+
Class for holding configuration which is the partial analysis of the input sentence.
|
| 29 |
+
The transition based parser aims at finding set of operators that transfer the initial
|
| 30 |
+
configuration to the terminal configuration.
|
| 31 |
+
|
| 32 |
+
The configuration includes:
|
| 33 |
+
- Stack: for storing partially proceeded words
|
| 34 |
+
- Buffer: for storing remaining input words
|
| 35 |
+
- Set of arcs: for storing partially built dependency tree
|
| 36 |
+
|
| 37 |
+
This class also provides a method to represent a configuration as list of features.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, dep_graph):
|
| 41 |
+
"""
|
| 42 |
+
:param dep_graph: the representation of an input in the form of dependency graph.
|
| 43 |
+
:type dep_graph: DependencyGraph where the dependencies are not specified.
|
| 44 |
+
"""
|
| 45 |
+
# dep_graph.nodes contain list of token for a sentence
|
| 46 |
+
self.stack = [0] # The root element
|
| 47 |
+
self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer
|
| 48 |
+
self.arcs = [] # empty set of arc
|
| 49 |
+
self._tokens = dep_graph.nodes
|
| 50 |
+
self._max_address = len(self.buffer)
|
| 51 |
+
|
| 52 |
+
def __str__(self):
|
| 53 |
+
return (
|
| 54 |
+
"Stack : "
|
| 55 |
+
+ str(self.stack)
|
| 56 |
+
+ " Buffer : "
|
| 57 |
+
+ str(self.buffer)
|
| 58 |
+
+ " Arcs : "
|
| 59 |
+
+ str(self.arcs)
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
def _check_informative(self, feat, flag=False):
|
| 63 |
+
"""
|
| 64 |
+
Check whether a feature is informative
|
| 65 |
+
The flag control whether "_" is informative or not
|
| 66 |
+
"""
|
| 67 |
+
if feat is None:
|
| 68 |
+
return False
|
| 69 |
+
if feat == "":
|
| 70 |
+
return False
|
| 71 |
+
if flag is False:
|
| 72 |
+
if feat == "_":
|
| 73 |
+
return False
|
| 74 |
+
return True
|
| 75 |
+
|
| 76 |
+
def extract_features(self):
|
| 77 |
+
"""
|
| 78 |
+
Extract the set of features for the current configuration. Implement standard features as describe in
|
| 79 |
+
Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre.
|
| 80 |
+
Please note that these features are very basic.
|
| 81 |
+
:return: list(str)
|
| 82 |
+
"""
|
| 83 |
+
result = []
|
| 84 |
+
# Todo : can come up with more complicated features set for better
|
| 85 |
+
# performance.
|
| 86 |
+
if len(self.stack) > 0:
|
| 87 |
+
# Stack 0
|
| 88 |
+
stack_idx0 = self.stack[len(self.stack) - 1]
|
| 89 |
+
token = self._tokens[stack_idx0]
|
| 90 |
+
if self._check_informative(token["word"], True):
|
| 91 |
+
result.append("STK_0_FORM_" + token["word"])
|
| 92 |
+
if "lemma" in token and self._check_informative(token["lemma"]):
|
| 93 |
+
result.append("STK_0_LEMMA_" + token["lemma"])
|
| 94 |
+
if self._check_informative(token["tag"]):
|
| 95 |
+
result.append("STK_0_POS_" + token["tag"])
|
| 96 |
+
if "feats" in token and self._check_informative(token["feats"]):
|
| 97 |
+
feats = token["feats"].split("|")
|
| 98 |
+
for feat in feats:
|
| 99 |
+
result.append("STK_0_FEATS_" + feat)
|
| 100 |
+
# Stack 1
|
| 101 |
+
if len(self.stack) > 1:
|
| 102 |
+
stack_idx1 = self.stack[len(self.stack) - 2]
|
| 103 |
+
token = self._tokens[stack_idx1]
|
| 104 |
+
if self._check_informative(token["tag"]):
|
| 105 |
+
result.append("STK_1_POS_" + token["tag"])
|
| 106 |
+
|
| 107 |
+
# Left most, right most dependency of stack[0]
|
| 108 |
+
left_most = 1000000
|
| 109 |
+
right_most = -1
|
| 110 |
+
dep_left_most = ""
|
| 111 |
+
dep_right_most = ""
|
| 112 |
+
for (wi, r, wj) in self.arcs:
|
| 113 |
+
if wi == stack_idx0:
|
| 114 |
+
if (wj > wi) and (wj > right_most):
|
| 115 |
+
right_most = wj
|
| 116 |
+
dep_right_most = r
|
| 117 |
+
if (wj < wi) and (wj < left_most):
|
| 118 |
+
left_most = wj
|
| 119 |
+
dep_left_most = r
|
| 120 |
+
if self._check_informative(dep_left_most):
|
| 121 |
+
result.append("STK_0_LDEP_" + dep_left_most)
|
| 122 |
+
if self._check_informative(dep_right_most):
|
| 123 |
+
result.append("STK_0_RDEP_" + dep_right_most)
|
| 124 |
+
|
| 125 |
+
# Check Buffered 0
|
| 126 |
+
if len(self.buffer) > 0:
|
| 127 |
+
# Buffer 0
|
| 128 |
+
buffer_idx0 = self.buffer[0]
|
| 129 |
+
token = self._tokens[buffer_idx0]
|
| 130 |
+
if self._check_informative(token["word"], True):
|
| 131 |
+
result.append("BUF_0_FORM_" + token["word"])
|
| 132 |
+
if "lemma" in token and self._check_informative(token["lemma"]):
|
| 133 |
+
result.append("BUF_0_LEMMA_" + token["lemma"])
|
| 134 |
+
if self._check_informative(token["tag"]):
|
| 135 |
+
result.append("BUF_0_POS_" + token["tag"])
|
| 136 |
+
if "feats" in token and self._check_informative(token["feats"]):
|
| 137 |
+
feats = token["feats"].split("|")
|
| 138 |
+
for feat in feats:
|
| 139 |
+
result.append("BUF_0_FEATS_" + feat)
|
| 140 |
+
# Buffer 1
|
| 141 |
+
if len(self.buffer) > 1:
|
| 142 |
+
buffer_idx1 = self.buffer[1]
|
| 143 |
+
token = self._tokens[buffer_idx1]
|
| 144 |
+
if self._check_informative(token["word"], True):
|
| 145 |
+
result.append("BUF_1_FORM_" + token["word"])
|
| 146 |
+
if self._check_informative(token["tag"]):
|
| 147 |
+
result.append("BUF_1_POS_" + token["tag"])
|
| 148 |
+
if len(self.buffer) > 2:
|
| 149 |
+
buffer_idx2 = self.buffer[2]
|
| 150 |
+
token = self._tokens[buffer_idx2]
|
| 151 |
+
if self._check_informative(token["tag"]):
|
| 152 |
+
result.append("BUF_2_POS_" + token["tag"])
|
| 153 |
+
if len(self.buffer) > 3:
|
| 154 |
+
buffer_idx3 = self.buffer[3]
|
| 155 |
+
token = self._tokens[buffer_idx3]
|
| 156 |
+
if self._check_informative(token["tag"]):
|
| 157 |
+
result.append("BUF_3_POS_" + token["tag"])
|
| 158 |
+
# Left most, right most dependency of stack[0]
|
| 159 |
+
left_most = 1000000
|
| 160 |
+
right_most = -1
|
| 161 |
+
dep_left_most = ""
|
| 162 |
+
dep_right_most = ""
|
| 163 |
+
for (wi, r, wj) in self.arcs:
|
| 164 |
+
if wi == buffer_idx0:
|
| 165 |
+
if (wj > wi) and (wj > right_most):
|
| 166 |
+
right_most = wj
|
| 167 |
+
dep_right_most = r
|
| 168 |
+
if (wj < wi) and (wj < left_most):
|
| 169 |
+
left_most = wj
|
| 170 |
+
dep_left_most = r
|
| 171 |
+
if self._check_informative(dep_left_most):
|
| 172 |
+
result.append("BUF_0_LDEP_" + dep_left_most)
|
| 173 |
+
if self._check_informative(dep_right_most):
|
| 174 |
+
result.append("BUF_0_RDEP_" + dep_right_most)
|
| 175 |
+
|
| 176 |
+
return result
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
class Transition:
|
| 180 |
+
"""
|
| 181 |
+
This class defines a set of transition which is applied to a configuration to get another configuration
|
| 182 |
+
Note that for different parsing algorithm, the transition is different.
|
| 183 |
+
"""
|
| 184 |
+
|
| 185 |
+
# Define set of transitions
|
| 186 |
+
LEFT_ARC = "LEFTARC"
|
| 187 |
+
RIGHT_ARC = "RIGHTARC"
|
| 188 |
+
SHIFT = "SHIFT"
|
| 189 |
+
REDUCE = "REDUCE"
|
| 190 |
+
|
| 191 |
+
def __init__(self, alg_option):
|
| 192 |
+
"""
|
| 193 |
+
:param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
|
| 194 |
+
:type alg_option: str
|
| 195 |
+
"""
|
| 196 |
+
self._algo = alg_option
|
| 197 |
+
if alg_option not in [
|
| 198 |
+
TransitionParser.ARC_STANDARD,
|
| 199 |
+
TransitionParser.ARC_EAGER,
|
| 200 |
+
]:
|
| 201 |
+
raise ValueError(
|
| 202 |
+
" Currently we only support %s and %s "
|
| 203 |
+
% (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER)
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
def left_arc(self, conf, relation):
|
| 207 |
+
"""
|
| 208 |
+
Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager
|
| 209 |
+
|
| 210 |
+
:param configuration: is the current configuration
|
| 211 |
+
:return: A new configuration or -1 if the pre-condition is not satisfied
|
| 212 |
+
"""
|
| 213 |
+
if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
|
| 214 |
+
return -1
|
| 215 |
+
if conf.buffer[0] == 0:
|
| 216 |
+
# here is the Root element
|
| 217 |
+
return -1
|
| 218 |
+
|
| 219 |
+
idx_wi = conf.stack[len(conf.stack) - 1]
|
| 220 |
+
|
| 221 |
+
flag = True
|
| 222 |
+
if self._algo == TransitionParser.ARC_EAGER:
|
| 223 |
+
for (idx_parent, r, idx_child) in conf.arcs:
|
| 224 |
+
if idx_child == idx_wi:
|
| 225 |
+
flag = False
|
| 226 |
+
|
| 227 |
+
if flag:
|
| 228 |
+
conf.stack.pop()
|
| 229 |
+
idx_wj = conf.buffer[0]
|
| 230 |
+
conf.arcs.append((idx_wj, relation, idx_wi))
|
| 231 |
+
else:
|
| 232 |
+
return -1
|
| 233 |
+
|
| 234 |
+
def right_arc(self, conf, relation):
|
| 235 |
+
"""
|
| 236 |
+
Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager
|
| 237 |
+
|
| 238 |
+
:param configuration: is the current configuration
|
| 239 |
+
:return: A new configuration or -1 if the pre-condition is not satisfied
|
| 240 |
+
"""
|
| 241 |
+
if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
|
| 242 |
+
return -1
|
| 243 |
+
if self._algo == TransitionParser.ARC_STANDARD:
|
| 244 |
+
idx_wi = conf.stack.pop()
|
| 245 |
+
idx_wj = conf.buffer[0]
|
| 246 |
+
conf.buffer[0] = idx_wi
|
| 247 |
+
conf.arcs.append((idx_wi, relation, idx_wj))
|
| 248 |
+
else: # arc-eager
|
| 249 |
+
idx_wi = conf.stack[len(conf.stack) - 1]
|
| 250 |
+
idx_wj = conf.buffer.pop(0)
|
| 251 |
+
conf.stack.append(idx_wj)
|
| 252 |
+
conf.arcs.append((idx_wi, relation, idx_wj))
|
| 253 |
+
|
| 254 |
+
def reduce(self, conf):
|
| 255 |
+
"""
|
| 256 |
+
Note that the algorithm for reduce is only available for arc-eager
|
| 257 |
+
|
| 258 |
+
:param configuration: is the current configuration
|
| 259 |
+
:return: A new configuration or -1 if the pre-condition is not satisfied
|
| 260 |
+
"""
|
| 261 |
+
|
| 262 |
+
if self._algo != TransitionParser.ARC_EAGER:
|
| 263 |
+
return -1
|
| 264 |
+
if len(conf.stack) <= 0:
|
| 265 |
+
return -1
|
| 266 |
+
|
| 267 |
+
idx_wi = conf.stack[len(conf.stack) - 1]
|
| 268 |
+
flag = False
|
| 269 |
+
for (idx_parent, r, idx_child) in conf.arcs:
|
| 270 |
+
if idx_child == idx_wi:
|
| 271 |
+
flag = True
|
| 272 |
+
if flag:
|
| 273 |
+
conf.stack.pop() # reduce it
|
| 274 |
+
else:
|
| 275 |
+
return -1
|
| 276 |
+
|
| 277 |
+
def shift(self, conf):
|
| 278 |
+
"""
|
| 279 |
+
Note that the algorithm for shift is the SAME for arc-standard and arc-eager
|
| 280 |
+
|
| 281 |
+
:param configuration: is the current configuration
|
| 282 |
+
:return: A new configuration or -1 if the pre-condition is not satisfied
|
| 283 |
+
"""
|
| 284 |
+
if len(conf.buffer) <= 0:
|
| 285 |
+
return -1
|
| 286 |
+
idx_wi = conf.buffer.pop(0)
|
| 287 |
+
conf.stack.append(idx_wi)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
class TransitionParser(ParserI):
|
| 291 |
+
|
| 292 |
+
"""
|
| 293 |
+
Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
|
| 294 |
+
"""
|
| 295 |
+
|
| 296 |
+
ARC_STANDARD = "arc-standard"
|
| 297 |
+
ARC_EAGER = "arc-eager"
|
| 298 |
+
|
| 299 |
+
def __init__(self, algorithm):
|
| 300 |
+
"""
|
| 301 |
+
:param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
|
| 302 |
+
:type algorithm: str
|
| 303 |
+
"""
|
| 304 |
+
if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
|
| 305 |
+
raise ValueError(
|
| 306 |
+
" Currently we only support %s and %s "
|
| 307 |
+
% (self.ARC_STANDARD, self.ARC_EAGER)
|
| 308 |
+
)
|
| 309 |
+
self._algorithm = algorithm
|
| 310 |
+
|
| 311 |
+
self._dictionary = {}
|
| 312 |
+
self._transition = {}
|
| 313 |
+
self._match_transition = {}
|
| 314 |
+
|
| 315 |
+
def _get_dep_relation(self, idx_parent, idx_child, depgraph):
|
| 316 |
+
p_node = depgraph.nodes[idx_parent]
|
| 317 |
+
c_node = depgraph.nodes[idx_child]
|
| 318 |
+
|
| 319 |
+
if c_node["word"] is None:
|
| 320 |
+
return None # Root word
|
| 321 |
+
|
| 322 |
+
if c_node["head"] == p_node["address"]:
|
| 323 |
+
return c_node["rel"]
|
| 324 |
+
else:
|
| 325 |
+
return None
|
| 326 |
+
|
| 327 |
+
def _convert_to_binary_features(self, features):
|
| 328 |
+
"""
|
| 329 |
+
:param features: list of feature string which is needed to convert to binary features
|
| 330 |
+
:type features: list(str)
|
| 331 |
+
:return : string of binary features in libsvm format which is 'featureID:value' pairs
|
| 332 |
+
"""
|
| 333 |
+
unsorted_result = []
|
| 334 |
+
for feature in features:
|
| 335 |
+
self._dictionary.setdefault(feature, len(self._dictionary))
|
| 336 |
+
unsorted_result.append(self._dictionary[feature])
|
| 337 |
+
|
| 338 |
+
# Default value of each feature is 1.0
|
| 339 |
+
return " ".join(
|
| 340 |
+
str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
def _is_projective(self, depgraph):
|
| 344 |
+
arc_list = []
|
| 345 |
+
for key in depgraph.nodes:
|
| 346 |
+
node = depgraph.nodes[key]
|
| 347 |
+
|
| 348 |
+
if "head" in node:
|
| 349 |
+
childIdx = node["address"]
|
| 350 |
+
parentIdx = node["head"]
|
| 351 |
+
if parentIdx is not None:
|
| 352 |
+
arc_list.append((parentIdx, childIdx))
|
| 353 |
+
|
| 354 |
+
for (parentIdx, childIdx) in arc_list:
|
| 355 |
+
# Ensure that childIdx < parentIdx
|
| 356 |
+
if childIdx > parentIdx:
|
| 357 |
+
temp = childIdx
|
| 358 |
+
childIdx = parentIdx
|
| 359 |
+
parentIdx = temp
|
| 360 |
+
for k in range(childIdx + 1, parentIdx):
|
| 361 |
+
for m in range(len(depgraph.nodes)):
|
| 362 |
+
if (m < childIdx) or (m > parentIdx):
|
| 363 |
+
if (k, m) in arc_list:
|
| 364 |
+
return False
|
| 365 |
+
if (m, k) in arc_list:
|
| 366 |
+
return False
|
| 367 |
+
return True
|
| 368 |
+
|
| 369 |
+
def _write_to_file(self, key, binary_features, input_file):
|
| 370 |
+
"""
|
| 371 |
+
write the binary features to input file and update the transition dictionary
|
| 372 |
+
"""
|
| 373 |
+
self._transition.setdefault(key, len(self._transition) + 1)
|
| 374 |
+
self._match_transition[self._transition[key]] = key
|
| 375 |
+
|
| 376 |
+
input_str = str(self._transition[key]) + " " + binary_features + "\n"
|
| 377 |
+
input_file.write(input_str.encode("utf-8"))
|
| 378 |
+
|
| 379 |
+
def _create_training_examples_arc_std(self, depgraphs, input_file):
|
| 380 |
+
"""
|
| 381 |
+
Create the training example in the libsvm format and write it to the input_file.
|
| 382 |
+
Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009)
|
| 383 |
+
"""
|
| 384 |
+
operation = Transition(self.ARC_STANDARD)
|
| 385 |
+
count_proj = 0
|
| 386 |
+
training_seq = []
|
| 387 |
+
|
| 388 |
+
for depgraph in depgraphs:
|
| 389 |
+
if not self._is_projective(depgraph):
|
| 390 |
+
continue
|
| 391 |
+
|
| 392 |
+
count_proj += 1
|
| 393 |
+
conf = Configuration(depgraph)
|
| 394 |
+
while len(conf.buffer) > 0:
|
| 395 |
+
b0 = conf.buffer[0]
|
| 396 |
+
features = conf.extract_features()
|
| 397 |
+
binary_features = self._convert_to_binary_features(features)
|
| 398 |
+
|
| 399 |
+
if len(conf.stack) > 0:
|
| 400 |
+
s0 = conf.stack[len(conf.stack) - 1]
|
| 401 |
+
# Left-arc operation
|
| 402 |
+
rel = self._get_dep_relation(b0, s0, depgraph)
|
| 403 |
+
if rel is not None:
|
| 404 |
+
key = Transition.LEFT_ARC + ":" + rel
|
| 405 |
+
self._write_to_file(key, binary_features, input_file)
|
| 406 |
+
operation.left_arc(conf, rel)
|
| 407 |
+
training_seq.append(key)
|
| 408 |
+
continue
|
| 409 |
+
|
| 410 |
+
# Right-arc operation
|
| 411 |
+
rel = self._get_dep_relation(s0, b0, depgraph)
|
| 412 |
+
if rel is not None:
|
| 413 |
+
precondition = True
|
| 414 |
+
# Get the max-index of buffer
|
| 415 |
+
maxID = conf._max_address
|
| 416 |
+
|
| 417 |
+
for w in range(maxID + 1):
|
| 418 |
+
if w != b0:
|
| 419 |
+
relw = self._get_dep_relation(b0, w, depgraph)
|
| 420 |
+
if relw is not None:
|
| 421 |
+
if (b0, relw, w) not in conf.arcs:
|
| 422 |
+
precondition = False
|
| 423 |
+
|
| 424 |
+
if precondition:
|
| 425 |
+
key = Transition.RIGHT_ARC + ":" + rel
|
| 426 |
+
self._write_to_file(key, binary_features, input_file)
|
| 427 |
+
operation.right_arc(conf, rel)
|
| 428 |
+
training_seq.append(key)
|
| 429 |
+
continue
|
| 430 |
+
|
| 431 |
+
# Shift operation as the default
|
| 432 |
+
key = Transition.SHIFT
|
| 433 |
+
self._write_to_file(key, binary_features, input_file)
|
| 434 |
+
operation.shift(conf)
|
| 435 |
+
training_seq.append(key)
|
| 436 |
+
|
| 437 |
+
print(" Number of training examples : " + str(len(depgraphs)))
|
| 438 |
+
print(" Number of valid (projective) examples : " + str(count_proj))
|
| 439 |
+
return training_seq
|
| 440 |
+
|
| 441 |
+
def _create_training_examples_arc_eager(self, depgraphs, input_file):
|
| 442 |
+
"""
|
| 443 |
+
Create the training example in the libsvm format and write it to the input_file.
|
| 444 |
+
Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre
|
| 445 |
+
"""
|
| 446 |
+
operation = Transition(self.ARC_EAGER)
|
| 447 |
+
countProj = 0
|
| 448 |
+
training_seq = []
|
| 449 |
+
|
| 450 |
+
for depgraph in depgraphs:
|
| 451 |
+
if not self._is_projective(depgraph):
|
| 452 |
+
continue
|
| 453 |
+
|
| 454 |
+
countProj += 1
|
| 455 |
+
conf = Configuration(depgraph)
|
| 456 |
+
while len(conf.buffer) > 0:
|
| 457 |
+
b0 = conf.buffer[0]
|
| 458 |
+
features = conf.extract_features()
|
| 459 |
+
binary_features = self._convert_to_binary_features(features)
|
| 460 |
+
|
| 461 |
+
if len(conf.stack) > 0:
|
| 462 |
+
s0 = conf.stack[len(conf.stack) - 1]
|
| 463 |
+
# Left-arc operation
|
| 464 |
+
rel = self._get_dep_relation(b0, s0, depgraph)
|
| 465 |
+
if rel is not None:
|
| 466 |
+
key = Transition.LEFT_ARC + ":" + rel
|
| 467 |
+
self._write_to_file(key, binary_features, input_file)
|
| 468 |
+
operation.left_arc(conf, rel)
|
| 469 |
+
training_seq.append(key)
|
| 470 |
+
continue
|
| 471 |
+
|
| 472 |
+
# Right-arc operation
|
| 473 |
+
rel = self._get_dep_relation(s0, b0, depgraph)
|
| 474 |
+
if rel is not None:
|
| 475 |
+
key = Transition.RIGHT_ARC + ":" + rel
|
| 476 |
+
self._write_to_file(key, binary_features, input_file)
|
| 477 |
+
operation.right_arc(conf, rel)
|
| 478 |
+
training_seq.append(key)
|
| 479 |
+
continue
|
| 480 |
+
|
| 481 |
+
# reduce operation
|
| 482 |
+
flag = False
|
| 483 |
+
for k in range(s0):
|
| 484 |
+
if self._get_dep_relation(k, b0, depgraph) is not None:
|
| 485 |
+
flag = True
|
| 486 |
+
if self._get_dep_relation(b0, k, depgraph) is not None:
|
| 487 |
+
flag = True
|
| 488 |
+
if flag:
|
| 489 |
+
key = Transition.REDUCE
|
| 490 |
+
self._write_to_file(key, binary_features, input_file)
|
| 491 |
+
operation.reduce(conf)
|
| 492 |
+
training_seq.append(key)
|
| 493 |
+
continue
|
| 494 |
+
|
| 495 |
+
# Shift operation as the default
|
| 496 |
+
key = Transition.SHIFT
|
| 497 |
+
self._write_to_file(key, binary_features, input_file)
|
| 498 |
+
operation.shift(conf)
|
| 499 |
+
training_seq.append(key)
|
| 500 |
+
|
| 501 |
+
print(" Number of training examples : " + str(len(depgraphs)))
|
| 502 |
+
print(" Number of valid (projective) examples : " + str(countProj))
|
| 503 |
+
return training_seq
|
| 504 |
+
|
| 505 |
+
def train(self, depgraphs, modelfile, verbose=True):
|
| 506 |
+
"""
|
| 507 |
+
:param depgraphs : list of DependencyGraph as the training data
|
| 508 |
+
:type depgraphs : DependencyGraph
|
| 509 |
+
:param modelfile : file name to save the trained model
|
| 510 |
+
:type modelfile : str
|
| 511 |
+
"""
|
| 512 |
+
|
| 513 |
+
try:
|
| 514 |
+
input_file = tempfile.NamedTemporaryFile(
|
| 515 |
+
prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
if self._algorithm == self.ARC_STANDARD:
|
| 519 |
+
self._create_training_examples_arc_std(depgraphs, input_file)
|
| 520 |
+
else:
|
| 521 |
+
self._create_training_examples_arc_eager(depgraphs, input_file)
|
| 522 |
+
|
| 523 |
+
input_file.close()
|
| 524 |
+
# Using the temporary file to train the libsvm classifier
|
| 525 |
+
x_train, y_train = load_svmlight_file(input_file.name)
|
| 526 |
+
# The parameter is set according to the paper:
|
| 527 |
+
# Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
|
| 528 |
+
# Todo : because of probability = True => very slow due to
|
| 529 |
+
# cross-validation. Need to improve the speed here
|
| 530 |
+
model = svm.SVC(
|
| 531 |
+
kernel="poly",
|
| 532 |
+
degree=2,
|
| 533 |
+
coef0=0,
|
| 534 |
+
gamma=0.2,
|
| 535 |
+
C=0.5,
|
| 536 |
+
verbose=verbose,
|
| 537 |
+
probability=True,
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
model.fit(x_train, y_train)
|
| 541 |
+
# Save the model to file name (as pickle)
|
| 542 |
+
pickle.dump(model, open(modelfile, "wb"))
|
| 543 |
+
finally:
|
| 544 |
+
remove(input_file.name)
|
| 545 |
+
|
| 546 |
+
def parse(self, depgraphs, modelFile):
|
| 547 |
+
"""
|
| 548 |
+
:param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy
|
| 549 |
+
:type depgraphs: list(DependencyGraph)
|
| 550 |
+
:param modelfile: the model file
|
| 551 |
+
:type modelfile: str
|
| 552 |
+
:return: list (DependencyGraph) with the 'head' and 'rel' information
|
| 553 |
+
"""
|
| 554 |
+
result = []
|
| 555 |
+
# First load the model
|
| 556 |
+
model = pickle.load(open(modelFile, "rb"))
|
| 557 |
+
operation = Transition(self._algorithm)
|
| 558 |
+
|
| 559 |
+
for depgraph in depgraphs:
|
| 560 |
+
conf = Configuration(depgraph)
|
| 561 |
+
while len(conf.buffer) > 0:
|
| 562 |
+
features = conf.extract_features()
|
| 563 |
+
col = []
|
| 564 |
+
row = []
|
| 565 |
+
data = []
|
| 566 |
+
for feature in features:
|
| 567 |
+
if feature in self._dictionary:
|
| 568 |
+
col.append(self._dictionary[feature])
|
| 569 |
+
row.append(0)
|
| 570 |
+
data.append(1.0)
|
| 571 |
+
np_col = array(sorted(col)) # NB : index must be sorted
|
| 572 |
+
np_row = array(row)
|
| 573 |
+
np_data = array(data)
|
| 574 |
+
|
| 575 |
+
x_test = sparse.csr_matrix(
|
| 576 |
+
(np_data, (np_row, np_col)), shape=(1, len(self._dictionary))
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
# It's best to use decision function as follow BUT it's not supported yet for sparse SVM
|
| 580 |
+
# Using decision function to build the votes array
|
| 581 |
+
# dec_func = model.decision_function(x_test)[0]
|
| 582 |
+
# votes = {}
|
| 583 |
+
# k = 0
|
| 584 |
+
# for i in range(len(model.classes_)):
|
| 585 |
+
# for j in range(i+1, len(model.classes_)):
|
| 586 |
+
# #if dec_func[k] > 0:
|
| 587 |
+
# votes.setdefault(i,0)
|
| 588 |
+
# votes[i] +=1
|
| 589 |
+
# else:
|
| 590 |
+
# votes.setdefault(j,0)
|
| 591 |
+
# votes[j] +=1
|
| 592 |
+
# k +=1
|
| 593 |
+
# Sort votes according to the values
|
| 594 |
+
# sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
|
| 595 |
+
|
| 596 |
+
# We will use predict_proba instead of decision_function
|
| 597 |
+
prob_dict = {}
|
| 598 |
+
pred_prob = model.predict_proba(x_test)[0]
|
| 599 |
+
for i in range(len(pred_prob)):
|
| 600 |
+
prob_dict[i] = pred_prob[i]
|
| 601 |
+
sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True)
|
| 602 |
+
|
| 603 |
+
# Note that SHIFT is always a valid operation
|
| 604 |
+
for (y_pred_idx, confidence) in sorted_Prob:
|
| 605 |
+
# y_pred = model.predict(x_test)[0]
|
| 606 |
+
# From the prediction match to the operation
|
| 607 |
+
y_pred = model.classes_[y_pred_idx]
|
| 608 |
+
|
| 609 |
+
if y_pred in self._match_transition:
|
| 610 |
+
strTransition = self._match_transition[y_pred]
|
| 611 |
+
baseTransition = strTransition.split(":")[0]
|
| 612 |
+
|
| 613 |
+
if baseTransition == Transition.LEFT_ARC:
|
| 614 |
+
if (
|
| 615 |
+
operation.left_arc(conf, strTransition.split(":")[1])
|
| 616 |
+
!= -1
|
| 617 |
+
):
|
| 618 |
+
break
|
| 619 |
+
elif baseTransition == Transition.RIGHT_ARC:
|
| 620 |
+
if (
|
| 621 |
+
operation.right_arc(conf, strTransition.split(":")[1])
|
| 622 |
+
!= -1
|
| 623 |
+
):
|
| 624 |
+
break
|
| 625 |
+
elif baseTransition == Transition.REDUCE:
|
| 626 |
+
if operation.reduce(conf) != -1:
|
| 627 |
+
break
|
| 628 |
+
elif baseTransition == Transition.SHIFT:
|
| 629 |
+
if operation.shift(conf) != -1:
|
| 630 |
+
break
|
| 631 |
+
else:
|
| 632 |
+
raise ValueError(
|
| 633 |
+
"The predicted transition is not recognized, expected errors"
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
# Finish with operations build the dependency graph from Conf.arcs
|
| 637 |
+
|
| 638 |
+
new_depgraph = deepcopy(depgraph)
|
| 639 |
+
for key in new_depgraph.nodes:
|
| 640 |
+
node = new_depgraph.nodes[key]
|
| 641 |
+
node["rel"] = ""
|
| 642 |
+
# With the default, all the token depend on the Root
|
| 643 |
+
node["head"] = 0
|
| 644 |
+
for (head, rel, child) in conf.arcs:
|
| 645 |
+
c_node = new_depgraph.nodes[child]
|
| 646 |
+
c_node["head"] = head
|
| 647 |
+
c_node["rel"] = rel
|
| 648 |
+
result.append(new_depgraph)
|
| 649 |
+
|
| 650 |
+
return result
|
| 651 |
+
|
| 652 |
+
|
| 653 |
+
def demo():
|
| 654 |
+
"""
|
| 655 |
+
>>> from nltk.parse import DependencyGraph, DependencyEvaluator
|
| 656 |
+
>>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
|
| 657 |
+
>>> gold_sent = DependencyGraph(\"""
|
| 658 |
+
... Economic JJ 2 ATT
|
| 659 |
+
... news NN 3 SBJ
|
| 660 |
+
... has VBD 0 ROOT
|
| 661 |
+
... little JJ 5 ATT
|
| 662 |
+
... effect NN 3 OBJ
|
| 663 |
+
... on IN 5 ATT
|
| 664 |
+
... financial JJ 8 ATT
|
| 665 |
+
... markets NNS 6 PC
|
| 666 |
+
... . . 3 PU
|
| 667 |
+
... \""")
|
| 668 |
+
|
| 669 |
+
>>> conf = Configuration(gold_sent)
|
| 670 |
+
|
| 671 |
+
###################### Check the Initial Feature ########################
|
| 672 |
+
|
| 673 |
+
>>> print(', '.join(conf.extract_features()))
|
| 674 |
+
STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ
|
| 675 |
+
|
| 676 |
+
###################### Check The Transition #######################
|
| 677 |
+
Check the Initialized Configuration
|
| 678 |
+
>>> print(conf)
|
| 679 |
+
Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : []
|
| 680 |
+
|
| 681 |
+
A. Do some transition checks for ARC-STANDARD
|
| 682 |
+
|
| 683 |
+
>>> operation = Transition('arc-standard')
|
| 684 |
+
>>> operation.shift(conf)
|
| 685 |
+
>>> operation.left_arc(conf, "ATT")
|
| 686 |
+
>>> operation.shift(conf)
|
| 687 |
+
>>> operation.left_arc(conf,"SBJ")
|
| 688 |
+
>>> operation.shift(conf)
|
| 689 |
+
>>> operation.shift(conf)
|
| 690 |
+
>>> operation.left_arc(conf, "ATT")
|
| 691 |
+
>>> operation.shift(conf)
|
| 692 |
+
>>> operation.shift(conf)
|
| 693 |
+
>>> operation.shift(conf)
|
| 694 |
+
>>> operation.left_arc(conf, "ATT")
|
| 695 |
+
|
| 696 |
+
Middle Configuration and Features Check
|
| 697 |
+
>>> print(conf)
|
| 698 |
+
Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)]
|
| 699 |
+
|
| 700 |
+
>>> print(', '.join(conf.extract_features()))
|
| 701 |
+
STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT
|
| 702 |
+
|
| 703 |
+
>>> operation.right_arc(conf, "PC")
|
| 704 |
+
>>> operation.right_arc(conf, "ATT")
|
| 705 |
+
>>> operation.right_arc(conf, "OBJ")
|
| 706 |
+
>>> operation.shift(conf)
|
| 707 |
+
>>> operation.right_arc(conf, "PU")
|
| 708 |
+
>>> operation.right_arc(conf, "ROOT")
|
| 709 |
+
>>> operation.shift(conf)
|
| 710 |
+
|
| 711 |
+
Terminated Configuration Check
|
| 712 |
+
>>> print(conf)
|
| 713 |
+
Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)]
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
B. Do some transition checks for ARC-EAGER
|
| 717 |
+
|
| 718 |
+
>>> conf = Configuration(gold_sent)
|
| 719 |
+
>>> operation = Transition('arc-eager')
|
| 720 |
+
>>> operation.shift(conf)
|
| 721 |
+
>>> operation.left_arc(conf,'ATT')
|
| 722 |
+
>>> operation.shift(conf)
|
| 723 |
+
>>> operation.left_arc(conf,'SBJ')
|
| 724 |
+
>>> operation.right_arc(conf,'ROOT')
|
| 725 |
+
>>> operation.shift(conf)
|
| 726 |
+
>>> operation.left_arc(conf,'ATT')
|
| 727 |
+
>>> operation.right_arc(conf,'OBJ')
|
| 728 |
+
>>> operation.right_arc(conf,'ATT')
|
| 729 |
+
>>> operation.shift(conf)
|
| 730 |
+
>>> operation.left_arc(conf,'ATT')
|
| 731 |
+
>>> operation.right_arc(conf,'PC')
|
| 732 |
+
>>> operation.reduce(conf)
|
| 733 |
+
>>> operation.reduce(conf)
|
| 734 |
+
>>> operation.reduce(conf)
|
| 735 |
+
>>> operation.right_arc(conf,'PU')
|
| 736 |
+
>>> print(conf)
|
| 737 |
+
Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)]
|
| 738 |
+
|
| 739 |
+
###################### Check The Training Function #######################
|
| 740 |
+
|
| 741 |
+
A. Check the ARC-STANDARD training
|
| 742 |
+
>>> import tempfile
|
| 743 |
+
>>> import os
|
| 744 |
+
>>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False)
|
| 745 |
+
|
| 746 |
+
>>> parser_std = TransitionParser('arc-standard')
|
| 747 |
+
>>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file)))
|
| 748 |
+
Number of training examples : 1
|
| 749 |
+
Number of valid (projective) examples : 1
|
| 750 |
+
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT
|
| 751 |
+
|
| 752 |
+
>>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False)
|
| 753 |
+
Number of training examples : 1
|
| 754 |
+
Number of valid (projective) examples : 1
|
| 755 |
+
>>> input_file.close()
|
| 756 |
+
>>> remove(input_file.name)
|
| 757 |
+
|
| 758 |
+
B. Check the ARC-EAGER training
|
| 759 |
+
|
| 760 |
+
>>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False)
|
| 761 |
+
>>> parser_eager = TransitionParser('arc-eager')
|
| 762 |
+
>>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file)))
|
| 763 |
+
Number of training examples : 1
|
| 764 |
+
Number of valid (projective) examples : 1
|
| 765 |
+
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU
|
| 766 |
+
|
| 767 |
+
>>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False)
|
| 768 |
+
Number of training examples : 1
|
| 769 |
+
Number of valid (projective) examples : 1
|
| 770 |
+
|
| 771 |
+
>>> input_file.close()
|
| 772 |
+
>>> remove(input_file.name)
|
| 773 |
+
|
| 774 |
+
###################### Check The Parsing Function ########################
|
| 775 |
+
|
| 776 |
+
A. Check the ARC-STANDARD parser
|
| 777 |
+
|
| 778 |
+
>>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
|
| 779 |
+
>>> de = DependencyEvaluator(result, [gold_sent])
|
| 780 |
+
>>> de.eval() >= (0, 0)
|
| 781 |
+
True
|
| 782 |
+
|
| 783 |
+
B. Check the ARC-EAGER parser
|
| 784 |
+
>>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
|
| 785 |
+
>>> de = DependencyEvaluator(result, [gold_sent])
|
| 786 |
+
>>> de.eval() >= (0, 0)
|
| 787 |
+
True
|
| 788 |
+
|
| 789 |
+
Remove test temporary files
|
| 790 |
+
>>> remove('temp.arceager.model')
|
| 791 |
+
>>> remove('temp.arcstd.model')
|
| 792 |
+
|
| 793 |
+
Note that result is very poor because of only one training example.
|
| 794 |
+
"""
|
.eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Parser Utility Functions
|
| 2 |
+
#
|
| 3 |
+
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
| 4 |
+
# Tom Aarsen <>
|
| 5 |
+
#
|
| 6 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
"""
|
| 12 |
+
Utility functions for parsers.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from nltk.data import load
|
| 16 |
+
from nltk.grammar import CFG, PCFG, FeatureGrammar
|
| 17 |
+
from nltk.parse.chart import Chart, ChartParser
|
| 18 |
+
from nltk.parse.featurechart import FeatureChart, FeatureChartParser
|
| 19 |
+
from nltk.parse.pchart import InsideChartParser
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_parser(
|
| 23 |
+
grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args
|
| 24 |
+
):
|
| 25 |
+
"""
|
| 26 |
+
Load a grammar from a file, and build a parser based on that grammar.
|
| 27 |
+
The parser depends on the grammar format, and might also depend
|
| 28 |
+
on properties of the grammar itself.
|
| 29 |
+
|
| 30 |
+
The following grammar formats are currently supported:
|
| 31 |
+
- ``'cfg'`` (CFGs: ``CFG``)
|
| 32 |
+
- ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
|
| 33 |
+
- ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
|
| 34 |
+
|
| 35 |
+
:type grammar_url: str
|
| 36 |
+
:param grammar_url: A URL specifying where the grammar is located.
|
| 37 |
+
The default protocol is ``"nltk:"``, which searches for the file
|
| 38 |
+
in the the NLTK data package.
|
| 39 |
+
:type trace: int
|
| 40 |
+
:param trace: The level of tracing that should be used when
|
| 41 |
+
parsing a text. ``0`` will generate no tracing output;
|
| 42 |
+
and higher numbers will produce more verbose tracing output.
|
| 43 |
+
:param parser: The class used for parsing; should be ``ChartParser``
|
| 44 |
+
or a subclass.
|
| 45 |
+
If None, the class depends on the grammar format.
|
| 46 |
+
:param chart_class: The class used for storing the chart;
|
| 47 |
+
should be ``Chart`` or a subclass.
|
| 48 |
+
Only used for CFGs and feature CFGs.
|
| 49 |
+
If None, the chart class depends on the grammar format.
|
| 50 |
+
:type beam_size: int
|
| 51 |
+
:param beam_size: The maximum length for the parser's edge queue.
|
| 52 |
+
Only used for probabilistic CFGs.
|
| 53 |
+
:param load_args: Keyword parameters used when loading the grammar.
|
| 54 |
+
See ``data.load`` for more information.
|
| 55 |
+
"""
|
| 56 |
+
grammar = load(grammar_url, **load_args)
|
| 57 |
+
if not isinstance(grammar, CFG):
|
| 58 |
+
raise ValueError("The grammar must be a CFG, " "or a subclass thereof.")
|
| 59 |
+
if isinstance(grammar, PCFG):
|
| 60 |
+
if parser is None:
|
| 61 |
+
parser = InsideChartParser
|
| 62 |
+
return parser(grammar, trace=trace, beam_size=beam_size)
|
| 63 |
+
|
| 64 |
+
elif isinstance(grammar, FeatureGrammar):
|
| 65 |
+
if parser is None:
|
| 66 |
+
parser = FeatureChartParser
|
| 67 |
+
if chart_class is None:
|
| 68 |
+
chart_class = FeatureChart
|
| 69 |
+
return parser(grammar, trace=trace, chart_class=chart_class)
|
| 70 |
+
|
| 71 |
+
else: # Plain CFG.
|
| 72 |
+
if parser is None:
|
| 73 |
+
parser = ChartParser
|
| 74 |
+
if chart_class is None:
|
| 75 |
+
chart_class = Chart
|
| 76 |
+
return parser(grammar, trace=trace, chart_class=chart_class)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def taggedsent_to_conll(sentence):
|
| 80 |
+
"""
|
| 81 |
+
A module to convert a single POS tagged sentence into CONLL format.
|
| 82 |
+
|
| 83 |
+
>>> from nltk import word_tokenize, pos_tag
|
| 84 |
+
>>> text = "This is a foobar sentence."
|
| 85 |
+
>>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): # doctest: +NORMALIZE_WHITESPACE
|
| 86 |
+
... print(line, end="")
|
| 87 |
+
1 This _ DT DT _ 0 a _ _
|
| 88 |
+
2 is _ VBZ VBZ _ 0 a _ _
|
| 89 |
+
3 a _ DT DT _ 0 a _ _
|
| 90 |
+
4 foobar _ JJ JJ _ 0 a _ _
|
| 91 |
+
5 sentence _ NN NN _ 0 a _ _
|
| 92 |
+
6 . _ . . _ 0 a _ _
|
| 93 |
+
|
| 94 |
+
:param sentence: A single input sentence to parse
|
| 95 |
+
:type sentence: list(tuple(str, str))
|
| 96 |
+
:rtype: iter(str)
|
| 97 |
+
:return: a generator yielding a single sentence in CONLL format.
|
| 98 |
+
"""
|
| 99 |
+
for (i, (word, tag)) in enumerate(sentence, start=1):
|
| 100 |
+
input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
|
| 101 |
+
input_str = "\t".join(input_str) + "\n"
|
| 102 |
+
yield input_str
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def taggedsents_to_conll(sentences):
|
| 106 |
+
"""
|
| 107 |
+
A module to convert the a POS tagged document stream
|
| 108 |
+
(i.e. list of list of tuples, a list of sentences) and yield lines
|
| 109 |
+
in CONLL format. This module yields one line per word and two newlines
|
| 110 |
+
for end of sentence.
|
| 111 |
+
|
| 112 |
+
>>> from nltk import word_tokenize, sent_tokenize, pos_tag
|
| 113 |
+
>>> text = "This is a foobar sentence. Is that right?"
|
| 114 |
+
>>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
|
| 115 |
+
>>> for line in taggedsents_to_conll(sentences): # doctest: +NORMALIZE_WHITESPACE
|
| 116 |
+
... if line:
|
| 117 |
+
... print(line, end="")
|
| 118 |
+
1 This _ DT DT _ 0 a _ _
|
| 119 |
+
2 is _ VBZ VBZ _ 0 a _ _
|
| 120 |
+
3 a _ DT DT _ 0 a _ _
|
| 121 |
+
4 foobar _ JJ JJ _ 0 a _ _
|
| 122 |
+
5 sentence _ NN NN _ 0 a _ _
|
| 123 |
+
6 . _ . . _ 0 a _ _
|
| 124 |
+
<BLANKLINE>
|
| 125 |
+
<BLANKLINE>
|
| 126 |
+
1 Is _ VBZ VBZ _ 0 a _ _
|
| 127 |
+
2 that _ IN IN _ 0 a _ _
|
| 128 |
+
3 right _ NN NN _ 0 a _ _
|
| 129 |
+
4 ? _ . . _ 0 a _ _
|
| 130 |
+
<BLANKLINE>
|
| 131 |
+
<BLANKLINE>
|
| 132 |
+
|
| 133 |
+
:param sentences: Input sentences to parse
|
| 134 |
+
:type sentence: list(list(tuple(str, str)))
|
| 135 |
+
:rtype: iter(str)
|
| 136 |
+
:return: a generator yielding sentences in CONLL format.
|
| 137 |
+
"""
|
| 138 |
+
for sentence in sentences:
|
| 139 |
+
yield from taggedsent_to_conll(sentence)
|
| 140 |
+
yield "\n\n"
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
######################################################################
|
| 144 |
+
# { Test Suites
|
| 145 |
+
######################################################################
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class TestGrammar:
|
| 149 |
+
"""
|
| 150 |
+
Unit tests for CFG.
|
| 151 |
+
"""
|
| 152 |
+
|
| 153 |
+
def __init__(self, grammar, suite, accept=None, reject=None):
|
| 154 |
+
self.test_grammar = grammar
|
| 155 |
+
|
| 156 |
+
self.cp = load_parser(grammar, trace=0)
|
| 157 |
+
self.suite = suite
|
| 158 |
+
self._accept = accept
|
| 159 |
+
self._reject = reject
|
| 160 |
+
|
| 161 |
+
def run(self, show_trees=False):
|
| 162 |
+
"""
|
| 163 |
+
Sentences in the test suite are divided into two classes:
|
| 164 |
+
|
| 165 |
+
- grammatical (``accept``) and
|
| 166 |
+
- ungrammatical (``reject``).
|
| 167 |
+
|
| 168 |
+
If a sentence should parse according to the grammar, the value of
|
| 169 |
+
``trees`` will be a non-empty list. If a sentence should be rejected
|
| 170 |
+
according to the grammar, then the value of ``trees`` will be None.
|
| 171 |
+
"""
|
| 172 |
+
for test in self.suite:
|
| 173 |
+
print(test["doc"] + ":", end=" ")
|
| 174 |
+
for key in ["accept", "reject"]:
|
| 175 |
+
for sent in test[key]:
|
| 176 |
+
tokens = sent.split()
|
| 177 |
+
trees = list(self.cp.parse(tokens))
|
| 178 |
+
if show_trees and trees:
|
| 179 |
+
print()
|
| 180 |
+
print(sent)
|
| 181 |
+
for tree in trees:
|
| 182 |
+
print(tree)
|
| 183 |
+
if key == "accept":
|
| 184 |
+
if trees == []:
|
| 185 |
+
raise ValueError("Sentence '%s' failed to parse'" % sent)
|
| 186 |
+
else:
|
| 187 |
+
accepted = True
|
| 188 |
+
else:
|
| 189 |
+
if trees:
|
| 190 |
+
raise ValueError("Sentence '%s' received a parse'" % sent)
|
| 191 |
+
else:
|
| 192 |
+
rejected = True
|
| 193 |
+
if accepted and rejected:
|
| 194 |
+
print("All tests passed!")
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def extract_test_sentences(string, comment_chars="#%;", encoding=None):
|
| 198 |
+
"""
|
| 199 |
+
Parses a string with one test sentence per line.
|
| 200 |
+
Lines can optionally begin with:
|
| 201 |
+
|
| 202 |
+
- a bool, saying if the sentence is grammatical or not, or
|
| 203 |
+
- an int, giving the number of parse trees is should have,
|
| 204 |
+
|
| 205 |
+
The result information is followed by a colon, and then the sentence.
|
| 206 |
+
Empty lines and lines beginning with a comment char are ignored.
|
| 207 |
+
|
| 208 |
+
:return: a list of tuple of sentences and expected results,
|
| 209 |
+
where a sentence is a list of str,
|
| 210 |
+
and a result is None, or bool, or int
|
| 211 |
+
|
| 212 |
+
:param comment_chars: ``str`` of possible comment characters.
|
| 213 |
+
:param encoding: the encoding of the string, if it is binary
|
| 214 |
+
"""
|
| 215 |
+
if encoding is not None:
|
| 216 |
+
string = string.decode(encoding)
|
| 217 |
+
sentences = []
|
| 218 |
+
for sentence in string.split("\n"):
|
| 219 |
+
if sentence == "" or sentence[0] in comment_chars:
|
| 220 |
+
continue
|
| 221 |
+
split_info = sentence.split(":", 1)
|
| 222 |
+
result = None
|
| 223 |
+
if len(split_info) == 2:
|
| 224 |
+
if split_info[0] in ["True", "true", "False", "false"]:
|
| 225 |
+
result = split_info[0] in ["True", "true"]
|
| 226 |
+
sentence = split_info[1]
|
| 227 |
+
else:
|
| 228 |
+
result = int(split_info[0])
|
| 229 |
+
sentence = split_info[1]
|
| 230 |
+
tokens = sentence.split()
|
| 231 |
+
if tokens == []:
|
| 232 |
+
continue
|
| 233 |
+
sentences += [(tokens, result)]
|
| 234 |
+
return sentences
|
.eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Viterbi Probabilistic Parser
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
from functools import reduce
|
| 10 |
+
|
| 11 |
+
from nltk.parse.api import ParserI
|
| 12 |
+
from nltk.tree import ProbabilisticTree, Tree
|
| 13 |
+
|
| 14 |
+
##//////////////////////////////////////////////////////
|
| 15 |
+
## Viterbi PCFG Parser
|
| 16 |
+
##//////////////////////////////////////////////////////
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ViterbiParser(ParserI):
|
| 20 |
+
"""
|
| 21 |
+
A bottom-up ``PCFG`` parser that uses dynamic programming to find
|
| 22 |
+
the single most likely parse for a text. The ``ViterbiParser`` parser
|
| 23 |
+
parses texts by filling in a "most likely constituent table".
|
| 24 |
+
This table records the most probable tree representation for any
|
| 25 |
+
given span and node value. In particular, it has an entry for
|
| 26 |
+
every start index, end index, and node value, recording the most
|
| 27 |
+
likely subtree that spans from the start index to the end index,
|
| 28 |
+
and has the given node value.
|
| 29 |
+
|
| 30 |
+
The ``ViterbiParser`` parser fills in this table incrementally. It starts
|
| 31 |
+
by filling in all entries for constituents that span one element
|
| 32 |
+
of text (i.e., entries where the end index is one greater than the
|
| 33 |
+
start index). After it has filled in all table entries for
|
| 34 |
+
constituents that span one element of text, it fills in the
|
| 35 |
+
entries for constitutants that span two elements of text. It
|
| 36 |
+
continues filling in the entries for constituents spanning larger
|
| 37 |
+
and larger portions of the text, until the entire table has been
|
| 38 |
+
filled. Finally, it returns the table entry for a constituent
|
| 39 |
+
spanning the entire text, whose node value is the grammar's start
|
| 40 |
+
symbol.
|
| 41 |
+
|
| 42 |
+
In order to find the most likely constituent with a given span and
|
| 43 |
+
node value, the ``ViterbiParser`` parser considers all productions that
|
| 44 |
+
could produce that node value. For each production, it finds all
|
| 45 |
+
children that collectively cover the span and have the node values
|
| 46 |
+
specified by the production's right hand side. If the probability
|
| 47 |
+
of the tree formed by applying the production to the children is
|
| 48 |
+
greater than the probability of the current entry in the table,
|
| 49 |
+
then the table is updated with this new tree.
|
| 50 |
+
|
| 51 |
+
A pseudo-code description of the algorithm used by
|
| 52 |
+
``ViterbiParser`` is:
|
| 53 |
+
|
| 54 |
+
| Create an empty most likely constituent table, *MLC*.
|
| 55 |
+
| For width in 1...len(text):
|
| 56 |
+
| For start in 1...len(text)-width:
|
| 57 |
+
| For prod in grammar.productions:
|
| 58 |
+
| For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
|
| 59 |
+
| where t[i].label()==prod.rhs[i],
|
| 60 |
+
| and the sequence covers [start:start+width]:
|
| 61 |
+
| old_p = MLC[start, start+width, prod.lhs]
|
| 62 |
+
| new_p = P(t[1])P(t[1])...P(t[n])P(prod)
|
| 63 |
+
| if new_p > old_p:
|
| 64 |
+
| new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
|
| 65 |
+
| MLC[start, start+width, prod.lhs] = new_tree
|
| 66 |
+
| Return MLC[0, len(text), start_symbol]
|
| 67 |
+
|
| 68 |
+
:type _grammar: PCFG
|
| 69 |
+
:ivar _grammar: The grammar used to parse sentences.
|
| 70 |
+
:type _trace: int
|
| 71 |
+
:ivar _trace: The level of tracing output that should be generated
|
| 72 |
+
when parsing a text.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
def __init__(self, grammar, trace=0):
|
| 76 |
+
"""
|
| 77 |
+
Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
|
| 78 |
+
parse texts.
|
| 79 |
+
|
| 80 |
+
:type grammar: PCFG
|
| 81 |
+
:param grammar: The grammar used to parse texts.
|
| 82 |
+
:type trace: int
|
| 83 |
+
:param trace: The level of tracing that should be used when
|
| 84 |
+
parsing a text. ``0`` will generate no tracing output;
|
| 85 |
+
and higher numbers will produce more verbose tracing
|
| 86 |
+
output.
|
| 87 |
+
"""
|
| 88 |
+
self._grammar = grammar
|
| 89 |
+
self._trace = trace
|
| 90 |
+
|
| 91 |
+
def grammar(self):
|
| 92 |
+
return self._grammar
|
| 93 |
+
|
| 94 |
+
def trace(self, trace=2):
|
| 95 |
+
"""
|
| 96 |
+
Set the level of tracing output that should be generated when
|
| 97 |
+
parsing a text.
|
| 98 |
+
|
| 99 |
+
:type trace: int
|
| 100 |
+
:param trace: The trace level. A trace level of ``0`` will
|
| 101 |
+
generate no tracing output; and higher trace levels will
|
| 102 |
+
produce more verbose tracing output.
|
| 103 |
+
:rtype: None
|
| 104 |
+
"""
|
| 105 |
+
self._trace = trace
|
| 106 |
+
|
| 107 |
+
def parse(self, tokens):
|
| 108 |
+
# Inherit docs from ParserI
|
| 109 |
+
|
| 110 |
+
tokens = list(tokens)
|
| 111 |
+
self._grammar.check_coverage(tokens)
|
| 112 |
+
|
| 113 |
+
# The most likely constituent table. This table specifies the
|
| 114 |
+
# most likely constituent for a given span and type.
|
| 115 |
+
# Constituents can be either Trees or tokens. For Trees,
|
| 116 |
+
# the "type" is the Nonterminal for the tree's root node
|
| 117 |
+
# value. For Tokens, the "type" is the token's type.
|
| 118 |
+
# The table is stored as a dictionary, since it is sparse.
|
| 119 |
+
constituents = {}
|
| 120 |
+
|
| 121 |
+
# Initialize the constituents dictionary with the words from
|
| 122 |
+
# the text.
|
| 123 |
+
if self._trace:
|
| 124 |
+
print("Inserting tokens into the most likely" + " constituents table...")
|
| 125 |
+
for index in range(len(tokens)):
|
| 126 |
+
token = tokens[index]
|
| 127 |
+
constituents[index, index + 1, token] = token
|
| 128 |
+
if self._trace > 1:
|
| 129 |
+
self._trace_lexical_insertion(token, index, len(tokens))
|
| 130 |
+
|
| 131 |
+
# Consider each span of length 1, 2, ..., n; and add any trees
|
| 132 |
+
# that might cover that span to the constituents dictionary.
|
| 133 |
+
for length in range(1, len(tokens) + 1):
|
| 134 |
+
if self._trace:
|
| 135 |
+
print(
|
| 136 |
+
"Finding the most likely constituents"
|
| 137 |
+
+ " spanning %d text elements..." % length
|
| 138 |
+
)
|
| 139 |
+
for start in range(len(tokens) - length + 1):
|
| 140 |
+
span = (start, start + length)
|
| 141 |
+
self._add_constituents_spanning(span, constituents, tokens)
|
| 142 |
+
|
| 143 |
+
# Return the tree that spans the entire text & have the right cat
|
| 144 |
+
tree = constituents.get((0, len(tokens), self._grammar.start()))
|
| 145 |
+
if tree is not None:
|
| 146 |
+
yield tree
|
| 147 |
+
|
| 148 |
+
def _add_constituents_spanning(self, span, constituents, tokens):
|
| 149 |
+
"""
|
| 150 |
+
Find any constituents that might cover ``span``, and add them
|
| 151 |
+
to the most likely constituents table.
|
| 152 |
+
|
| 153 |
+
:rtype: None
|
| 154 |
+
:type span: tuple(int, int)
|
| 155 |
+
:param span: The section of the text for which we are
|
| 156 |
+
trying to find possible constituents. The span is
|
| 157 |
+
specified as a pair of integers, where the first integer
|
| 158 |
+
is the index of the first token that should be included in
|
| 159 |
+
the constituent; and the second integer is the index of
|
| 160 |
+
the first token that should not be included in the
|
| 161 |
+
constituent. I.e., the constituent should cover
|
| 162 |
+
``text[span[0]:span[1]]``, where ``text`` is the text
|
| 163 |
+
that we are parsing.
|
| 164 |
+
|
| 165 |
+
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
| 166 |
+
:param constituents: The most likely constituents table. This
|
| 167 |
+
table records the most probable tree representation for
|
| 168 |
+
any given span and node value. In particular,
|
| 169 |
+
``constituents(s,e,nv)`` is the most likely
|
| 170 |
+
``ProbabilisticTree`` that covers ``text[s:e]``
|
| 171 |
+
and has a node value ``nv.symbol()``, where ``text``
|
| 172 |
+
is the text that we are parsing. When
|
| 173 |
+
``_add_constituents_spanning`` is called, ``constituents``
|
| 174 |
+
should contain all possible constituents that are shorter
|
| 175 |
+
than ``span``.
|
| 176 |
+
|
| 177 |
+
:type tokens: list of tokens
|
| 178 |
+
:param tokens: The text we are parsing. This is only used for
|
| 179 |
+
trace output.
|
| 180 |
+
"""
|
| 181 |
+
# Since some of the grammar productions may be unary, we need to
|
| 182 |
+
# repeatedly try all of the productions until none of them add any
|
| 183 |
+
# new constituents.
|
| 184 |
+
changed = True
|
| 185 |
+
while changed:
|
| 186 |
+
changed = False
|
| 187 |
+
|
| 188 |
+
# Find all ways instantiations of the grammar productions that
|
| 189 |
+
# cover the span.
|
| 190 |
+
instantiations = self._find_instantiations(span, constituents)
|
| 191 |
+
|
| 192 |
+
# For each production instantiation, add a new
|
| 193 |
+
# ProbabilisticTree whose probability is the product
|
| 194 |
+
# of the childrens' probabilities and the production's
|
| 195 |
+
# probability.
|
| 196 |
+
for (production, children) in instantiations:
|
| 197 |
+
subtrees = [c for c in children if isinstance(c, Tree)]
|
| 198 |
+
p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
|
| 199 |
+
node = production.lhs().symbol()
|
| 200 |
+
tree = ProbabilisticTree(node, children, prob=p)
|
| 201 |
+
|
| 202 |
+
# If it's new a constituent, then add it to the
|
| 203 |
+
# constituents dictionary.
|
| 204 |
+
c = constituents.get((span[0], span[1], production.lhs()))
|
| 205 |
+
if self._trace > 1:
|
| 206 |
+
if c is None or c != tree:
|
| 207 |
+
if c is None or c.prob() < tree.prob():
|
| 208 |
+
print(" Insert:", end=" ")
|
| 209 |
+
else:
|
| 210 |
+
print(" Discard:", end=" ")
|
| 211 |
+
self._trace_production(production, p, span, len(tokens))
|
| 212 |
+
if c is None or c.prob() < tree.prob():
|
| 213 |
+
constituents[span[0], span[1], production.lhs()] = tree
|
| 214 |
+
changed = True
|
| 215 |
+
|
| 216 |
+
def _find_instantiations(self, span, constituents):
|
| 217 |
+
"""
|
| 218 |
+
:return: a list of the production instantiations that cover a
|
| 219 |
+
given span of the text. A "production instantiation" is
|
| 220 |
+
a tuple containing a production and a list of children,
|
| 221 |
+
where the production's right hand side matches the list of
|
| 222 |
+
children; and the children cover ``span``. :rtype: list
|
| 223 |
+
of ``pair`` of ``Production``, (list of
|
| 224 |
+
(``ProbabilisticTree`` or token.
|
| 225 |
+
|
| 226 |
+
:type span: tuple(int, int)
|
| 227 |
+
:param span: The section of the text for which we are
|
| 228 |
+
trying to find production instantiations. The span is
|
| 229 |
+
specified as a pair of integers, where the first integer
|
| 230 |
+
is the index of the first token that should be covered by
|
| 231 |
+
the production instantiation; and the second integer is
|
| 232 |
+
the index of the first token that should not be covered by
|
| 233 |
+
the production instantiation.
|
| 234 |
+
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
| 235 |
+
:param constituents: The most likely constituents table. This
|
| 236 |
+
table records the most probable tree representation for
|
| 237 |
+
any given span and node value. See the module
|
| 238 |
+
documentation for more information.
|
| 239 |
+
"""
|
| 240 |
+
rv = []
|
| 241 |
+
for production in self._grammar.productions():
|
| 242 |
+
childlists = self._match_rhs(production.rhs(), span, constituents)
|
| 243 |
+
|
| 244 |
+
for childlist in childlists:
|
| 245 |
+
rv.append((production, childlist))
|
| 246 |
+
return rv
|
| 247 |
+
|
| 248 |
+
def _match_rhs(self, rhs, span, constituents):
|
| 249 |
+
"""
|
| 250 |
+
:return: a set of all the lists of children that cover ``span``
|
| 251 |
+
and that match ``rhs``.
|
| 252 |
+
:rtype: list(list(ProbabilisticTree or token)
|
| 253 |
+
|
| 254 |
+
:type rhs: list(Nonterminal or any)
|
| 255 |
+
:param rhs: The list specifying what kinds of children need to
|
| 256 |
+
cover ``span``. Each nonterminal in ``rhs`` specifies
|
| 257 |
+
that the corresponding child should be a tree whose node
|
| 258 |
+
value is that nonterminal's symbol. Each terminal in ``rhs``
|
| 259 |
+
specifies that the corresponding child should be a token
|
| 260 |
+
whose type is that terminal.
|
| 261 |
+
:type span: tuple(int, int)
|
| 262 |
+
:param span: The section of the text for which we are
|
| 263 |
+
trying to find child lists. The span is specified as a
|
| 264 |
+
pair of integers, where the first integer is the index of
|
| 265 |
+
the first token that should be covered by the child list;
|
| 266 |
+
and the second integer is the index of the first token
|
| 267 |
+
that should not be covered by the child list.
|
| 268 |
+
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
| 269 |
+
:param constituents: The most likely constituents table. This
|
| 270 |
+
table records the most probable tree representation for
|
| 271 |
+
any given span and node value. See the module
|
| 272 |
+
documentation for more information.
|
| 273 |
+
"""
|
| 274 |
+
(start, end) = span
|
| 275 |
+
|
| 276 |
+
# Base case
|
| 277 |
+
if start >= end and rhs == ():
|
| 278 |
+
return [[]]
|
| 279 |
+
if start >= end or rhs == ():
|
| 280 |
+
return []
|
| 281 |
+
|
| 282 |
+
# Find everything that matches the 1st symbol of the RHS
|
| 283 |
+
childlists = []
|
| 284 |
+
for split in range(start, end + 1):
|
| 285 |
+
l = constituents.get((start, split, rhs[0]))
|
| 286 |
+
if l is not None:
|
| 287 |
+
rights = self._match_rhs(rhs[1:], (split, end), constituents)
|
| 288 |
+
childlists += [[l] + r for r in rights]
|
| 289 |
+
|
| 290 |
+
return childlists
|
| 291 |
+
|
| 292 |
+
def _trace_production(self, production, p, span, width):
|
| 293 |
+
"""
|
| 294 |
+
Print trace output indicating that a given production has been
|
| 295 |
+
applied at a given location.
|
| 296 |
+
|
| 297 |
+
:param production: The production that has been applied
|
| 298 |
+
:type production: Production
|
| 299 |
+
:param p: The probability of the tree produced by the production.
|
| 300 |
+
:type p: float
|
| 301 |
+
:param span: The span of the production
|
| 302 |
+
:type span: tuple
|
| 303 |
+
:rtype: None
|
| 304 |
+
"""
|
| 305 |
+
|
| 306 |
+
str = "|" + "." * span[0]
|
| 307 |
+
str += "=" * (span[1] - span[0])
|
| 308 |
+
str += "." * (width - span[1]) + "| "
|
| 309 |
+
str += "%s" % production
|
| 310 |
+
if self._trace > 2:
|
| 311 |
+
str = f"{str:<40} {p:12.10f} "
|
| 312 |
+
|
| 313 |
+
print(str)
|
| 314 |
+
|
| 315 |
+
def _trace_lexical_insertion(self, token, index, width):
|
| 316 |
+
str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
|
| 317 |
+
str += f"{token}"
|
| 318 |
+
print(str)
|
| 319 |
+
|
| 320 |
+
def __repr__(self):
|
| 321 |
+
return "<ViterbiParser for %r>" % self._grammar
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
##//////////////////////////////////////////////////////
|
| 325 |
+
## Test Code
|
| 326 |
+
##//////////////////////////////////////////////////////
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def demo():
|
| 330 |
+
"""
|
| 331 |
+
A demonstration of the probabilistic parsers. The user is
|
| 332 |
+
prompted to select which demo to run, and how many parses should
|
| 333 |
+
be found; and then each parser is run on the same demo, and a
|
| 334 |
+
summary of the results are displayed.
|
| 335 |
+
"""
|
| 336 |
+
import sys
|
| 337 |
+
import time
|
| 338 |
+
|
| 339 |
+
from nltk import tokenize
|
| 340 |
+
from nltk.grammar import PCFG
|
| 341 |
+
from nltk.parse import ViterbiParser
|
| 342 |
+
|
| 343 |
+
toy_pcfg1 = PCFG.fromstring(
|
| 344 |
+
"""
|
| 345 |
+
S -> NP VP [1.0]
|
| 346 |
+
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
|
| 347 |
+
Det -> 'the' [0.8] | 'my' [0.2]
|
| 348 |
+
N -> 'man' [0.5] | 'telescope' [0.5]
|
| 349 |
+
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
|
| 350 |
+
V -> 'ate' [0.35] | 'saw' [0.65]
|
| 351 |
+
PP -> P NP [1.0]
|
| 352 |
+
P -> 'with' [0.61] | 'under' [0.39]
|
| 353 |
+
"""
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
toy_pcfg2 = PCFG.fromstring(
|
| 357 |
+
"""
|
| 358 |
+
S -> NP VP [1.0]
|
| 359 |
+
VP -> V NP [.59]
|
| 360 |
+
VP -> V [.40]
|
| 361 |
+
VP -> VP PP [.01]
|
| 362 |
+
NP -> Det N [.41]
|
| 363 |
+
NP -> Name [.28]
|
| 364 |
+
NP -> NP PP [.31]
|
| 365 |
+
PP -> P NP [1.0]
|
| 366 |
+
V -> 'saw' [.21]
|
| 367 |
+
V -> 'ate' [.51]
|
| 368 |
+
V -> 'ran' [.28]
|
| 369 |
+
N -> 'boy' [.11]
|
| 370 |
+
N -> 'cookie' [.12]
|
| 371 |
+
N -> 'table' [.13]
|
| 372 |
+
N -> 'telescope' [.14]
|
| 373 |
+
N -> 'hill' [.5]
|
| 374 |
+
Name -> 'Jack' [.52]
|
| 375 |
+
Name -> 'Bob' [.48]
|
| 376 |
+
P -> 'with' [.61]
|
| 377 |
+
P -> 'under' [.39]
|
| 378 |
+
Det -> 'the' [.41]
|
| 379 |
+
Det -> 'a' [.31]
|
| 380 |
+
Det -> 'my' [.28]
|
| 381 |
+
"""
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
# Define two demos. Each demo has a sentence and a grammar.
|
| 385 |
+
demos = [
|
| 386 |
+
("I saw the man with my telescope", toy_pcfg1),
|
| 387 |
+
("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
|
| 388 |
+
]
|
| 389 |
+
|
| 390 |
+
# Ask the user which demo they want to use.
|
| 391 |
+
print()
|
| 392 |
+
for i in range(len(demos)):
|
| 393 |
+
print(f"{i + 1:>3}: {demos[i][0]}")
|
| 394 |
+
print(" %r" % demos[i][1])
|
| 395 |
+
print()
|
| 396 |
+
print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
|
| 397 |
+
try:
|
| 398 |
+
snum = int(sys.stdin.readline().strip()) - 1
|
| 399 |
+
sent, grammar = demos[snum]
|
| 400 |
+
except:
|
| 401 |
+
print("Bad sentence number")
|
| 402 |
+
return
|
| 403 |
+
|
| 404 |
+
# Tokenize the sentence.
|
| 405 |
+
tokens = sent.split()
|
| 406 |
+
|
| 407 |
+
parser = ViterbiParser(grammar)
|
| 408 |
+
all_parses = {}
|
| 409 |
+
|
| 410 |
+
print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
|
| 411 |
+
parser.trace(3)
|
| 412 |
+
t = time.time()
|
| 413 |
+
parses = parser.parse_all(tokens)
|
| 414 |
+
time = time.time() - t
|
| 415 |
+
average = (
|
| 416 |
+
reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
|
| 417 |
+
)
|
| 418 |
+
num_parses = len(parses)
|
| 419 |
+
for p in parses:
|
| 420 |
+
all_parses[p.freeze()] = 1
|
| 421 |
+
|
| 422 |
+
# Print some summary statistics
|
| 423 |
+
print()
|
| 424 |
+
print("Time (secs) # Parses Average P(parse)")
|
| 425 |
+
print("-----------------------------------------")
|
| 426 |
+
print("%11.4f%11d%19.14f" % (time, num_parses, average))
|
| 427 |
+
parses = all_parses.keys()
|
| 428 |
+
if parses:
|
| 429 |
+
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
|
| 430 |
+
else:
|
| 431 |
+
p = 0
|
| 432 |
+
print("------------------------------------------")
|
| 433 |
+
print("%11s%11d%19.14f" % ("n/a", len(parses), p))
|
| 434 |
+
|
| 435 |
+
# Ask the user if we should draw the parses.
|
| 436 |
+
print()
|
| 437 |
+
print("Draw parses (y/n)? ", end=" ")
|
| 438 |
+
if sys.stdin.readline().strip().lower().startswith("y"):
|
| 439 |
+
from nltk.draw.tree import draw_trees
|
| 440 |
+
|
| 441 |
+
print(" please wait...")
|
| 442 |
+
draw_trees(*parses)
|
| 443 |
+
|
| 444 |
+
# Ask the user if we should print the parses.
|
| 445 |
+
print()
|
| 446 |
+
print("Print parses (y/n)? ", end=" ")
|
| 447 |
+
if sys.stdin.readline().strip().lower().startswith("y"):
|
| 448 |
+
for parse in parses:
|
| 449 |
+
print(parse)
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
if __name__ == "__main__":
|
| 453 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py
ADDED
|
@@ -0,0 +1,1605 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Interface to Boxer
|
| 2 |
+
# <http://svn.ask.it.usyd.edu.au/trac/candc/wiki/boxer>
|
| 3 |
+
#
|
| 4 |
+
# Author: Dan Garrette <dhgarrette@gmail.com>
|
| 5 |
+
#
|
| 6 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
An interface to Boxer.
|
| 12 |
+
|
| 13 |
+
This interface relies on the latest version of the development (subversion) version of
|
| 14 |
+
C&C and Boxer.
|
| 15 |
+
|
| 16 |
+
Usage
|
| 17 |
+
=====
|
| 18 |
+
|
| 19 |
+
Set the environment variable CANDC to the bin directory of your CandC installation.
|
| 20 |
+
The models directory should be in the CandC root directory.
|
| 21 |
+
For example::
|
| 22 |
+
|
| 23 |
+
/path/to/candc/
|
| 24 |
+
bin/
|
| 25 |
+
candc
|
| 26 |
+
boxer
|
| 27 |
+
models/
|
| 28 |
+
boxer/
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
import operator
|
| 32 |
+
import os
|
| 33 |
+
import re
|
| 34 |
+
import subprocess
|
| 35 |
+
import tempfile
|
| 36 |
+
from functools import reduce
|
| 37 |
+
from optparse import OptionParser
|
| 38 |
+
|
| 39 |
+
from nltk.internals import find_binary
|
| 40 |
+
from nltk.sem.drt import (
|
| 41 |
+
DRS,
|
| 42 |
+
DrtApplicationExpression,
|
| 43 |
+
DrtEqualityExpression,
|
| 44 |
+
DrtNegatedExpression,
|
| 45 |
+
DrtOrExpression,
|
| 46 |
+
DrtParser,
|
| 47 |
+
DrtProposition,
|
| 48 |
+
DrtTokens,
|
| 49 |
+
DrtVariableExpression,
|
| 50 |
+
)
|
| 51 |
+
from nltk.sem.logic import (
|
| 52 |
+
ExpectedMoreTokensException,
|
| 53 |
+
LogicalExpressionException,
|
| 54 |
+
UnexpectedTokenException,
|
| 55 |
+
Variable,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class Boxer:
|
| 60 |
+
"""
|
| 61 |
+
This class is an interface to Johan Bos's program Boxer, a wide-coverage
|
| 62 |
+
semantic parser that produces Discourse Representation Structures (DRSs).
|
| 63 |
+
"""
|
| 64 |
+
|
| 65 |
+
def __init__(
|
| 66 |
+
self,
|
| 67 |
+
boxer_drs_interpreter=None,
|
| 68 |
+
elimeq=False,
|
| 69 |
+
bin_dir=None,
|
| 70 |
+
verbose=False,
|
| 71 |
+
resolve=True,
|
| 72 |
+
):
|
| 73 |
+
"""
|
| 74 |
+
:param boxer_drs_interpreter: A class that converts from the
|
| 75 |
+
``AbstractBoxerDrs`` object hierarchy to a different object. The
|
| 76 |
+
default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK
|
| 77 |
+
DRT hierarchy.
|
| 78 |
+
:param elimeq: When set to true, Boxer removes all equalities from the
|
| 79 |
+
DRSs and discourse referents standing in the equality relation are
|
| 80 |
+
unified, but only if this can be done in a meaning-preserving manner.
|
| 81 |
+
:param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction.
|
| 82 |
+
Resolution follows Van der Sandt's theory of binding and accommodation.
|
| 83 |
+
"""
|
| 84 |
+
if boxer_drs_interpreter is None:
|
| 85 |
+
boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter()
|
| 86 |
+
self._boxer_drs_interpreter = boxer_drs_interpreter
|
| 87 |
+
|
| 88 |
+
self._resolve = resolve
|
| 89 |
+
self._elimeq = elimeq
|
| 90 |
+
|
| 91 |
+
self.set_bin_dir(bin_dir, verbose)
|
| 92 |
+
|
| 93 |
+
def set_bin_dir(self, bin_dir, verbose=False):
|
| 94 |
+
self._candc_bin = self._find_binary("candc", bin_dir, verbose)
|
| 95 |
+
self._candc_models_path = os.path.normpath(
|
| 96 |
+
os.path.join(self._candc_bin[:-5], "../models")
|
| 97 |
+
)
|
| 98 |
+
self._boxer_bin = self._find_binary("boxer", bin_dir, verbose)
|
| 99 |
+
|
| 100 |
+
def interpret(self, input, discourse_id=None, question=False, verbose=False):
|
| 101 |
+
"""
|
| 102 |
+
Use Boxer to give a first order representation.
|
| 103 |
+
|
| 104 |
+
:param input: str Input sentence to parse
|
| 105 |
+
:param occur_index: bool Should predicates be occurrence indexed?
|
| 106 |
+
:param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
|
| 107 |
+
:return: ``drt.DrtExpression``
|
| 108 |
+
"""
|
| 109 |
+
discourse_ids = [discourse_id] if discourse_id is not None else None
|
| 110 |
+
(d,) = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
|
| 111 |
+
if not d:
|
| 112 |
+
raise Exception(f'Unable to interpret: "{input}"')
|
| 113 |
+
return d
|
| 114 |
+
|
| 115 |
+
def interpret_multi(self, input, discourse_id=None, question=False, verbose=False):
|
| 116 |
+
"""
|
| 117 |
+
Use Boxer to give a first order representation.
|
| 118 |
+
|
| 119 |
+
:param input: list of str Input sentences to parse as a single discourse
|
| 120 |
+
:param occur_index: bool Should predicates be occurrence indexed?
|
| 121 |
+
:param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
|
| 122 |
+
:return: ``drt.DrtExpression``
|
| 123 |
+
"""
|
| 124 |
+
discourse_ids = [discourse_id] if discourse_id is not None else None
|
| 125 |
+
(d,) = self.interpret_multi_sents([input], discourse_ids, question, verbose)
|
| 126 |
+
if not d:
|
| 127 |
+
raise Exception(f'Unable to interpret: "{input}"')
|
| 128 |
+
return d
|
| 129 |
+
|
| 130 |
+
def interpret_sents(
|
| 131 |
+
self, inputs, discourse_ids=None, question=False, verbose=False
|
| 132 |
+
):
|
| 133 |
+
"""
|
| 134 |
+
Use Boxer to give a first order representation.
|
| 135 |
+
|
| 136 |
+
:param inputs: list of str Input sentences to parse as individual discourses
|
| 137 |
+
:param occur_index: bool Should predicates be occurrence indexed?
|
| 138 |
+
:param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
|
| 139 |
+
:return: list of ``drt.DrtExpression``
|
| 140 |
+
"""
|
| 141 |
+
return self.interpret_multi_sents(
|
| 142 |
+
[[input] for input in inputs], discourse_ids, question, verbose
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
def interpret_multi_sents(
|
| 146 |
+
self, inputs, discourse_ids=None, question=False, verbose=False
|
| 147 |
+
):
|
| 148 |
+
"""
|
| 149 |
+
Use Boxer to give a first order representation.
|
| 150 |
+
|
| 151 |
+
:param inputs: list of list of str Input discourses to parse
|
| 152 |
+
:param occur_index: bool Should predicates be occurrence indexed?
|
| 153 |
+
:param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
|
| 154 |
+
:return: ``drt.DrtExpression``
|
| 155 |
+
"""
|
| 156 |
+
if discourse_ids is not None:
|
| 157 |
+
assert len(inputs) == len(discourse_ids)
|
| 158 |
+
assert reduce(operator.and_, (id is not None for id in discourse_ids))
|
| 159 |
+
use_disc_id = True
|
| 160 |
+
else:
|
| 161 |
+
discourse_ids = list(map(str, range(len(inputs))))
|
| 162 |
+
use_disc_id = False
|
| 163 |
+
|
| 164 |
+
candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose)
|
| 165 |
+
boxer_out = self._call_boxer(candc_out, verbose=verbose)
|
| 166 |
+
|
| 167 |
+
# if 'ERROR: input file contains no ccg/2 terms.' in boxer_out:
|
| 168 |
+
# raise UnparseableInputException('Could not parse with candc: "%s"' % input_str)
|
| 169 |
+
|
| 170 |
+
drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id)
|
| 171 |
+
return [drs_dict.get(id, None) for id in discourse_ids]
|
| 172 |
+
|
| 173 |
+
def _call_candc(self, inputs, discourse_ids, question, verbose=False):
|
| 174 |
+
"""
|
| 175 |
+
Call the ``candc`` binary with the given input.
|
| 176 |
+
|
| 177 |
+
:param inputs: list of list of str Input discourses to parse
|
| 178 |
+
:param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
|
| 179 |
+
:param filename: str A filename for the output file
|
| 180 |
+
:return: stdout
|
| 181 |
+
"""
|
| 182 |
+
args = [
|
| 183 |
+
"--models",
|
| 184 |
+
os.path.join(self._candc_models_path, ["boxer", "questions"][question]),
|
| 185 |
+
"--candc-printer",
|
| 186 |
+
"boxer",
|
| 187 |
+
]
|
| 188 |
+
return self._call(
|
| 189 |
+
"\n".join(
|
| 190 |
+
sum(
|
| 191 |
+
([f"<META>'{id}'"] + d for d, id in zip(inputs, discourse_ids)),
|
| 192 |
+
[],
|
| 193 |
+
)
|
| 194 |
+
),
|
| 195 |
+
self._candc_bin,
|
| 196 |
+
args,
|
| 197 |
+
verbose,
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
def _call_boxer(self, candc_out, verbose=False):
|
| 201 |
+
"""
|
| 202 |
+
Call the ``boxer`` binary with the given input.
|
| 203 |
+
|
| 204 |
+
:param candc_out: str output from C&C parser
|
| 205 |
+
:return: stdout
|
| 206 |
+
"""
|
| 207 |
+
f = None
|
| 208 |
+
try:
|
| 209 |
+
fd, temp_filename = tempfile.mkstemp(
|
| 210 |
+
prefix="boxer-", suffix=".in", text=True
|
| 211 |
+
)
|
| 212 |
+
f = os.fdopen(fd, "w")
|
| 213 |
+
f.write(candc_out.decode("utf-8"))
|
| 214 |
+
finally:
|
| 215 |
+
if f:
|
| 216 |
+
f.close()
|
| 217 |
+
|
| 218 |
+
args = [
|
| 219 |
+
"--box",
|
| 220 |
+
"false",
|
| 221 |
+
"--semantics",
|
| 222 |
+
"drs",
|
| 223 |
+
#'--flat', 'false', # removed from boxer
|
| 224 |
+
"--resolve",
|
| 225 |
+
["false", "true"][self._resolve],
|
| 226 |
+
"--elimeq",
|
| 227 |
+
["false", "true"][self._elimeq],
|
| 228 |
+
"--format",
|
| 229 |
+
"prolog",
|
| 230 |
+
"--instantiate",
|
| 231 |
+
"true",
|
| 232 |
+
"--input",
|
| 233 |
+
temp_filename,
|
| 234 |
+
]
|
| 235 |
+
stdout = self._call(None, self._boxer_bin, args, verbose)
|
| 236 |
+
os.remove(temp_filename)
|
| 237 |
+
return stdout
|
| 238 |
+
|
| 239 |
+
def _find_binary(self, name, bin_dir, verbose=False):
|
| 240 |
+
return find_binary(
|
| 241 |
+
name,
|
| 242 |
+
path_to_bin=bin_dir,
|
| 243 |
+
env_vars=["CANDC"],
|
| 244 |
+
url="http://svn.ask.it.usyd.edu.au/trac/candc/",
|
| 245 |
+
binary_names=[name, name + ".exe"],
|
| 246 |
+
verbose=verbose,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
def _call(self, input_str, binary, args=[], verbose=False):
|
| 250 |
+
"""
|
| 251 |
+
Call the binary with the given input.
|
| 252 |
+
|
| 253 |
+
:param input_str: A string whose contents are used as stdin.
|
| 254 |
+
:param binary: The location of the binary to call
|
| 255 |
+
:param args: A list of command-line arguments.
|
| 256 |
+
:return: stdout
|
| 257 |
+
"""
|
| 258 |
+
if verbose:
|
| 259 |
+
print("Calling:", binary)
|
| 260 |
+
print("Args:", args)
|
| 261 |
+
print("Input:", input_str)
|
| 262 |
+
print("Command:", binary + " " + " ".join(args))
|
| 263 |
+
|
| 264 |
+
# Call via a subprocess
|
| 265 |
+
if input_str is None:
|
| 266 |
+
cmd = [binary] + args
|
| 267 |
+
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 268 |
+
else:
|
| 269 |
+
cmd = 'echo "{}" | {} {}'.format(input_str, binary, " ".join(args))
|
| 270 |
+
p = subprocess.Popen(
|
| 271 |
+
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
|
| 272 |
+
)
|
| 273 |
+
stdout, stderr = p.communicate()
|
| 274 |
+
|
| 275 |
+
if verbose:
|
| 276 |
+
print("Return code:", p.returncode)
|
| 277 |
+
if stdout:
|
| 278 |
+
print("stdout:\n", stdout, "\n")
|
| 279 |
+
if stderr:
|
| 280 |
+
print("stderr:\n", stderr, "\n")
|
| 281 |
+
if p.returncode != 0:
|
| 282 |
+
raise Exception(
|
| 283 |
+
"ERROR CALLING: {} {}\nReturncode: {}\n{}".format(
|
| 284 |
+
binary, " ".join(args), p.returncode, stderr
|
| 285 |
+
)
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
return stdout
|
| 289 |
+
|
| 290 |
+
def _parse_to_drs_dict(self, boxer_out, use_disc_id):
|
| 291 |
+
lines = boxer_out.decode("utf-8").split("\n")
|
| 292 |
+
drs_dict = {}
|
| 293 |
+
i = 0
|
| 294 |
+
while i < len(lines):
|
| 295 |
+
line = lines[i]
|
| 296 |
+
if line.startswith("id("):
|
| 297 |
+
comma_idx = line.index(",")
|
| 298 |
+
discourse_id = line[3:comma_idx]
|
| 299 |
+
if discourse_id[0] == "'" and discourse_id[-1] == "'":
|
| 300 |
+
discourse_id = discourse_id[1:-1]
|
| 301 |
+
drs_id = line[comma_idx + 1 : line.index(")")]
|
| 302 |
+
i += 1
|
| 303 |
+
line = lines[i]
|
| 304 |
+
assert line.startswith(f"sem({drs_id},")
|
| 305 |
+
if line[-4:] == "').'":
|
| 306 |
+
line = line[:-4] + ")."
|
| 307 |
+
assert line.endswith(")."), f"can't parse line: {line}"
|
| 308 |
+
|
| 309 |
+
search_start = len(f"sem({drs_id},[")
|
| 310 |
+
brace_count = 1
|
| 311 |
+
drs_start = -1
|
| 312 |
+
for j, c in enumerate(line[search_start:]):
|
| 313 |
+
if c == "[":
|
| 314 |
+
brace_count += 1
|
| 315 |
+
if c == "]":
|
| 316 |
+
brace_count -= 1
|
| 317 |
+
if brace_count == 0:
|
| 318 |
+
drs_start = search_start + j + 1
|
| 319 |
+
if line[drs_start : drs_start + 3] == "','":
|
| 320 |
+
drs_start = drs_start + 3
|
| 321 |
+
else:
|
| 322 |
+
drs_start = drs_start + 1
|
| 323 |
+
break
|
| 324 |
+
assert drs_start > -1
|
| 325 |
+
|
| 326 |
+
drs_input = line[drs_start:-2].strip()
|
| 327 |
+
parsed = self._parse_drs(drs_input, discourse_id, use_disc_id)
|
| 328 |
+
drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed)
|
| 329 |
+
i += 1
|
| 330 |
+
return drs_dict
|
| 331 |
+
|
| 332 |
+
def _parse_drs(self, drs_string, discourse_id, use_disc_id):
|
| 333 |
+
return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string)
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
class BoxerOutputDrsParser(DrtParser):
|
| 337 |
+
def __init__(self, discourse_id=None):
|
| 338 |
+
"""
|
| 339 |
+
This class is used to parse the Prolog DRS output from Boxer into a
|
| 340 |
+
hierarchy of python objects.
|
| 341 |
+
"""
|
| 342 |
+
DrtParser.__init__(self)
|
| 343 |
+
self.discourse_id = discourse_id
|
| 344 |
+
self.sentence_id_offset = None
|
| 345 |
+
self.quote_chars = [("'", "'", "\\", False)]
|
| 346 |
+
|
| 347 |
+
def parse(self, data, signature=None):
|
| 348 |
+
return DrtParser.parse(self, data, signature)
|
| 349 |
+
|
| 350 |
+
def get_all_symbols(self):
|
| 351 |
+
return ["(", ")", ",", "[", "]", ":"]
|
| 352 |
+
|
| 353 |
+
def handle(self, tok, context):
|
| 354 |
+
return self.handle_drs(tok)
|
| 355 |
+
|
| 356 |
+
def attempt_adjuncts(self, expression, context):
|
| 357 |
+
return expression
|
| 358 |
+
|
| 359 |
+
def parse_condition(self, indices):
|
| 360 |
+
"""
|
| 361 |
+
Parse a DRS condition
|
| 362 |
+
|
| 363 |
+
:return: list of ``DrtExpression``
|
| 364 |
+
"""
|
| 365 |
+
tok = self.token()
|
| 366 |
+
accum = self.handle_condition(tok, indices)
|
| 367 |
+
if accum is None:
|
| 368 |
+
raise UnexpectedTokenException(tok)
|
| 369 |
+
return accum
|
| 370 |
+
|
| 371 |
+
def handle_drs(self, tok):
|
| 372 |
+
if tok == "drs":
|
| 373 |
+
return self.parse_drs()
|
| 374 |
+
elif tok in ["merge", "smerge"]:
|
| 375 |
+
return self._handle_binary_expression(self._make_merge_expression)(None, [])
|
| 376 |
+
elif tok in ["alfa"]:
|
| 377 |
+
return self._handle_alfa(self._make_merge_expression)(None, [])
|
| 378 |
+
|
| 379 |
+
def handle_condition(self, tok, indices):
|
| 380 |
+
"""
|
| 381 |
+
Handle a DRS condition
|
| 382 |
+
|
| 383 |
+
:param indices: list of int
|
| 384 |
+
:return: list of ``DrtExpression``
|
| 385 |
+
"""
|
| 386 |
+
if tok == "not":
|
| 387 |
+
return [self._handle_not()]
|
| 388 |
+
|
| 389 |
+
if tok == "or":
|
| 390 |
+
conds = [self._handle_binary_expression(self._make_or_expression)]
|
| 391 |
+
elif tok == "imp":
|
| 392 |
+
conds = [self._handle_binary_expression(self._make_imp_expression)]
|
| 393 |
+
elif tok == "eq":
|
| 394 |
+
conds = [self._handle_eq()]
|
| 395 |
+
elif tok == "prop":
|
| 396 |
+
conds = [self._handle_prop()]
|
| 397 |
+
|
| 398 |
+
elif tok == "pred":
|
| 399 |
+
conds = [self._handle_pred()]
|
| 400 |
+
elif tok == "named":
|
| 401 |
+
conds = [self._handle_named()]
|
| 402 |
+
elif tok == "rel":
|
| 403 |
+
conds = [self._handle_rel()]
|
| 404 |
+
elif tok == "timex":
|
| 405 |
+
conds = self._handle_timex()
|
| 406 |
+
elif tok == "card":
|
| 407 |
+
conds = [self._handle_card()]
|
| 408 |
+
|
| 409 |
+
elif tok == "whq":
|
| 410 |
+
conds = [self._handle_whq()]
|
| 411 |
+
elif tok == "duplex":
|
| 412 |
+
conds = [self._handle_duplex()]
|
| 413 |
+
|
| 414 |
+
else:
|
| 415 |
+
conds = []
|
| 416 |
+
|
| 417 |
+
return sum(
|
| 418 |
+
(
|
| 419 |
+
[cond(sent_index, word_indices) for cond in conds]
|
| 420 |
+
for sent_index, word_indices in self._sent_and_word_indices(indices)
|
| 421 |
+
),
|
| 422 |
+
[],
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
def _handle_not(self):
|
| 426 |
+
self.assertToken(self.token(), "(")
|
| 427 |
+
drs = self.process_next_expression(None)
|
| 428 |
+
self.assertToken(self.token(), ")")
|
| 429 |
+
return BoxerNot(drs)
|
| 430 |
+
|
| 431 |
+
def _handle_pred(self):
|
| 432 |
+
# pred(_G3943, dog, n, 0)
|
| 433 |
+
self.assertToken(self.token(), "(")
|
| 434 |
+
variable = self.parse_variable()
|
| 435 |
+
self.assertToken(self.token(), ",")
|
| 436 |
+
name = self.token()
|
| 437 |
+
self.assertToken(self.token(), ",")
|
| 438 |
+
pos = self.token()
|
| 439 |
+
self.assertToken(self.token(), ",")
|
| 440 |
+
sense = int(self.token())
|
| 441 |
+
self.assertToken(self.token(), ")")
|
| 442 |
+
|
| 443 |
+
def _handle_pred_f(sent_index, word_indices):
|
| 444 |
+
return BoxerPred(
|
| 445 |
+
self.discourse_id, sent_index, word_indices, variable, name, pos, sense
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
return _handle_pred_f
|
| 449 |
+
|
| 450 |
+
def _handle_duplex(self):
|
| 451 |
+
# duplex(whq, drs(...), var, drs(...))
|
| 452 |
+
self.assertToken(self.token(), "(")
|
| 453 |
+
# self.assertToken(self.token(), '[')
|
| 454 |
+
ans_types = []
|
| 455 |
+
# while self.token(0) != ']':
|
| 456 |
+
# cat = self.token()
|
| 457 |
+
# self.assertToken(self.token(), ':')
|
| 458 |
+
# if cat == 'des':
|
| 459 |
+
# ans_types.append(self.token())
|
| 460 |
+
# elif cat == 'num':
|
| 461 |
+
# ans_types.append('number')
|
| 462 |
+
# typ = self.token()
|
| 463 |
+
# if typ == 'cou':
|
| 464 |
+
# ans_types.append('count')
|
| 465 |
+
# else:
|
| 466 |
+
# ans_types.append(typ)
|
| 467 |
+
# else:
|
| 468 |
+
# ans_types.append(self.token())
|
| 469 |
+
# self.token() #swallow the ']'
|
| 470 |
+
|
| 471 |
+
self.assertToken(self.token(), "whq")
|
| 472 |
+
self.assertToken(self.token(), ",")
|
| 473 |
+
d1 = self.process_next_expression(None)
|
| 474 |
+
self.assertToken(self.token(), ",")
|
| 475 |
+
ref = self.parse_variable()
|
| 476 |
+
self.assertToken(self.token(), ",")
|
| 477 |
+
d2 = self.process_next_expression(None)
|
| 478 |
+
self.assertToken(self.token(), ")")
|
| 479 |
+
return lambda sent_index, word_indices: BoxerWhq(
|
| 480 |
+
self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
def _handle_named(self):
|
| 484 |
+
# named(x0, john, per, 0)
|
| 485 |
+
self.assertToken(self.token(), "(")
|
| 486 |
+
variable = self.parse_variable()
|
| 487 |
+
self.assertToken(self.token(), ",")
|
| 488 |
+
name = self.token()
|
| 489 |
+
self.assertToken(self.token(), ",")
|
| 490 |
+
type = self.token()
|
| 491 |
+
self.assertToken(self.token(), ",")
|
| 492 |
+
sense = self.token() # as per boxer rev 2554
|
| 493 |
+
self.assertToken(self.token(), ")")
|
| 494 |
+
return lambda sent_index, word_indices: BoxerNamed(
|
| 495 |
+
self.discourse_id, sent_index, word_indices, variable, name, type, sense
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
def _handle_rel(self):
|
| 499 |
+
# rel(_G3993, _G3943, agent, 0)
|
| 500 |
+
self.assertToken(self.token(), "(")
|
| 501 |
+
var1 = self.parse_variable()
|
| 502 |
+
self.assertToken(self.token(), ",")
|
| 503 |
+
var2 = self.parse_variable()
|
| 504 |
+
self.assertToken(self.token(), ",")
|
| 505 |
+
rel = self.token()
|
| 506 |
+
self.assertToken(self.token(), ",")
|
| 507 |
+
sense = int(self.token())
|
| 508 |
+
self.assertToken(self.token(), ")")
|
| 509 |
+
return lambda sent_index, word_indices: BoxerRel(
|
| 510 |
+
self.discourse_id, sent_index, word_indices, var1, var2, rel, sense
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
def _handle_timex(self):
|
| 514 |
+
# timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
|
| 515 |
+
self.assertToken(self.token(), "(")
|
| 516 |
+
arg = self.parse_variable()
|
| 517 |
+
self.assertToken(self.token(), ",")
|
| 518 |
+
new_conds = self._handle_time_expression(arg)
|
| 519 |
+
self.assertToken(self.token(), ")")
|
| 520 |
+
return new_conds
|
| 521 |
+
|
| 522 |
+
def _handle_time_expression(self, arg):
|
| 523 |
+
# date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
|
| 524 |
+
tok = self.token()
|
| 525 |
+
self.assertToken(self.token(), "(")
|
| 526 |
+
if tok == "date":
|
| 527 |
+
conds = self._handle_date(arg)
|
| 528 |
+
elif tok == "time":
|
| 529 |
+
conds = self._handle_time(arg)
|
| 530 |
+
else:
|
| 531 |
+
return None
|
| 532 |
+
self.assertToken(self.token(), ")")
|
| 533 |
+
return [
|
| 534 |
+
lambda sent_index, word_indices: BoxerPred(
|
| 535 |
+
self.discourse_id, sent_index, word_indices, arg, tok, "n", 0
|
| 536 |
+
)
|
| 537 |
+
] + [lambda sent_index, word_indices: cond for cond in conds]
|
| 538 |
+
|
| 539 |
+
def _handle_date(self, arg):
|
| 540 |
+
# []: (+), []:'XXXX', [1004]:'04', []:'XX'
|
| 541 |
+
conds = []
|
| 542 |
+
((sent_index, word_indices),) = self._sent_and_word_indices(
|
| 543 |
+
self._parse_index_list()
|
| 544 |
+
)
|
| 545 |
+
self.assertToken(self.token(), "(")
|
| 546 |
+
pol = self.token()
|
| 547 |
+
self.assertToken(self.token(), ")")
|
| 548 |
+
conds.append(
|
| 549 |
+
BoxerPred(
|
| 550 |
+
self.discourse_id,
|
| 551 |
+
sent_index,
|
| 552 |
+
word_indices,
|
| 553 |
+
arg,
|
| 554 |
+
f"date_pol_{pol}",
|
| 555 |
+
"a",
|
| 556 |
+
0,
|
| 557 |
+
)
|
| 558 |
+
)
|
| 559 |
+
self.assertToken(self.token(), ",")
|
| 560 |
+
|
| 561 |
+
((sent_index, word_indices),) = self._sent_and_word_indices(
|
| 562 |
+
self._parse_index_list()
|
| 563 |
+
)
|
| 564 |
+
year = self.token()
|
| 565 |
+
if year != "XXXX":
|
| 566 |
+
year = year.replace(":", "_")
|
| 567 |
+
conds.append(
|
| 568 |
+
BoxerPred(
|
| 569 |
+
self.discourse_id,
|
| 570 |
+
sent_index,
|
| 571 |
+
word_indices,
|
| 572 |
+
arg,
|
| 573 |
+
f"date_year_{year}",
|
| 574 |
+
"a",
|
| 575 |
+
0,
|
| 576 |
+
)
|
| 577 |
+
)
|
| 578 |
+
self.assertToken(self.token(), ",")
|
| 579 |
+
|
| 580 |
+
((sent_index, word_indices),) = self._sent_and_word_indices(
|
| 581 |
+
self._parse_index_list()
|
| 582 |
+
)
|
| 583 |
+
month = self.token()
|
| 584 |
+
if month != "XX":
|
| 585 |
+
conds.append(
|
| 586 |
+
BoxerPred(
|
| 587 |
+
self.discourse_id,
|
| 588 |
+
sent_index,
|
| 589 |
+
word_indices,
|
| 590 |
+
arg,
|
| 591 |
+
f"date_month_{month}",
|
| 592 |
+
"a",
|
| 593 |
+
0,
|
| 594 |
+
)
|
| 595 |
+
)
|
| 596 |
+
self.assertToken(self.token(), ",")
|
| 597 |
+
|
| 598 |
+
((sent_index, word_indices),) = self._sent_and_word_indices(
|
| 599 |
+
self._parse_index_list()
|
| 600 |
+
)
|
| 601 |
+
day = self.token()
|
| 602 |
+
if day != "XX":
|
| 603 |
+
conds.append(
|
| 604 |
+
BoxerPred(
|
| 605 |
+
self.discourse_id,
|
| 606 |
+
sent_index,
|
| 607 |
+
word_indices,
|
| 608 |
+
arg,
|
| 609 |
+
f"date_day_{day}",
|
| 610 |
+
"a",
|
| 611 |
+
0,
|
| 612 |
+
)
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
return conds
|
| 616 |
+
|
| 617 |
+
def _handle_time(self, arg):
|
| 618 |
+
# time([1018]:'18', []:'XX', []:'XX')
|
| 619 |
+
conds = []
|
| 620 |
+
self._parse_index_list()
|
| 621 |
+
hour = self.token()
|
| 622 |
+
if hour != "XX":
|
| 623 |
+
conds.append(self._make_atom("r_hour_2", arg, hour))
|
| 624 |
+
self.assertToken(self.token(), ",")
|
| 625 |
+
|
| 626 |
+
self._parse_index_list()
|
| 627 |
+
min = self.token()
|
| 628 |
+
if min != "XX":
|
| 629 |
+
conds.append(self._make_atom("r_min_2", arg, min))
|
| 630 |
+
self.assertToken(self.token(), ",")
|
| 631 |
+
|
| 632 |
+
self._parse_index_list()
|
| 633 |
+
sec = self.token()
|
| 634 |
+
if sec != "XX":
|
| 635 |
+
conds.append(self._make_atom("r_sec_2", arg, sec))
|
| 636 |
+
|
| 637 |
+
return conds
|
| 638 |
+
|
| 639 |
+
def _handle_card(self):
|
| 640 |
+
# card(_G18535, 28, ge)
|
| 641 |
+
self.assertToken(self.token(), "(")
|
| 642 |
+
variable = self.parse_variable()
|
| 643 |
+
self.assertToken(self.token(), ",")
|
| 644 |
+
value = self.token()
|
| 645 |
+
self.assertToken(self.token(), ",")
|
| 646 |
+
type = self.token()
|
| 647 |
+
self.assertToken(self.token(), ")")
|
| 648 |
+
return lambda sent_index, word_indices: BoxerCard(
|
| 649 |
+
self.discourse_id, sent_index, word_indices, variable, value, type
|
| 650 |
+
)
|
| 651 |
+
|
| 652 |
+
def _handle_prop(self):
|
| 653 |
+
# prop(_G15949, drs(...))
|
| 654 |
+
self.assertToken(self.token(), "(")
|
| 655 |
+
variable = self.parse_variable()
|
| 656 |
+
self.assertToken(self.token(), ",")
|
| 657 |
+
drs = self.process_next_expression(None)
|
| 658 |
+
self.assertToken(self.token(), ")")
|
| 659 |
+
return lambda sent_index, word_indices: BoxerProp(
|
| 660 |
+
self.discourse_id, sent_index, word_indices, variable, drs
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
+
def _parse_index_list(self):
|
| 664 |
+
# [1001,1002]:
|
| 665 |
+
indices = []
|
| 666 |
+
self.assertToken(self.token(), "[")
|
| 667 |
+
while self.token(0) != "]":
|
| 668 |
+
indices.append(self.parse_index())
|
| 669 |
+
if self.token(0) == ",":
|
| 670 |
+
self.token() # swallow ','
|
| 671 |
+
self.token() # swallow ']'
|
| 672 |
+
self.assertToken(self.token(), ":")
|
| 673 |
+
return indices
|
| 674 |
+
|
| 675 |
+
def parse_drs(self):
|
| 676 |
+
# drs([[1001]:_G3943],
|
| 677 |
+
# [[1002]:pred(_G3943, dog, n, 0)]
|
| 678 |
+
# )
|
| 679 |
+
self.assertToken(self.token(), "(")
|
| 680 |
+
self.assertToken(self.token(), "[")
|
| 681 |
+
refs = set()
|
| 682 |
+
while self.token(0) != "]":
|
| 683 |
+
indices = self._parse_index_list()
|
| 684 |
+
refs.add(self.parse_variable())
|
| 685 |
+
if self.token(0) == ",":
|
| 686 |
+
self.token() # swallow ','
|
| 687 |
+
self.token() # swallow ']'
|
| 688 |
+
self.assertToken(self.token(), ",")
|
| 689 |
+
self.assertToken(self.token(), "[")
|
| 690 |
+
conds = []
|
| 691 |
+
while self.token(0) != "]":
|
| 692 |
+
indices = self._parse_index_list()
|
| 693 |
+
conds.extend(self.parse_condition(indices))
|
| 694 |
+
if self.token(0) == ",":
|
| 695 |
+
self.token() # swallow ','
|
| 696 |
+
self.token() # swallow ']'
|
| 697 |
+
self.assertToken(self.token(), ")")
|
| 698 |
+
return BoxerDrs(list(refs), conds)
|
| 699 |
+
|
| 700 |
+
def _handle_binary_expression(self, make_callback):
|
| 701 |
+
self.assertToken(self.token(), "(")
|
| 702 |
+
drs1 = self.process_next_expression(None)
|
| 703 |
+
self.assertToken(self.token(), ",")
|
| 704 |
+
drs2 = self.process_next_expression(None)
|
| 705 |
+
self.assertToken(self.token(), ")")
|
| 706 |
+
return lambda sent_index, word_indices: make_callback(
|
| 707 |
+
sent_index, word_indices, drs1, drs2
|
| 708 |
+
)
|
| 709 |
+
|
| 710 |
+
def _handle_alfa(self, make_callback):
|
| 711 |
+
self.assertToken(self.token(), "(")
|
| 712 |
+
type = self.token()
|
| 713 |
+
self.assertToken(self.token(), ",")
|
| 714 |
+
drs1 = self.process_next_expression(None)
|
| 715 |
+
self.assertToken(self.token(), ",")
|
| 716 |
+
drs2 = self.process_next_expression(None)
|
| 717 |
+
self.assertToken(self.token(), ")")
|
| 718 |
+
return lambda sent_index, word_indices: make_callback(
|
| 719 |
+
sent_index, word_indices, drs1, drs2
|
| 720 |
+
)
|
| 721 |
+
|
| 722 |
+
def _handle_eq(self):
|
| 723 |
+
self.assertToken(self.token(), "(")
|
| 724 |
+
var1 = self.parse_variable()
|
| 725 |
+
self.assertToken(self.token(), ",")
|
| 726 |
+
var2 = self.parse_variable()
|
| 727 |
+
self.assertToken(self.token(), ")")
|
| 728 |
+
return lambda sent_index, word_indices: BoxerEq(
|
| 729 |
+
self.discourse_id, sent_index, word_indices, var1, var2
|
| 730 |
+
)
|
| 731 |
+
|
| 732 |
+
def _handle_whq(self):
|
| 733 |
+
self.assertToken(self.token(), "(")
|
| 734 |
+
self.assertToken(self.token(), "[")
|
| 735 |
+
ans_types = []
|
| 736 |
+
while self.token(0) != "]":
|
| 737 |
+
cat = self.token()
|
| 738 |
+
self.assertToken(self.token(), ":")
|
| 739 |
+
if cat == "des":
|
| 740 |
+
ans_types.append(self.token())
|
| 741 |
+
elif cat == "num":
|
| 742 |
+
ans_types.append("number")
|
| 743 |
+
typ = self.token()
|
| 744 |
+
if typ == "cou":
|
| 745 |
+
ans_types.append("count")
|
| 746 |
+
else:
|
| 747 |
+
ans_types.append(typ)
|
| 748 |
+
else:
|
| 749 |
+
ans_types.append(self.token())
|
| 750 |
+
self.token() # swallow the ']'
|
| 751 |
+
|
| 752 |
+
self.assertToken(self.token(), ",")
|
| 753 |
+
d1 = self.process_next_expression(None)
|
| 754 |
+
self.assertToken(self.token(), ",")
|
| 755 |
+
ref = self.parse_variable()
|
| 756 |
+
self.assertToken(self.token(), ",")
|
| 757 |
+
d2 = self.process_next_expression(None)
|
| 758 |
+
self.assertToken(self.token(), ")")
|
| 759 |
+
return lambda sent_index, word_indices: BoxerWhq(
|
| 760 |
+
self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
|
| 764 |
+
return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
|
| 765 |
+
|
| 766 |
+
def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
|
| 767 |
+
return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
|
| 768 |
+
|
| 769 |
+
def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
|
| 770 |
+
return BoxerDrs(drs1.refs, drs1.conds, drs2)
|
| 771 |
+
|
| 772 |
+
def parse_variable(self):
|
| 773 |
+
var = self.token()
|
| 774 |
+
assert re.match(r"^[exps]\d+$", var), var
|
| 775 |
+
return var
|
| 776 |
+
|
| 777 |
+
def parse_index(self):
|
| 778 |
+
return int(self.token())
|
| 779 |
+
|
| 780 |
+
def _sent_and_word_indices(self, indices):
|
| 781 |
+
"""
|
| 782 |
+
:return: list of (sent_index, word_indices) tuples
|
| 783 |
+
"""
|
| 784 |
+
sent_indices = {(i / 1000) - 1 for i in indices if i >= 0}
|
| 785 |
+
if sent_indices:
|
| 786 |
+
pairs = []
|
| 787 |
+
for sent_index in sent_indices:
|
| 788 |
+
word_indices = [
|
| 789 |
+
(i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1
|
| 790 |
+
]
|
| 791 |
+
pairs.append((sent_index, word_indices))
|
| 792 |
+
return pairs
|
| 793 |
+
else:
|
| 794 |
+
word_indices = [(i % 1000) - 1 for i in indices]
|
| 795 |
+
return [(None, word_indices)]
|
| 796 |
+
|
| 797 |
+
|
| 798 |
+
class BoxerDrsParser(DrtParser):
|
| 799 |
+
"""
|
| 800 |
+
Reparse the str form of subclasses of ``AbstractBoxerDrs``
|
| 801 |
+
"""
|
| 802 |
+
|
| 803 |
+
def __init__(self, discourse_id=None):
|
| 804 |
+
DrtParser.__init__(self)
|
| 805 |
+
self.discourse_id = discourse_id
|
| 806 |
+
|
| 807 |
+
def get_all_symbols(self):
|
| 808 |
+
return [
|
| 809 |
+
DrtTokens.OPEN,
|
| 810 |
+
DrtTokens.CLOSE,
|
| 811 |
+
DrtTokens.COMMA,
|
| 812 |
+
DrtTokens.OPEN_BRACKET,
|
| 813 |
+
DrtTokens.CLOSE_BRACKET,
|
| 814 |
+
]
|
| 815 |
+
|
| 816 |
+
def attempt_adjuncts(self, expression, context):
|
| 817 |
+
return expression
|
| 818 |
+
|
| 819 |
+
def handle(self, tok, context):
|
| 820 |
+
try:
|
| 821 |
+
# if tok == 'drs':
|
| 822 |
+
# self.assertNextToken(DrtTokens.OPEN)
|
| 823 |
+
# label = int(self.token())
|
| 824 |
+
# self.assertNextToken(DrtTokens.COMMA)
|
| 825 |
+
# refs = list(map(int, self.handle_refs()))
|
| 826 |
+
# self.assertNextToken(DrtTokens.COMMA)
|
| 827 |
+
# conds = self.handle_conds(None)
|
| 828 |
+
# self.assertNextToken(DrtTokens.CLOSE)
|
| 829 |
+
# return BoxerDrs(label, refs, conds)
|
| 830 |
+
if tok == "pred":
|
| 831 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 832 |
+
disc_id = (
|
| 833 |
+
self.discourse_id if self.discourse_id is not None else self.token()
|
| 834 |
+
)
|
| 835 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 836 |
+
sent_id = self.nullableIntToken()
|
| 837 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 838 |
+
word_ids = list(map(int, self.handle_refs()))
|
| 839 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 840 |
+
variable = int(self.token())
|
| 841 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 842 |
+
name = self.token()
|
| 843 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 844 |
+
pos = self.token()
|
| 845 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 846 |
+
sense = int(self.token())
|
| 847 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 848 |
+
return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
|
| 849 |
+
elif tok == "named":
|
| 850 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 851 |
+
disc_id = (
|
| 852 |
+
self.discourse_id if self.discourse_id is not None else self.token()
|
| 853 |
+
)
|
| 854 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 855 |
+
sent_id = int(self.token())
|
| 856 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 857 |
+
word_ids = map(int, self.handle_refs())
|
| 858 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 859 |
+
variable = int(self.token())
|
| 860 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 861 |
+
name = self.token()
|
| 862 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 863 |
+
type = self.token()
|
| 864 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 865 |
+
sense = int(self.token())
|
| 866 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 867 |
+
return BoxerNamed(
|
| 868 |
+
disc_id, sent_id, word_ids, variable, name, type, sense
|
| 869 |
+
)
|
| 870 |
+
elif tok == "rel":
|
| 871 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 872 |
+
disc_id = (
|
| 873 |
+
self.discourse_id if self.discourse_id is not None else self.token()
|
| 874 |
+
)
|
| 875 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 876 |
+
sent_id = self.nullableIntToken()
|
| 877 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 878 |
+
word_ids = list(map(int, self.handle_refs()))
|
| 879 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 880 |
+
var1 = int(self.token())
|
| 881 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 882 |
+
var2 = int(self.token())
|
| 883 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 884 |
+
rel = self.token()
|
| 885 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 886 |
+
sense = int(self.token())
|
| 887 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 888 |
+
return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
|
| 889 |
+
elif tok == "prop":
|
| 890 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 891 |
+
disc_id = (
|
| 892 |
+
self.discourse_id if self.discourse_id is not None else self.token()
|
| 893 |
+
)
|
| 894 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 895 |
+
sent_id = int(self.token())
|
| 896 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 897 |
+
word_ids = list(map(int, self.handle_refs()))
|
| 898 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 899 |
+
variable = int(self.token())
|
| 900 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 901 |
+
drs = self.process_next_expression(None)
|
| 902 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 903 |
+
return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
|
| 904 |
+
elif tok == "not":
|
| 905 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 906 |
+
drs = self.process_next_expression(None)
|
| 907 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 908 |
+
return BoxerNot(drs)
|
| 909 |
+
elif tok == "imp":
|
| 910 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 911 |
+
drs1 = self.process_next_expression(None)
|
| 912 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 913 |
+
drs2 = self.process_next_expression(None)
|
| 914 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 915 |
+
return BoxerDrs(drs1.refs, drs1.conds, drs2)
|
| 916 |
+
elif tok == "or":
|
| 917 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 918 |
+
disc_id = (
|
| 919 |
+
self.discourse_id if self.discourse_id is not None else self.token()
|
| 920 |
+
)
|
| 921 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 922 |
+
sent_id = self.nullableIntToken()
|
| 923 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 924 |
+
word_ids = map(int, self.handle_refs())
|
| 925 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 926 |
+
drs1 = self.process_next_expression(None)
|
| 927 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 928 |
+
drs2 = self.process_next_expression(None)
|
| 929 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 930 |
+
return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
|
| 931 |
+
elif tok == "eq":
|
| 932 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 933 |
+
disc_id = (
|
| 934 |
+
self.discourse_id if self.discourse_id is not None else self.token()
|
| 935 |
+
)
|
| 936 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 937 |
+
sent_id = self.nullableIntToken()
|
| 938 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 939 |
+
word_ids = list(map(int, self.handle_refs()))
|
| 940 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 941 |
+
var1 = int(self.token())
|
| 942 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 943 |
+
var2 = int(self.token())
|
| 944 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 945 |
+
return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
|
| 946 |
+
elif tok == "card":
|
| 947 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 948 |
+
disc_id = (
|
| 949 |
+
self.discourse_id if self.discourse_id is not None else self.token()
|
| 950 |
+
)
|
| 951 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 952 |
+
sent_id = self.nullableIntToken()
|
| 953 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 954 |
+
word_ids = map(int, self.handle_refs())
|
| 955 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 956 |
+
var = int(self.token())
|
| 957 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 958 |
+
value = self.token()
|
| 959 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 960 |
+
type = self.token()
|
| 961 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 962 |
+
return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
|
| 963 |
+
elif tok == "whq":
|
| 964 |
+
self.assertNextToken(DrtTokens.OPEN)
|
| 965 |
+
disc_id = (
|
| 966 |
+
self.discourse_id if self.discourse_id is not None else self.token()
|
| 967 |
+
)
|
| 968 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 969 |
+
sent_id = self.nullableIntToken()
|
| 970 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 971 |
+
word_ids = list(map(int, self.handle_refs()))
|
| 972 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 973 |
+
ans_types = self.handle_refs()
|
| 974 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 975 |
+
drs1 = self.process_next_expression(None)
|
| 976 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 977 |
+
var = int(self.token())
|
| 978 |
+
self.assertNextToken(DrtTokens.COMMA)
|
| 979 |
+
drs2 = self.process_next_expression(None)
|
| 980 |
+
self.assertNextToken(DrtTokens.CLOSE)
|
| 981 |
+
return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2)
|
| 982 |
+
except Exception as e:
|
| 983 |
+
raise LogicalExpressionException(self._currentIndex, str(e)) from e
|
| 984 |
+
assert False, repr(tok)
|
| 985 |
+
|
| 986 |
+
def nullableIntToken(self):
|
| 987 |
+
t = self.token()
|
| 988 |
+
return int(t) if t != "None" else None
|
| 989 |
+
|
| 990 |
+
def get_next_token_variable(self, description):
|
| 991 |
+
try:
|
| 992 |
+
return self.token()
|
| 993 |
+
except ExpectedMoreTokensException as e:
|
| 994 |
+
raise ExpectedMoreTokensException(e.index, "Variable expected.") from e
|
| 995 |
+
|
| 996 |
+
|
| 997 |
+
class AbstractBoxerDrs:
|
| 998 |
+
def variables(self):
|
| 999 |
+
"""
|
| 1000 |
+
:return: (set<variables>, set<events>, set<propositions>)
|
| 1001 |
+
"""
|
| 1002 |
+
variables, events, propositions = self._variables()
|
| 1003 |
+
return (variables - (events | propositions), events, propositions - events)
|
| 1004 |
+
|
| 1005 |
+
def variable_types(self):
|
| 1006 |
+
vartypes = {}
|
| 1007 |
+
for t, vars in zip(("z", "e", "p"), self.variables()):
|
| 1008 |
+
for v in vars:
|
| 1009 |
+
vartypes[v] = t
|
| 1010 |
+
return vartypes
|
| 1011 |
+
|
| 1012 |
+
def _variables(self):
|
| 1013 |
+
"""
|
| 1014 |
+
:return: (set<variables>, set<events>, set<propositions>)
|
| 1015 |
+
"""
|
| 1016 |
+
return (set(), set(), set())
|
| 1017 |
+
|
| 1018 |
+
def atoms(self):
|
| 1019 |
+
return set()
|
| 1020 |
+
|
| 1021 |
+
def clean(self):
|
| 1022 |
+
return self
|
| 1023 |
+
|
| 1024 |
+
def _clean_name(self, name):
|
| 1025 |
+
return name.replace("-", "_").replace("'", "_")
|
| 1026 |
+
|
| 1027 |
+
def renumber_sentences(self, f):
|
| 1028 |
+
return self
|
| 1029 |
+
|
| 1030 |
+
def __hash__(self):
|
| 1031 |
+
return hash(f"{self}")
|
| 1032 |
+
|
| 1033 |
+
|
| 1034 |
+
class BoxerDrs(AbstractBoxerDrs):
|
| 1035 |
+
def __init__(self, refs, conds, consequent=None):
|
| 1036 |
+
AbstractBoxerDrs.__init__(self)
|
| 1037 |
+
self.refs = refs
|
| 1038 |
+
self.conds = conds
|
| 1039 |
+
self.consequent = consequent
|
| 1040 |
+
|
| 1041 |
+
def _variables(self):
|
| 1042 |
+
variables = (set(), set(), set())
|
| 1043 |
+
for cond in self.conds:
|
| 1044 |
+
for s, v in zip(variables, cond._variables()):
|
| 1045 |
+
s.update(v)
|
| 1046 |
+
if self.consequent is not None:
|
| 1047 |
+
for s, v in zip(variables, self.consequent._variables()):
|
| 1048 |
+
s.update(v)
|
| 1049 |
+
return variables
|
| 1050 |
+
|
| 1051 |
+
def atoms(self):
|
| 1052 |
+
atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set())
|
| 1053 |
+
if self.consequent is not None:
|
| 1054 |
+
atoms.update(self.consequent.atoms())
|
| 1055 |
+
return atoms
|
| 1056 |
+
|
| 1057 |
+
def clean(self):
|
| 1058 |
+
consequent = self.consequent.clean() if self.consequent else None
|
| 1059 |
+
return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
|
| 1060 |
+
|
| 1061 |
+
def renumber_sentences(self, f):
|
| 1062 |
+
consequent = self.consequent.renumber_sentences(f) if self.consequent else None
|
| 1063 |
+
return BoxerDrs(
|
| 1064 |
+
self.refs, [c.renumber_sentences(f) for c in self.conds], consequent
|
| 1065 |
+
)
|
| 1066 |
+
|
| 1067 |
+
def __repr__(self):
|
| 1068 |
+
s = "drs([{}], [{}])".format(
|
| 1069 |
+
", ".join("%s" % r for r in self.refs),
|
| 1070 |
+
", ".join("%s" % c for c in self.conds),
|
| 1071 |
+
)
|
| 1072 |
+
if self.consequent is not None:
|
| 1073 |
+
s = f"imp({s}, {self.consequent})"
|
| 1074 |
+
return s
|
| 1075 |
+
|
| 1076 |
+
def __eq__(self, other):
|
| 1077 |
+
return (
|
| 1078 |
+
self.__class__ == other.__class__
|
| 1079 |
+
and self.refs == other.refs
|
| 1080 |
+
and len(self.conds) == len(other.conds)
|
| 1081 |
+
and reduce(
|
| 1082 |
+
operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds))
|
| 1083 |
+
)
|
| 1084 |
+
and self.consequent == other.consequent
|
| 1085 |
+
)
|
| 1086 |
+
|
| 1087 |
+
def __ne__(self, other):
|
| 1088 |
+
return not self == other
|
| 1089 |
+
|
| 1090 |
+
__hash__ = AbstractBoxerDrs.__hash__
|
| 1091 |
+
|
| 1092 |
+
|
| 1093 |
+
class BoxerNot(AbstractBoxerDrs):
|
| 1094 |
+
def __init__(self, drs):
|
| 1095 |
+
AbstractBoxerDrs.__init__(self)
|
| 1096 |
+
self.drs = drs
|
| 1097 |
+
|
| 1098 |
+
def _variables(self):
|
| 1099 |
+
return self.drs._variables()
|
| 1100 |
+
|
| 1101 |
+
def atoms(self):
|
| 1102 |
+
return self.drs.atoms()
|
| 1103 |
+
|
| 1104 |
+
def clean(self):
|
| 1105 |
+
return BoxerNot(self.drs.clean())
|
| 1106 |
+
|
| 1107 |
+
def renumber_sentences(self, f):
|
| 1108 |
+
return BoxerNot(self.drs.renumber_sentences(f))
|
| 1109 |
+
|
| 1110 |
+
def __repr__(self):
|
| 1111 |
+
return "not(%s)" % (self.drs)
|
| 1112 |
+
|
| 1113 |
+
def __eq__(self, other):
|
| 1114 |
+
return self.__class__ == other.__class__ and self.drs == other.drs
|
| 1115 |
+
|
| 1116 |
+
def __ne__(self, other):
|
| 1117 |
+
return not self == other
|
| 1118 |
+
|
| 1119 |
+
__hash__ = AbstractBoxerDrs.__hash__
|
| 1120 |
+
|
| 1121 |
+
|
| 1122 |
+
class BoxerIndexed(AbstractBoxerDrs):
|
| 1123 |
+
def __init__(self, discourse_id, sent_index, word_indices):
|
| 1124 |
+
AbstractBoxerDrs.__init__(self)
|
| 1125 |
+
self.discourse_id = discourse_id
|
| 1126 |
+
self.sent_index = sent_index
|
| 1127 |
+
self.word_indices = word_indices
|
| 1128 |
+
|
| 1129 |
+
def atoms(self):
|
| 1130 |
+
return {self}
|
| 1131 |
+
|
| 1132 |
+
def __eq__(self, other):
|
| 1133 |
+
return (
|
| 1134 |
+
self.__class__ == other.__class__
|
| 1135 |
+
and self.discourse_id == other.discourse_id
|
| 1136 |
+
and self.sent_index == other.sent_index
|
| 1137 |
+
and self.word_indices == other.word_indices
|
| 1138 |
+
and reduce(operator.and_, (s == o for s, o in zip(self, other)))
|
| 1139 |
+
)
|
| 1140 |
+
|
| 1141 |
+
def __ne__(self, other):
|
| 1142 |
+
return not self == other
|
| 1143 |
+
|
| 1144 |
+
__hash__ = AbstractBoxerDrs.__hash__
|
| 1145 |
+
|
| 1146 |
+
def __repr__(self):
|
| 1147 |
+
s = "{}({}, {}, [{}]".format(
|
| 1148 |
+
self._pred(),
|
| 1149 |
+
self.discourse_id,
|
| 1150 |
+
self.sent_index,
|
| 1151 |
+
", ".join("%s" % wi for wi in self.word_indices),
|
| 1152 |
+
)
|
| 1153 |
+
for v in self:
|
| 1154 |
+
s += ", %s" % v
|
| 1155 |
+
return s + ")"
|
| 1156 |
+
|
| 1157 |
+
|
| 1158 |
+
class BoxerPred(BoxerIndexed):
|
| 1159 |
+
def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense):
|
| 1160 |
+
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
|
| 1161 |
+
self.var = var
|
| 1162 |
+
self.name = name
|
| 1163 |
+
self.pos = pos
|
| 1164 |
+
self.sense = sense
|
| 1165 |
+
|
| 1166 |
+
def _variables(self):
|
| 1167 |
+
return ({self.var}, set(), set())
|
| 1168 |
+
|
| 1169 |
+
def change_var(self, var):
|
| 1170 |
+
return BoxerPred(
|
| 1171 |
+
self.discourse_id,
|
| 1172 |
+
self.sent_index,
|
| 1173 |
+
self.word_indices,
|
| 1174 |
+
var,
|
| 1175 |
+
self.name,
|
| 1176 |
+
self.pos,
|
| 1177 |
+
self.sense,
|
| 1178 |
+
)
|
| 1179 |
+
|
| 1180 |
+
def clean(self):
|
| 1181 |
+
return BoxerPred(
|
| 1182 |
+
self.discourse_id,
|
| 1183 |
+
self.sent_index,
|
| 1184 |
+
self.word_indices,
|
| 1185 |
+
self.var,
|
| 1186 |
+
self._clean_name(self.name),
|
| 1187 |
+
self.pos,
|
| 1188 |
+
self.sense,
|
| 1189 |
+
)
|
| 1190 |
+
|
| 1191 |
+
def renumber_sentences(self, f):
|
| 1192 |
+
new_sent_index = f(self.sent_index)
|
| 1193 |
+
return BoxerPred(
|
| 1194 |
+
self.discourse_id,
|
| 1195 |
+
new_sent_index,
|
| 1196 |
+
self.word_indices,
|
| 1197 |
+
self.var,
|
| 1198 |
+
self.name,
|
| 1199 |
+
self.pos,
|
| 1200 |
+
self.sense,
|
| 1201 |
+
)
|
| 1202 |
+
|
| 1203 |
+
def __iter__(self):
|
| 1204 |
+
return iter((self.var, self.name, self.pos, self.sense))
|
| 1205 |
+
|
| 1206 |
+
def _pred(self):
|
| 1207 |
+
return "pred"
|
| 1208 |
+
|
| 1209 |
+
|
| 1210 |
+
class BoxerNamed(BoxerIndexed):
|
| 1211 |
+
def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense):
|
| 1212 |
+
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
|
| 1213 |
+
self.var = var
|
| 1214 |
+
self.name = name
|
| 1215 |
+
self.type = type
|
| 1216 |
+
self.sense = sense
|
| 1217 |
+
|
| 1218 |
+
def _variables(self):
|
| 1219 |
+
return ({self.var}, set(), set())
|
| 1220 |
+
|
| 1221 |
+
def change_var(self, var):
|
| 1222 |
+
return BoxerNamed(
|
| 1223 |
+
self.discourse_id,
|
| 1224 |
+
self.sent_index,
|
| 1225 |
+
self.word_indices,
|
| 1226 |
+
var,
|
| 1227 |
+
self.name,
|
| 1228 |
+
self.type,
|
| 1229 |
+
self.sense,
|
| 1230 |
+
)
|
| 1231 |
+
|
| 1232 |
+
def clean(self):
|
| 1233 |
+
return BoxerNamed(
|
| 1234 |
+
self.discourse_id,
|
| 1235 |
+
self.sent_index,
|
| 1236 |
+
self.word_indices,
|
| 1237 |
+
self.var,
|
| 1238 |
+
self._clean_name(self.name),
|
| 1239 |
+
self.type,
|
| 1240 |
+
self.sense,
|
| 1241 |
+
)
|
| 1242 |
+
|
| 1243 |
+
def renumber_sentences(self, f):
|
| 1244 |
+
return BoxerNamed(
|
| 1245 |
+
self.discourse_id,
|
| 1246 |
+
f(self.sent_index),
|
| 1247 |
+
self.word_indices,
|
| 1248 |
+
self.var,
|
| 1249 |
+
self.name,
|
| 1250 |
+
self.type,
|
| 1251 |
+
self.sense,
|
| 1252 |
+
)
|
| 1253 |
+
|
| 1254 |
+
def __iter__(self):
|
| 1255 |
+
return iter((self.var, self.name, self.type, self.sense))
|
| 1256 |
+
|
| 1257 |
+
def _pred(self):
|
| 1258 |
+
return "named"
|
| 1259 |
+
|
| 1260 |
+
|
| 1261 |
+
class BoxerRel(BoxerIndexed):
|
| 1262 |
+
def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense):
|
| 1263 |
+
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
|
| 1264 |
+
self.var1 = var1
|
| 1265 |
+
self.var2 = var2
|
| 1266 |
+
self.rel = rel
|
| 1267 |
+
self.sense = sense
|
| 1268 |
+
|
| 1269 |
+
def _variables(self):
|
| 1270 |
+
return ({self.var1, self.var2}, set(), set())
|
| 1271 |
+
|
| 1272 |
+
def clean(self):
|
| 1273 |
+
return BoxerRel(
|
| 1274 |
+
self.discourse_id,
|
| 1275 |
+
self.sent_index,
|
| 1276 |
+
self.word_indices,
|
| 1277 |
+
self.var1,
|
| 1278 |
+
self.var2,
|
| 1279 |
+
self._clean_name(self.rel),
|
| 1280 |
+
self.sense,
|
| 1281 |
+
)
|
| 1282 |
+
|
| 1283 |
+
def renumber_sentences(self, f):
|
| 1284 |
+
return BoxerRel(
|
| 1285 |
+
self.discourse_id,
|
| 1286 |
+
f(self.sent_index),
|
| 1287 |
+
self.word_indices,
|
| 1288 |
+
self.var1,
|
| 1289 |
+
self.var2,
|
| 1290 |
+
self.rel,
|
| 1291 |
+
self.sense,
|
| 1292 |
+
)
|
| 1293 |
+
|
| 1294 |
+
def __iter__(self):
|
| 1295 |
+
return iter((self.var1, self.var2, self.rel, self.sense))
|
| 1296 |
+
|
| 1297 |
+
def _pred(self):
|
| 1298 |
+
return "rel"
|
| 1299 |
+
|
| 1300 |
+
|
| 1301 |
+
class BoxerProp(BoxerIndexed):
|
| 1302 |
+
def __init__(self, discourse_id, sent_index, word_indices, var, drs):
|
| 1303 |
+
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
|
| 1304 |
+
self.var = var
|
| 1305 |
+
self.drs = drs
|
| 1306 |
+
|
| 1307 |
+
def _variables(self):
|
| 1308 |
+
return tuple(
|
| 1309 |
+
map(operator.or_, (set(), set(), {self.var}), self.drs._variables())
|
| 1310 |
+
)
|
| 1311 |
+
|
| 1312 |
+
def referenced_labels(self):
|
| 1313 |
+
return {self.drs}
|
| 1314 |
+
|
| 1315 |
+
def atoms(self):
|
| 1316 |
+
return self.drs.atoms()
|
| 1317 |
+
|
| 1318 |
+
def clean(self):
|
| 1319 |
+
return BoxerProp(
|
| 1320 |
+
self.discourse_id,
|
| 1321 |
+
self.sent_index,
|
| 1322 |
+
self.word_indices,
|
| 1323 |
+
self.var,
|
| 1324 |
+
self.drs.clean(),
|
| 1325 |
+
)
|
| 1326 |
+
|
| 1327 |
+
def renumber_sentences(self, f):
|
| 1328 |
+
return BoxerProp(
|
| 1329 |
+
self.discourse_id,
|
| 1330 |
+
f(self.sent_index),
|
| 1331 |
+
self.word_indices,
|
| 1332 |
+
self.var,
|
| 1333 |
+
self.drs.renumber_sentences(f),
|
| 1334 |
+
)
|
| 1335 |
+
|
| 1336 |
+
def __iter__(self):
|
| 1337 |
+
return iter((self.var, self.drs))
|
| 1338 |
+
|
| 1339 |
+
def _pred(self):
|
| 1340 |
+
return "prop"
|
| 1341 |
+
|
| 1342 |
+
|
| 1343 |
+
class BoxerEq(BoxerIndexed):
|
| 1344 |
+
def __init__(self, discourse_id, sent_index, word_indices, var1, var2):
|
| 1345 |
+
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
|
| 1346 |
+
self.var1 = var1
|
| 1347 |
+
self.var2 = var2
|
| 1348 |
+
|
| 1349 |
+
def _variables(self):
|
| 1350 |
+
return ({self.var1, self.var2}, set(), set())
|
| 1351 |
+
|
| 1352 |
+
def atoms(self):
|
| 1353 |
+
return set()
|
| 1354 |
+
|
| 1355 |
+
def renumber_sentences(self, f):
|
| 1356 |
+
return BoxerEq(
|
| 1357 |
+
self.discourse_id,
|
| 1358 |
+
f(self.sent_index),
|
| 1359 |
+
self.word_indices,
|
| 1360 |
+
self.var1,
|
| 1361 |
+
self.var2,
|
| 1362 |
+
)
|
| 1363 |
+
|
| 1364 |
+
def __iter__(self):
|
| 1365 |
+
return iter((self.var1, self.var2))
|
| 1366 |
+
|
| 1367 |
+
def _pred(self):
|
| 1368 |
+
return "eq"
|
| 1369 |
+
|
| 1370 |
+
|
| 1371 |
+
class BoxerCard(BoxerIndexed):
|
| 1372 |
+
def __init__(self, discourse_id, sent_index, word_indices, var, value, type):
|
| 1373 |
+
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
|
| 1374 |
+
self.var = var
|
| 1375 |
+
self.value = value
|
| 1376 |
+
self.type = type
|
| 1377 |
+
|
| 1378 |
+
def _variables(self):
|
| 1379 |
+
return ({self.var}, set(), set())
|
| 1380 |
+
|
| 1381 |
+
def renumber_sentences(self, f):
|
| 1382 |
+
return BoxerCard(
|
| 1383 |
+
self.discourse_id,
|
| 1384 |
+
f(self.sent_index),
|
| 1385 |
+
self.word_indices,
|
| 1386 |
+
self.var,
|
| 1387 |
+
self.value,
|
| 1388 |
+
self.type,
|
| 1389 |
+
)
|
| 1390 |
+
|
| 1391 |
+
def __iter__(self):
|
| 1392 |
+
return iter((self.var, self.value, self.type))
|
| 1393 |
+
|
| 1394 |
+
def _pred(self):
|
| 1395 |
+
return "card"
|
| 1396 |
+
|
| 1397 |
+
|
| 1398 |
+
class BoxerOr(BoxerIndexed):
|
| 1399 |
+
def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2):
|
| 1400 |
+
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
|
| 1401 |
+
self.drs1 = drs1
|
| 1402 |
+
self.drs2 = drs2
|
| 1403 |
+
|
| 1404 |
+
def _variables(self):
|
| 1405 |
+
return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables()))
|
| 1406 |
+
|
| 1407 |
+
def atoms(self):
|
| 1408 |
+
return self.drs1.atoms() | self.drs2.atoms()
|
| 1409 |
+
|
| 1410 |
+
def clean(self):
|
| 1411 |
+
return BoxerOr(
|
| 1412 |
+
self.discourse_id,
|
| 1413 |
+
self.sent_index,
|
| 1414 |
+
self.word_indices,
|
| 1415 |
+
self.drs1.clean(),
|
| 1416 |
+
self.drs2.clean(),
|
| 1417 |
+
)
|
| 1418 |
+
|
| 1419 |
+
def renumber_sentences(self, f):
|
| 1420 |
+
return BoxerOr(
|
| 1421 |
+
self.discourse_id,
|
| 1422 |
+
f(self.sent_index),
|
| 1423 |
+
self.word_indices,
|
| 1424 |
+
self.drs1,
|
| 1425 |
+
self.drs2,
|
| 1426 |
+
)
|
| 1427 |
+
|
| 1428 |
+
def __iter__(self):
|
| 1429 |
+
return iter((self.drs1, self.drs2))
|
| 1430 |
+
|
| 1431 |
+
def _pred(self):
|
| 1432 |
+
return "or"
|
| 1433 |
+
|
| 1434 |
+
|
| 1435 |
+
class BoxerWhq(BoxerIndexed):
|
| 1436 |
+
def __init__(
|
| 1437 |
+
self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2
|
| 1438 |
+
):
|
| 1439 |
+
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
|
| 1440 |
+
self.ans_types = ans_types
|
| 1441 |
+
self.drs1 = drs1
|
| 1442 |
+
self.variable = variable
|
| 1443 |
+
self.drs2 = drs2
|
| 1444 |
+
|
| 1445 |
+
def _variables(self):
|
| 1446 |
+
return tuple(
|
| 1447 |
+
map(
|
| 1448 |
+
operator.or_,
|
| 1449 |
+
({self.variable}, set(), set()),
|
| 1450 |
+
self.drs1._variables(),
|
| 1451 |
+
self.drs2._variables(),
|
| 1452 |
+
)
|
| 1453 |
+
)
|
| 1454 |
+
|
| 1455 |
+
def atoms(self):
|
| 1456 |
+
return self.drs1.atoms() | self.drs2.atoms()
|
| 1457 |
+
|
| 1458 |
+
def clean(self):
|
| 1459 |
+
return BoxerWhq(
|
| 1460 |
+
self.discourse_id,
|
| 1461 |
+
self.sent_index,
|
| 1462 |
+
self.word_indices,
|
| 1463 |
+
self.ans_types,
|
| 1464 |
+
self.drs1.clean(),
|
| 1465 |
+
self.variable,
|
| 1466 |
+
self.drs2.clean(),
|
| 1467 |
+
)
|
| 1468 |
+
|
| 1469 |
+
def renumber_sentences(self, f):
|
| 1470 |
+
return BoxerWhq(
|
| 1471 |
+
self.discourse_id,
|
| 1472 |
+
f(self.sent_index),
|
| 1473 |
+
self.word_indices,
|
| 1474 |
+
self.ans_types,
|
| 1475 |
+
self.drs1,
|
| 1476 |
+
self.variable,
|
| 1477 |
+
self.drs2,
|
| 1478 |
+
)
|
| 1479 |
+
|
| 1480 |
+
def __iter__(self):
|
| 1481 |
+
return iter(
|
| 1482 |
+
("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2)
|
| 1483 |
+
)
|
| 1484 |
+
|
| 1485 |
+
def _pred(self):
|
| 1486 |
+
return "whq"
|
| 1487 |
+
|
| 1488 |
+
|
| 1489 |
+
class PassthroughBoxerDrsInterpreter:
|
| 1490 |
+
def interpret(self, ex):
|
| 1491 |
+
return ex
|
| 1492 |
+
|
| 1493 |
+
|
| 1494 |
+
class NltkDrtBoxerDrsInterpreter:
|
| 1495 |
+
def __init__(self, occur_index=False):
|
| 1496 |
+
self._occur_index = occur_index
|
| 1497 |
+
|
| 1498 |
+
def interpret(self, ex):
|
| 1499 |
+
"""
|
| 1500 |
+
:param ex: ``AbstractBoxerDrs``
|
| 1501 |
+
:return: ``DrtExpression``
|
| 1502 |
+
"""
|
| 1503 |
+
if isinstance(ex, BoxerDrs):
|
| 1504 |
+
drs = DRS(
|
| 1505 |
+
[Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds))
|
| 1506 |
+
)
|
| 1507 |
+
if ex.consequent is not None:
|
| 1508 |
+
drs.consequent = self.interpret(ex.consequent)
|
| 1509 |
+
return drs
|
| 1510 |
+
elif isinstance(ex, BoxerNot):
|
| 1511 |
+
return DrtNegatedExpression(self.interpret(ex.drs))
|
| 1512 |
+
elif isinstance(ex, BoxerPred):
|
| 1513 |
+
pred = self._add_occur_indexing(f"{ex.pos}_{ex.name}", ex)
|
| 1514 |
+
return self._make_atom(pred, ex.var)
|
| 1515 |
+
elif isinstance(ex, BoxerNamed):
|
| 1516 |
+
pred = self._add_occur_indexing(f"ne_{ex.type}_{ex.name}", ex)
|
| 1517 |
+
return self._make_atom(pred, ex.var)
|
| 1518 |
+
elif isinstance(ex, BoxerRel):
|
| 1519 |
+
pred = self._add_occur_indexing("%s" % (ex.rel), ex)
|
| 1520 |
+
return self._make_atom(pred, ex.var1, ex.var2)
|
| 1521 |
+
elif isinstance(ex, BoxerProp):
|
| 1522 |
+
return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
|
| 1523 |
+
elif isinstance(ex, BoxerEq):
|
| 1524 |
+
return DrtEqualityExpression(
|
| 1525 |
+
DrtVariableExpression(Variable(ex.var1)),
|
| 1526 |
+
DrtVariableExpression(Variable(ex.var2)),
|
| 1527 |
+
)
|
| 1528 |
+
elif isinstance(ex, BoxerCard):
|
| 1529 |
+
pred = self._add_occur_indexing(f"card_{ex.type}_{ex.value}", ex)
|
| 1530 |
+
return self._make_atom(pred, ex.var)
|
| 1531 |
+
elif isinstance(ex, BoxerOr):
|
| 1532 |
+
return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
|
| 1533 |
+
elif isinstance(ex, BoxerWhq):
|
| 1534 |
+
drs1 = self.interpret(ex.drs1)
|
| 1535 |
+
drs2 = self.interpret(ex.drs2)
|
| 1536 |
+
return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
|
| 1537 |
+
assert False, f"{ex.__class__.__name__}: {ex}"
|
| 1538 |
+
|
| 1539 |
+
def _make_atom(self, pred, *args):
|
| 1540 |
+
accum = DrtVariableExpression(Variable(pred))
|
| 1541 |
+
for arg in args:
|
| 1542 |
+
accum = DrtApplicationExpression(
|
| 1543 |
+
accum, DrtVariableExpression(Variable(arg))
|
| 1544 |
+
)
|
| 1545 |
+
return accum
|
| 1546 |
+
|
| 1547 |
+
def _add_occur_indexing(self, base, ex):
|
| 1548 |
+
if self._occur_index and ex.sent_index is not None:
|
| 1549 |
+
if ex.discourse_id:
|
| 1550 |
+
base += "_%s" % ex.discourse_id
|
| 1551 |
+
base += "_s%s" % ex.sent_index
|
| 1552 |
+
base += "_w%s" % sorted(ex.word_indices)[0]
|
| 1553 |
+
return base
|
| 1554 |
+
|
| 1555 |
+
|
| 1556 |
+
class UnparseableInputException(Exception):
|
| 1557 |
+
pass
|
| 1558 |
+
|
| 1559 |
+
|
| 1560 |
+
if __name__ == "__main__":
|
| 1561 |
+
opts = OptionParser("usage: %prog TEXT [options]")
|
| 1562 |
+
opts.add_option(
|
| 1563 |
+
"--verbose",
|
| 1564 |
+
"-v",
|
| 1565 |
+
help="display verbose logs",
|
| 1566 |
+
action="store_true",
|
| 1567 |
+
default=False,
|
| 1568 |
+
dest="verbose",
|
| 1569 |
+
)
|
| 1570 |
+
opts.add_option(
|
| 1571 |
+
"--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol"
|
| 1572 |
+
)
|
| 1573 |
+
opts.add_option(
|
| 1574 |
+
"--question",
|
| 1575 |
+
"-q",
|
| 1576 |
+
help="input is a question",
|
| 1577 |
+
action="store_true",
|
| 1578 |
+
default=False,
|
| 1579 |
+
dest="question",
|
| 1580 |
+
)
|
| 1581 |
+
opts.add_option(
|
| 1582 |
+
"--occur",
|
| 1583 |
+
"-o",
|
| 1584 |
+
help="occurrence index",
|
| 1585 |
+
action="store_true",
|
| 1586 |
+
default=False,
|
| 1587 |
+
dest="occur_index",
|
| 1588 |
+
)
|
| 1589 |
+
(options, args) = opts.parse_args()
|
| 1590 |
+
|
| 1591 |
+
if len(args) != 1:
|
| 1592 |
+
opts.error("incorrect number of arguments")
|
| 1593 |
+
|
| 1594 |
+
interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
|
| 1595 |
+
drs = Boxer(interpreter).interpret_multi(
|
| 1596 |
+
args[0].split(r"\n"), question=options.question, verbose=options.verbose
|
| 1597 |
+
)
|
| 1598 |
+
if drs is None:
|
| 1599 |
+
print(None)
|
| 1600 |
+
else:
|
| 1601 |
+
drs = drs.simplify().eliminate_equality()
|
| 1602 |
+
if options.fol:
|
| 1603 |
+
print(drs.fol().normalize())
|
| 1604 |
+
else:
|
| 1605 |
+
drs.pretty_print()
|
.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: GUI Demo for Glue Semantics with Discourse
|
| 2 |
+
# Representation Theory (DRT) as meaning language
|
| 3 |
+
#
|
| 4 |
+
# Author: Dan Garrette <dhgarrette@gmail.com>
|
| 5 |
+
#
|
| 6 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
|
| 12 |
+
from tkinter.font import Font
|
| 13 |
+
|
| 14 |
+
from nltk.draw.util import CanvasFrame, ShowText
|
| 15 |
+
|
| 16 |
+
except ImportError:
|
| 17 |
+
"""Ignore ImportError because tkinter might not be available."""
|
| 18 |
+
|
| 19 |
+
from nltk.parse import MaltParser
|
| 20 |
+
from nltk.sem.drt import DrsDrawer, DrtVariableExpression
|
| 21 |
+
from nltk.sem.glue import DrtGlue
|
| 22 |
+
from nltk.sem.logic import Variable
|
| 23 |
+
from nltk.tag import RegexpTagger
|
| 24 |
+
from nltk.util import in_idle
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class DrtGlueDemo:
|
| 28 |
+
def __init__(self, examples):
|
| 29 |
+
# Set up the main window.
|
| 30 |
+
self._top = Tk()
|
| 31 |
+
self._top.title("DRT Glue Demo")
|
| 32 |
+
|
| 33 |
+
# Set up key bindings.
|
| 34 |
+
self._init_bindings()
|
| 35 |
+
|
| 36 |
+
# Initialize the fonts.self._error = None
|
| 37 |
+
self._init_fonts(self._top)
|
| 38 |
+
|
| 39 |
+
self._examples = examples
|
| 40 |
+
self._readingCache = [None for example in examples]
|
| 41 |
+
|
| 42 |
+
# The user can hide the grammar.
|
| 43 |
+
self._show_grammar = IntVar(self._top)
|
| 44 |
+
self._show_grammar.set(1)
|
| 45 |
+
|
| 46 |
+
# Set the data to None
|
| 47 |
+
self._curExample = -1
|
| 48 |
+
self._readings = []
|
| 49 |
+
self._drs = None
|
| 50 |
+
self._drsWidget = None
|
| 51 |
+
self._error = None
|
| 52 |
+
|
| 53 |
+
self._init_glue()
|
| 54 |
+
|
| 55 |
+
# Create the basic frames.
|
| 56 |
+
self._init_menubar(self._top)
|
| 57 |
+
self._init_buttons(self._top)
|
| 58 |
+
self._init_exampleListbox(self._top)
|
| 59 |
+
self._init_readingListbox(self._top)
|
| 60 |
+
self._init_canvas(self._top)
|
| 61 |
+
|
| 62 |
+
# Resize callback
|
| 63 |
+
self._canvas.bind("<Configure>", self._configure)
|
| 64 |
+
|
| 65 |
+
#########################################
|
| 66 |
+
## Initialization Helpers
|
| 67 |
+
#########################################
|
| 68 |
+
|
| 69 |
+
def _init_glue(self):
|
| 70 |
+
tagger = RegexpTagger(
|
| 71 |
+
[
|
| 72 |
+
("^(David|Mary|John)$", "NNP"),
|
| 73 |
+
(
|
| 74 |
+
"^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
|
| 75 |
+
"VB",
|
| 76 |
+
),
|
| 77 |
+
("^(go|order|vanish|find|approach)$", "VB"),
|
| 78 |
+
("^(a)$", "ex_quant"),
|
| 79 |
+
("^(every)$", "univ_quant"),
|
| 80 |
+
("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
|
| 81 |
+
("^(big|gray|former)$", "JJ"),
|
| 82 |
+
("^(him|himself)$", "PRP"),
|
| 83 |
+
]
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
depparser = MaltParser(tagger=tagger)
|
| 87 |
+
self._glue = DrtGlue(depparser=depparser, remove_duplicates=False)
|
| 88 |
+
|
| 89 |
+
def _init_fonts(self, root):
|
| 90 |
+
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
|
| 91 |
+
self._sysfont = Font(font=Button()["font"])
|
| 92 |
+
root.option_add("*Font", self._sysfont)
|
| 93 |
+
|
| 94 |
+
# TWhat's our font size (default=same as sysfont)
|
| 95 |
+
self._size = IntVar(root)
|
| 96 |
+
self._size.set(self._sysfont.cget("size"))
|
| 97 |
+
|
| 98 |
+
self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
|
| 99 |
+
self._font = Font(family="helvetica", size=self._size.get())
|
| 100 |
+
if self._size.get() < 0:
|
| 101 |
+
big = self._size.get() - 2
|
| 102 |
+
else:
|
| 103 |
+
big = self._size.get() + 2
|
| 104 |
+
self._bigfont = Font(family="helvetica", weight="bold", size=big)
|
| 105 |
+
|
| 106 |
+
def _init_exampleListbox(self, parent):
|
| 107 |
+
self._exampleFrame = listframe = Frame(parent)
|
| 108 |
+
self._exampleFrame.pack(fill="both", side="left", padx=2)
|
| 109 |
+
self._exampleList_label = Label(
|
| 110 |
+
self._exampleFrame, font=self._boldfont, text="Examples"
|
| 111 |
+
)
|
| 112 |
+
self._exampleList_label.pack()
|
| 113 |
+
self._exampleList = Listbox(
|
| 114 |
+
self._exampleFrame,
|
| 115 |
+
selectmode="single",
|
| 116 |
+
relief="groove",
|
| 117 |
+
background="white",
|
| 118 |
+
foreground="#909090",
|
| 119 |
+
font=self._font,
|
| 120 |
+
selectforeground="#004040",
|
| 121 |
+
selectbackground="#c0f0c0",
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
self._exampleList.pack(side="right", fill="both", expand=1)
|
| 125 |
+
|
| 126 |
+
for example in self._examples:
|
| 127 |
+
self._exampleList.insert("end", (" %s" % example))
|
| 128 |
+
self._exampleList.config(height=min(len(self._examples), 25), width=40)
|
| 129 |
+
|
| 130 |
+
# Add a scrollbar if there are more than 25 examples.
|
| 131 |
+
if len(self._examples) > 25:
|
| 132 |
+
listscroll = Scrollbar(self._exampleFrame, orient="vertical")
|
| 133 |
+
self._exampleList.config(yscrollcommand=listscroll.set)
|
| 134 |
+
listscroll.config(command=self._exampleList.yview)
|
| 135 |
+
listscroll.pack(side="left", fill="y")
|
| 136 |
+
|
| 137 |
+
# If they select a example, apply it.
|
| 138 |
+
self._exampleList.bind("<<ListboxSelect>>", self._exampleList_select)
|
| 139 |
+
|
| 140 |
+
def _init_readingListbox(self, parent):
|
| 141 |
+
self._readingFrame = listframe = Frame(parent)
|
| 142 |
+
self._readingFrame.pack(fill="both", side="left", padx=2)
|
| 143 |
+
self._readingList_label = Label(
|
| 144 |
+
self._readingFrame, font=self._boldfont, text="Readings"
|
| 145 |
+
)
|
| 146 |
+
self._readingList_label.pack()
|
| 147 |
+
self._readingList = Listbox(
|
| 148 |
+
self._readingFrame,
|
| 149 |
+
selectmode="single",
|
| 150 |
+
relief="groove",
|
| 151 |
+
background="white",
|
| 152 |
+
foreground="#909090",
|
| 153 |
+
font=self._font,
|
| 154 |
+
selectforeground="#004040",
|
| 155 |
+
selectbackground="#c0f0c0",
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
self._readingList.pack(side="right", fill="both", expand=1)
|
| 159 |
+
|
| 160 |
+
# Add a scrollbar if there are more than 25 examples.
|
| 161 |
+
listscroll = Scrollbar(self._readingFrame, orient="vertical")
|
| 162 |
+
self._readingList.config(yscrollcommand=listscroll.set)
|
| 163 |
+
listscroll.config(command=self._readingList.yview)
|
| 164 |
+
listscroll.pack(side="right", fill="y")
|
| 165 |
+
|
| 166 |
+
self._populate_readingListbox()
|
| 167 |
+
|
| 168 |
+
def _populate_readingListbox(self):
|
| 169 |
+
# Populate the listbox with integers
|
| 170 |
+
self._readingList.delete(0, "end")
|
| 171 |
+
for i in range(len(self._readings)):
|
| 172 |
+
self._readingList.insert("end", (" %s" % (i + 1)))
|
| 173 |
+
self._readingList.config(height=min(len(self._readings), 25), width=5)
|
| 174 |
+
|
| 175 |
+
# If they select a example, apply it.
|
| 176 |
+
self._readingList.bind("<<ListboxSelect>>", self._readingList_select)
|
| 177 |
+
|
| 178 |
+
def _init_bindings(self):
|
| 179 |
+
# Key bindings are a good thing.
|
| 180 |
+
self._top.bind("<Control-q>", self.destroy)
|
| 181 |
+
self._top.bind("<Control-x>", self.destroy)
|
| 182 |
+
self._top.bind("<Escape>", self.destroy)
|
| 183 |
+
self._top.bind("n", self.next)
|
| 184 |
+
self._top.bind("<space>", self.next)
|
| 185 |
+
self._top.bind("p", self.prev)
|
| 186 |
+
self._top.bind("<BackSpace>", self.prev)
|
| 187 |
+
|
| 188 |
+
def _init_buttons(self, parent):
|
| 189 |
+
# Set up the frames.
|
| 190 |
+
self._buttonframe = buttonframe = Frame(parent)
|
| 191 |
+
buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
|
| 192 |
+
Button(
|
| 193 |
+
buttonframe,
|
| 194 |
+
text="Prev",
|
| 195 |
+
background="#90c0d0",
|
| 196 |
+
foreground="black",
|
| 197 |
+
command=self.prev,
|
| 198 |
+
).pack(side="left")
|
| 199 |
+
Button(
|
| 200 |
+
buttonframe,
|
| 201 |
+
text="Next",
|
| 202 |
+
background="#90c0d0",
|
| 203 |
+
foreground="black",
|
| 204 |
+
command=self.next,
|
| 205 |
+
).pack(side="left")
|
| 206 |
+
|
| 207 |
+
def _configure(self, event):
|
| 208 |
+
self._autostep = 0
|
| 209 |
+
(x1, y1, x2, y2) = self._cframe.scrollregion()
|
| 210 |
+
y2 = event.height - 6
|
| 211 |
+
self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
|
| 212 |
+
self._redraw()
|
| 213 |
+
|
| 214 |
+
def _init_canvas(self, parent):
|
| 215 |
+
self._cframe = CanvasFrame(
|
| 216 |
+
parent,
|
| 217 |
+
background="white",
|
| 218 |
+
# width=525, height=250,
|
| 219 |
+
closeenough=10,
|
| 220 |
+
border=2,
|
| 221 |
+
relief="sunken",
|
| 222 |
+
)
|
| 223 |
+
self._cframe.pack(expand=1, fill="both", side="top", pady=2)
|
| 224 |
+
canvas = self._canvas = self._cframe.canvas()
|
| 225 |
+
|
| 226 |
+
# Initially, there's no tree or text
|
| 227 |
+
self._tree = None
|
| 228 |
+
self._textwidgets = []
|
| 229 |
+
self._textline = None
|
| 230 |
+
|
| 231 |
+
def _init_menubar(self, parent):
|
| 232 |
+
menubar = Menu(parent)
|
| 233 |
+
|
| 234 |
+
filemenu = Menu(menubar, tearoff=0)
|
| 235 |
+
filemenu.add_command(
|
| 236 |
+
label="Exit", underline=1, command=self.destroy, accelerator="q"
|
| 237 |
+
)
|
| 238 |
+
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
| 239 |
+
|
| 240 |
+
actionmenu = Menu(menubar, tearoff=0)
|
| 241 |
+
actionmenu.add_command(
|
| 242 |
+
label="Next", underline=0, command=self.next, accelerator="n, Space"
|
| 243 |
+
)
|
| 244 |
+
actionmenu.add_command(
|
| 245 |
+
label="Previous", underline=0, command=self.prev, accelerator="p, Backspace"
|
| 246 |
+
)
|
| 247 |
+
menubar.add_cascade(label="Action", underline=0, menu=actionmenu)
|
| 248 |
+
|
| 249 |
+
optionmenu = Menu(menubar, tearoff=0)
|
| 250 |
+
optionmenu.add_checkbutton(
|
| 251 |
+
label="Remove Duplicates",
|
| 252 |
+
underline=0,
|
| 253 |
+
variable=self._glue.remove_duplicates,
|
| 254 |
+
command=self._toggle_remove_duplicates,
|
| 255 |
+
accelerator="r",
|
| 256 |
+
)
|
| 257 |
+
menubar.add_cascade(label="Options", underline=0, menu=optionmenu)
|
| 258 |
+
|
| 259 |
+
viewmenu = Menu(menubar, tearoff=0)
|
| 260 |
+
viewmenu.add_radiobutton(
|
| 261 |
+
label="Tiny",
|
| 262 |
+
variable=self._size,
|
| 263 |
+
underline=0,
|
| 264 |
+
value=10,
|
| 265 |
+
command=self.resize,
|
| 266 |
+
)
|
| 267 |
+
viewmenu.add_radiobutton(
|
| 268 |
+
label="Small",
|
| 269 |
+
variable=self._size,
|
| 270 |
+
underline=0,
|
| 271 |
+
value=12,
|
| 272 |
+
command=self.resize,
|
| 273 |
+
)
|
| 274 |
+
viewmenu.add_radiobutton(
|
| 275 |
+
label="Medium",
|
| 276 |
+
variable=self._size,
|
| 277 |
+
underline=0,
|
| 278 |
+
value=14,
|
| 279 |
+
command=self.resize,
|
| 280 |
+
)
|
| 281 |
+
viewmenu.add_radiobutton(
|
| 282 |
+
label="Large",
|
| 283 |
+
variable=self._size,
|
| 284 |
+
underline=0,
|
| 285 |
+
value=18,
|
| 286 |
+
command=self.resize,
|
| 287 |
+
)
|
| 288 |
+
viewmenu.add_radiobutton(
|
| 289 |
+
label="Huge",
|
| 290 |
+
variable=self._size,
|
| 291 |
+
underline=0,
|
| 292 |
+
value=24,
|
| 293 |
+
command=self.resize,
|
| 294 |
+
)
|
| 295 |
+
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
|
| 296 |
+
|
| 297 |
+
helpmenu = Menu(menubar, tearoff=0)
|
| 298 |
+
helpmenu.add_command(label="About", underline=0, command=self.about)
|
| 299 |
+
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
|
| 300 |
+
|
| 301 |
+
parent.config(menu=menubar)
|
| 302 |
+
|
| 303 |
+
#########################################
|
| 304 |
+
## Main draw procedure
|
| 305 |
+
#########################################
|
| 306 |
+
|
| 307 |
+
def _redraw(self):
|
| 308 |
+
canvas = self._canvas
|
| 309 |
+
|
| 310 |
+
# Delete the old DRS, widgets, etc.
|
| 311 |
+
if self._drsWidget is not None:
|
| 312 |
+
self._drsWidget.clear()
|
| 313 |
+
|
| 314 |
+
if self._drs:
|
| 315 |
+
self._drsWidget = DrsWidget(self._canvas, self._drs)
|
| 316 |
+
self._drsWidget.draw()
|
| 317 |
+
|
| 318 |
+
if self._error:
|
| 319 |
+
self._drsWidget = DrsWidget(self._canvas, self._error)
|
| 320 |
+
self._drsWidget.draw()
|
| 321 |
+
|
| 322 |
+
#########################################
|
| 323 |
+
## Button Callbacks
|
| 324 |
+
#########################################
|
| 325 |
+
|
| 326 |
+
def destroy(self, *e):
|
| 327 |
+
self._autostep = 0
|
| 328 |
+
if self._top is None:
|
| 329 |
+
return
|
| 330 |
+
self._top.destroy()
|
| 331 |
+
self._top = None
|
| 332 |
+
|
| 333 |
+
def prev(self, *e):
|
| 334 |
+
selection = self._readingList.curselection()
|
| 335 |
+
readingListSize = self._readingList.size()
|
| 336 |
+
|
| 337 |
+
# there are readings
|
| 338 |
+
if readingListSize > 0:
|
| 339 |
+
# if one reading is currently selected
|
| 340 |
+
if len(selection) == 1:
|
| 341 |
+
index = int(selection[0])
|
| 342 |
+
|
| 343 |
+
# if it's on (or before) the first item
|
| 344 |
+
if index <= 0:
|
| 345 |
+
self._select_previous_example()
|
| 346 |
+
else:
|
| 347 |
+
self._readingList_store_selection(index - 1)
|
| 348 |
+
|
| 349 |
+
else:
|
| 350 |
+
# select its first reading
|
| 351 |
+
self._readingList_store_selection(readingListSize - 1)
|
| 352 |
+
|
| 353 |
+
else:
|
| 354 |
+
self._select_previous_example()
|
| 355 |
+
|
| 356 |
+
def _select_previous_example(self):
|
| 357 |
+
# if the current example is not the first example
|
| 358 |
+
if self._curExample > 0:
|
| 359 |
+
self._exampleList_store_selection(self._curExample - 1)
|
| 360 |
+
else:
|
| 361 |
+
# go to the last example
|
| 362 |
+
self._exampleList_store_selection(len(self._examples) - 1)
|
| 363 |
+
|
| 364 |
+
def next(self, *e):
|
| 365 |
+
selection = self._readingList.curselection()
|
| 366 |
+
readingListSize = self._readingList.size()
|
| 367 |
+
|
| 368 |
+
# if there are readings
|
| 369 |
+
if readingListSize > 0:
|
| 370 |
+
# if one reading is currently selected
|
| 371 |
+
if len(selection) == 1:
|
| 372 |
+
index = int(selection[0])
|
| 373 |
+
|
| 374 |
+
# if it's on (or past) the last item
|
| 375 |
+
if index >= (readingListSize - 1):
|
| 376 |
+
self._select_next_example()
|
| 377 |
+
else:
|
| 378 |
+
self._readingList_store_selection(index + 1)
|
| 379 |
+
|
| 380 |
+
else:
|
| 381 |
+
# select its first reading
|
| 382 |
+
self._readingList_store_selection(0)
|
| 383 |
+
|
| 384 |
+
else:
|
| 385 |
+
self._select_next_example()
|
| 386 |
+
|
| 387 |
+
def _select_next_example(self):
|
| 388 |
+
# if the current example is not the last example
|
| 389 |
+
if self._curExample < len(self._examples) - 1:
|
| 390 |
+
self._exampleList_store_selection(self._curExample + 1)
|
| 391 |
+
else:
|
| 392 |
+
# go to the first example
|
| 393 |
+
self._exampleList_store_selection(0)
|
| 394 |
+
|
| 395 |
+
def about(self, *e):
|
| 396 |
+
ABOUT = (
|
| 397 |
+
"NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"
|
| 398 |
+
+ "Written by Daniel H. Garrette"
|
| 399 |
+
)
|
| 400 |
+
TITLE = "About: NLTK DRT Glue Demo"
|
| 401 |
+
try:
|
| 402 |
+
from tkinter.messagebox import Message
|
| 403 |
+
|
| 404 |
+
Message(message=ABOUT, title=TITLE).show()
|
| 405 |
+
except:
|
| 406 |
+
ShowText(self._top, TITLE, ABOUT)
|
| 407 |
+
|
| 408 |
+
def postscript(self, *e):
|
| 409 |
+
self._autostep = 0
|
| 410 |
+
self._cframe.print_to_file()
|
| 411 |
+
|
| 412 |
+
def mainloop(self, *args, **kwargs):
|
| 413 |
+
"""
|
| 414 |
+
Enter the Tkinter mainloop. This function must be called if
|
| 415 |
+
this demo is created from a non-interactive program (e.g.
|
| 416 |
+
from a secript); otherwise, the demo will close as soon as
|
| 417 |
+
the script completes.
|
| 418 |
+
"""
|
| 419 |
+
if in_idle():
|
| 420 |
+
return
|
| 421 |
+
self._top.mainloop(*args, **kwargs)
|
| 422 |
+
|
| 423 |
+
def resize(self, size=None):
|
| 424 |
+
if size is not None:
|
| 425 |
+
self._size.set(size)
|
| 426 |
+
size = self._size.get()
|
| 427 |
+
self._font.configure(size=-(abs(size)))
|
| 428 |
+
self._boldfont.configure(size=-(abs(size)))
|
| 429 |
+
self._sysfont.configure(size=-(abs(size)))
|
| 430 |
+
self._bigfont.configure(size=-(abs(size + 2)))
|
| 431 |
+
self._redraw()
|
| 432 |
+
|
| 433 |
+
def _toggle_remove_duplicates(self):
|
| 434 |
+
self._glue.remove_duplicates = not self._glue.remove_duplicates
|
| 435 |
+
|
| 436 |
+
self._exampleList.selection_clear(0, "end")
|
| 437 |
+
self._readings = []
|
| 438 |
+
self._populate_readingListbox()
|
| 439 |
+
self._readingCache = [None for ex in self._examples]
|
| 440 |
+
self._curExample = -1
|
| 441 |
+
self._error = None
|
| 442 |
+
|
| 443 |
+
self._drs = None
|
| 444 |
+
self._redraw()
|
| 445 |
+
|
| 446 |
+
def _exampleList_select(self, event):
|
| 447 |
+
selection = self._exampleList.curselection()
|
| 448 |
+
if len(selection) != 1:
|
| 449 |
+
return
|
| 450 |
+
self._exampleList_store_selection(int(selection[0]))
|
| 451 |
+
|
| 452 |
+
def _exampleList_store_selection(self, index):
|
| 453 |
+
self._curExample = index
|
| 454 |
+
example = self._examples[index]
|
| 455 |
+
|
| 456 |
+
self._exampleList.selection_clear(0, "end")
|
| 457 |
+
if example:
|
| 458 |
+
cache = self._readingCache[index]
|
| 459 |
+
if cache:
|
| 460 |
+
if isinstance(cache, list):
|
| 461 |
+
self._readings = cache
|
| 462 |
+
self._error = None
|
| 463 |
+
else:
|
| 464 |
+
self._readings = []
|
| 465 |
+
self._error = cache
|
| 466 |
+
else:
|
| 467 |
+
try:
|
| 468 |
+
self._readings = self._glue.parse_to_meaning(example)
|
| 469 |
+
self._error = None
|
| 470 |
+
self._readingCache[index] = self._readings
|
| 471 |
+
except Exception as e:
|
| 472 |
+
self._readings = []
|
| 473 |
+
self._error = DrtVariableExpression(Variable("Error: " + str(e)))
|
| 474 |
+
self._readingCache[index] = self._error
|
| 475 |
+
|
| 476 |
+
# add a star to the end of the example
|
| 477 |
+
self._exampleList.delete(index)
|
| 478 |
+
self._exampleList.insert(index, (" %s *" % example))
|
| 479 |
+
self._exampleList.config(
|
| 480 |
+
height=min(len(self._examples), 25), width=40
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
self._populate_readingListbox()
|
| 484 |
+
|
| 485 |
+
self._exampleList.selection_set(index)
|
| 486 |
+
|
| 487 |
+
self._drs = None
|
| 488 |
+
self._redraw()
|
| 489 |
+
|
| 490 |
+
def _readingList_select(self, event):
|
| 491 |
+
selection = self._readingList.curselection()
|
| 492 |
+
if len(selection) != 1:
|
| 493 |
+
return
|
| 494 |
+
self._readingList_store_selection(int(selection[0]))
|
| 495 |
+
|
| 496 |
+
def _readingList_store_selection(self, index):
|
| 497 |
+
reading = self._readings[index]
|
| 498 |
+
|
| 499 |
+
self._readingList.selection_clear(0, "end")
|
| 500 |
+
if reading:
|
| 501 |
+
self._readingList.selection_set(index)
|
| 502 |
+
|
| 503 |
+
self._drs = reading.simplify().normalize().resolve_anaphora()
|
| 504 |
+
|
| 505 |
+
self._redraw()
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
class DrsWidget:
|
| 509 |
+
def __init__(self, canvas, drs, **attribs):
|
| 510 |
+
self._drs = drs
|
| 511 |
+
self._canvas = canvas
|
| 512 |
+
canvas.font = Font(
|
| 513 |
+
font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font")
|
| 514 |
+
)
|
| 515 |
+
canvas._BUFFER = 3
|
| 516 |
+
self.bbox = (0, 0, 0, 0)
|
| 517 |
+
|
| 518 |
+
def draw(self):
|
| 519 |
+
(right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw()
|
| 520 |
+
self.bbox = (0, 0, right + 1, bottom + 1)
|
| 521 |
+
|
| 522 |
+
def clear(self):
|
| 523 |
+
self._canvas.create_rectangle(self.bbox, fill="white", width="0")
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
def demo():
|
| 527 |
+
examples = [
|
| 528 |
+
"John walks",
|
| 529 |
+
"David sees Mary",
|
| 530 |
+
"David eats a sandwich",
|
| 531 |
+
"every man chases a dog",
|
| 532 |
+
# 'every man believes a dog yawns',
|
| 533 |
+
# 'John gives David a sandwich',
|
| 534 |
+
"John chases himself",
|
| 535 |
+
# 'John persuades David to order a pizza',
|
| 536 |
+
# 'John tries to go',
|
| 537 |
+
# 'John tries to find a unicorn',
|
| 538 |
+
# 'John seems to vanish',
|
| 539 |
+
# 'a unicorn seems to approach',
|
| 540 |
+
# 'every big cat leaves',
|
| 541 |
+
# 'every gray cat leaves',
|
| 542 |
+
# 'every big gray cat leaves',
|
| 543 |
+
# 'a former senator leaves',
|
| 544 |
+
# 'John likes a cat',
|
| 545 |
+
# 'John likes every cat',
|
| 546 |
+
# 'he walks',
|
| 547 |
+
# 'John walks and he leaves'
|
| 548 |
+
]
|
| 549 |
+
DrtGlueDemo(examples).mainloop()
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
if __name__ == "__main__":
|
| 553 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py
ADDED
|
@@ -0,0 +1,835 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Glue Semantics
|
| 2 |
+
#
|
| 3 |
+
# Author: Dan Garrette <dhgarrette@gmail.com>
|
| 4 |
+
#
|
| 5 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
from itertools import chain
|
| 11 |
+
|
| 12 |
+
import nltk
|
| 13 |
+
from nltk.internals import Counter
|
| 14 |
+
from nltk.sem import drt, linearlogic
|
| 15 |
+
from nltk.sem.logic import (
|
| 16 |
+
AbstractVariableExpression,
|
| 17 |
+
Expression,
|
| 18 |
+
LambdaExpression,
|
| 19 |
+
Variable,
|
| 20 |
+
VariableExpression,
|
| 21 |
+
)
|
| 22 |
+
from nltk.tag import BigramTagger, RegexpTagger, TrigramTagger, UnigramTagger
|
| 23 |
+
|
| 24 |
+
SPEC_SEMTYPES = {
|
| 25 |
+
"a": "ex_quant",
|
| 26 |
+
"an": "ex_quant",
|
| 27 |
+
"every": "univ_quant",
|
| 28 |
+
"the": "def_art",
|
| 29 |
+
"no": "no_quant",
|
| 30 |
+
"default": "ex_quant",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class GlueFormula:
|
| 37 |
+
def __init__(self, meaning, glue, indices=None):
|
| 38 |
+
if not indices:
|
| 39 |
+
indices = set()
|
| 40 |
+
|
| 41 |
+
if isinstance(meaning, str):
|
| 42 |
+
self.meaning = Expression.fromstring(meaning)
|
| 43 |
+
elif isinstance(meaning, Expression):
|
| 44 |
+
self.meaning = meaning
|
| 45 |
+
else:
|
| 46 |
+
raise RuntimeError(
|
| 47 |
+
"Meaning term neither string or expression: %s, %s"
|
| 48 |
+
% (meaning, meaning.__class__)
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
if isinstance(glue, str):
|
| 52 |
+
self.glue = linearlogic.LinearLogicParser().parse(glue)
|
| 53 |
+
elif isinstance(glue, linearlogic.Expression):
|
| 54 |
+
self.glue = glue
|
| 55 |
+
else:
|
| 56 |
+
raise RuntimeError(
|
| 57 |
+
"Glue term neither string or expression: %s, %s"
|
| 58 |
+
% (glue, glue.__class__)
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
self.indices = indices
|
| 62 |
+
|
| 63 |
+
def applyto(self, arg):
|
| 64 |
+
"""self = (\\x.(walk x), (subj -o f))
|
| 65 |
+
arg = (john , subj)
|
| 66 |
+
returns ((walk john), f)
|
| 67 |
+
"""
|
| 68 |
+
if self.indices & arg.indices: # if the sets are NOT disjoint
|
| 69 |
+
raise linearlogic.LinearLogicApplicationException(
|
| 70 |
+
f"'{self}' applied to '{arg}'. Indices are not disjoint."
|
| 71 |
+
)
|
| 72 |
+
else: # if the sets ARE disjoint
|
| 73 |
+
return_indices = self.indices | arg.indices
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
return_glue = linearlogic.ApplicationExpression(
|
| 77 |
+
self.glue, arg.glue, arg.indices
|
| 78 |
+
)
|
| 79 |
+
except linearlogic.LinearLogicApplicationException as e:
|
| 80 |
+
raise linearlogic.LinearLogicApplicationException(
|
| 81 |
+
f"'{self.simplify()}' applied to '{arg.simplify()}'"
|
| 82 |
+
) from e
|
| 83 |
+
|
| 84 |
+
arg_meaning_abstracted = arg.meaning
|
| 85 |
+
if return_indices:
|
| 86 |
+
for dep in self.glue.simplify().antecedent.dependencies[
|
| 87 |
+
::-1
|
| 88 |
+
]: # if self.glue is (A -o B), dep is in A.dependencies
|
| 89 |
+
arg_meaning_abstracted = self.make_LambdaExpression(
|
| 90 |
+
Variable("v%s" % dep), arg_meaning_abstracted
|
| 91 |
+
)
|
| 92 |
+
return_meaning = self.meaning.applyto(arg_meaning_abstracted)
|
| 93 |
+
|
| 94 |
+
return self.__class__(return_meaning, return_glue, return_indices)
|
| 95 |
+
|
| 96 |
+
def make_VariableExpression(self, name):
|
| 97 |
+
return VariableExpression(name)
|
| 98 |
+
|
| 99 |
+
def make_LambdaExpression(self, variable, term):
|
| 100 |
+
return LambdaExpression(variable, term)
|
| 101 |
+
|
| 102 |
+
def lambda_abstract(self, other):
|
| 103 |
+
assert isinstance(other, GlueFormula)
|
| 104 |
+
assert isinstance(other.meaning, AbstractVariableExpression)
|
| 105 |
+
return self.__class__(
|
| 106 |
+
self.make_LambdaExpression(other.meaning.variable, self.meaning),
|
| 107 |
+
linearlogic.ImpExpression(other.glue, self.glue),
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
def compile(self, counter=None):
|
| 111 |
+
"""From Iddo Lev's PhD Dissertation p108-109"""
|
| 112 |
+
if not counter:
|
| 113 |
+
counter = Counter()
|
| 114 |
+
(compiled_glue, new_forms) = self.glue.simplify().compile_pos(
|
| 115 |
+
counter, self.__class__
|
| 116 |
+
)
|
| 117 |
+
return new_forms + [
|
| 118 |
+
self.__class__(self.meaning, compiled_glue, {counter.get()})
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
def simplify(self):
|
| 122 |
+
return self.__class__(
|
| 123 |
+
self.meaning.simplify(), self.glue.simplify(), self.indices
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
def __eq__(self, other):
|
| 127 |
+
return (
|
| 128 |
+
self.__class__ == other.__class__
|
| 129 |
+
and self.meaning == other.meaning
|
| 130 |
+
and self.glue == other.glue
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
def __ne__(self, other):
|
| 134 |
+
return not self == other
|
| 135 |
+
|
| 136 |
+
# sorting for use in doctests which must be deterministic
|
| 137 |
+
def __lt__(self, other):
|
| 138 |
+
return str(self) < str(other)
|
| 139 |
+
|
| 140 |
+
def __str__(self):
|
| 141 |
+
assert isinstance(self.indices, set)
|
| 142 |
+
accum = f"{self.meaning} : {self.glue}"
|
| 143 |
+
if self.indices:
|
| 144 |
+
accum += (
|
| 145 |
+
" : {" + ", ".join(str(index) for index in sorted(self.indices)) + "}"
|
| 146 |
+
)
|
| 147 |
+
return accum
|
| 148 |
+
|
| 149 |
+
def __repr__(self):
|
| 150 |
+
return "%s" % self
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class GlueDict(dict):
|
| 154 |
+
def __init__(self, filename, encoding=None):
|
| 155 |
+
self.filename = filename
|
| 156 |
+
self.file_encoding = encoding
|
| 157 |
+
self.read_file()
|
| 158 |
+
|
| 159 |
+
def read_file(self, empty_first=True):
|
| 160 |
+
if empty_first:
|
| 161 |
+
self.clear()
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
contents = nltk.data.load(
|
| 165 |
+
self.filename, format="text", encoding=self.file_encoding
|
| 166 |
+
)
|
| 167 |
+
# TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
|
| 168 |
+
except LookupError as e:
|
| 169 |
+
try:
|
| 170 |
+
contents = nltk.data.load(
|
| 171 |
+
"file:" + self.filename, format="text", encoding=self.file_encoding
|
| 172 |
+
)
|
| 173 |
+
except LookupError:
|
| 174 |
+
raise e
|
| 175 |
+
lines = contents.splitlines()
|
| 176 |
+
|
| 177 |
+
for line in lines: # example: 'n : (\\x.(<word> x), (v-or))'
|
| 178 |
+
# lambdacalc -^ linear logic -^
|
| 179 |
+
line = line.strip() # remove trailing newline
|
| 180 |
+
if not len(line):
|
| 181 |
+
continue # skip empty lines
|
| 182 |
+
if line[0] == "#":
|
| 183 |
+
continue # skip commented out lines
|
| 184 |
+
|
| 185 |
+
parts = line.split(
|
| 186 |
+
" : ", 2
|
| 187 |
+
) # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
|
| 188 |
+
|
| 189 |
+
glue_formulas = []
|
| 190 |
+
paren_count = 0
|
| 191 |
+
tuple_start = 0
|
| 192 |
+
tuple_comma = 0
|
| 193 |
+
|
| 194 |
+
relationships = None
|
| 195 |
+
|
| 196 |
+
if len(parts) > 1:
|
| 197 |
+
for (i, c) in enumerate(parts[1]):
|
| 198 |
+
if c == "(":
|
| 199 |
+
if paren_count == 0: # if it's the first '(' of a tuple
|
| 200 |
+
tuple_start = i + 1 # then save the index
|
| 201 |
+
paren_count += 1
|
| 202 |
+
elif c == ")":
|
| 203 |
+
paren_count -= 1
|
| 204 |
+
if paren_count == 0: # if it's the last ')' of a tuple
|
| 205 |
+
meaning_term = parts[1][
|
| 206 |
+
tuple_start:tuple_comma
|
| 207 |
+
] # '\\x.(<word> x)'
|
| 208 |
+
glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)'
|
| 209 |
+
glue_formulas.append(
|
| 210 |
+
[meaning_term, glue_term]
|
| 211 |
+
) # add the GlueFormula to the list
|
| 212 |
+
elif c == ",":
|
| 213 |
+
if (
|
| 214 |
+
paren_count == 1
|
| 215 |
+
): # if it's a comma separating the parts of the tuple
|
| 216 |
+
tuple_comma = i # then save the index
|
| 217 |
+
elif c == "#": # skip comments at the ends of lines
|
| 218 |
+
if (
|
| 219 |
+
paren_count != 0
|
| 220 |
+
): # if the line hasn't parsed correctly so far
|
| 221 |
+
raise RuntimeError(
|
| 222 |
+
"Formula syntax is incorrect for entry " + line
|
| 223 |
+
)
|
| 224 |
+
break # break to the next line
|
| 225 |
+
|
| 226 |
+
if len(parts) > 2: # if there is a relationship entry at the end
|
| 227 |
+
rel_start = parts[2].index("[") + 1
|
| 228 |
+
rel_end = parts[2].index("]")
|
| 229 |
+
if rel_start == rel_end:
|
| 230 |
+
relationships = frozenset()
|
| 231 |
+
else:
|
| 232 |
+
relationships = frozenset(
|
| 233 |
+
r.strip() for r in parts[2][rel_start:rel_end].split(",")
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
try:
|
| 237 |
+
start_inheritance = parts[0].index("(")
|
| 238 |
+
end_inheritance = parts[0].index(")")
|
| 239 |
+
sem = parts[0][:start_inheritance].strip()
|
| 240 |
+
supertype = parts[0][start_inheritance + 1 : end_inheritance]
|
| 241 |
+
except:
|
| 242 |
+
sem = parts[0].strip()
|
| 243 |
+
supertype = None
|
| 244 |
+
|
| 245 |
+
if sem not in self:
|
| 246 |
+
self[sem] = {}
|
| 247 |
+
|
| 248 |
+
if (
|
| 249 |
+
relationships is None
|
| 250 |
+
): # if not specified for a specific relationship set
|
| 251 |
+
# add all relationship entries for parents
|
| 252 |
+
if supertype:
|
| 253 |
+
for rels in self[supertype]:
|
| 254 |
+
if rels not in self[sem]:
|
| 255 |
+
self[sem][rels] = []
|
| 256 |
+
glue = self[supertype][rels]
|
| 257 |
+
self[sem][rels].extend(glue)
|
| 258 |
+
self[sem][rels].extend(
|
| 259 |
+
glue_formulas
|
| 260 |
+
) # add the glue formulas to every rel entry
|
| 261 |
+
else:
|
| 262 |
+
if None not in self[sem]:
|
| 263 |
+
self[sem][None] = []
|
| 264 |
+
self[sem][None].extend(
|
| 265 |
+
glue_formulas
|
| 266 |
+
) # add the glue formulas to every rel entry
|
| 267 |
+
else:
|
| 268 |
+
if relationships not in self[sem]:
|
| 269 |
+
self[sem][relationships] = []
|
| 270 |
+
if supertype:
|
| 271 |
+
self[sem][relationships].extend(self[supertype][relationships])
|
| 272 |
+
self[sem][relationships].extend(
|
| 273 |
+
glue_formulas
|
| 274 |
+
) # add the glue entry to the dictionary
|
| 275 |
+
|
| 276 |
+
def __str__(self):
|
| 277 |
+
accum = ""
|
| 278 |
+
for pos in self:
|
| 279 |
+
str_pos = "%s" % pos
|
| 280 |
+
for relset in self[pos]:
|
| 281 |
+
i = 1
|
| 282 |
+
for gf in self[pos][relset]:
|
| 283 |
+
if i == 1:
|
| 284 |
+
accum += str_pos + ": "
|
| 285 |
+
else:
|
| 286 |
+
accum += " " * (len(str_pos) + 2)
|
| 287 |
+
accum += "%s" % gf
|
| 288 |
+
if relset and i == len(self[pos][relset]):
|
| 289 |
+
accum += " : %s" % relset
|
| 290 |
+
accum += "\n"
|
| 291 |
+
i += 1
|
| 292 |
+
return accum
|
| 293 |
+
|
| 294 |
+
def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False):
|
| 295 |
+
if node is None:
|
| 296 |
+
# TODO: should it be depgraph.root? Is this code tested?
|
| 297 |
+
top = depgraph.nodes[0]
|
| 298 |
+
depList = list(chain.from_iterable(top["deps"].values()))
|
| 299 |
+
root = depgraph.nodes[depList[0]]
|
| 300 |
+
|
| 301 |
+
return self.to_glueformula_list(depgraph, root, Counter(), verbose)
|
| 302 |
+
|
| 303 |
+
glueformulas = self.lookup(node, depgraph, counter)
|
| 304 |
+
for dep_idx in chain.from_iterable(node["deps"].values()):
|
| 305 |
+
dep = depgraph.nodes[dep_idx]
|
| 306 |
+
glueformulas.extend(
|
| 307 |
+
self.to_glueformula_list(depgraph, dep, counter, verbose)
|
| 308 |
+
)
|
| 309 |
+
return glueformulas
|
| 310 |
+
|
| 311 |
+
def lookup(self, node, depgraph, counter):
|
| 312 |
+
semtype_names = self.get_semtypes(node)
|
| 313 |
+
|
| 314 |
+
semtype = None
|
| 315 |
+
for name in semtype_names:
|
| 316 |
+
if name in self:
|
| 317 |
+
semtype = self[name]
|
| 318 |
+
break
|
| 319 |
+
if semtype is None:
|
| 320 |
+
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
|
| 321 |
+
return []
|
| 322 |
+
|
| 323 |
+
self.add_missing_dependencies(node, depgraph)
|
| 324 |
+
|
| 325 |
+
lookup = self._lookup_semtype_option(semtype, node, depgraph)
|
| 326 |
+
|
| 327 |
+
if not len(lookup):
|
| 328 |
+
raise KeyError(
|
| 329 |
+
"There is no GlueDict entry for sem type of '%s' "
|
| 330 |
+
"with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"])
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
return self.get_glueformulas_from_semtype_entry(
|
| 334 |
+
lookup, node["word"], node, depgraph, counter
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
def add_missing_dependencies(self, node, depgraph):
|
| 338 |
+
rel = node["rel"].lower()
|
| 339 |
+
|
| 340 |
+
if rel == "main":
|
| 341 |
+
headnode = depgraph.nodes[node["head"]]
|
| 342 |
+
subj = self.lookup_unique("subj", headnode, depgraph)
|
| 343 |
+
relation = subj["rel"]
|
| 344 |
+
node["deps"].setdefault(relation, [])
|
| 345 |
+
node["deps"][relation].append(subj["address"])
|
| 346 |
+
# node['deps'].append(subj['address'])
|
| 347 |
+
|
| 348 |
+
def _lookup_semtype_option(self, semtype, node, depgraph):
|
| 349 |
+
relationships = frozenset(
|
| 350 |
+
depgraph.nodes[dep]["rel"].lower()
|
| 351 |
+
for dep in chain.from_iterable(node["deps"].values())
|
| 352 |
+
if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
try:
|
| 356 |
+
lookup = semtype[relationships]
|
| 357 |
+
except KeyError:
|
| 358 |
+
# An exact match is not found, so find the best match where
|
| 359 |
+
# 'best' is defined as the glue entry whose relationship set has the
|
| 360 |
+
# most relations of any possible relationship set that is a subset
|
| 361 |
+
# of the actual depgraph
|
| 362 |
+
best_match = frozenset()
|
| 363 |
+
for relset_option in set(semtype) - {None}:
|
| 364 |
+
if (
|
| 365 |
+
len(relset_option) > len(best_match)
|
| 366 |
+
and relset_option < relationships
|
| 367 |
+
):
|
| 368 |
+
best_match = relset_option
|
| 369 |
+
if not best_match:
|
| 370 |
+
if None in semtype:
|
| 371 |
+
best_match = None
|
| 372 |
+
else:
|
| 373 |
+
return None
|
| 374 |
+
lookup = semtype[best_match]
|
| 375 |
+
|
| 376 |
+
return lookup
|
| 377 |
+
|
| 378 |
+
def get_semtypes(self, node):
|
| 379 |
+
"""
|
| 380 |
+
Based on the node, return a list of plausible semtypes in order of
|
| 381 |
+
plausibility.
|
| 382 |
+
"""
|
| 383 |
+
rel = node["rel"].lower()
|
| 384 |
+
word = node["word"].lower()
|
| 385 |
+
|
| 386 |
+
if rel == "spec":
|
| 387 |
+
if word in SPEC_SEMTYPES:
|
| 388 |
+
return [SPEC_SEMTYPES[word]]
|
| 389 |
+
else:
|
| 390 |
+
return [SPEC_SEMTYPES["default"]]
|
| 391 |
+
elif rel in ["nmod", "vmod"]:
|
| 392 |
+
return [node["tag"], rel]
|
| 393 |
+
else:
|
| 394 |
+
return [node["tag"]]
|
| 395 |
+
|
| 396 |
+
def get_glueformulas_from_semtype_entry(
|
| 397 |
+
self, lookup, word, node, depgraph, counter
|
| 398 |
+
):
|
| 399 |
+
glueformulas = []
|
| 400 |
+
|
| 401 |
+
glueFormulaFactory = self.get_GlueFormula_factory()
|
| 402 |
+
for meaning, glue in lookup:
|
| 403 |
+
gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue)
|
| 404 |
+
if not len(glueformulas):
|
| 405 |
+
gf.word = word
|
| 406 |
+
else:
|
| 407 |
+
gf.word = f"{word}{len(glueformulas) + 1}"
|
| 408 |
+
|
| 409 |
+
gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
|
| 410 |
+
|
| 411 |
+
glueformulas.append(gf)
|
| 412 |
+
return glueformulas
|
| 413 |
+
|
| 414 |
+
def get_meaning_formula(self, generic, word):
|
| 415 |
+
"""
|
| 416 |
+
:param generic: A meaning formula string containing the
|
| 417 |
+
parameter "<word>"
|
| 418 |
+
:param word: The actual word to be replace "<word>"
|
| 419 |
+
"""
|
| 420 |
+
word = word.replace(".", "")
|
| 421 |
+
return generic.replace("<word>", word)
|
| 422 |
+
|
| 423 |
+
def initialize_labels(self, expr, node, depgraph, unique_index):
|
| 424 |
+
if isinstance(expr, linearlogic.AtomicExpression):
|
| 425 |
+
name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index)
|
| 426 |
+
if name[0].isupper():
|
| 427 |
+
return linearlogic.VariableExpression(name)
|
| 428 |
+
else:
|
| 429 |
+
return linearlogic.ConstantExpression(name)
|
| 430 |
+
else:
|
| 431 |
+
return linearlogic.ImpExpression(
|
| 432 |
+
self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
|
| 433 |
+
self.initialize_labels(expr.consequent, node, depgraph, unique_index),
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
def find_label_name(self, name, node, depgraph, unique_index):
|
| 437 |
+
try:
|
| 438 |
+
dot = name.index(".")
|
| 439 |
+
|
| 440 |
+
before_dot = name[:dot]
|
| 441 |
+
after_dot = name[dot + 1 :]
|
| 442 |
+
if before_dot == "super":
|
| 443 |
+
return self.find_label_name(
|
| 444 |
+
after_dot, depgraph.nodes[node["head"]], depgraph, unique_index
|
| 445 |
+
)
|
| 446 |
+
else:
|
| 447 |
+
return self.find_label_name(
|
| 448 |
+
after_dot,
|
| 449 |
+
self.lookup_unique(before_dot, node, depgraph),
|
| 450 |
+
depgraph,
|
| 451 |
+
unique_index,
|
| 452 |
+
)
|
| 453 |
+
except ValueError:
|
| 454 |
+
lbl = self.get_label(node)
|
| 455 |
+
if name == "f":
|
| 456 |
+
return lbl
|
| 457 |
+
elif name == "v":
|
| 458 |
+
return "%sv" % lbl
|
| 459 |
+
elif name == "r":
|
| 460 |
+
return "%sr" % lbl
|
| 461 |
+
elif name == "super":
|
| 462 |
+
return self.get_label(depgraph.nodes[node["head"]])
|
| 463 |
+
elif name == "var":
|
| 464 |
+
return f"{lbl.upper()}{unique_index}"
|
| 465 |
+
elif name == "a":
|
| 466 |
+
return self.get_label(self.lookup_unique("conja", node, depgraph))
|
| 467 |
+
elif name == "b":
|
| 468 |
+
return self.get_label(self.lookup_unique("conjb", node, depgraph))
|
| 469 |
+
else:
|
| 470 |
+
return self.get_label(self.lookup_unique(name, node, depgraph))
|
| 471 |
+
|
| 472 |
+
def get_label(self, node):
|
| 473 |
+
"""
|
| 474 |
+
Pick an alphabetic character as identifier for an entity in the model.
|
| 475 |
+
|
| 476 |
+
:param value: where to index into the list of characters
|
| 477 |
+
:type value: int
|
| 478 |
+
"""
|
| 479 |
+
value = node["address"]
|
| 480 |
+
|
| 481 |
+
letter = [
|
| 482 |
+
"f",
|
| 483 |
+
"g",
|
| 484 |
+
"h",
|
| 485 |
+
"i",
|
| 486 |
+
"j",
|
| 487 |
+
"k",
|
| 488 |
+
"l",
|
| 489 |
+
"m",
|
| 490 |
+
"n",
|
| 491 |
+
"o",
|
| 492 |
+
"p",
|
| 493 |
+
"q",
|
| 494 |
+
"r",
|
| 495 |
+
"s",
|
| 496 |
+
"t",
|
| 497 |
+
"u",
|
| 498 |
+
"v",
|
| 499 |
+
"w",
|
| 500 |
+
"x",
|
| 501 |
+
"y",
|
| 502 |
+
"z",
|
| 503 |
+
"a",
|
| 504 |
+
"b",
|
| 505 |
+
"c",
|
| 506 |
+
"d",
|
| 507 |
+
"e",
|
| 508 |
+
][value - 1]
|
| 509 |
+
num = int(value) // 26
|
| 510 |
+
if num > 0:
|
| 511 |
+
return letter + str(num)
|
| 512 |
+
else:
|
| 513 |
+
return letter
|
| 514 |
+
|
| 515 |
+
def lookup_unique(self, rel, node, depgraph):
|
| 516 |
+
"""
|
| 517 |
+
Lookup 'key'. There should be exactly one item in the associated relation.
|
| 518 |
+
"""
|
| 519 |
+
deps = [
|
| 520 |
+
depgraph.nodes[dep]
|
| 521 |
+
for dep in chain.from_iterable(node["deps"].values())
|
| 522 |
+
if depgraph.nodes[dep]["rel"].lower() == rel.lower()
|
| 523 |
+
]
|
| 524 |
+
|
| 525 |
+
if len(deps) == 0:
|
| 526 |
+
raise KeyError(
|
| 527 |
+
"'{}' doesn't contain a feature '{}'".format(node["word"], rel)
|
| 528 |
+
)
|
| 529 |
+
elif len(deps) > 1:
|
| 530 |
+
raise KeyError(
|
| 531 |
+
"'{}' should only have one feature '{}'".format(node["word"], rel)
|
| 532 |
+
)
|
| 533 |
+
else:
|
| 534 |
+
return deps[0]
|
| 535 |
+
|
| 536 |
+
def get_GlueFormula_factory(self):
|
| 537 |
+
return GlueFormula
|
| 538 |
+
|
| 539 |
+
|
| 540 |
+
class Glue:
|
| 541 |
+
def __init__(
|
| 542 |
+
self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
|
| 543 |
+
):
|
| 544 |
+
self.verbose = verbose
|
| 545 |
+
self.remove_duplicates = remove_duplicates
|
| 546 |
+
self.depparser = depparser
|
| 547 |
+
|
| 548 |
+
from nltk import Prover9
|
| 549 |
+
|
| 550 |
+
self.prover = Prover9()
|
| 551 |
+
|
| 552 |
+
if semtype_file:
|
| 553 |
+
self.semtype_file = semtype_file
|
| 554 |
+
else:
|
| 555 |
+
self.semtype_file = os.path.join(
|
| 556 |
+
"grammars", "sample_grammars", "glue.semtype"
|
| 557 |
+
)
|
| 558 |
+
|
| 559 |
+
def train_depparser(self, depgraphs=None):
|
| 560 |
+
if depgraphs:
|
| 561 |
+
self.depparser.train(depgraphs)
|
| 562 |
+
else:
|
| 563 |
+
self.depparser.train_from_file(
|
| 564 |
+
nltk.data.find(
|
| 565 |
+
os.path.join("grammars", "sample_grammars", "glue_train.conll")
|
| 566 |
+
)
|
| 567 |
+
)
|
| 568 |
+
|
| 569 |
+
def parse_to_meaning(self, sentence):
|
| 570 |
+
readings = []
|
| 571 |
+
for agenda in self.parse_to_compiled(sentence):
|
| 572 |
+
readings.extend(self.get_readings(agenda))
|
| 573 |
+
return readings
|
| 574 |
+
|
| 575 |
+
def get_readings(self, agenda):
|
| 576 |
+
readings = []
|
| 577 |
+
agenda_length = len(agenda)
|
| 578 |
+
atomics = dict()
|
| 579 |
+
nonatomics = dict()
|
| 580 |
+
while agenda: # is not empty
|
| 581 |
+
cur = agenda.pop()
|
| 582 |
+
glue_simp = cur.glue.simplify()
|
| 583 |
+
if isinstance(
|
| 584 |
+
glue_simp, linearlogic.ImpExpression
|
| 585 |
+
): # if cur.glue is non-atomic
|
| 586 |
+
for key in atomics:
|
| 587 |
+
try:
|
| 588 |
+
if isinstance(cur.glue, linearlogic.ApplicationExpression):
|
| 589 |
+
bindings = cur.glue.bindings
|
| 590 |
+
else:
|
| 591 |
+
bindings = linearlogic.BindingDict()
|
| 592 |
+
glue_simp.antecedent.unify(key, bindings)
|
| 593 |
+
for atomic in atomics[key]:
|
| 594 |
+
if not (
|
| 595 |
+
cur.indices & atomic.indices
|
| 596 |
+
): # if the sets of indices are disjoint
|
| 597 |
+
try:
|
| 598 |
+
agenda.append(cur.applyto(atomic))
|
| 599 |
+
except linearlogic.LinearLogicApplicationException:
|
| 600 |
+
pass
|
| 601 |
+
except linearlogic.UnificationException:
|
| 602 |
+
pass
|
| 603 |
+
try:
|
| 604 |
+
nonatomics[glue_simp.antecedent].append(cur)
|
| 605 |
+
except KeyError:
|
| 606 |
+
nonatomics[glue_simp.antecedent] = [cur]
|
| 607 |
+
|
| 608 |
+
else: # else cur.glue is atomic
|
| 609 |
+
for key in nonatomics:
|
| 610 |
+
for nonatomic in nonatomics[key]:
|
| 611 |
+
try:
|
| 612 |
+
if isinstance(
|
| 613 |
+
nonatomic.glue, linearlogic.ApplicationExpression
|
| 614 |
+
):
|
| 615 |
+
bindings = nonatomic.glue.bindings
|
| 616 |
+
else:
|
| 617 |
+
bindings = linearlogic.BindingDict()
|
| 618 |
+
glue_simp.unify(key, bindings)
|
| 619 |
+
if not (
|
| 620 |
+
cur.indices & nonatomic.indices
|
| 621 |
+
): # if the sets of indices are disjoint
|
| 622 |
+
try:
|
| 623 |
+
agenda.append(nonatomic.applyto(cur))
|
| 624 |
+
except linearlogic.LinearLogicApplicationException:
|
| 625 |
+
pass
|
| 626 |
+
except linearlogic.UnificationException:
|
| 627 |
+
pass
|
| 628 |
+
try:
|
| 629 |
+
atomics[glue_simp].append(cur)
|
| 630 |
+
except KeyError:
|
| 631 |
+
atomics[glue_simp] = [cur]
|
| 632 |
+
|
| 633 |
+
for entry in atomics:
|
| 634 |
+
for gf in atomics[entry]:
|
| 635 |
+
if len(gf.indices) == agenda_length:
|
| 636 |
+
self._add_to_reading_list(gf, readings)
|
| 637 |
+
for entry in nonatomics:
|
| 638 |
+
for gf in nonatomics[entry]:
|
| 639 |
+
if len(gf.indices) == agenda_length:
|
| 640 |
+
self._add_to_reading_list(gf, readings)
|
| 641 |
+
return readings
|
| 642 |
+
|
| 643 |
+
def _add_to_reading_list(self, glueformula, reading_list):
|
| 644 |
+
add_reading = True
|
| 645 |
+
if self.remove_duplicates:
|
| 646 |
+
for reading in reading_list:
|
| 647 |
+
try:
|
| 648 |
+
if reading.equiv(glueformula.meaning, self.prover):
|
| 649 |
+
add_reading = False
|
| 650 |
+
break
|
| 651 |
+
except Exception as e:
|
| 652 |
+
# if there is an exception, the syntax of the formula
|
| 653 |
+
# may not be understandable by the prover, so don't
|
| 654 |
+
# throw out the reading.
|
| 655 |
+
print("Error when checking logical equality of statements", e)
|
| 656 |
+
|
| 657 |
+
if add_reading:
|
| 658 |
+
reading_list.append(glueformula.meaning)
|
| 659 |
+
|
| 660 |
+
def parse_to_compiled(self, sentence):
|
| 661 |
+
gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
|
| 662 |
+
return [self.gfl_to_compiled(gfl) for gfl in gfls]
|
| 663 |
+
|
| 664 |
+
def dep_parse(self, sentence):
|
| 665 |
+
"""
|
| 666 |
+
Return a dependency graph for the sentence.
|
| 667 |
+
|
| 668 |
+
:param sentence: the sentence to be parsed
|
| 669 |
+
:type sentence: list(str)
|
| 670 |
+
:rtype: DependencyGraph
|
| 671 |
+
"""
|
| 672 |
+
|
| 673 |
+
# Lazy-initialize the depparser
|
| 674 |
+
if self.depparser is None:
|
| 675 |
+
from nltk.parse import MaltParser
|
| 676 |
+
|
| 677 |
+
self.depparser = MaltParser(tagger=self.get_pos_tagger())
|
| 678 |
+
if not self.depparser._trained:
|
| 679 |
+
self.train_depparser()
|
| 680 |
+
return self.depparser.parse(sentence, verbose=self.verbose)
|
| 681 |
+
|
| 682 |
+
def depgraph_to_glue(self, depgraph):
|
| 683 |
+
return self.get_glue_dict().to_glueformula_list(depgraph)
|
| 684 |
+
|
| 685 |
+
def get_glue_dict(self):
|
| 686 |
+
return GlueDict(self.semtype_file)
|
| 687 |
+
|
| 688 |
+
def gfl_to_compiled(self, gfl):
|
| 689 |
+
index_counter = Counter()
|
| 690 |
+
return_list = []
|
| 691 |
+
for gf in gfl:
|
| 692 |
+
return_list.extend(gf.compile(index_counter))
|
| 693 |
+
|
| 694 |
+
if self.verbose:
|
| 695 |
+
print("Compiled Glue Premises:")
|
| 696 |
+
for cgf in return_list:
|
| 697 |
+
print(cgf)
|
| 698 |
+
|
| 699 |
+
return return_list
|
| 700 |
+
|
| 701 |
+
def get_pos_tagger(self):
|
| 702 |
+
from nltk.corpus import brown
|
| 703 |
+
|
| 704 |
+
regexp_tagger = RegexpTagger(
|
| 705 |
+
[
|
| 706 |
+
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
|
| 707 |
+
(r"(The|the|A|a|An|an)$", "AT"), # articles
|
| 708 |
+
(r".*able$", "JJ"), # adjectives
|
| 709 |
+
(r".*ness$", "NN"), # nouns formed from adjectives
|
| 710 |
+
(r".*ly$", "RB"), # adverbs
|
| 711 |
+
(r".*s$", "NNS"), # plural nouns
|
| 712 |
+
(r".*ing$", "VBG"), # gerunds
|
| 713 |
+
(r".*ed$", "VBD"), # past tense verbs
|
| 714 |
+
(r".*", "NN"), # nouns (default)
|
| 715 |
+
]
|
| 716 |
+
)
|
| 717 |
+
brown_train = brown.tagged_sents(categories="news")
|
| 718 |
+
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
|
| 719 |
+
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
|
| 720 |
+
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
|
| 721 |
+
|
| 722 |
+
# Override particular words
|
| 723 |
+
main_tagger = RegexpTagger(
|
| 724 |
+
[(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
|
| 725 |
+
backoff=trigram_tagger,
|
| 726 |
+
)
|
| 727 |
+
|
| 728 |
+
return main_tagger
|
| 729 |
+
|
| 730 |
+
|
| 731 |
+
class DrtGlueFormula(GlueFormula):
|
| 732 |
+
def __init__(self, meaning, glue, indices=None):
|
| 733 |
+
if not indices:
|
| 734 |
+
indices = set()
|
| 735 |
+
|
| 736 |
+
if isinstance(meaning, str):
|
| 737 |
+
self.meaning = drt.DrtExpression.fromstring(meaning)
|
| 738 |
+
elif isinstance(meaning, drt.DrtExpression):
|
| 739 |
+
self.meaning = meaning
|
| 740 |
+
else:
|
| 741 |
+
raise RuntimeError(
|
| 742 |
+
"Meaning term neither string or expression: %s, %s"
|
| 743 |
+
% (meaning, meaning.__class__)
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
if isinstance(glue, str):
|
| 747 |
+
self.glue = linearlogic.LinearLogicParser().parse(glue)
|
| 748 |
+
elif isinstance(glue, linearlogic.Expression):
|
| 749 |
+
self.glue = glue
|
| 750 |
+
else:
|
| 751 |
+
raise RuntimeError(
|
| 752 |
+
"Glue term neither string or expression: %s, %s"
|
| 753 |
+
% (glue, glue.__class__)
|
| 754 |
+
)
|
| 755 |
+
|
| 756 |
+
self.indices = indices
|
| 757 |
+
|
| 758 |
+
def make_VariableExpression(self, name):
|
| 759 |
+
return drt.DrtVariableExpression(name)
|
| 760 |
+
|
| 761 |
+
def make_LambdaExpression(self, variable, term):
|
| 762 |
+
return drt.DrtLambdaExpression(variable, term)
|
| 763 |
+
|
| 764 |
+
|
| 765 |
+
class DrtGlueDict(GlueDict):
|
| 766 |
+
def get_GlueFormula_factory(self):
|
| 767 |
+
return DrtGlueFormula
|
| 768 |
+
|
| 769 |
+
|
| 770 |
+
class DrtGlue(Glue):
|
| 771 |
+
def __init__(
|
| 772 |
+
self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
|
| 773 |
+
):
|
| 774 |
+
if not semtype_file:
|
| 775 |
+
semtype_file = os.path.join(
|
| 776 |
+
"grammars", "sample_grammars", "drt_glue.semtype"
|
| 777 |
+
)
|
| 778 |
+
Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
|
| 779 |
+
|
| 780 |
+
def get_glue_dict(self):
|
| 781 |
+
return DrtGlueDict(self.semtype_file)
|
| 782 |
+
|
| 783 |
+
|
| 784 |
+
def demo(show_example=-1):
|
| 785 |
+
from nltk.parse import MaltParser
|
| 786 |
+
|
| 787 |
+
examples = [
|
| 788 |
+
"David sees Mary",
|
| 789 |
+
"David eats a sandwich",
|
| 790 |
+
"every man chases a dog",
|
| 791 |
+
"every man believes a dog sleeps",
|
| 792 |
+
"John gives David a sandwich",
|
| 793 |
+
"John chases himself",
|
| 794 |
+
]
|
| 795 |
+
# 'John persuades David to order a pizza',
|
| 796 |
+
# 'John tries to go',
|
| 797 |
+
# 'John tries to find a unicorn',
|
| 798 |
+
# 'John seems to vanish',
|
| 799 |
+
# 'a unicorn seems to approach',
|
| 800 |
+
# 'every big cat leaves',
|
| 801 |
+
# 'every gray cat leaves',
|
| 802 |
+
# 'every big gray cat leaves',
|
| 803 |
+
# 'a former senator leaves',
|
| 804 |
+
|
| 805 |
+
print("============== DEMO ==============")
|
| 806 |
+
|
| 807 |
+
tagger = RegexpTagger(
|
| 808 |
+
[
|
| 809 |
+
("^(David|Mary|John)$", "NNP"),
|
| 810 |
+
(
|
| 811 |
+
"^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
|
| 812 |
+
"VB",
|
| 813 |
+
),
|
| 814 |
+
("^(go|order|vanish|find|approach)$", "VB"),
|
| 815 |
+
("^(a)$", "ex_quant"),
|
| 816 |
+
("^(every)$", "univ_quant"),
|
| 817 |
+
("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
|
| 818 |
+
("^(big|gray|former)$", "JJ"),
|
| 819 |
+
("^(him|himself)$", "PRP"),
|
| 820 |
+
]
|
| 821 |
+
)
|
| 822 |
+
|
| 823 |
+
depparser = MaltParser(tagger=tagger)
|
| 824 |
+
glue = Glue(depparser=depparser, verbose=False)
|
| 825 |
+
|
| 826 |
+
for (i, sentence) in enumerate(examples):
|
| 827 |
+
if i == show_example or show_example == -1:
|
| 828 |
+
print(f"[[[Example {i}]]] {sentence}")
|
| 829 |
+
for reading in glue.parse_to_meaning(sentence.split()):
|
| 830 |
+
print(reading.simplify())
|
| 831 |
+
print("")
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
if __name__ == "__main__":
|
| 835 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Logic
|
| 2 |
+
#
|
| 3 |
+
# Author: Peter Wang
|
| 4 |
+
# Updated by: Dan Garrette <dhgarrette@gmail.com>
|
| 5 |
+
#
|
| 6 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
An implementation of the Hole Semantics model, following Blackburn and Bos,
|
| 12 |
+
Representation and Inference for Natural Language (CSLI, 2005).
|
| 13 |
+
|
| 14 |
+
The semantic representations are built by the grammar hole.fcfg.
|
| 15 |
+
This module contains driver code to read in sentences and parse them
|
| 16 |
+
according to a hole semantics grammar.
|
| 17 |
+
|
| 18 |
+
After parsing, the semantic representation is in the form of an underspecified
|
| 19 |
+
representation that is not easy to read. We use a "plugging" algorithm to
|
| 20 |
+
convert that representation into first-order logic formulas.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from functools import reduce
|
| 24 |
+
|
| 25 |
+
from nltk.parse import load_parser
|
| 26 |
+
from nltk.sem.logic import (
|
| 27 |
+
AllExpression,
|
| 28 |
+
AndExpression,
|
| 29 |
+
ApplicationExpression,
|
| 30 |
+
ExistsExpression,
|
| 31 |
+
IffExpression,
|
| 32 |
+
ImpExpression,
|
| 33 |
+
LambdaExpression,
|
| 34 |
+
NegatedExpression,
|
| 35 |
+
OrExpression,
|
| 36 |
+
)
|
| 37 |
+
from nltk.sem.skolemize import skolemize
|
| 38 |
+
|
| 39 |
+
# Note that in this code there may be multiple types of trees being referred to:
|
| 40 |
+
#
|
| 41 |
+
# 1. parse trees
|
| 42 |
+
# 2. the underspecified representation
|
| 43 |
+
# 3. first-order logic formula trees
|
| 44 |
+
# 4. the search space when plugging (search tree)
|
| 45 |
+
#
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class Constants:
|
| 49 |
+
ALL = "ALL"
|
| 50 |
+
EXISTS = "EXISTS"
|
| 51 |
+
NOT = "NOT"
|
| 52 |
+
AND = "AND"
|
| 53 |
+
OR = "OR"
|
| 54 |
+
IMP = "IMP"
|
| 55 |
+
IFF = "IFF"
|
| 56 |
+
PRED = "PRED"
|
| 57 |
+
LEQ = "LEQ"
|
| 58 |
+
HOLE = "HOLE"
|
| 59 |
+
LABEL = "LABEL"
|
| 60 |
+
|
| 61 |
+
MAP = {
|
| 62 |
+
ALL: lambda v, e: AllExpression(v.variable, e),
|
| 63 |
+
EXISTS: lambda v, e: ExistsExpression(v.variable, e),
|
| 64 |
+
NOT: NegatedExpression,
|
| 65 |
+
AND: AndExpression,
|
| 66 |
+
OR: OrExpression,
|
| 67 |
+
IMP: ImpExpression,
|
| 68 |
+
IFF: IffExpression,
|
| 69 |
+
PRED: ApplicationExpression,
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class HoleSemantics:
|
| 74 |
+
"""
|
| 75 |
+
This class holds the broken-down components of a hole semantics, i.e. it
|
| 76 |
+
extracts the holes, labels, logic formula fragments and constraints out of
|
| 77 |
+
a big conjunction of such as produced by the hole semantics grammar. It
|
| 78 |
+
then provides some operations on the semantics dealing with holes, labels
|
| 79 |
+
and finding legal ways to plug holes with labels.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
def __init__(self, usr):
|
| 83 |
+
"""
|
| 84 |
+
Constructor. `usr' is a ``sem.Expression`` representing an
|
| 85 |
+
Underspecified Representation Structure (USR). A USR has the following
|
| 86 |
+
special predicates:
|
| 87 |
+
ALL(l,v,n),
|
| 88 |
+
EXISTS(l,v,n),
|
| 89 |
+
AND(l,n,n),
|
| 90 |
+
OR(l,n,n),
|
| 91 |
+
IMP(l,n,n),
|
| 92 |
+
IFF(l,n,n),
|
| 93 |
+
PRED(l,v,n,v[,v]*) where the brackets and star indicate zero or more repetitions,
|
| 94 |
+
LEQ(n,n),
|
| 95 |
+
HOLE(n),
|
| 96 |
+
LABEL(n)
|
| 97 |
+
where l is the label of the node described by the predicate, n is either
|
| 98 |
+
a label or a hole, and v is a variable.
|
| 99 |
+
"""
|
| 100 |
+
self.holes = set()
|
| 101 |
+
self.labels = set()
|
| 102 |
+
self.fragments = {} # mapping of label -> formula fragment
|
| 103 |
+
self.constraints = set() # set of Constraints
|
| 104 |
+
self._break_down(usr)
|
| 105 |
+
self.top_most_labels = self._find_top_most_labels()
|
| 106 |
+
self.top_hole = self._find_top_hole()
|
| 107 |
+
|
| 108 |
+
def is_node(self, x):
|
| 109 |
+
"""
|
| 110 |
+
Return true if x is a node (label or hole) in this semantic
|
| 111 |
+
representation.
|
| 112 |
+
"""
|
| 113 |
+
return x in (self.labels | self.holes)
|
| 114 |
+
|
| 115 |
+
def _break_down(self, usr):
|
| 116 |
+
"""
|
| 117 |
+
Extract holes, labels, formula fragments and constraints from the hole
|
| 118 |
+
semantics underspecified representation (USR).
|
| 119 |
+
"""
|
| 120 |
+
if isinstance(usr, AndExpression):
|
| 121 |
+
self._break_down(usr.first)
|
| 122 |
+
self._break_down(usr.second)
|
| 123 |
+
elif isinstance(usr, ApplicationExpression):
|
| 124 |
+
func, args = usr.uncurry()
|
| 125 |
+
if func.variable.name == Constants.LEQ:
|
| 126 |
+
self.constraints.add(Constraint(args[0], args[1]))
|
| 127 |
+
elif func.variable.name == Constants.HOLE:
|
| 128 |
+
self.holes.add(args[0])
|
| 129 |
+
elif func.variable.name == Constants.LABEL:
|
| 130 |
+
self.labels.add(args[0])
|
| 131 |
+
else:
|
| 132 |
+
label = args[0]
|
| 133 |
+
assert label not in self.fragments
|
| 134 |
+
self.fragments[label] = (func, args[1:])
|
| 135 |
+
else:
|
| 136 |
+
raise ValueError(usr.label())
|
| 137 |
+
|
| 138 |
+
def _find_top_nodes(self, node_list):
|
| 139 |
+
top_nodes = node_list.copy()
|
| 140 |
+
for f in self.fragments.values():
|
| 141 |
+
# the label is the first argument of the predicate
|
| 142 |
+
args = f[1]
|
| 143 |
+
for arg in args:
|
| 144 |
+
if arg in node_list:
|
| 145 |
+
top_nodes.discard(arg)
|
| 146 |
+
return top_nodes
|
| 147 |
+
|
| 148 |
+
def _find_top_most_labels(self):
|
| 149 |
+
"""
|
| 150 |
+
Return the set of labels which are not referenced directly as part of
|
| 151 |
+
another formula fragment. These will be the top-most labels for the
|
| 152 |
+
subtree that they are part of.
|
| 153 |
+
"""
|
| 154 |
+
return self._find_top_nodes(self.labels)
|
| 155 |
+
|
| 156 |
+
def _find_top_hole(self):
|
| 157 |
+
"""
|
| 158 |
+
Return the hole that will be the top of the formula tree.
|
| 159 |
+
"""
|
| 160 |
+
top_holes = self._find_top_nodes(self.holes)
|
| 161 |
+
assert len(top_holes) == 1 # it must be unique
|
| 162 |
+
return top_holes.pop()
|
| 163 |
+
|
| 164 |
+
def pluggings(self):
|
| 165 |
+
"""
|
| 166 |
+
Calculate and return all the legal pluggings (mappings of labels to
|
| 167 |
+
holes) of this semantics given the constraints.
|
| 168 |
+
"""
|
| 169 |
+
record = []
|
| 170 |
+
self._plug_nodes([(self.top_hole, [])], self.top_most_labels, {}, record)
|
| 171 |
+
return record
|
| 172 |
+
|
| 173 |
+
def _plug_nodes(self, queue, potential_labels, plug_acc, record):
|
| 174 |
+
"""
|
| 175 |
+
Plug the nodes in `queue' with the labels in `potential_labels'.
|
| 176 |
+
|
| 177 |
+
Each element of `queue' is a tuple of the node to plug and the list of
|
| 178 |
+
ancestor holes from the root of the graph to that node.
|
| 179 |
+
|
| 180 |
+
`potential_labels' is a set of the labels which are still available for
|
| 181 |
+
plugging.
|
| 182 |
+
|
| 183 |
+
`plug_acc' is the incomplete mapping of holes to labels made on the
|
| 184 |
+
current branch of the search tree so far.
|
| 185 |
+
|
| 186 |
+
`record' is a list of all the complete pluggings that we have found in
|
| 187 |
+
total so far. It is the only parameter that is destructively updated.
|
| 188 |
+
"""
|
| 189 |
+
if queue != []:
|
| 190 |
+
(node, ancestors) = queue[0]
|
| 191 |
+
if node in self.holes:
|
| 192 |
+
# The node is a hole, try to plug it.
|
| 193 |
+
self._plug_hole(
|
| 194 |
+
node, ancestors, queue[1:], potential_labels, plug_acc, record
|
| 195 |
+
)
|
| 196 |
+
else:
|
| 197 |
+
assert node in self.labels
|
| 198 |
+
# The node is a label. Replace it in the queue by the holes and
|
| 199 |
+
# labels in the formula fragment named by that label.
|
| 200 |
+
args = self.fragments[node][1]
|
| 201 |
+
head = [(a, ancestors) for a in args if self.is_node(a)]
|
| 202 |
+
self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record)
|
| 203 |
+
else:
|
| 204 |
+
raise Exception("queue empty")
|
| 205 |
+
|
| 206 |
+
def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record):
|
| 207 |
+
"""
|
| 208 |
+
Try all possible ways of plugging a single hole.
|
| 209 |
+
See _plug_nodes for the meanings of the parameters.
|
| 210 |
+
"""
|
| 211 |
+
# Add the current hole we're trying to plug into the list of ancestors.
|
| 212 |
+
assert hole not in ancestors0
|
| 213 |
+
ancestors = [hole] + ancestors0
|
| 214 |
+
|
| 215 |
+
# Try each potential label in this hole in turn.
|
| 216 |
+
for l in potential_labels0:
|
| 217 |
+
# Is the label valid in this hole?
|
| 218 |
+
if self._violates_constraints(l, ancestors):
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
plug_acc = plug_acc0.copy()
|
| 222 |
+
plug_acc[hole] = l
|
| 223 |
+
potential_labels = potential_labels0.copy()
|
| 224 |
+
potential_labels.remove(l)
|
| 225 |
+
|
| 226 |
+
if len(potential_labels) == 0:
|
| 227 |
+
# No more potential labels. That must mean all the holes have
|
| 228 |
+
# been filled so we have found a legal plugging so remember it.
|
| 229 |
+
#
|
| 230 |
+
# Note that the queue might not be empty because there might
|
| 231 |
+
# be labels on there that point to formula fragments with
|
| 232 |
+
# no holes in them. _sanity_check_plugging will make sure
|
| 233 |
+
# all holes are filled.
|
| 234 |
+
self._sanity_check_plugging(plug_acc, self.top_hole, [])
|
| 235 |
+
record.append(plug_acc)
|
| 236 |
+
else:
|
| 237 |
+
# Recursively try to fill in the rest of the holes in the
|
| 238 |
+
# queue. The label we just plugged into the hole could have
|
| 239 |
+
# holes of its own so at the end of the queue. Putting it on
|
| 240 |
+
# the end of the queue gives us a breadth-first search, so that
|
| 241 |
+
# all the holes at level i of the formula tree are filled
|
| 242 |
+
# before filling level i+1.
|
| 243 |
+
# A depth-first search would work as well since the trees must
|
| 244 |
+
# be finite but the bookkeeping would be harder.
|
| 245 |
+
self._plug_nodes(
|
| 246 |
+
queue + [(l, ancestors)], potential_labels, plug_acc, record
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
def _violates_constraints(self, label, ancestors):
|
| 250 |
+
"""
|
| 251 |
+
Return True if the `label' cannot be placed underneath the holes given
|
| 252 |
+
by the set `ancestors' because it would violate the constraints imposed
|
| 253 |
+
on it.
|
| 254 |
+
"""
|
| 255 |
+
for c in self.constraints:
|
| 256 |
+
if c.lhs == label:
|
| 257 |
+
if c.rhs not in ancestors:
|
| 258 |
+
return True
|
| 259 |
+
return False
|
| 260 |
+
|
| 261 |
+
def _sanity_check_plugging(self, plugging, node, ancestors):
|
| 262 |
+
"""
|
| 263 |
+
Make sure that a given plugging is legal. We recursively go through
|
| 264 |
+
each node and make sure that no constraints are violated.
|
| 265 |
+
We also check that all holes have been filled.
|
| 266 |
+
"""
|
| 267 |
+
if node in self.holes:
|
| 268 |
+
ancestors = [node] + ancestors
|
| 269 |
+
label = plugging[node]
|
| 270 |
+
else:
|
| 271 |
+
label = node
|
| 272 |
+
assert label in self.labels
|
| 273 |
+
for c in self.constraints:
|
| 274 |
+
if c.lhs == label:
|
| 275 |
+
assert c.rhs in ancestors
|
| 276 |
+
args = self.fragments[label][1]
|
| 277 |
+
for arg in args:
|
| 278 |
+
if self.is_node(arg):
|
| 279 |
+
self._sanity_check_plugging(plugging, arg, [label] + ancestors)
|
| 280 |
+
|
| 281 |
+
def formula_tree(self, plugging):
|
| 282 |
+
"""
|
| 283 |
+
Return the first-order logic formula tree for this underspecified
|
| 284 |
+
representation using the plugging given.
|
| 285 |
+
"""
|
| 286 |
+
return self._formula_tree(plugging, self.top_hole)
|
| 287 |
+
|
| 288 |
+
def _formula_tree(self, plugging, node):
|
| 289 |
+
if node in plugging:
|
| 290 |
+
return self._formula_tree(plugging, plugging[node])
|
| 291 |
+
elif node in self.fragments:
|
| 292 |
+
pred, args = self.fragments[node]
|
| 293 |
+
children = [self._formula_tree(plugging, arg) for arg in args]
|
| 294 |
+
return reduce(Constants.MAP[pred.variable.name], children)
|
| 295 |
+
else:
|
| 296 |
+
return node
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
class Constraint:
|
| 300 |
+
"""
|
| 301 |
+
This class represents a constraint of the form (L =< N),
|
| 302 |
+
where L is a label and N is a node (a label or a hole).
|
| 303 |
+
"""
|
| 304 |
+
|
| 305 |
+
def __init__(self, lhs, rhs):
|
| 306 |
+
self.lhs = lhs
|
| 307 |
+
self.rhs = rhs
|
| 308 |
+
|
| 309 |
+
def __eq__(self, other):
|
| 310 |
+
if self.__class__ == other.__class__:
|
| 311 |
+
return self.lhs == other.lhs and self.rhs == other.rhs
|
| 312 |
+
else:
|
| 313 |
+
return False
|
| 314 |
+
|
| 315 |
+
def __ne__(self, other):
|
| 316 |
+
return not (self == other)
|
| 317 |
+
|
| 318 |
+
def __hash__(self):
|
| 319 |
+
return hash(repr(self))
|
| 320 |
+
|
| 321 |
+
def __repr__(self):
|
| 322 |
+
return f"({self.lhs} < {self.rhs})"
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def hole_readings(sentence, grammar_filename=None, verbose=False):
|
| 326 |
+
if not grammar_filename:
|
| 327 |
+
grammar_filename = "grammars/sample_grammars/hole.fcfg"
|
| 328 |
+
|
| 329 |
+
if verbose:
|
| 330 |
+
print("Reading grammar file", grammar_filename)
|
| 331 |
+
|
| 332 |
+
parser = load_parser(grammar_filename)
|
| 333 |
+
|
| 334 |
+
# Parse the sentence.
|
| 335 |
+
tokens = sentence.split()
|
| 336 |
+
trees = list(parser.parse(tokens))
|
| 337 |
+
if verbose:
|
| 338 |
+
print("Got %d different parses" % len(trees))
|
| 339 |
+
|
| 340 |
+
all_readings = []
|
| 341 |
+
for tree in trees:
|
| 342 |
+
# Get the semantic feature from the top of the parse tree.
|
| 343 |
+
sem = tree.label()["SEM"].simplify()
|
| 344 |
+
|
| 345 |
+
# Print the raw semantic representation.
|
| 346 |
+
if verbose:
|
| 347 |
+
print("Raw: ", sem)
|
| 348 |
+
|
| 349 |
+
# Skolemize away all quantifiers. All variables become unique.
|
| 350 |
+
while isinstance(sem, LambdaExpression):
|
| 351 |
+
sem = sem.term
|
| 352 |
+
skolemized = skolemize(sem)
|
| 353 |
+
|
| 354 |
+
if verbose:
|
| 355 |
+
print("Skolemized:", skolemized)
|
| 356 |
+
|
| 357 |
+
# Break the hole semantics representation down into its components
|
| 358 |
+
# i.e. holes, labels, formula fragments and constraints.
|
| 359 |
+
hole_sem = HoleSemantics(skolemized)
|
| 360 |
+
|
| 361 |
+
# Maybe show the details of the semantic representation.
|
| 362 |
+
if verbose:
|
| 363 |
+
print("Holes: ", hole_sem.holes)
|
| 364 |
+
print("Labels: ", hole_sem.labels)
|
| 365 |
+
print("Constraints: ", hole_sem.constraints)
|
| 366 |
+
print("Top hole: ", hole_sem.top_hole)
|
| 367 |
+
print("Top labels: ", hole_sem.top_most_labels)
|
| 368 |
+
print("Fragments:")
|
| 369 |
+
for l, f in hole_sem.fragments.items():
|
| 370 |
+
print(f"\t{l}: {f}")
|
| 371 |
+
|
| 372 |
+
# Find all the possible ways to plug the formulas together.
|
| 373 |
+
pluggings = hole_sem.pluggings()
|
| 374 |
+
|
| 375 |
+
# Build FOL formula trees using the pluggings.
|
| 376 |
+
readings = list(map(hole_sem.formula_tree, pluggings))
|
| 377 |
+
|
| 378 |
+
# Print out the formulas in a textual format.
|
| 379 |
+
if verbose:
|
| 380 |
+
for i, r in enumerate(readings):
|
| 381 |
+
print()
|
| 382 |
+
print("%d. %s" % (i, r))
|
| 383 |
+
print()
|
| 384 |
+
|
| 385 |
+
all_readings.extend(readings)
|
| 386 |
+
|
| 387 |
+
return all_readings
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
if __name__ == "__main__":
|
| 391 |
+
for r in hole_readings("a dog barks"):
|
| 392 |
+
print(r)
|
| 393 |
+
print()
|
| 394 |
+
for r in hole_readings("every girl chases a dog"):
|
| 395 |
+
print(r)
|
.eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Stemmers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
NLTK Stemmers
|
| 12 |
+
|
| 13 |
+
Interfaces used to remove morphological affixes from words, leaving
|
| 14 |
+
only the word stem. Stemming algorithms aim to remove those affixes
|
| 15 |
+
required for eg. grammatical role, tense, derivational morphology
|
| 16 |
+
leaving only the stem of the word. This is a difficult problem due to
|
| 17 |
+
irregular words (eg. common verbs in English), complicated
|
| 18 |
+
morphological rules, and part-of-speech and sense ambiguities
|
| 19 |
+
(eg. ``ceil-`` is not the stem of ``ceiling``).
|
| 20 |
+
|
| 21 |
+
StemmerI defines a standard interface for stemmers.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from nltk.stem.api import StemmerI
|
| 25 |
+
from nltk.stem.arlstem import ARLSTem
|
| 26 |
+
from nltk.stem.arlstem2 import ARLSTem2
|
| 27 |
+
from nltk.stem.cistem import Cistem
|
| 28 |
+
from nltk.stem.isri import ISRIStemmer
|
| 29 |
+
from nltk.stem.lancaster import LancasterStemmer
|
| 30 |
+
from nltk.stem.porter import PorterStemmer
|
| 31 |
+
from nltk.stem.regexp import RegexpStemmer
|
| 32 |
+
from nltk.stem.rslp import RSLPStemmer
|
| 33 |
+
from nltk.stem.snowball import SnowballStemmer
|
| 34 |
+
from nltk.stem.wordnet import WordNetLemmatizer
|
.eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Stemmer Interface
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# Steven Bird <stevenbird1@gmail.com>
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
from abc import ABCMeta, abstractmethod
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class StemmerI(metaclass=ABCMeta):
|
| 14 |
+
"""
|
| 15 |
+
A processing interface for removing morphological affixes from
|
| 16 |
+
words. This process is known as stemming.
|
| 17 |
+
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
@abstractmethod
|
| 21 |
+
def stem(self, token):
|
| 22 |
+
"""
|
| 23 |
+
Strip affixes from the token and return the stem.
|
| 24 |
+
|
| 25 |
+
:param token: The token that should be stemmed.
|
| 26 |
+
:type token: str
|
| 27 |
+
"""
|
.eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Stemmers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Tomcavage <stomcava@law.upenn.edu>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
|
| 10 |
+
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
|
| 11 |
+
"""
|
| 12 |
+
import re
|
| 13 |
+
|
| 14 |
+
from nltk.stem.api import StemmerI
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class LancasterStemmer(StemmerI):
|
| 18 |
+
"""
|
| 19 |
+
Lancaster Stemmer
|
| 20 |
+
|
| 21 |
+
>>> from nltk.stem.lancaster import LancasterStemmer
|
| 22 |
+
>>> st = LancasterStemmer()
|
| 23 |
+
>>> st.stem('maximum') # Remove "-um" when word is intact
|
| 24 |
+
'maxim'
|
| 25 |
+
>>> st.stem('presumably') # Don't remove "-um" when word is not intact
|
| 26 |
+
'presum'
|
| 27 |
+
>>> st.stem('multiply') # No action taken if word ends with "-ply"
|
| 28 |
+
'multiply'
|
| 29 |
+
>>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
|
| 30 |
+
'provid'
|
| 31 |
+
>>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
|
| 32 |
+
'ow'
|
| 33 |
+
>>> st.stem('ear') # ditto
|
| 34 |
+
'ear'
|
| 35 |
+
>>> st.stem('saying') # Words starting with consonant must contain at least 3
|
| 36 |
+
'say'
|
| 37 |
+
>>> st.stem('crying') # letters and one of those letters must be a vowel
|
| 38 |
+
'cry'
|
| 39 |
+
>>> st.stem('string') # ditto
|
| 40 |
+
'string'
|
| 41 |
+
>>> st.stem('meant') # ditto
|
| 42 |
+
'meant'
|
| 43 |
+
>>> st.stem('cement') # ditto
|
| 44 |
+
'cem'
|
| 45 |
+
>>> st_pre = LancasterStemmer(strip_prefix_flag=True)
|
| 46 |
+
>>> st_pre.stem('kilometer') # Test Prefix
|
| 47 |
+
'met'
|
| 48 |
+
>>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
|
| 49 |
+
>>> st_custom.stem("ness") # Change s to t
|
| 50 |
+
'nest'
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
# The rule list is static since it doesn't change between instances
|
| 54 |
+
default_rule_tuple = (
|
| 55 |
+
"ai*2.", # -ia > - if intact
|
| 56 |
+
"a*1.", # -a > - if intact
|
| 57 |
+
"bb1.", # -bb > -b
|
| 58 |
+
"city3s.", # -ytic > -ys
|
| 59 |
+
"ci2>", # -ic > -
|
| 60 |
+
"cn1t>", # -nc > -nt
|
| 61 |
+
"dd1.", # -dd > -d
|
| 62 |
+
"dei3y>", # -ied > -y
|
| 63 |
+
"deec2ss.", # -ceed >", -cess
|
| 64 |
+
"dee1.", # -eed > -ee
|
| 65 |
+
"de2>", # -ed > -
|
| 66 |
+
"dooh4>", # -hood > -
|
| 67 |
+
"e1>", # -e > -
|
| 68 |
+
"feil1v.", # -lief > -liev
|
| 69 |
+
"fi2>", # -if > -
|
| 70 |
+
"gni3>", # -ing > -
|
| 71 |
+
"gai3y.", # -iag > -y
|
| 72 |
+
"ga2>", # -ag > -
|
| 73 |
+
"gg1.", # -gg > -g
|
| 74 |
+
"ht*2.", # -th > - if intact
|
| 75 |
+
"hsiug5ct.", # -guish > -ct
|
| 76 |
+
"hsi3>", # -ish > -
|
| 77 |
+
"i*1.", # -i > - if intact
|
| 78 |
+
"i1y>", # -i > -y
|
| 79 |
+
"ji1d.", # -ij > -id -- see nois4j> & vis3j>
|
| 80 |
+
"juf1s.", # -fuj > -fus
|
| 81 |
+
"ju1d.", # -uj > -ud
|
| 82 |
+
"jo1d.", # -oj > -od
|
| 83 |
+
"jeh1r.", # -hej > -her
|
| 84 |
+
"jrev1t.", # -verj > -vert
|
| 85 |
+
"jsim2t.", # -misj > -mit
|
| 86 |
+
"jn1d.", # -nj > -nd
|
| 87 |
+
"j1s.", # -j > -s
|
| 88 |
+
"lbaifi6.", # -ifiabl > -
|
| 89 |
+
"lbai4y.", # -iabl > -y
|
| 90 |
+
"lba3>", # -abl > -
|
| 91 |
+
"lbi3.", # -ibl > -
|
| 92 |
+
"lib2l>", # -bil > -bl
|
| 93 |
+
"lc1.", # -cl > c
|
| 94 |
+
"lufi4y.", # -iful > -y
|
| 95 |
+
"luf3>", # -ful > -
|
| 96 |
+
"lu2.", # -ul > -
|
| 97 |
+
"lai3>", # -ial > -
|
| 98 |
+
"lau3>", # -ual > -
|
| 99 |
+
"la2>", # -al > -
|
| 100 |
+
"ll1.", # -ll > -l
|
| 101 |
+
"mui3.", # -ium > -
|
| 102 |
+
"mu*2.", # -um > - if intact
|
| 103 |
+
"msi3>", # -ism > -
|
| 104 |
+
"mm1.", # -mm > -m
|
| 105 |
+
"nois4j>", # -sion > -j
|
| 106 |
+
"noix4ct.", # -xion > -ct
|
| 107 |
+
"noi3>", # -ion > -
|
| 108 |
+
"nai3>", # -ian > -
|
| 109 |
+
"na2>", # -an > -
|
| 110 |
+
"nee0.", # protect -een
|
| 111 |
+
"ne2>", # -en > -
|
| 112 |
+
"nn1.", # -nn > -n
|
| 113 |
+
"pihs4>", # -ship > -
|
| 114 |
+
"pp1.", # -pp > -p
|
| 115 |
+
"re2>", # -er > -
|
| 116 |
+
"rae0.", # protect -ear
|
| 117 |
+
"ra2.", # -ar > -
|
| 118 |
+
"ro2>", # -or > -
|
| 119 |
+
"ru2>", # -ur > -
|
| 120 |
+
"rr1.", # -rr > -r
|
| 121 |
+
"rt1>", # -tr > -t
|
| 122 |
+
"rei3y>", # -ier > -y
|
| 123 |
+
"sei3y>", # -ies > -y
|
| 124 |
+
"sis2.", # -sis > -s
|
| 125 |
+
"si2>", # -is > -
|
| 126 |
+
"ssen4>", # -ness > -
|
| 127 |
+
"ss0.", # protect -ss
|
| 128 |
+
"suo3>", # -ous > -
|
| 129 |
+
"su*2.", # -us > - if intact
|
| 130 |
+
"s*1>", # -s > - if intact
|
| 131 |
+
"s0.", # -s > -s
|
| 132 |
+
"tacilp4y.", # -plicat > -ply
|
| 133 |
+
"ta2>", # -at > -
|
| 134 |
+
"tnem4>", # -ment > -
|
| 135 |
+
"tne3>", # -ent > -
|
| 136 |
+
"tna3>", # -ant > -
|
| 137 |
+
"tpir2b.", # -ript > -rib
|
| 138 |
+
"tpro2b.", # -orpt > -orb
|
| 139 |
+
"tcud1.", # -duct > -duc
|
| 140 |
+
"tpmus2.", # -sumpt > -sum
|
| 141 |
+
"tpec2iv.", # -cept > -ceiv
|
| 142 |
+
"tulo2v.", # -olut > -olv
|
| 143 |
+
"tsis0.", # protect -sist
|
| 144 |
+
"tsi3>", # -ist > -
|
| 145 |
+
"tt1.", # -tt > -t
|
| 146 |
+
"uqi3.", # -iqu > -
|
| 147 |
+
"ugo1.", # -ogu > -og
|
| 148 |
+
"vis3j>", # -siv > -j
|
| 149 |
+
"vie0.", # protect -eiv
|
| 150 |
+
"vi2>", # -iv > -
|
| 151 |
+
"ylb1>", # -bly > -bl
|
| 152 |
+
"yli3y>", # -ily > -y
|
| 153 |
+
"ylp0.", # protect -ply
|
| 154 |
+
"yl2>", # -ly > -
|
| 155 |
+
"ygo1.", # -ogy > -og
|
| 156 |
+
"yhp1.", # -phy > -ph
|
| 157 |
+
"ymo1.", # -omy > -om
|
| 158 |
+
"ypo1.", # -opy > -op
|
| 159 |
+
"yti3>", # -ity > -
|
| 160 |
+
"yte3>", # -ety > -
|
| 161 |
+
"ytl2.", # -lty > -l
|
| 162 |
+
"yrtsi5.", # -istry > -
|
| 163 |
+
"yra3>", # -ary > -
|
| 164 |
+
"yro3>", # -ory > -
|
| 165 |
+
"yfi3.", # -ify > -
|
| 166 |
+
"ycn2t>", # -ncy > -nt
|
| 167 |
+
"yca3>", # -acy > -
|
| 168 |
+
"zi2>", # -iz > -
|
| 169 |
+
"zy1s.", # -yz > -ys
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
def __init__(self, rule_tuple=None, strip_prefix_flag=False):
|
| 173 |
+
"""Create an instance of the Lancaster stemmer."""
|
| 174 |
+
# Setup an empty rule dictionary - this will be filled in later
|
| 175 |
+
self.rule_dictionary = {}
|
| 176 |
+
# Check if a user wants to strip prefix
|
| 177 |
+
self._strip_prefix = strip_prefix_flag
|
| 178 |
+
# Check if a user wants to use his/her own rule tuples.
|
| 179 |
+
self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
|
| 180 |
+
|
| 181 |
+
def parseRules(self, rule_tuple=None):
|
| 182 |
+
"""Validate the set of rules used in this stemmer.
|
| 183 |
+
|
| 184 |
+
If this function is called as an individual method, without using stem
|
| 185 |
+
method, rule_tuple argument will be compiled into self.rule_dictionary.
|
| 186 |
+
If this function is called within stem, self._rule_tuple will be used.
|
| 187 |
+
|
| 188 |
+
"""
|
| 189 |
+
# If there is no argument for the function, use class' own rule tuple.
|
| 190 |
+
rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
|
| 191 |
+
valid_rule = re.compile(r"^[a-z]+\*?\d[a-z]*[>\.]?$")
|
| 192 |
+
# Empty any old rules from the rule set before adding new ones
|
| 193 |
+
self.rule_dictionary = {}
|
| 194 |
+
|
| 195 |
+
for rule in rule_tuple:
|
| 196 |
+
if not valid_rule.match(rule):
|
| 197 |
+
raise ValueError(f"The rule {rule} is invalid")
|
| 198 |
+
first_letter = rule[0:1]
|
| 199 |
+
if first_letter in self.rule_dictionary:
|
| 200 |
+
self.rule_dictionary[first_letter].append(rule)
|
| 201 |
+
else:
|
| 202 |
+
self.rule_dictionary[first_letter] = [rule]
|
| 203 |
+
|
| 204 |
+
def stem(self, word):
|
| 205 |
+
"""Stem a word using the Lancaster stemmer."""
|
| 206 |
+
# Lower-case the word, since all the rules are lower-cased
|
| 207 |
+
word = word.lower()
|
| 208 |
+
word = self.__stripPrefix(word) if self._strip_prefix else word
|
| 209 |
+
|
| 210 |
+
# Save a copy of the original word
|
| 211 |
+
intact_word = word
|
| 212 |
+
|
| 213 |
+
# If rule dictionary is empty, parse rule tuple.
|
| 214 |
+
if not self.rule_dictionary:
|
| 215 |
+
self.parseRules()
|
| 216 |
+
|
| 217 |
+
return self.__doStemming(word, intact_word)
|
| 218 |
+
|
| 219 |
+
def __doStemming(self, word, intact_word):
|
| 220 |
+
"""Perform the actual word stemming"""
|
| 221 |
+
|
| 222 |
+
valid_rule = re.compile(r"^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
|
| 223 |
+
|
| 224 |
+
proceed = True
|
| 225 |
+
|
| 226 |
+
while proceed:
|
| 227 |
+
|
| 228 |
+
# Find the position of the last letter of the word to be stemmed
|
| 229 |
+
last_letter_position = self.__getLastLetter(word)
|
| 230 |
+
|
| 231 |
+
# Only stem the word if it has a last letter and a rule matching that last letter
|
| 232 |
+
if (
|
| 233 |
+
last_letter_position < 0
|
| 234 |
+
or word[last_letter_position] not in self.rule_dictionary
|
| 235 |
+
):
|
| 236 |
+
proceed = False
|
| 237 |
+
|
| 238 |
+
else:
|
| 239 |
+
rule_was_applied = False
|
| 240 |
+
|
| 241 |
+
# Go through each rule that matches the word's final letter
|
| 242 |
+
for rule in self.rule_dictionary[word[last_letter_position]]:
|
| 243 |
+
rule_match = valid_rule.match(rule)
|
| 244 |
+
if rule_match:
|
| 245 |
+
(
|
| 246 |
+
ending_string,
|
| 247 |
+
intact_flag,
|
| 248 |
+
remove_total,
|
| 249 |
+
append_string,
|
| 250 |
+
cont_flag,
|
| 251 |
+
) = rule_match.groups()
|
| 252 |
+
|
| 253 |
+
# Convert the number of chars to remove when stemming
|
| 254 |
+
# from a string to an integer
|
| 255 |
+
remove_total = int(remove_total)
|
| 256 |
+
|
| 257 |
+
# Proceed if word's ending matches rule's word ending
|
| 258 |
+
if word.endswith(ending_string[::-1]):
|
| 259 |
+
if intact_flag:
|
| 260 |
+
if word == intact_word and self.__isAcceptable(
|
| 261 |
+
word, remove_total
|
| 262 |
+
):
|
| 263 |
+
word = self.__applyRule(
|
| 264 |
+
word, remove_total, append_string
|
| 265 |
+
)
|
| 266 |
+
rule_was_applied = True
|
| 267 |
+
if cont_flag == ".":
|
| 268 |
+
proceed = False
|
| 269 |
+
break
|
| 270 |
+
elif self.__isAcceptable(word, remove_total):
|
| 271 |
+
word = self.__applyRule(
|
| 272 |
+
word, remove_total, append_string
|
| 273 |
+
)
|
| 274 |
+
rule_was_applied = True
|
| 275 |
+
if cont_flag == ".":
|
| 276 |
+
proceed = False
|
| 277 |
+
break
|
| 278 |
+
# If no rules apply, the word doesn't need any more stemming
|
| 279 |
+
if rule_was_applied == False:
|
| 280 |
+
proceed = False
|
| 281 |
+
return word
|
| 282 |
+
|
| 283 |
+
def __getLastLetter(self, word):
|
| 284 |
+
"""Get the zero-based index of the last alphabetic character in this string"""
|
| 285 |
+
last_letter = -1
|
| 286 |
+
for position in range(len(word)):
|
| 287 |
+
if word[position].isalpha():
|
| 288 |
+
last_letter = position
|
| 289 |
+
else:
|
| 290 |
+
break
|
| 291 |
+
return last_letter
|
| 292 |
+
|
| 293 |
+
def __isAcceptable(self, word, remove_total):
|
| 294 |
+
"""Determine if the word is acceptable for stemming."""
|
| 295 |
+
word_is_acceptable = False
|
| 296 |
+
# If the word starts with a vowel, it must be at least 2
|
| 297 |
+
# characters long to be stemmed
|
| 298 |
+
if word[0] in "aeiouy":
|
| 299 |
+
if len(word) - remove_total >= 2:
|
| 300 |
+
word_is_acceptable = True
|
| 301 |
+
# If the word starts with a consonant, it must be at least 3
|
| 302 |
+
# characters long (including one vowel) to be stemmed
|
| 303 |
+
elif len(word) - remove_total >= 3:
|
| 304 |
+
if word[1] in "aeiouy":
|
| 305 |
+
word_is_acceptable = True
|
| 306 |
+
elif word[2] in "aeiouy":
|
| 307 |
+
word_is_acceptable = True
|
| 308 |
+
return word_is_acceptable
|
| 309 |
+
|
| 310 |
+
def __applyRule(self, word, remove_total, append_string):
|
| 311 |
+
"""Apply the stemming rule to the word"""
|
| 312 |
+
# Remove letters from the end of the word
|
| 313 |
+
new_word_length = len(word) - remove_total
|
| 314 |
+
word = word[0:new_word_length]
|
| 315 |
+
|
| 316 |
+
# And add new letters to the end of the truncated word
|
| 317 |
+
if append_string:
|
| 318 |
+
word += append_string
|
| 319 |
+
return word
|
| 320 |
+
|
| 321 |
+
def __stripPrefix(self, word):
|
| 322 |
+
"""Remove prefix from a word.
|
| 323 |
+
|
| 324 |
+
This function originally taken from Whoosh.
|
| 325 |
+
|
| 326 |
+
"""
|
| 327 |
+
for prefix in (
|
| 328 |
+
"kilo",
|
| 329 |
+
"micro",
|
| 330 |
+
"milli",
|
| 331 |
+
"intra",
|
| 332 |
+
"ultra",
|
| 333 |
+
"mega",
|
| 334 |
+
"nano",
|
| 335 |
+
"pico",
|
| 336 |
+
"pseudo",
|
| 337 |
+
):
|
| 338 |
+
if word.startswith(prefix):
|
| 339 |
+
return word[len(prefix) :]
|
| 340 |
+
return word
|
| 341 |
+
|
| 342 |
+
def __repr__(self):
|
| 343 |
+
return "<LancasterStemmer>"
|
.eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: RSLP Stemmer
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Tiago Tresoldi <tresoldi@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
# This code is based on the algorithm presented in the paper "A Stemming
|
| 9 |
+
# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
|
| 10 |
+
# Christian Huyck, which unfortunately I had no access to. The code is a
|
| 11 |
+
# Python version, with some minor modifications of mine, to the description
|
| 12 |
+
# presented at https://www.webcitation.org/5NnvdIzOb and to the C source code
|
| 13 |
+
# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
|
| 14 |
+
# Please note that this stemmer is intended for demonstration and educational
|
| 15 |
+
# purposes only. Feel free to write me for any comments, including the
|
| 16 |
+
# development of a different and/or better stemmer for Portuguese. I also
|
| 17 |
+
# suggest using NLTK's mailing list for Portuguese for any discussion.
|
| 18 |
+
|
| 19 |
+
# Este código é baseado no algoritmo apresentado no artigo "A Stemming
|
| 20 |
+
# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
|
| 21 |
+
# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
|
| 22 |
+
# código é uma conversão para Python, com algumas pequenas modificações
|
| 23 |
+
# minhas, daquele apresentado em https://www.webcitation.org/5NnvdIzOb e do
|
| 24 |
+
# código para linguagem C disponível em
|
| 25 |
+
# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
|
| 26 |
+
# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
|
| 27 |
+
# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
|
| 28 |
+
# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
|
| 29 |
+
# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
|
| 30 |
+
# do NLTK para o português para qualquer debate.
|
| 31 |
+
|
| 32 |
+
from nltk.data import load
|
| 33 |
+
from nltk.stem.api import StemmerI
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class RSLPStemmer(StemmerI):
|
| 37 |
+
"""
|
| 38 |
+
A stemmer for Portuguese.
|
| 39 |
+
|
| 40 |
+
>>> from nltk.stem import RSLPStemmer
|
| 41 |
+
>>> st = RSLPStemmer()
|
| 42 |
+
>>> # opening lines of Erico Verissimo's "Música ao Longe"
|
| 43 |
+
>>> text = '''
|
| 44 |
+
... Clarissa risca com giz no quadro-negro a paisagem que os alunos
|
| 45 |
+
... devem copiar . Uma casinha de porta e janela , em cima duma
|
| 46 |
+
... coxilha .'''
|
| 47 |
+
>>> for token in text.split(): # doctest: +NORMALIZE_WHITESPACE
|
| 48 |
+
... print(st.stem(token))
|
| 49 |
+
clariss risc com giz no quadro-negr a pais que os alun dev copi .
|
| 50 |
+
uma cas de port e janel , em cim dum coxilh .
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
def __init__(self):
|
| 54 |
+
self._model = []
|
| 55 |
+
|
| 56 |
+
self._model.append(self.read_rule("step0.pt"))
|
| 57 |
+
self._model.append(self.read_rule("step1.pt"))
|
| 58 |
+
self._model.append(self.read_rule("step2.pt"))
|
| 59 |
+
self._model.append(self.read_rule("step3.pt"))
|
| 60 |
+
self._model.append(self.read_rule("step4.pt"))
|
| 61 |
+
self._model.append(self.read_rule("step5.pt"))
|
| 62 |
+
self._model.append(self.read_rule("step6.pt"))
|
| 63 |
+
|
| 64 |
+
def read_rule(self, filename):
|
| 65 |
+
rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
|
| 66 |
+
lines = rules.split("\n")
|
| 67 |
+
|
| 68 |
+
lines = [line for line in lines if line != ""] # remove blank lines
|
| 69 |
+
lines = [line for line in lines if line[0] != "#"] # remove comments
|
| 70 |
+
|
| 71 |
+
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
|
| 72 |
+
lines = [line.replace("\t\t", "\t") for line in lines]
|
| 73 |
+
|
| 74 |
+
# parse rules
|
| 75 |
+
rules = []
|
| 76 |
+
for line in lines:
|
| 77 |
+
rule = []
|
| 78 |
+
tokens = line.split("\t")
|
| 79 |
+
|
| 80 |
+
# text to be searched for at the end of the string
|
| 81 |
+
rule.append(tokens[0][1:-1]) # remove quotes
|
| 82 |
+
|
| 83 |
+
# minimum stem size to perform the replacement
|
| 84 |
+
rule.append(int(tokens[1]))
|
| 85 |
+
|
| 86 |
+
# text to be replaced into
|
| 87 |
+
rule.append(tokens[2][1:-1]) # remove quotes
|
| 88 |
+
|
| 89 |
+
# exceptions to this rule
|
| 90 |
+
rule.append([token[1:-1] for token in tokens[3].split(",")])
|
| 91 |
+
|
| 92 |
+
# append to the results
|
| 93 |
+
rules.append(rule)
|
| 94 |
+
|
| 95 |
+
return rules
|
| 96 |
+
|
| 97 |
+
def stem(self, word):
|
| 98 |
+
word = word.lower()
|
| 99 |
+
|
| 100 |
+
# the word ends in 's'? apply rule for plural reduction
|
| 101 |
+
if word[-1] == "s":
|
| 102 |
+
word = self.apply_rule(word, 0)
|
| 103 |
+
|
| 104 |
+
# the word ends in 'a'? apply rule for feminine reduction
|
| 105 |
+
if word[-1] == "a":
|
| 106 |
+
word = self.apply_rule(word, 1)
|
| 107 |
+
|
| 108 |
+
# augmentative reduction
|
| 109 |
+
word = self.apply_rule(word, 3)
|
| 110 |
+
|
| 111 |
+
# adverb reduction
|
| 112 |
+
word = self.apply_rule(word, 2)
|
| 113 |
+
|
| 114 |
+
# noun reduction
|
| 115 |
+
prev_word = word
|
| 116 |
+
word = self.apply_rule(word, 4)
|
| 117 |
+
if word == prev_word:
|
| 118 |
+
# verb reduction
|
| 119 |
+
prev_word = word
|
| 120 |
+
word = self.apply_rule(word, 5)
|
| 121 |
+
if word == prev_word:
|
| 122 |
+
# vowel removal
|
| 123 |
+
word = self.apply_rule(word, 6)
|
| 124 |
+
|
| 125 |
+
return word
|
| 126 |
+
|
| 127 |
+
def apply_rule(self, word, rule_index):
|
| 128 |
+
rules = self._model[rule_index]
|
| 129 |
+
for rule in rules:
|
| 130 |
+
suffix_length = len(rule[0])
|
| 131 |
+
if word[-suffix_length:] == rule[0]: # if suffix matches
|
| 132 |
+
if len(word) >= suffix_length + rule[1]: # if we have minimum size
|
| 133 |
+
if word not in rule[3]: # if not an exception
|
| 134 |
+
word = word[:-suffix_length] + rule[2]
|
| 135 |
+
break
|
| 136 |
+
|
| 137 |
+
return word
|