msj19 commited on
Commit
9b40ad5
·
verified ·
1 Parent(s): 234704f

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py +186 -0
  2. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py +154 -0
  3. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py +516 -0
  4. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py +218 -0
  5. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py +265 -0
  6. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py +237 -0
  7. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py +168 -0
  8. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py +158 -0
  9. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py +174 -0
  10. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py +95 -0
  11. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py +375 -0
  12. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py +227 -0
  13. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py +95 -0
  14. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py +520 -0
  15. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py +133 -0
  16. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py +331 -0
  17. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py +146 -0
  18. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py +296 -0
  19. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py +196 -0
  20. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py +136 -0
  21. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py +75 -0
  22. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py +56 -0
  23. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py +125 -0
  24. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py +354 -0
  25. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py +510 -0
  26. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py +76 -0
  27. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py +136 -0
  28. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py +75 -0
  29. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py +867 -0
  30. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py +629 -0
  31. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py +166 -0
  32. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py +2489 -0
  33. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py +397 -0
  34. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py +256 -0
  35. .eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py +393 -0
  36. .eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py +772 -0
  37. .eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py +684 -0
  38. .eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py +479 -0
  39. .eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py +470 -0
  40. .eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py +794 -0
  41. .eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py +234 -0
  42. .eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py +453 -0
  43. .eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py +1605 -0
  44. .eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py +553 -0
  45. .eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py +835 -0
  46. .eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py +395 -0
  47. .eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py +34 -0
  48. .eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py +27 -0
  49. .eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py +343 -0
  50. .eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py +137 -0
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/__init__.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Corpus Readers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ NLTK corpus readers. The modules in this package provide functions
11
+ that can be used to read corpus fileids in a variety of formats. These
12
+ functions can be used to read both the corpus fileids that are
13
+ distributed in the NLTK corpus package, and corpus fileids that are part
14
+ of external corpora.
15
+
16
+ Corpus Reader Functions
17
+ =======================
18
+ Each corpus module defines one or more "corpus reader functions",
19
+ which can be used to read documents from that corpus. These functions
20
+ take an argument, ``item``, which is used to indicate which document
21
+ should be read from the corpus:
22
+
23
+ - If ``item`` is one of the unique identifiers listed in the corpus
24
+ module's ``items`` variable, then the corresponding document will
25
+ be loaded from the NLTK corpus package.
26
+ - If ``item`` is a fileid, then that file will be read.
27
+
28
+ Additionally, corpus reader functions can be given lists of item
29
+ names; in which case, they will return a concatenation of the
30
+ corresponding documents.
31
+
32
+ Corpus reader functions are named based on the type of information
33
+ they return. Some common examples, and their return types, are:
34
+
35
+ - words(): list of str
36
+ - sents(): list of (list of str)
37
+ - paras(): list of (list of (list of str))
38
+ - tagged_words(): list of (str,str) tuple
39
+ - tagged_sents(): list of (list of (str,str))
40
+ - tagged_paras(): list of (list of (list of (str,str)))
41
+ - chunked_sents(): list of (Tree w/ (str,str) leaves)
42
+ - parsed_sents(): list of (Tree with str leaves)
43
+ - parsed_paras(): list of (list of (Tree with str leaves))
44
+ - xml(): A single xml ElementTree
45
+ - raw(): unprocessed corpus contents
46
+
47
+ For example, to read a list of the words in the Brown Corpus, use
48
+ ``nltk.corpus.brown.words()``:
49
+
50
+ >>> from nltk.corpus import brown
51
+ >>> print(", ".join(brown.words()[:6])) # only first 6 words
52
+ The, Fulton, County, Grand, Jury, said
53
+
54
+ isort:skip_file
55
+ """
56
+
57
+ from nltk.corpus.reader.plaintext import *
58
+ from nltk.corpus.reader.util import *
59
+ from nltk.corpus.reader.api import *
60
+ from nltk.corpus.reader.tagged import *
61
+ from nltk.corpus.reader.cmudict import *
62
+ from nltk.corpus.reader.conll import *
63
+ from nltk.corpus.reader.chunked import *
64
+ from nltk.corpus.reader.wordlist import *
65
+ from nltk.corpus.reader.xmldocs import *
66
+ from nltk.corpus.reader.ppattach import *
67
+ from nltk.corpus.reader.senseval import *
68
+ from nltk.corpus.reader.ieer import *
69
+ from nltk.corpus.reader.sinica_treebank import *
70
+ from nltk.corpus.reader.bracket_parse import *
71
+ from nltk.corpus.reader.indian import *
72
+ from nltk.corpus.reader.toolbox import *
73
+ from nltk.corpus.reader.timit import *
74
+ from nltk.corpus.reader.ycoe import *
75
+ from nltk.corpus.reader.rte import *
76
+ from nltk.corpus.reader.string_category import *
77
+ from nltk.corpus.reader.propbank import *
78
+ from nltk.corpus.reader.verbnet import *
79
+ from nltk.corpus.reader.bnc import *
80
+ from nltk.corpus.reader.nps_chat import *
81
+ from nltk.corpus.reader.wordnet import *
82
+ from nltk.corpus.reader.switchboard import *
83
+ from nltk.corpus.reader.dependency import *
84
+ from nltk.corpus.reader.nombank import *
85
+ from nltk.corpus.reader.ipipan import *
86
+ from nltk.corpus.reader.pl196x import *
87
+ from nltk.corpus.reader.knbc import *
88
+ from nltk.corpus.reader.chasen import *
89
+ from nltk.corpus.reader.childes import *
90
+ from nltk.corpus.reader.aligned import *
91
+ from nltk.corpus.reader.lin import *
92
+ from nltk.corpus.reader.semcor import *
93
+ from nltk.corpus.reader.framenet import *
94
+ from nltk.corpus.reader.udhr import *
95
+ from nltk.corpus.reader.bnc import *
96
+ from nltk.corpus.reader.sentiwordnet import *
97
+ from nltk.corpus.reader.twitter import *
98
+ from nltk.corpus.reader.nkjp import *
99
+ from nltk.corpus.reader.crubadan import *
100
+ from nltk.corpus.reader.mte import *
101
+ from nltk.corpus.reader.reviews import *
102
+ from nltk.corpus.reader.opinion_lexicon import *
103
+ from nltk.corpus.reader.pros_cons import *
104
+ from nltk.corpus.reader.categorized_sents import *
105
+ from nltk.corpus.reader.comparative_sents import *
106
+ from nltk.corpus.reader.panlex_lite import *
107
+ from nltk.corpus.reader.panlex_swadesh import *
108
+ from nltk.corpus.reader.bcp47 import *
109
+
110
+ # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
111
+ # the function bracket_parse() defined in nltk.tree:
112
+ from nltk.corpus.reader import bracket_parse
113
+
114
+ __all__ = [
115
+ "CorpusReader",
116
+ "CategorizedCorpusReader",
117
+ "PlaintextCorpusReader",
118
+ "find_corpus_fileids",
119
+ "TaggedCorpusReader",
120
+ "CMUDictCorpusReader",
121
+ "ConllChunkCorpusReader",
122
+ "WordListCorpusReader",
123
+ "PPAttachmentCorpusReader",
124
+ "SensevalCorpusReader",
125
+ "IEERCorpusReader",
126
+ "ChunkedCorpusReader",
127
+ "SinicaTreebankCorpusReader",
128
+ "BracketParseCorpusReader",
129
+ "IndianCorpusReader",
130
+ "ToolboxCorpusReader",
131
+ "TimitCorpusReader",
132
+ "YCOECorpusReader",
133
+ "MacMorphoCorpusReader",
134
+ "SyntaxCorpusReader",
135
+ "AlpinoCorpusReader",
136
+ "RTECorpusReader",
137
+ "StringCategoryCorpusReader",
138
+ "EuroparlCorpusReader",
139
+ "CategorizedBracketParseCorpusReader",
140
+ "CategorizedTaggedCorpusReader",
141
+ "CategorizedPlaintextCorpusReader",
142
+ "PortugueseCategorizedPlaintextCorpusReader",
143
+ "tagged_treebank_para_block_reader",
144
+ "PropbankCorpusReader",
145
+ "VerbnetCorpusReader",
146
+ "BNCCorpusReader",
147
+ "ConllCorpusReader",
148
+ "XMLCorpusReader",
149
+ "NPSChatCorpusReader",
150
+ "SwadeshCorpusReader",
151
+ "WordNetCorpusReader",
152
+ "WordNetICCorpusReader",
153
+ "SwitchboardCorpusReader",
154
+ "DependencyCorpusReader",
155
+ "NombankCorpusReader",
156
+ "IPIPANCorpusReader",
157
+ "Pl196xCorpusReader",
158
+ "TEICorpusView",
159
+ "KNBCorpusReader",
160
+ "ChasenCorpusReader",
161
+ "CHILDESCorpusReader",
162
+ "AlignedCorpusReader",
163
+ "TimitTaggedCorpusReader",
164
+ "LinThesaurusCorpusReader",
165
+ "SemcorCorpusReader",
166
+ "FramenetCorpusReader",
167
+ "UdhrCorpusReader",
168
+ "BNCCorpusReader",
169
+ "SentiWordNetCorpusReader",
170
+ "SentiSynset",
171
+ "TwitterCorpusReader",
172
+ "NKJPCorpusReader",
173
+ "CrubadanCorpusReader",
174
+ "MTECorpusReader",
175
+ "ReviewsCorpusReader",
176
+ "OpinionLexiconCorpusReader",
177
+ "ProsConsCorpusReader",
178
+ "CategorizedSentencesCorpusReader",
179
+ "ComparativeSentencesCorpusReader",
180
+ "PanLexLiteCorpusReader",
181
+ "NonbreakingPrefixesCorpusReader",
182
+ "UnicharsCorpusReader",
183
+ "MWAPPDBCorpusReader",
184
+ "PanlexSwadeshCorpusReader",
185
+ "BCP47CorpusReader",
186
+ ]
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/aligned.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Aligned Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # URL: <https://www.nltk.org/>
5
+ # Author: Steven Bird <stevenbird1@gmail.com>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ from nltk.corpus.reader.api import CorpusReader
9
+ from nltk.corpus.reader.util import (
10
+ StreamBackedCorpusView,
11
+ concat,
12
+ read_alignedsent_block,
13
+ )
14
+ from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
15
+ from nltk.translate import AlignedSent, Alignment
16
+
17
+
18
+ class AlignedCorpusReader(CorpusReader):
19
+ """
20
+ Reader for corpora of word-aligned sentences. Tokens are assumed
21
+ to be separated by whitespace. Sentences begin on separate lines.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ root,
27
+ fileids,
28
+ sep="/",
29
+ word_tokenizer=WhitespaceTokenizer(),
30
+ sent_tokenizer=RegexpTokenizer("\n", gaps=True),
31
+ alignedsent_block_reader=read_alignedsent_block,
32
+ encoding="latin1",
33
+ ):
34
+ """
35
+ Construct a new Aligned Corpus reader for a set of documents
36
+ located at the given root directory. Example usage:
37
+
38
+ >>> root = '/...path to corpus.../'
39
+ >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
40
+
41
+ :param root: The root directory for this corpus.
42
+ :param fileids: A list or regexp specifying the fileids in this corpus.
43
+ """
44
+ CorpusReader.__init__(self, root, fileids, encoding)
45
+ self._sep = sep
46
+ self._word_tokenizer = word_tokenizer
47
+ self._sent_tokenizer = sent_tokenizer
48
+ self._alignedsent_block_reader = alignedsent_block_reader
49
+
50
+ def words(self, fileids=None):
51
+ """
52
+ :return: the given file(s) as a list of words
53
+ and punctuation symbols.
54
+ :rtype: list(str)
55
+ """
56
+ return concat(
57
+ [
58
+ AlignedSentCorpusView(
59
+ fileid,
60
+ enc,
61
+ False,
62
+ False,
63
+ self._word_tokenizer,
64
+ self._sent_tokenizer,
65
+ self._alignedsent_block_reader,
66
+ )
67
+ for (fileid, enc) in self.abspaths(fileids, True)
68
+ ]
69
+ )
70
+
71
+ def sents(self, fileids=None):
72
+ """
73
+ :return: the given file(s) as a list of
74
+ sentences or utterances, each encoded as a list of word
75
+ strings.
76
+ :rtype: list(list(str))
77
+ """
78
+ return concat(
79
+ [
80
+ AlignedSentCorpusView(
81
+ fileid,
82
+ enc,
83
+ False,
84
+ True,
85
+ self._word_tokenizer,
86
+ self._sent_tokenizer,
87
+ self._alignedsent_block_reader,
88
+ )
89
+ for (fileid, enc) in self.abspaths(fileids, True)
90
+ ]
91
+ )
92
+
93
+ def aligned_sents(self, fileids=None):
94
+ """
95
+ :return: the given file(s) as a list of AlignedSent objects.
96
+ :rtype: list(AlignedSent)
97
+ """
98
+ return concat(
99
+ [
100
+ AlignedSentCorpusView(
101
+ fileid,
102
+ enc,
103
+ True,
104
+ True,
105
+ self._word_tokenizer,
106
+ self._sent_tokenizer,
107
+ self._alignedsent_block_reader,
108
+ )
109
+ for (fileid, enc) in self.abspaths(fileids, True)
110
+ ]
111
+ )
112
+
113
+
114
+ class AlignedSentCorpusView(StreamBackedCorpusView):
115
+ """
116
+ A specialized corpus view for aligned sentences.
117
+ ``AlignedSentCorpusView`` objects are typically created by
118
+ ``AlignedCorpusReader`` (not directly by nltk users).
119
+ """
120
+
121
+ def __init__(
122
+ self,
123
+ corpus_file,
124
+ encoding,
125
+ aligned,
126
+ group_by_sent,
127
+ word_tokenizer,
128
+ sent_tokenizer,
129
+ alignedsent_block_reader,
130
+ ):
131
+ self._aligned = aligned
132
+ self._group_by_sent = group_by_sent
133
+ self._word_tokenizer = word_tokenizer
134
+ self._sent_tokenizer = sent_tokenizer
135
+ self._alignedsent_block_reader = alignedsent_block_reader
136
+ StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
137
+
138
+ def read_block(self, stream):
139
+ block = [
140
+ self._word_tokenizer.tokenize(sent_str)
141
+ for alignedsent_str in self._alignedsent_block_reader(stream)
142
+ for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
143
+ ]
144
+ if self._aligned:
145
+ block[2] = Alignment.fromstring(
146
+ " ".join(block[2])
147
+ ) # kludge; we shouldn't have tokenized the alignment string
148
+ block = [AlignedSent(*block)]
149
+ elif self._group_by_sent:
150
+ block = [block[0]]
151
+ else:
152
+ block = block[0]
153
+
154
+ return block
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/api.py ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: API for Corpus Readers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ API for corpus readers.
11
+ """
12
+
13
+ import os
14
+ import re
15
+ from collections import defaultdict
16
+ from itertools import chain
17
+
18
+ from nltk.corpus.reader.util import *
19
+ from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer
20
+
21
+
22
+ class CorpusReader:
23
+ """
24
+ A base class for "corpus reader" classes, each of which can be
25
+ used to read a specific corpus format. Each individual corpus
26
+ reader instance is used to read a specific corpus, consisting of
27
+ one or more files under a common root directory. Each file is
28
+ identified by its ``file identifier``, which is the relative path
29
+ to the file from the root directory.
30
+
31
+ A separate subclass is defined for each corpus format. These
32
+ subclasses define one or more methods that provide 'views' on the
33
+ corpus contents, such as ``words()`` (for a list of words) and
34
+ ``parsed_sents()`` (for a list of parsed sentences). Called with
35
+ no arguments, these methods will return the contents of the entire
36
+ corpus. For most corpora, these methods define one or more
37
+ selection arguments, such as ``fileids`` or ``categories``, which can
38
+ be used to select which portion of the corpus should be returned.
39
+ """
40
+
41
+ def __init__(self, root, fileids, encoding="utf8", tagset=None):
42
+ """
43
+ :type root: PathPointer or str
44
+ :param root: A path pointer identifying the root directory for
45
+ this corpus. If a string is specified, then it will be
46
+ converted to a ``PathPointer`` automatically.
47
+ :param fileids: A list of the files that make up this corpus.
48
+ This list can either be specified explicitly, as a list of
49
+ strings; or implicitly, as a regular expression over file
50
+ paths. The absolute path for each file will be constructed
51
+ by joining the reader's root to each file name.
52
+ :param encoding: The default unicode encoding for the files
53
+ that make up the corpus. The value of ``encoding`` can be any
54
+ of the following:
55
+
56
+ - A string: ``encoding`` is the encoding name for all files.
57
+ - A dictionary: ``encoding[file_id]`` is the encoding
58
+ name for the file whose identifier is ``file_id``. If
59
+ ``file_id`` is not in ``encoding``, then the file
60
+ contents will be processed using non-unicode byte strings.
61
+ - A list: ``encoding`` should be a list of ``(regexp, encoding)``
62
+ tuples. The encoding for a file whose identifier is ``file_id``
63
+ will be the ``encoding`` value for the first tuple whose
64
+ ``regexp`` matches the ``file_id``. If no tuple's ``regexp``
65
+ matches the ``file_id``, the file contents will be processed
66
+ using non-unicode byte strings.
67
+ - None: the file contents of all files will be
68
+ processed using non-unicode byte strings.
69
+ :param tagset: The name of the tagset used by this corpus, to be used
70
+ for normalizing or converting the POS tags returned by the
71
+ ``tagged_...()`` methods.
72
+ """
73
+ # Convert the root to a path pointer, if necessary.
74
+ if isinstance(root, str) and not isinstance(root, PathPointer):
75
+ m = re.match(r"(.*\.zip)/?(.*)$|", root)
76
+ zipfile, zipentry = m.groups()
77
+ if zipfile:
78
+ root = ZipFilePathPointer(zipfile, zipentry)
79
+ else:
80
+ root = FileSystemPathPointer(root)
81
+ elif not isinstance(root, PathPointer):
82
+ raise TypeError("CorpusReader: expected a string or a PathPointer")
83
+
84
+ # If `fileids` is a regexp, then expand it.
85
+ if isinstance(fileids, str):
86
+ fileids = find_corpus_fileids(root, fileids)
87
+
88
+ self._fileids = fileids
89
+ """A list of the relative paths for the fileids that make up
90
+ this corpus."""
91
+
92
+ self._root = root
93
+ """The root directory for this corpus."""
94
+
95
+ self._readme = "README"
96
+ self._license = "LICENSE"
97
+ self._citation = "citation.bib"
98
+
99
+ # If encoding was specified as a list of regexps, then convert
100
+ # it to a dictionary.
101
+ if isinstance(encoding, list):
102
+ encoding_dict = {}
103
+ for fileid in self._fileids:
104
+ for x in encoding:
105
+ (regexp, enc) = x
106
+ if re.match(regexp, fileid):
107
+ encoding_dict[fileid] = enc
108
+ break
109
+ encoding = encoding_dict
110
+
111
+ self._encoding = encoding
112
+ """The default unicode encoding for the fileids that make up
113
+ this corpus. If ``encoding`` is None, then the file
114
+ contents are processed using byte strings."""
115
+ self._tagset = tagset
116
+
117
+ def __repr__(self):
118
+ if isinstance(self._root, ZipFilePathPointer):
119
+ path = f"{self._root.zipfile.filename}/{self._root.entry}"
120
+ else:
121
+ path = "%s" % self._root.path
122
+ return f"<{self.__class__.__name__} in {path!r}>"
123
+
124
+ def ensure_loaded(self):
125
+ """
126
+ Load this corpus (if it has not already been loaded). This is
127
+ used by LazyCorpusLoader as a simple method that can be used to
128
+ make sure a corpus is loaded -- e.g., in case a user wants to
129
+ do help(some_corpus).
130
+ """
131
+ pass # no need to actually do anything.
132
+
133
+ def readme(self):
134
+ """
135
+ Return the contents of the corpus README file, if it exists.
136
+ """
137
+ with self.open(self._readme) as f:
138
+ return f.read()
139
+
140
+ def license(self):
141
+ """
142
+ Return the contents of the corpus LICENSE file, if it exists.
143
+ """
144
+ with self.open(self._license) as f:
145
+ return f.read()
146
+
147
+ def citation(self):
148
+ """
149
+ Return the contents of the corpus citation.bib file, if it exists.
150
+ """
151
+ with self.open(self._citation) as f:
152
+ return f.read()
153
+
154
+ def fileids(self):
155
+ """
156
+ Return a list of file identifiers for the fileids that make up
157
+ this corpus.
158
+ """
159
+ return self._fileids
160
+
161
+ def abspath(self, fileid):
162
+ """
163
+ Return the absolute path for the given file.
164
+
165
+ :type fileid: str
166
+ :param fileid: The file identifier for the file whose path
167
+ should be returned.
168
+ :rtype: PathPointer
169
+ """
170
+ return self._root.join(fileid)
171
+
172
+ def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
173
+ """
174
+ Return a list of the absolute paths for all fileids in this corpus;
175
+ or for the given list of fileids, if specified.
176
+
177
+ :type fileids: None or str or list
178
+ :param fileids: Specifies the set of fileids for which paths should
179
+ be returned. Can be None, for all fileids; a list of
180
+ file identifiers, for a specified set of fileids; or a single
181
+ file identifier, for a single file. Note that the return
182
+ value is always a list of paths, even if ``fileids`` is a
183
+ single file identifier.
184
+
185
+ :param include_encoding: If true, then return a list of
186
+ ``(path_pointer, encoding)`` tuples.
187
+
188
+ :rtype: list(PathPointer)
189
+ """
190
+ if fileids is None:
191
+ fileids = self._fileids
192
+ elif isinstance(fileids, str):
193
+ fileids = [fileids]
194
+
195
+ paths = [self._root.join(f) for f in fileids]
196
+
197
+ if include_encoding and include_fileid:
198
+ return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
199
+ elif include_fileid:
200
+ return list(zip(paths, fileids))
201
+ elif include_encoding:
202
+ return list(zip(paths, [self.encoding(f) for f in fileids]))
203
+ else:
204
+ return paths
205
+
206
+ def raw(self, fileids=None):
207
+ """
208
+ :param fileids: A list specifying the fileids that should be used.
209
+ :return: the given file(s) as a single string.
210
+ :rtype: str
211
+ """
212
+ if fileids is None:
213
+ fileids = self._fileids
214
+ elif isinstance(fileids, str):
215
+ fileids = [fileids]
216
+ contents = []
217
+ for f in fileids:
218
+ with self.open(f) as fp:
219
+ contents.append(fp.read())
220
+ return concat(contents)
221
+
222
+ def open(self, file):
223
+ """
224
+ Return an open stream that can be used to read the given file.
225
+ If the file's encoding is not None, then the stream will
226
+ automatically decode the file's contents into unicode.
227
+
228
+ :param file: The file identifier of the file to read.
229
+ """
230
+ encoding = self.encoding(file)
231
+ stream = self._root.join(file).open(encoding)
232
+ return stream
233
+
234
+ def encoding(self, file):
235
+ """
236
+ Return the unicode encoding for the given corpus file, if known.
237
+ If the encoding is unknown, or if the given file should be
238
+ processed using byte strings (str), then return None.
239
+ """
240
+ if isinstance(self._encoding, dict):
241
+ return self._encoding.get(file)
242
+ else:
243
+ return self._encoding
244
+
245
+ def _get_root(self):
246
+ return self._root
247
+
248
+ root = property(
249
+ _get_root,
250
+ doc="""
251
+ The directory where this corpus is stored.
252
+
253
+ :type: PathPointer""",
254
+ )
255
+
256
+
257
+ ######################################################################
258
+ # { Corpora containing categorized items
259
+ ######################################################################
260
+
261
+
262
+ class CategorizedCorpusReader:
263
+ """
264
+ A mixin class used to aid in the implementation of corpus readers
265
+ for categorized corpora. This class defines the method
266
+ ``categories()``, which returns a list of the categories for the
267
+ corpus or for a specified set of fileids; and overrides ``fileids()``
268
+ to take a ``categories`` argument, restricting the set of fileids to
269
+ be returned.
270
+
271
+ Subclasses are expected to:
272
+
273
+ - Call ``__init__()`` to set up the mapping.
274
+
275
+ - Override all view methods to accept a ``categories`` parameter,
276
+ which can be used *instead* of the ``fileids`` parameter, to
277
+ select which fileids should be included in the returned view.
278
+ """
279
+
280
+ def __init__(self, kwargs):
281
+ """
282
+ Initialize this mapping based on keyword arguments, as
283
+ follows:
284
+
285
+ - cat_pattern: A regular expression pattern used to find the
286
+ category for each file identifier. The pattern will be
287
+ applied to each file identifier, and the first matching
288
+ group will be used as the category label for that file.
289
+
290
+ - cat_map: A dictionary, mapping from file identifiers to
291
+ category labels.
292
+
293
+ - cat_file: The name of a file that contains the mapping
294
+ from file identifiers to categories. The argument
295
+ ``cat_delimiter`` can be used to specify a delimiter.
296
+
297
+ The corresponding argument will be deleted from ``kwargs``. If
298
+ more than one argument is specified, an exception will be
299
+ raised.
300
+ """
301
+ self._f2c = None #: file-to-category mapping
302
+ self._c2f = None #: category-to-file mapping
303
+
304
+ self._pattern = None #: regexp specifying the mapping
305
+ self._map = None #: dict specifying the mapping
306
+ self._file = None #: fileid of file containing the mapping
307
+ self._delimiter = None #: delimiter for ``self._file``
308
+
309
+ if "cat_pattern" in kwargs:
310
+ self._pattern = kwargs["cat_pattern"]
311
+ del kwargs["cat_pattern"]
312
+ elif "cat_map" in kwargs:
313
+ self._map = kwargs["cat_map"]
314
+ del kwargs["cat_map"]
315
+ elif "cat_file" in kwargs:
316
+ self._file = kwargs["cat_file"]
317
+ del kwargs["cat_file"]
318
+ if "cat_delimiter" in kwargs:
319
+ self._delimiter = kwargs["cat_delimiter"]
320
+ del kwargs["cat_delimiter"]
321
+ else:
322
+ raise ValueError(
323
+ "Expected keyword argument cat_pattern or " "cat_map or cat_file."
324
+ )
325
+
326
+ if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
327
+ raise ValueError(
328
+ "Specify exactly one of: cat_pattern, " "cat_map, cat_file."
329
+ )
330
+
331
+ def _init(self):
332
+ self._f2c = defaultdict(set)
333
+ self._c2f = defaultdict(set)
334
+
335
+ if self._pattern is not None:
336
+ for file_id in self._fileids:
337
+ category = re.match(self._pattern, file_id).group(1)
338
+ self._add(file_id, category)
339
+
340
+ elif self._map is not None:
341
+ for (file_id, categories) in self._map.items():
342
+ for category in categories:
343
+ self._add(file_id, category)
344
+
345
+ elif self._file is not None:
346
+ with self.open(self._file) as f:
347
+ for line in f.readlines():
348
+ line = line.strip()
349
+ file_id, categories = line.split(self._delimiter, 1)
350
+ if file_id not in self.fileids():
351
+ raise ValueError(
352
+ "In category mapping file %s: %s "
353
+ "not found" % (self._file, file_id)
354
+ )
355
+ for category in categories.split(self._delimiter):
356
+ self._add(file_id, category)
357
+
358
+ def _add(self, file_id, category):
359
+ self._f2c[file_id].add(category)
360
+ self._c2f[category].add(file_id)
361
+
362
+ def categories(self, fileids=None):
363
+ """
364
+ Return a list of the categories that are defined for this corpus,
365
+ or for the file(s) if it is given.
366
+ """
367
+ if self._f2c is None:
368
+ self._init()
369
+ if fileids is None:
370
+ return sorted(self._c2f)
371
+ if isinstance(fileids, str):
372
+ fileids = [fileids]
373
+ return sorted(set.union(*(self._f2c[d] for d in fileids)))
374
+
375
+ def fileids(self, categories=None):
376
+ """
377
+ Return a list of file identifiers for the files that make up
378
+ this corpus, or that make up the given category(s) if specified.
379
+ """
380
+ if categories is None:
381
+ return super().fileids()
382
+ elif isinstance(categories, str):
383
+ if self._f2c is None:
384
+ self._init()
385
+ if categories in self._c2f:
386
+ return sorted(self._c2f[categories])
387
+ else:
388
+ raise ValueError("Category %s not found" % categories)
389
+ else:
390
+ if self._f2c is None:
391
+ self._init()
392
+ return sorted(set.union(*(self._c2f[c] for c in categories)))
393
+
394
+ def _resolve(self, fileids, categories):
395
+ if fileids is not None and categories is not None:
396
+ raise ValueError("Specify fileids or categories, not both")
397
+ if categories is not None:
398
+ return self.fileids(categories)
399
+ else:
400
+ return fileids
401
+
402
+ def raw(self, fileids=None, categories=None):
403
+ return super().raw(self._resolve(fileids, categories))
404
+
405
+ def words(self, fileids=None, categories=None):
406
+ return super().words(self._resolve(fileids, categories))
407
+
408
+ def sents(self, fileids=None, categories=None):
409
+ return super().sents(self._resolve(fileids, categories))
410
+
411
+ def paras(self, fileids=None, categories=None):
412
+ return super().paras(self._resolve(fileids, categories))
413
+
414
+
415
+ ######################################################################
416
+ # { Treebank readers
417
+ ######################################################################
418
+
419
+ # [xx] is it worth it to factor this out?
420
+ class SyntaxCorpusReader(CorpusReader):
421
+ """
422
+ An abstract base class for reading corpora consisting of
423
+ syntactically parsed text. Subclasses should define:
424
+
425
+ - ``__init__``, which specifies the location of the corpus
426
+ and a method for detecting the sentence blocks in corpus files.
427
+ - ``_read_block``, which reads a block from the input stream.
428
+ - ``_word``, which takes a block and returns a list of list of words.
429
+ - ``_tag``, which takes a block and returns a list of list of tagged
430
+ words.
431
+ - ``_parse``, which takes a block and returns a list of parsed
432
+ sentences.
433
+ """
434
+
435
+ def _parse(self, s):
436
+ raise NotImplementedError()
437
+
438
+ def _word(self, s):
439
+ raise NotImplementedError()
440
+
441
+ def _tag(self, s):
442
+ raise NotImplementedError()
443
+
444
+ def _read_block(self, stream):
445
+ raise NotImplementedError()
446
+
447
+ def parsed_sents(self, fileids=None):
448
+ reader = self._read_parsed_sent_block
449
+ return concat(
450
+ [
451
+ StreamBackedCorpusView(fileid, reader, encoding=enc)
452
+ for fileid, enc in self.abspaths(fileids, True)
453
+ ]
454
+ )
455
+
456
+ def tagged_sents(self, fileids=None, tagset=None):
457
+ def reader(stream):
458
+ return self._read_tagged_sent_block(stream, tagset)
459
+
460
+ return concat(
461
+ [
462
+ StreamBackedCorpusView(fileid, reader, encoding=enc)
463
+ for fileid, enc in self.abspaths(fileids, True)
464
+ ]
465
+ )
466
+
467
+ def sents(self, fileids=None):
468
+ reader = self._read_sent_block
469
+ return concat(
470
+ [
471
+ StreamBackedCorpusView(fileid, reader, encoding=enc)
472
+ for fileid, enc in self.abspaths(fileids, True)
473
+ ]
474
+ )
475
+
476
+ def tagged_words(self, fileids=None, tagset=None):
477
+ def reader(stream):
478
+ return self._read_tagged_word_block(stream, tagset)
479
+
480
+ return concat(
481
+ [
482
+ StreamBackedCorpusView(fileid, reader, encoding=enc)
483
+ for fileid, enc in self.abspaths(fileids, True)
484
+ ]
485
+ )
486
+
487
+ def words(self, fileids=None):
488
+ return concat(
489
+ [
490
+ StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
491
+ for fileid, enc in self.abspaths(fileids, True)
492
+ ]
493
+ )
494
+
495
+ # ------------------------------------------------------------
496
+ # { Block Readers
497
+
498
+ def _read_word_block(self, stream):
499
+ return list(chain.from_iterable(self._read_sent_block(stream)))
500
+
501
+ def _read_tagged_word_block(self, stream, tagset=None):
502
+ return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset)))
503
+
504
+ def _read_sent_block(self, stream):
505
+ return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
506
+
507
+ def _read_tagged_sent_block(self, stream, tagset=None):
508
+ return list(
509
+ filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
510
+ )
511
+
512
+ def _read_parsed_sent_block(self, stream):
513
+ return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
514
+
515
+ # } End of Block Readers
516
+ # ------------------------------------------------------------
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bcp47.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: BCP-47 language tags
2
+ #
3
+ # Copyright (C) 2022 NLTK Project
4
+ # Author: Eric Kafe <kafe.eric@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ import re
9
+ from warnings import warn
10
+ from xml.etree import ElementTree as et
11
+
12
+ from nltk.corpus.reader import CorpusReader
13
+
14
+
15
+ class BCP47CorpusReader(CorpusReader):
16
+ """
17
+ Parse BCP-47 composite language tags
18
+
19
+ Supports all the main subtags, and the 'u-sd' extension:
20
+
21
+ >>> from nltk.corpus import bcp47
22
+ >>> bcp47.name('oc-gascon-u-sd-fr64')
23
+ 'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
24
+
25
+ Can load a conversion table to Wikidata Q-codes:
26
+ >>> bcp47.load_wiki_q()
27
+ >>> bcp47.wiki_q['en-GI-spanglis']
28
+ 'Q79388'
29
+
30
+ """
31
+
32
+ def __init__(self, root, fileids):
33
+ """Read the BCP-47 database"""
34
+ super().__init__(root, fileids)
35
+ self.langcode = {}
36
+ with self.open("iana/language-subtag-registry.txt") as fp:
37
+ self.db = self.data_dict(fp.read().split("%%\n"))
38
+ with self.open("cldr/common-subdivisions-en.xml") as fp:
39
+ self.subdiv = self.subdiv_dict(
40
+ et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
41
+ )
42
+ self.morphology()
43
+
44
+ def load_wiki_q(self):
45
+ """Load conversion table to Wikidata Q-codes (only if needed)"""
46
+ with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
47
+ self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
48
+
49
+ def wiki_dict(self, lines):
50
+ """Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
51
+ return {
52
+ pair[1]: pair[0].split("/")[-1]
53
+ for pair in [line.strip().split("\t") for line in lines]
54
+ }
55
+
56
+ def subdiv_dict(self, subdivs):
57
+ """Convert the CLDR subdivisions list to a dictionary"""
58
+ return {sub.attrib["type"]: sub.text for sub in subdivs}
59
+
60
+ def morphology(self):
61
+ self.casing = {
62
+ "language": str.lower,
63
+ "extlang": str.lower,
64
+ "script": str.title,
65
+ "region": str.upper,
66
+ "variant": str.lower,
67
+ }
68
+ dig = "[0-9]"
69
+ low = "[a-z]"
70
+ up = "[A-Z]"
71
+ alnum = "[a-zA-Z0-9]"
72
+ self.format = {
73
+ "language": re.compile(f"{low*3}?"),
74
+ "extlang": re.compile(f"{low*3}"),
75
+ "script": re.compile(f"{up}{low*3}"),
76
+ "region": re.compile(f"({up*2})|({dig*3})"),
77
+ "variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
78
+ "singleton": re.compile(f"{low}"),
79
+ }
80
+
81
+ def data_dict(self, records):
82
+ """Convert the BCP-47 language subtag registry to a dictionary"""
83
+ self.version = records[0].replace("File-Date:", "").strip()
84
+ dic = {}
85
+ dic["deprecated"] = {}
86
+ for label in [
87
+ "language",
88
+ "extlang",
89
+ "script",
90
+ "region",
91
+ "variant",
92
+ "redundant",
93
+ "grandfathered",
94
+ ]:
95
+ dic["deprecated"][label] = {}
96
+ for record in records[1:]:
97
+ fields = [field.split(": ") for field in record.strip().split("\n")]
98
+ typ = fields[0][1]
99
+ tag = fields[1][1]
100
+ if typ not in dic:
101
+ dic[typ] = {}
102
+ subfields = {}
103
+ for field in fields[2:]:
104
+ if len(field) == 2:
105
+ [key, val] = field
106
+ if key not in subfields:
107
+ subfields[key] = [val]
108
+ else: # multiple value
109
+ subfields[key].append(val)
110
+ else: # multiline field
111
+ subfields[key][-1] += " " + field[0].strip()
112
+ if (
113
+ "Deprecated" not in record
114
+ and typ == "language"
115
+ and key == "Description"
116
+ ):
117
+ self.langcode[subfields[key][-1]] = tag
118
+ for key in subfields:
119
+ if len(subfields[key]) == 1: # single value
120
+ subfields[key] = subfields[key][0]
121
+ if "Deprecated" in record:
122
+ dic["deprecated"][typ][tag] = subfields
123
+ else:
124
+ dic[typ][tag] = subfields
125
+ return dic
126
+
127
+ def val2str(self, val):
128
+ """Return only first value"""
129
+ if type(val) == list:
130
+ # val = "/".join(val) # Concatenate all values
131
+ val = val[0]
132
+ return val
133
+
134
+ def lang2str(self, lg_record):
135
+ """Concatenate subtag values"""
136
+ name = f"{lg_record['language']}"
137
+ for label in ["extlang", "script", "region", "variant", "extension"]:
138
+ if label in lg_record:
139
+ name += f": {lg_record[label]}"
140
+ return name
141
+
142
+ def parse_tag(self, tag):
143
+ """Convert a BCP-47 tag to a dictionary of labelled subtags"""
144
+ subtags = tag.split("-")
145
+ lang = {}
146
+ labels = ["language", "extlang", "script", "region", "variant", "variant"]
147
+ while subtags and labels:
148
+ subtag = subtags.pop(0)
149
+ found = False
150
+ while labels:
151
+ label = labels.pop(0)
152
+ subtag = self.casing[label](subtag)
153
+ if self.format[label].fullmatch(subtag):
154
+ if subtag in self.db[label]:
155
+ found = True
156
+ valstr = self.val2str(self.db[label][subtag]["Description"])
157
+ if label == "variant" and label in lang:
158
+ lang[label] += ": " + valstr
159
+ else:
160
+ lang[label] = valstr
161
+ break
162
+ elif subtag in self.db["deprecated"][label]:
163
+ found = True
164
+ note = f"The {subtag!r} {label} code is deprecated"
165
+ if "Preferred-Value" in self.db["deprecated"][label][subtag]:
166
+ prefer = self.db["deprecated"][label][subtag][
167
+ "Preferred-Value"
168
+ ]
169
+ note += f"', prefer '{self.val2str(prefer)}'"
170
+ lang[label] = self.val2str(
171
+ self.db["deprecated"][label][subtag]["Description"]
172
+ )
173
+ warn(note)
174
+ break
175
+ if not found:
176
+ if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions
177
+ sd = subtags[1]
178
+ if sd in self.subdiv:
179
+ ext = self.subdiv[sd]
180
+ else:
181
+ ext = f"<Unknown subdivision: {ext}>"
182
+ else: # other extension subtags are not supported yet
183
+ ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
184
+ if not self.format["singleton"].fullmatch(subtag):
185
+ ext = f"<Invalid extension: {ext}>"
186
+ warn(ext)
187
+ lang["extension"] = ext
188
+ subtags = []
189
+ return lang
190
+
191
+ def name(self, tag):
192
+ """
193
+ Convert a BCP-47 tag to a colon-separated string of subtag names
194
+
195
+ >>> from nltk.corpus import bcp47
196
+ >>> bcp47.name('ca-Latn-ES-valencia')
197
+ 'Catalan: Latin: Spain: Valencian'
198
+
199
+ """
200
+ for label in ["redundant", "grandfathered"]:
201
+ val = None
202
+ if tag in self.db[label]:
203
+ val = f"{self.db[label][tag]['Description']}"
204
+ note = f"The {tag!r} code is {label}"
205
+ elif tag in self.db["deprecated"][label]:
206
+ val = f"{self.db['deprecated'][label][tag]['Description']}"
207
+ note = f"The {tag!r} code is {label} and deprecated"
208
+ if "Preferred-Value" in self.db["deprecated"][label][tag]:
209
+ prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
210
+ note += f", prefer {self.val2str(prefer)!r}"
211
+ if val:
212
+ warn(note)
213
+ return val
214
+ try:
215
+ return self.lang2str(self.parse_tag(tag))
216
+ except:
217
+ warn(f"Tag {tag!r} was not recognized")
218
+ return None
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bnc.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Plaintext Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """Corpus reader for the XML version of the British National Corpus."""
9
+
10
+ from nltk.corpus.reader.util import concat
11
+ from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView
12
+
13
+
14
+ class BNCCorpusReader(XMLCorpusReader):
15
+ r"""Corpus reader for the XML version of the British National Corpus.
16
+
17
+ For access to the complete XML data structure, use the ``xml()``
18
+ method. For access to simple word lists and tagged word lists, use
19
+ ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
20
+
21
+ You can obtain the full version of the BNC corpus at
22
+ https://www.ota.ox.ac.uk/desc/2554
23
+
24
+ If you extracted the archive to a directory called `BNC`, then you can
25
+ instantiate the reader as::
26
+
27
+ BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
28
+
29
+ """
30
+
31
+ def __init__(self, root, fileids, lazy=True):
32
+ XMLCorpusReader.__init__(self, root, fileids)
33
+ self._lazy = lazy
34
+
35
+ def words(self, fileids=None, strip_space=True, stem=False):
36
+ """
37
+ :return: the given file(s) as a list of words
38
+ and punctuation symbols.
39
+ :rtype: list(str)
40
+
41
+ :param strip_space: If true, then strip trailing spaces from
42
+ word tokens. Otherwise, leave the spaces on the tokens.
43
+ :param stem: If true, then use word stems instead of word strings.
44
+ """
45
+ return self._views(fileids, False, None, strip_space, stem)
46
+
47
+ def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
48
+ """
49
+ :return: the given file(s) as a list of tagged
50
+ words and punctuation symbols, encoded as tuples
51
+ ``(word,tag)``.
52
+ :rtype: list(tuple(str,str))
53
+
54
+ :param c5: If true, then the tags used will be the more detailed
55
+ c5 tags. Otherwise, the simplified tags will be used.
56
+ :param strip_space: If true, then strip trailing spaces from
57
+ word tokens. Otherwise, leave the spaces on the tokens.
58
+ :param stem: If true, then use word stems instead of word strings.
59
+ """
60
+ tag = "c5" if c5 else "pos"
61
+ return self._views(fileids, False, tag, strip_space, stem)
62
+
63
+ def sents(self, fileids=None, strip_space=True, stem=False):
64
+ """
65
+ :return: the given file(s) as a list of
66
+ sentences or utterances, each encoded as a list of word
67
+ strings.
68
+ :rtype: list(list(str))
69
+
70
+ :param strip_space: If true, then strip trailing spaces from
71
+ word tokens. Otherwise, leave the spaces on the tokens.
72
+ :param stem: If true, then use word stems instead of word strings.
73
+ """
74
+ return self._views(fileids, True, None, strip_space, stem)
75
+
76
+ def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
77
+ """
78
+ :return: the given file(s) as a list of
79
+ sentences, each encoded as a list of ``(word,tag)`` tuples.
80
+ :rtype: list(list(tuple(str,str)))
81
+
82
+ :param c5: If true, then the tags used will be the more detailed
83
+ c5 tags. Otherwise, the simplified tags will be used.
84
+ :param strip_space: If true, then strip trailing spaces from
85
+ word tokens. Otherwise, leave the spaces on the tokens.
86
+ :param stem: If true, then use word stems instead of word strings.
87
+ """
88
+ tag = "c5" if c5 else "pos"
89
+ return self._views(
90
+ fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
91
+ )
92
+
93
+ def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
94
+ """A helper function that instantiates BNCWordViews or the list of words/sentences."""
95
+ f = BNCWordView if self._lazy else self._words
96
+ return concat(
97
+ [
98
+ f(fileid, sent, tag, strip_space, stem)
99
+ for fileid in self.abspaths(fileids)
100
+ ]
101
+ )
102
+
103
+ def _words(self, fileid, bracket_sent, tag, strip_space, stem):
104
+ """
105
+ Helper used to implement the view methods -- returns a list of
106
+ words or a list of sentences, optionally tagged.
107
+
108
+ :param fileid: The name of the underlying file.
109
+ :param bracket_sent: If true, include sentence bracketing.
110
+ :param tag: The name of the tagset to use, or None for no tags.
111
+ :param strip_space: If true, strip spaces from word tokens.
112
+ :param stem: If true, then substitute stems for words.
113
+ """
114
+ result = []
115
+
116
+ xmldoc = ElementTree.parse(fileid).getroot()
117
+ for xmlsent in xmldoc.findall(".//s"):
118
+ sent = []
119
+ for xmlword in _all_xmlwords_in(xmlsent):
120
+ word = xmlword.text
121
+ if not word:
122
+ word = "" # fixes issue 337?
123
+ if strip_space or stem:
124
+ word = word.strip()
125
+ if stem:
126
+ word = xmlword.get("hw", word)
127
+ if tag == "c5":
128
+ word = (word, xmlword.get("c5"))
129
+ elif tag == "pos":
130
+ word = (word, xmlword.get("pos", xmlword.get("c5")))
131
+ sent.append(word)
132
+ if bracket_sent:
133
+ result.append(BNCSentence(xmlsent.attrib["n"], sent))
134
+ else:
135
+ result.extend(sent)
136
+
137
+ assert None not in result
138
+ return result
139
+
140
+
141
+ def _all_xmlwords_in(elt, result=None):
142
+ if result is None:
143
+ result = []
144
+ for child in elt:
145
+ if child.tag in ("c", "w"):
146
+ result.append(child)
147
+ else:
148
+ _all_xmlwords_in(child, result)
149
+ return result
150
+
151
+
152
+ class BNCSentence(list):
153
+ """
154
+ A list of words, augmented by an attribute ``num`` used to record
155
+ the sentence identifier (the ``n`` attribute from the XML).
156
+ """
157
+
158
+ def __init__(self, num, items):
159
+ self.num = num
160
+ list.__init__(self, items)
161
+
162
+
163
+ class BNCWordView(XMLCorpusView):
164
+ """
165
+ A stream backed corpus view specialized for use with the BNC corpus.
166
+ """
167
+
168
+ tags_to_ignore = {
169
+ "pb",
170
+ "gap",
171
+ "vocal",
172
+ "event",
173
+ "unclear",
174
+ "shift",
175
+ "pause",
176
+ "align",
177
+ }
178
+ """These tags are ignored. For their description refer to the
179
+ technical documentation, for example,
180
+ http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
181
+
182
+ """
183
+
184
+ def __init__(self, fileid, sent, tag, strip_space, stem):
185
+ """
186
+ :param fileid: The name of the underlying file.
187
+ :param sent: If true, include sentence bracketing.
188
+ :param tag: The name of the tagset to use, or None for no tags.
189
+ :param strip_space: If true, strip spaces from word tokens.
190
+ :param stem: If true, then substitute stems for words.
191
+ """
192
+ if sent:
193
+ tagspec = ".*/s"
194
+ else:
195
+ tagspec = ".*/s/(.*/)?(c|w)"
196
+ self._sent = sent
197
+ self._tag = tag
198
+ self._strip_space = strip_space
199
+ self._stem = stem
200
+
201
+ self.title = None #: Title of the document.
202
+ self.author = None #: Author of the document.
203
+ self.editor = None #: Editor
204
+ self.resps = None #: Statement of responsibility
205
+
206
+ XMLCorpusView.__init__(self, fileid, tagspec)
207
+
208
+ # Read in a tasty header.
209
+ self._open()
210
+ self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
211
+ self.close()
212
+
213
+ # Reset tag context.
214
+ self._tag_context = {0: ()}
215
+
216
+ def handle_header(self, elt, context):
217
+ # Set up some metadata!
218
+ titles = elt.findall("titleStmt/title")
219
+ if titles:
220
+ self.title = "\n".join(title.text.strip() for title in titles)
221
+
222
+ authors = elt.findall("titleStmt/author")
223
+ if authors:
224
+ self.author = "\n".join(author.text.strip() for author in authors)
225
+
226
+ editors = elt.findall("titleStmt/editor")
227
+ if editors:
228
+ self.editor = "\n".join(editor.text.strip() for editor in editors)
229
+
230
+ resps = elt.findall("titleStmt/respStmt")
231
+ if resps:
232
+ self.resps = "\n\n".join(
233
+ "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
234
+ )
235
+
236
+ def handle_elt(self, elt, context):
237
+ if self._sent:
238
+ return self.handle_sent(elt)
239
+ else:
240
+ return self.handle_word(elt)
241
+
242
+ def handle_word(self, elt):
243
+ word = elt.text
244
+ if not word:
245
+ word = "" # fixes issue 337?
246
+ if self._strip_space or self._stem:
247
+ word = word.strip()
248
+ if self._stem:
249
+ word = elt.get("hw", word)
250
+ if self._tag == "c5":
251
+ word = (word, elt.get("c5"))
252
+ elif self._tag == "pos":
253
+ word = (word, elt.get("pos", elt.get("c5")))
254
+ return word
255
+
256
+ def handle_sent(self, elt):
257
+ sent = []
258
+ for child in elt:
259
+ if child.tag in ("mw", "hi", "corr", "trunc"):
260
+ sent += [self.handle_word(w) for w in child]
261
+ elif child.tag in ("w", "c"):
262
+ sent.append(self.handle_word(child))
263
+ elif child.tag not in self.tags_to_ignore:
264
+ raise ValueError("Unexpected element %s" % child.tag)
265
+ return BNCSentence(elt.attrib["n"], sent)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/bracket_parse.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Penn Treebank Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+ """
9
+ Corpus reader for corpora that consist of parenthesis-delineated parse trees.
10
+ """
11
+
12
+ import sys
13
+
14
+ from nltk.corpus.reader.api import *
15
+ from nltk.corpus.reader.util import *
16
+ from nltk.tag import map_tag
17
+ from nltk.tree import Tree
18
+
19
+ # we use [^\s()]+ instead of \S+? to avoid matching ()
20
+ SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
21
+ TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
22
+ WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
23
+ EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
24
+
25
+
26
+ class BracketParseCorpusReader(SyntaxCorpusReader):
27
+ """
28
+ Reader for corpora that consist of parenthesis-delineated parse trees,
29
+ like those found in the "combined" section of the Penn Treebank,
30
+ e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
31
+
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ root,
37
+ fileids,
38
+ comment_char=None,
39
+ detect_blocks="unindented_paren",
40
+ encoding="utf8",
41
+ tagset=None,
42
+ ):
43
+ """
44
+ :param root: The root directory for this corpus.
45
+ :param fileids: A list or regexp specifying the fileids in this corpus.
46
+ :param comment_char: The character which can appear at the start of
47
+ a line to indicate that the rest of the line is a comment.
48
+ :param detect_blocks: The method that is used to find blocks
49
+ in the corpus; can be 'unindented_paren' (every unindented
50
+ parenthesis starts a new parse) or 'sexpr' (brackets are
51
+ matched).
52
+ :param tagset: The name of the tagset used by this corpus, to be used
53
+ for normalizing or converting the POS tags returned by the
54
+ ``tagged_...()`` methods.
55
+ """
56
+ SyntaxCorpusReader.__init__(self, root, fileids, encoding)
57
+ self._comment_char = comment_char
58
+ self._detect_blocks = detect_blocks
59
+ self._tagset = tagset
60
+
61
+ def _read_block(self, stream):
62
+ if self._detect_blocks == "sexpr":
63
+ return read_sexpr_block(stream, comment_char=self._comment_char)
64
+ elif self._detect_blocks == "blankline":
65
+ return read_blankline_block(stream)
66
+ elif self._detect_blocks == "unindented_paren":
67
+ # Tokens start with unindented left parens.
68
+ toks = read_regexp_block(stream, start_re=r"^\(")
69
+ # Strip any comments out of the tokens.
70
+ if self._comment_char:
71
+ toks = [
72
+ re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
73
+ for tok in toks
74
+ ]
75
+ return toks
76
+ else:
77
+ assert 0, "bad block type"
78
+
79
+ def _normalize(self, t):
80
+ # Replace leaves of the form (!), (,), with (! !), (, ,)
81
+ t = re.sub(r"\((.)\)", r"(\1 \1)", t)
82
+ # Replace leaves of the form (tag word root) with (tag word)
83
+ t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
84
+ return t
85
+
86
+ def _parse(self, t):
87
+ try:
88
+ tree = Tree.fromstring(self._normalize(t))
89
+ # If there's an empty node at the top, strip it off
90
+ if tree.label() == "" and len(tree) == 1:
91
+ return tree[0]
92
+ else:
93
+ return tree
94
+
95
+ except ValueError as e:
96
+ sys.stderr.write("Bad tree detected; trying to recover...\n")
97
+ # Try to recover, if we can:
98
+ if e.args == ("mismatched parens",):
99
+ for n in range(1, 5):
100
+ try:
101
+ v = Tree(self._normalize(t + ")" * n))
102
+ sys.stderr.write(
103
+ " Recovered by adding %d close " "paren(s)\n" % n
104
+ )
105
+ return v
106
+ except ValueError:
107
+ pass
108
+ # Try something else:
109
+ sys.stderr.write(" Recovered by returning a flat parse.\n")
110
+ # sys.stderr.write(' '.join(t.split())+'\n')
111
+ return Tree("S", self._tag(t))
112
+
113
+ def _tag(self, t, tagset=None):
114
+ tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
115
+ if tagset and tagset != self._tagset:
116
+ tagged_sent = [
117
+ (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
118
+ ]
119
+ return tagged_sent
120
+
121
+ def _word(self, t):
122
+ return WORD.findall(self._normalize(t))
123
+
124
+
125
+ class CategorizedBracketParseCorpusReader(
126
+ CategorizedCorpusReader, BracketParseCorpusReader
127
+ ):
128
+ """
129
+ A reader for parsed corpora whose documents are
130
+ divided into categories based on their file identifiers.
131
+ @author: Nathan Schneider <nschneid@cs.cmu.edu>
132
+ """
133
+
134
+ def __init__(self, *args, **kwargs):
135
+ """
136
+ Initialize the corpus reader. Categorization arguments
137
+ (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
138
+ the L{CategorizedCorpusReader constructor
139
+ <CategorizedCorpusReader.__init__>}. The remaining arguments
140
+ are passed to the L{BracketParseCorpusReader constructor
141
+ <BracketParseCorpusReader.__init__>}.
142
+ """
143
+ CategorizedCorpusReader.__init__(self, kwargs)
144
+ BracketParseCorpusReader.__init__(self, *args, **kwargs)
145
+
146
+ def tagged_words(self, fileids=None, categories=None, tagset=None):
147
+ return super().tagged_words(self._resolve(fileids, categories), tagset)
148
+
149
+ def tagged_sents(self, fileids=None, categories=None, tagset=None):
150
+ return super().tagged_sents(self._resolve(fileids, categories), tagset)
151
+
152
+ def tagged_paras(self, fileids=None, categories=None, tagset=None):
153
+ return super().tagged_paras(self._resolve(fileids, categories), tagset)
154
+
155
+ def parsed_words(self, fileids=None, categories=None):
156
+ return super().parsed_words(self._resolve(fileids, categories))
157
+
158
+ def parsed_sents(self, fileids=None, categories=None):
159
+ return super().parsed_sents(self._resolve(fileids, categories))
160
+
161
+ def parsed_paras(self, fileids=None, categories=None):
162
+ return super().parsed_paras(self._resolve(fileids, categories))
163
+
164
+
165
+ class AlpinoCorpusReader(BracketParseCorpusReader):
166
+ """
167
+ Reader for the Alpino Dutch Treebank.
168
+ This corpus has a lexical breakdown structure embedded, as read by `_parse`
169
+ Unfortunately this puts punctuation and some other words out of the sentence
170
+ order in the xml element tree. This is no good for `tag_` and `word_`
171
+ `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
172
+ to the overridden _normalize function. The _parse function can then remain
173
+ untouched.
174
+ """
175
+
176
+ def __init__(self, root, encoding="ISO-8859-1", tagset=None):
177
+ BracketParseCorpusReader.__init__(
178
+ self,
179
+ root,
180
+ r"alpino\.xml",
181
+ detect_blocks="blankline",
182
+ encoding=encoding,
183
+ tagset=tagset,
184
+ )
185
+
186
+ def _normalize(self, t, ordered=False):
187
+ """Normalize the xml sentence element in t.
188
+ The sentence elements <alpino_ds>, although embedded in a few overall
189
+ xml elements, are separated by blank lines. That's how the reader can
190
+ deliver them one at a time.
191
+ Each sentence has a few category subnodes that are of no use to us.
192
+ The remaining word nodes may or may not appear in the proper order.
193
+ Each word node has attributes, among which:
194
+ - begin : the position of the word in the sentence
195
+ - pos : Part of Speech: the Tag
196
+ - word : the actual word
197
+ The return value is a string with all xml elementes replaced by
198
+ clauses: either a cat clause with nested clauses, or a word clause.
199
+ The order of the bracket clauses closely follows the xml.
200
+ If ordered == True, the word clauses include an order sequence number.
201
+ If ordered == False, the word clauses only have pos and word parts.
202
+ """
203
+ if t[:10] != "<alpino_ds":
204
+ return ""
205
+ # convert XML to sexpr notation
206
+ t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
207
+ if ordered:
208
+ t = re.sub(
209
+ r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
210
+ r"(\1 \2 \3)",
211
+ t,
212
+ )
213
+ else:
214
+ t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
215
+ t = re.sub(r" </node>", r")", t)
216
+ t = re.sub(r"<sentence>.*</sentence>", r"", t)
217
+ t = re.sub(r"</?alpino_ds.*>", r"", t)
218
+ return t
219
+
220
+ def _tag(self, t, tagset=None):
221
+ tagged_sent = [
222
+ (int(o), w, p)
223
+ for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
224
+ ]
225
+ tagged_sent.sort()
226
+ if tagset and tagset != self._tagset:
227
+ tagged_sent = [
228
+ (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
229
+ ]
230
+ else:
231
+ tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
232
+ return tagged_sent
233
+
234
+ def _word(self, t):
235
+ """Return a correctly ordered list if words"""
236
+ tagged_sent = self._tag(t)
237
+ return [w for (w, p) in tagged_sent]
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/categorized_sents.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Categorized Sentences Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ CorpusReader structured for corpora that contain one instance on each row.
10
+ This CorpusReader is specifically used for the Subjectivity Dataset and the
11
+ Sentence Polarity Dataset.
12
+
13
+ - Subjectivity Dataset information -
14
+
15
+ Authors: Bo Pang and Lillian Lee.
16
+ Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
17
+
18
+ Distributed with permission.
19
+
20
+ Related papers:
21
+
22
+ - Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
23
+ Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
24
+ 2004.
25
+
26
+ - Sentence Polarity Dataset information -
27
+
28
+ Authors: Bo Pang and Lillian Lee.
29
+ Url: https://www.cs.cornell.edu/people/pabo/movie-review-data
30
+
31
+ Related papers:
32
+
33
+ - Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
34
+ sentiment categorization with respect to rating scales". Proceedings of the
35
+ ACL, 2005.
36
+ """
37
+
38
+ from nltk.corpus.reader.api import *
39
+ from nltk.tokenize import *
40
+
41
+
42
+ class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
43
+ """
44
+ A reader for corpora in which each row represents a single instance, mainly
45
+ a sentence. Istances are divided into categories based on their file identifiers
46
+ (see CategorizedCorpusReader).
47
+ Since many corpora allow rows that contain more than one sentence, it is
48
+ possible to specify a sentence tokenizer to retrieve all sentences instead
49
+ than all rows.
50
+
51
+ Examples using the Subjectivity Dataset:
52
+
53
+ >>> from nltk.corpus import subjectivity
54
+ >>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE
55
+ ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
56
+ 'happened', 'off', 'screen', '.']
57
+ >>> subjectivity.categories()
58
+ ['obj', 'subj']
59
+ >>> subjectivity.words(categories='subj')
60
+ ['smart', 'and', 'alert', ',', 'thirteen', ...]
61
+
62
+ Examples using the Sentence Polarity Dataset:
63
+
64
+ >>> from nltk.corpus import sentence_polarity
65
+ >>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE
66
+ [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
67
+ 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
68
+ 'it', 'funny', '.'], ...]
69
+ >>> sentence_polarity.categories()
70
+ ['neg', 'pos']
71
+ """
72
+
73
+ CorpusView = StreamBackedCorpusView
74
+
75
+ def __init__(
76
+ self,
77
+ root,
78
+ fileids,
79
+ word_tokenizer=WhitespaceTokenizer(),
80
+ sent_tokenizer=None,
81
+ encoding="utf8",
82
+ **kwargs
83
+ ):
84
+ """
85
+ :param root: The root directory for the corpus.
86
+ :param fileids: a list or regexp specifying the fileids in the corpus.
87
+ :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
88
+ into words. Default: `WhitespaceTokenizer`
89
+ :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
90
+ :param encoding: the encoding that should be used to read the corpus.
91
+ :param kwargs: additional parameters passed to CategorizedCorpusReader.
92
+ """
93
+
94
+ CorpusReader.__init__(self, root, fileids, encoding)
95
+ CategorizedCorpusReader.__init__(self, kwargs)
96
+ self._word_tokenizer = word_tokenizer
97
+ self._sent_tokenizer = sent_tokenizer
98
+
99
+ def sents(self, fileids=None, categories=None):
100
+ """
101
+ Return all sentences in the corpus or in the specified file(s).
102
+
103
+ :param fileids: a list or regexp specifying the ids of the files whose
104
+ sentences have to be returned.
105
+ :param categories: a list specifying the categories whose sentences have
106
+ to be returned.
107
+ :return: the given file(s) as a list of sentences.
108
+ Each sentence is tokenized using the specified word_tokenizer.
109
+ :rtype: list(list(str))
110
+ """
111
+ fileids = self._resolve(fileids, categories)
112
+ if fileids is None:
113
+ fileids = self._fileids
114
+ elif isinstance(fileids, str):
115
+ fileids = [fileids]
116
+ return concat(
117
+ [
118
+ self.CorpusView(path, self._read_sent_block, encoding=enc)
119
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
120
+ ]
121
+ )
122
+
123
+ def words(self, fileids=None, categories=None):
124
+ """
125
+ Return all words and punctuation symbols in the corpus or in the specified
126
+ file(s).
127
+
128
+ :param fileids: a list or regexp specifying the ids of the files whose
129
+ words have to be returned.
130
+ :param categories: a list specifying the categories whose words have to
131
+ be returned.
132
+ :return: the given file(s) as a list of words and punctuation symbols.
133
+ :rtype: list(str)
134
+ """
135
+ fileids = self._resolve(fileids, categories)
136
+ if fileids is None:
137
+ fileids = self._fileids
138
+ elif isinstance(fileids, str):
139
+ fileids = [fileids]
140
+ return concat(
141
+ [
142
+ self.CorpusView(path, self._read_word_block, encoding=enc)
143
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
144
+ ]
145
+ )
146
+
147
+ def _read_sent_block(self, stream):
148
+ sents = []
149
+ for i in range(20): # Read 20 lines at a time.
150
+ line = stream.readline()
151
+ if not line:
152
+ continue
153
+ if self._sent_tokenizer:
154
+ sents.extend(
155
+ [
156
+ self._word_tokenizer.tokenize(sent)
157
+ for sent in self._sent_tokenizer.tokenize(line)
158
+ ]
159
+ )
160
+ else:
161
+ sents.append(self._word_tokenizer.tokenize(line))
162
+ return sents
163
+
164
+ def _read_word_block(self, stream):
165
+ words = []
166
+ for sent in self._read_sent_block(stream):
167
+ words.extend(sent)
168
+ return words
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chasen.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (C) 2001-2022 NLTK Project
3
+ # Author: Masato Hagiwara <hagisan@gmail.com>
4
+ # URL: <https://www.nltk.org/>
5
+ # For license information, see LICENSE.TXT
6
+
7
+ import sys
8
+
9
+ from nltk.corpus.reader import util
10
+ from nltk.corpus.reader.api import *
11
+ from nltk.corpus.reader.util import *
12
+
13
+
14
+ class ChasenCorpusReader(CorpusReader):
15
+ def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
16
+ self._sent_splitter = sent_splitter
17
+ CorpusReader.__init__(self, root, fileids, encoding)
18
+
19
+ def words(self, fileids=None):
20
+ return concat(
21
+ [
22
+ ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
23
+ for (fileid, enc) in self.abspaths(fileids, True)
24
+ ]
25
+ )
26
+
27
+ def tagged_words(self, fileids=None):
28
+ return concat(
29
+ [
30
+ ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
31
+ for (fileid, enc) in self.abspaths(fileids, True)
32
+ ]
33
+ )
34
+
35
+ def sents(self, fileids=None):
36
+ return concat(
37
+ [
38
+ ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
39
+ for (fileid, enc) in self.abspaths(fileids, True)
40
+ ]
41
+ )
42
+
43
+ def tagged_sents(self, fileids=None):
44
+ return concat(
45
+ [
46
+ ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
47
+ for (fileid, enc) in self.abspaths(fileids, True)
48
+ ]
49
+ )
50
+
51
+ def paras(self, fileids=None):
52
+ return concat(
53
+ [
54
+ ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
55
+ for (fileid, enc) in self.abspaths(fileids, True)
56
+ ]
57
+ )
58
+
59
+ def tagged_paras(self, fileids=None):
60
+ return concat(
61
+ [
62
+ ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
63
+ for (fileid, enc) in self.abspaths(fileids, True)
64
+ ]
65
+ )
66
+
67
+
68
+ class ChasenCorpusView(StreamBackedCorpusView):
69
+ """
70
+ A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
71
+ but this'll use fixed sets of word and sentence tokenizer.
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ corpus_file,
77
+ encoding,
78
+ tagged,
79
+ group_by_sent,
80
+ group_by_para,
81
+ sent_splitter=None,
82
+ ):
83
+ self._tagged = tagged
84
+ self._group_by_sent = group_by_sent
85
+ self._group_by_para = group_by_para
86
+ self._sent_splitter = sent_splitter
87
+ StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
88
+
89
+ def read_block(self, stream):
90
+ """Reads one paragraph at a time."""
91
+ block = []
92
+ for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
93
+
94
+ para = []
95
+
96
+ sent = []
97
+ for line in para_str.splitlines():
98
+
99
+ _eos = line.strip() == "EOS"
100
+ _cells = line.split("\t")
101
+ w = (_cells[0], "\t".join(_cells[1:]))
102
+ if not _eos:
103
+ sent.append(w)
104
+
105
+ if _eos or (self._sent_splitter and self._sent_splitter(w)):
106
+ if not self._tagged:
107
+ sent = [w for (w, t) in sent]
108
+ if self._group_by_sent:
109
+ para.append(sent)
110
+ else:
111
+ para.extend(sent)
112
+ sent = []
113
+
114
+ if len(sent) > 0:
115
+ if not self._tagged:
116
+ sent = [w for (w, t) in sent]
117
+
118
+ if self._group_by_sent:
119
+ para.append(sent)
120
+ else:
121
+ para.extend(sent)
122
+
123
+ if self._group_by_para:
124
+ block.append(para)
125
+ else:
126
+ block.extend(para)
127
+
128
+ return block
129
+
130
+
131
+ def demo():
132
+
133
+ import nltk
134
+ from nltk.corpus.util import LazyCorpusLoader
135
+
136
+ jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
137
+ print("/".join(jeita.words()[22100:22140]))
138
+
139
+ print(
140
+ "\nEOS\n".join(
141
+ "\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent)
142
+ for sent in jeita.tagged_sents()[2170:2173]
143
+ )
144
+ )
145
+
146
+
147
+ def test():
148
+
149
+ from nltk.corpus.util import LazyCorpusLoader
150
+
151
+ jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
152
+
153
+ assert isinstance(jeita.tagged_words()[0][1], str)
154
+
155
+
156
+ if __name__ == "__main__":
157
+ demo()
158
+ test()
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_lite.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: PanLex Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: David Kamholz <kamholz@panlex.org>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
10
+ as an SQLite database. See the README.txt in the panlex_lite corpus directory
11
+ for more information on PanLex Lite.
12
+ """
13
+
14
+ import os
15
+ import sqlite3
16
+
17
+ from nltk.corpus.reader.api import CorpusReader
18
+
19
+
20
+ class PanLexLiteCorpusReader(CorpusReader):
21
+ MEANING_Q = """
22
+ SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
23
+ FROM dnx
24
+ JOIN ex ON (ex.ex = dnx.ex)
25
+ JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
26
+ JOIN ex ex2 ON (ex2.ex = dnx2.ex)
27
+ WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
28
+ ORDER BY dnx2.uq DESC
29
+ """
30
+
31
+ TRANSLATION_Q = """
32
+ SELECT s.tt, sum(s.uq) AS trq FROM (
33
+ SELECT ex2.tt, max(dnx.uq) AS uq
34
+ FROM dnx
35
+ JOIN ex ON (ex.ex = dnx.ex)
36
+ JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
37
+ JOIN ex ex2 ON (ex2.ex = dnx2.ex)
38
+ WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
39
+ GROUP BY ex2.tt, dnx.ui
40
+ ) s
41
+ GROUP BY s.tt
42
+ ORDER BY trq DESC, s.tt
43
+ """
44
+
45
+ def __init__(self, root):
46
+ self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
47
+
48
+ self._uid_lv = {}
49
+ self._lv_uid = {}
50
+
51
+ for row in self._c.execute("SELECT uid, lv FROM lv"):
52
+ self._uid_lv[row[0]] = row[1]
53
+ self._lv_uid[row[1]] = row[0]
54
+
55
+ def language_varieties(self, lc=None):
56
+ """
57
+ Return a list of PanLex language varieties.
58
+
59
+ :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
60
+ by this code. If unspecified, all varieties are returned.
61
+ :return: the specified language varieties as a list of tuples. The first
62
+ element is the language variety's seven-character uniform identifier,
63
+ and the second element is its default name.
64
+ :rtype: list(tuple)
65
+ """
66
+
67
+ if lc is None:
68
+ return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
69
+ else:
70
+ return self._c.execute(
71
+ "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
72
+ ).fetchall()
73
+
74
+ def meanings(self, expr_uid, expr_tt):
75
+ """
76
+ Return a list of meanings for an expression.
77
+
78
+ :param expr_uid: the expression's language variety, as a seven-character
79
+ uniform identifier.
80
+ :param expr_tt: the expression's text.
81
+ :return: a list of Meaning objects.
82
+ :rtype: list(Meaning)
83
+ """
84
+
85
+ expr_lv = self._uid_lv[expr_uid]
86
+
87
+ mn_info = {}
88
+
89
+ for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
90
+ mn = i[0]
91
+ uid = self._lv_uid[i[5]]
92
+
93
+ if not mn in mn_info:
94
+ mn_info[mn] = {
95
+ "uq": i[1],
96
+ "ap": i[2],
97
+ "ui": i[3],
98
+ "ex": {expr_uid: [expr_tt]},
99
+ }
100
+
101
+ if not uid in mn_info[mn]["ex"]:
102
+ mn_info[mn]["ex"][uid] = []
103
+
104
+ mn_info[mn]["ex"][uid].append(i[4])
105
+
106
+ return [Meaning(mn, mn_info[mn]) for mn in mn_info]
107
+
108
+ def translations(self, from_uid, from_tt, to_uid):
109
+ """
110
+ Return a list of translations for an expression into a single language
111
+ variety.
112
+
113
+ :param from_uid: the source expression's language variety, as a
114
+ seven-character uniform identifier.
115
+ :param from_tt: the source expression's text.
116
+ :param to_uid: the target language variety, as a seven-character
117
+ uniform identifier.
118
+ :return: a list of translation tuples. The first element is the expression
119
+ text and the second element is the translation quality.
120
+ :rtype: list(tuple)
121
+ """
122
+
123
+ from_lv = self._uid_lv[from_uid]
124
+ to_lv = self._uid_lv[to_uid]
125
+
126
+ return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
127
+
128
+
129
+ class Meaning(dict):
130
+ """
131
+ Represents a single PanLex meaning. A meaning is a translation set derived
132
+ from a single source.
133
+ """
134
+
135
+ def __init__(self, mn, attr):
136
+ super().__init__(**attr)
137
+ self["mn"] = mn
138
+
139
+ def id(self):
140
+ """
141
+ :return: the meaning's id.
142
+ :rtype: int
143
+ """
144
+ return self["mn"]
145
+
146
+ def quality(self):
147
+ """
148
+ :return: the meaning's source's quality (0=worst, 9=best).
149
+ :rtype: int
150
+ """
151
+ return self["uq"]
152
+
153
+ def source(self):
154
+ """
155
+ :return: the meaning's source id.
156
+ :rtype: int
157
+ """
158
+ return self["ap"]
159
+
160
+ def source_group(self):
161
+ """
162
+ :return: the meaning's source group id.
163
+ :rtype: int
164
+ """
165
+ return self["ui"]
166
+
167
+ def expressions(self):
168
+ """
169
+ :return: the meaning's expressions as a dictionary whose keys are language
170
+ variety uniform identifiers and whose values are lists of expression
171
+ texts.
172
+ :rtype: dict
173
+ """
174
+ return self["ex"]
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/panlex_swadesh.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Word List Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+
10
+ import re
11
+ from collections import defaultdict, namedtuple
12
+
13
+ from nltk.corpus.reader.api import *
14
+ from nltk.corpus.reader.util import *
15
+ from nltk.corpus.reader.wordlist import WordListCorpusReader
16
+ from nltk.tokenize import line_tokenize
17
+
18
+ PanlexLanguage = namedtuple(
19
+ "PanlexLanguage",
20
+ [
21
+ "panlex_uid", # (1) PanLex UID
22
+ "iso639", # (2) ISO 639 language code
23
+ "iso639_type", # (3) ISO 639 language type, see README
24
+ "script", # (4) normal scripts of expressions
25
+ "name", # (5) PanLex default name
26
+ "langvar_uid", # (6) UID of the language variety in which the default name is an expression
27
+ ],
28
+ )
29
+
30
+
31
+ class PanlexSwadeshCorpusReader(WordListCorpusReader):
32
+ """
33
+ This is a class to read the PanLex Swadesh list from
34
+
35
+ David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
36
+ PanLex: Building a Resource for Panlingual Lexical Translation.
37
+ In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
38
+
39
+ License: CC0 1.0 Universal
40
+ https://creativecommons.org/publicdomain/zero/1.0/legalcode
41
+ """
42
+
43
+ def __init__(self, *args, **kwargs):
44
+ super().__init__(*args, **kwargs)
45
+ # Find the swadesh size using the fileids' path.
46
+ self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
47
+ self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
48
+ self._macro_langauges = self.get_macrolanguages()
49
+
50
+ def license(self):
51
+ return "CC0 1.0 Universal"
52
+
53
+ def language_codes(self):
54
+ return self._languages.keys()
55
+
56
+ def get_languages(self):
57
+ for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
58
+ if not line.strip(): # Skip empty lines.
59
+ continue
60
+ yield PanlexLanguage(*line.strip().split("\t"))
61
+
62
+ def get_macrolanguages(self):
63
+ macro_langauges = defaultdict(list)
64
+ for lang in self._languages.values():
65
+ macro_langauges[lang.iso639].append(lang.panlex_uid)
66
+ return macro_langauges
67
+
68
+ def words_by_lang(self, lang_code):
69
+ """
70
+ :return: a list of list(str)
71
+ """
72
+ fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
73
+ return [concept.split("\t") for concept in self.words(fileid)]
74
+
75
+ def words_by_iso639(self, iso63_code):
76
+ """
77
+ :return: a list of list(str)
78
+ """
79
+ fileids = [
80
+ f"swadesh{self.swadesh_size}/{lang_code}.txt"
81
+ for lang_code in self._macro_langauges[iso63_code]
82
+ ]
83
+ return [
84
+ concept.split("\t") for fileid in fileids for concept in self.words(fileid)
85
+ ]
86
+
87
+ def entries(self, fileids=None):
88
+ """
89
+ :return: a tuple of words for the specified fileids.
90
+ """
91
+ if not fileids:
92
+ fileids = self.fileids()
93
+
94
+ wordlists = [self.words(f) for f in fileids]
95
+ return list(zip(*wordlists))
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pl196x.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit:
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ from nltk.corpus.reader.api import *
9
+ from nltk.corpus.reader.xmldocs import XMLCorpusReader
10
+
11
+ PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
12
+ SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
13
+
14
+ TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
15
+ WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
16
+
17
+ TYPE = re.compile(r'type="(.*?)"')
18
+ ANA = re.compile(r'ana="(.*?)"')
19
+
20
+ TEXTID = re.compile(r'text id="(.*?)"')
21
+
22
+
23
+ class TEICorpusView(StreamBackedCorpusView):
24
+ def __init__(
25
+ self,
26
+ corpus_file,
27
+ tagged,
28
+ group_by_sent,
29
+ group_by_para,
30
+ tagset=None,
31
+ head_len=0,
32
+ textids=None,
33
+ ):
34
+
35
+ self._tagged = tagged
36
+ self._textids = textids
37
+
38
+ self._group_by_sent = group_by_sent
39
+ self._group_by_para = group_by_para
40
+ # WARNING -- skip header
41
+ StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
42
+
43
+ _pagesize = 4096
44
+
45
+ def read_block(self, stream):
46
+ block = stream.readlines(self._pagesize)
47
+ block = concat(block)
48
+ while (block.count("<text id") > block.count("</text>")) or block.count(
49
+ "<text id"
50
+ ) == 0:
51
+ tmp = stream.readline()
52
+ if len(tmp) <= 0:
53
+ break
54
+ block += tmp
55
+
56
+ block = block.replace("\n", "")
57
+
58
+ textids = TEXTID.findall(block)
59
+ if self._textids:
60
+ for tid in textids:
61
+ if tid not in self._textids:
62
+ beg = block.find(tid) - 1
63
+ end = block[beg:].find("</text>") + len("</text>")
64
+ block = block[:beg] + block[beg + end :]
65
+
66
+ output = []
67
+ for para_str in PARA.findall(block):
68
+ para = []
69
+ for sent_str in SENT.findall(para_str):
70
+ if not self._tagged:
71
+ sent = WORD.findall(sent_str)
72
+ else:
73
+ sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
74
+ if self._group_by_sent:
75
+ para.append(sent)
76
+ else:
77
+ para.extend(sent)
78
+ if self._group_by_para:
79
+ output.append(para)
80
+ else:
81
+ output.extend(para)
82
+ return output
83
+
84
+ def _parse_tag(self, tag_word_tuple):
85
+ (tag, word) = tag_word_tuple
86
+ if tag.startswith("w"):
87
+ tag = ANA.search(tag).group(1)
88
+ else: # tag.startswith('c')
89
+ tag = TYPE.search(tag).group(1)
90
+ return word, tag
91
+
92
+
93
+ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
94
+ head_len = 2770
95
+
96
+ def __init__(self, *args, **kwargs):
97
+ if "textid_file" in kwargs:
98
+ self._textids = kwargs["textid_file"]
99
+ else:
100
+ self._textids = None
101
+
102
+ XMLCorpusReader.__init__(self, *args)
103
+ CategorizedCorpusReader.__init__(self, kwargs)
104
+
105
+ self._init_textids()
106
+
107
+ def _init_textids(self):
108
+ self._f2t = defaultdict(list)
109
+ self._t2f = defaultdict(list)
110
+ if self._textids is not None:
111
+ with open(self._textids) as fp:
112
+ for line in fp:
113
+ line = line.strip()
114
+ file_id, text_ids = line.split(" ", 1)
115
+ if file_id not in self.fileids():
116
+ raise ValueError(
117
+ "In text_id mapping file %s: %s not found"
118
+ % (self._textids, file_id)
119
+ )
120
+ for text_id in text_ids.split(self._delimiter):
121
+ self._add_textids(file_id, text_id)
122
+
123
+ def _add_textids(self, file_id, text_id):
124
+ self._f2t[file_id].append(text_id)
125
+ self._t2f[text_id].append(file_id)
126
+
127
+ def _resolve(self, fileids, categories, textids=None):
128
+ tmp = None
129
+ if (
130
+ len(
131
+ list(
132
+ filter(
133
+ lambda accessor: accessor is None,
134
+ (fileids, categories, textids),
135
+ )
136
+ )
137
+ )
138
+ != 1
139
+ ):
140
+
141
+ raise ValueError(
142
+ "Specify exactly one of: fileids, " "categories or textids"
143
+ )
144
+
145
+ if fileids is not None:
146
+ return fileids, None
147
+
148
+ if categories is not None:
149
+ return self.fileids(categories), None
150
+
151
+ if textids is not None:
152
+ if isinstance(textids, str):
153
+ textids = [textids]
154
+ files = sum((self._t2f[t] for t in textids), [])
155
+ tdict = dict()
156
+ for f in files:
157
+ tdict[f] = set(self._f2t[f]) & set(textids)
158
+ return files, tdict
159
+
160
+ def decode_tag(self, tag):
161
+ # to be implemented
162
+ return tag
163
+
164
+ def textids(self, fileids=None, categories=None):
165
+ """
166
+ In the pl196x corpus each category is stored in single
167
+ file and thus both methods provide identical functionality. In order
168
+ to accommodate finer granularity, a non-standard textids() method was
169
+ implemented. All the main functions can be supplied with a list
170
+ of required chunks---giving much more control to the user.
171
+ """
172
+ fileids, _ = self._resolve(fileids, categories)
173
+ if fileids is None:
174
+ return sorted(self._t2f)
175
+
176
+ if isinstance(fileids, str):
177
+ fileids = [fileids]
178
+ return sorted(sum((self._f2t[d] for d in fileids), []))
179
+
180
+ def words(self, fileids=None, categories=None, textids=None):
181
+ fileids, textids = self._resolve(fileids, categories, textids)
182
+ if fileids is None:
183
+ fileids = self._fileids
184
+ elif isinstance(fileids, str):
185
+ fileids = [fileids]
186
+
187
+ if textids:
188
+ return concat(
189
+ [
190
+ TEICorpusView(
191
+ self.abspath(fileid),
192
+ False,
193
+ False,
194
+ False,
195
+ head_len=self.head_len,
196
+ textids=textids[fileid],
197
+ )
198
+ for fileid in fileids
199
+ ]
200
+ )
201
+ else:
202
+ return concat(
203
+ [
204
+ TEICorpusView(
205
+ self.abspath(fileid),
206
+ False,
207
+ False,
208
+ False,
209
+ head_len=self.head_len,
210
+ )
211
+ for fileid in fileids
212
+ ]
213
+ )
214
+
215
+ def sents(self, fileids=None, categories=None, textids=None):
216
+ fileids, textids = self._resolve(fileids, categories, textids)
217
+ if fileids is None:
218
+ fileids = self._fileids
219
+ elif isinstance(fileids, str):
220
+ fileids = [fileids]
221
+
222
+ if textids:
223
+ return concat(
224
+ [
225
+ TEICorpusView(
226
+ self.abspath(fileid),
227
+ False,
228
+ True,
229
+ False,
230
+ head_len=self.head_len,
231
+ textids=textids[fileid],
232
+ )
233
+ for fileid in fileids
234
+ ]
235
+ )
236
+ else:
237
+ return concat(
238
+ [
239
+ TEICorpusView(
240
+ self.abspath(fileid), False, True, False, head_len=self.head_len
241
+ )
242
+ for fileid in fileids
243
+ ]
244
+ )
245
+
246
+ def paras(self, fileids=None, categories=None, textids=None):
247
+ fileids, textids = self._resolve(fileids, categories, textids)
248
+ if fileids is None:
249
+ fileids = self._fileids
250
+ elif isinstance(fileids, str):
251
+ fileids = [fileids]
252
+
253
+ if textids:
254
+ return concat(
255
+ [
256
+ TEICorpusView(
257
+ self.abspath(fileid),
258
+ False,
259
+ True,
260
+ True,
261
+ head_len=self.head_len,
262
+ textids=textids[fileid],
263
+ )
264
+ for fileid in fileids
265
+ ]
266
+ )
267
+ else:
268
+ return concat(
269
+ [
270
+ TEICorpusView(
271
+ self.abspath(fileid), False, True, True, head_len=self.head_len
272
+ )
273
+ for fileid in fileids
274
+ ]
275
+ )
276
+
277
+ def tagged_words(self, fileids=None, categories=None, textids=None):
278
+ fileids, textids = self._resolve(fileids, categories, textids)
279
+ if fileids is None:
280
+ fileids = self._fileids
281
+ elif isinstance(fileids, str):
282
+ fileids = [fileids]
283
+
284
+ if textids:
285
+ return concat(
286
+ [
287
+ TEICorpusView(
288
+ self.abspath(fileid),
289
+ True,
290
+ False,
291
+ False,
292
+ head_len=self.head_len,
293
+ textids=textids[fileid],
294
+ )
295
+ for fileid in fileids
296
+ ]
297
+ )
298
+ else:
299
+ return concat(
300
+ [
301
+ TEICorpusView(
302
+ self.abspath(fileid), True, False, False, head_len=self.head_len
303
+ )
304
+ for fileid in fileids
305
+ ]
306
+ )
307
+
308
+ def tagged_sents(self, fileids=None, categories=None, textids=None):
309
+ fileids, textids = self._resolve(fileids, categories, textids)
310
+ if fileids is None:
311
+ fileids = self._fileids
312
+ elif isinstance(fileids, str):
313
+ fileids = [fileids]
314
+
315
+ if textids:
316
+ return concat(
317
+ [
318
+ TEICorpusView(
319
+ self.abspath(fileid),
320
+ True,
321
+ True,
322
+ False,
323
+ head_len=self.head_len,
324
+ textids=textids[fileid],
325
+ )
326
+ for fileid in fileids
327
+ ]
328
+ )
329
+ else:
330
+ return concat(
331
+ [
332
+ TEICorpusView(
333
+ self.abspath(fileid), True, True, False, head_len=self.head_len
334
+ )
335
+ for fileid in fileids
336
+ ]
337
+ )
338
+
339
+ def tagged_paras(self, fileids=None, categories=None, textids=None):
340
+ fileids, textids = self._resolve(fileids, categories, textids)
341
+ if fileids is None:
342
+ fileids = self._fileids
343
+ elif isinstance(fileids, str):
344
+ fileids = [fileids]
345
+
346
+ if textids:
347
+ return concat(
348
+ [
349
+ TEICorpusView(
350
+ self.abspath(fileid),
351
+ True,
352
+ True,
353
+ True,
354
+ head_len=self.head_len,
355
+ textids=textids[fileid],
356
+ )
357
+ for fileid in fileids
358
+ ]
359
+ )
360
+ else:
361
+ return concat(
362
+ [
363
+ TEICorpusView(
364
+ self.abspath(fileid), True, True, True, head_len=self.head_len
365
+ )
366
+ for fileid in fileids
367
+ ]
368
+ )
369
+
370
+ def xml(self, fileids=None, categories=None):
371
+ fileids, _ = self._resolve(fileids, categories)
372
+ if len(fileids) == 1:
373
+ return XMLCorpusReader.xml(self, fileids[0])
374
+ else:
375
+ raise TypeError("Expected a single file")
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/plaintext.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Plaintext Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # Nitin Madnani <nmadnani@umiacs.umd.edu>
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ """
11
+ A reader for corpora that consist of plaintext documents.
12
+ """
13
+
14
+ import nltk.data
15
+ from nltk.corpus.reader.api import *
16
+ from nltk.corpus.reader.util import *
17
+ from nltk.tokenize import *
18
+
19
+
20
+ class PlaintextCorpusReader(CorpusReader):
21
+ """
22
+ Reader for corpora that consist of plaintext documents. Paragraphs
23
+ are assumed to be split using blank lines. Sentences and words can
24
+ be tokenized using the default tokenizers, or by custom tokenizers
25
+ specified as parameters to the constructor.
26
+
27
+ This corpus reader can be customized (e.g., to skip preface
28
+ sections of specific document formats) by creating a subclass and
29
+ overriding the ``CorpusView`` class variable.
30
+ """
31
+
32
+ CorpusView = StreamBackedCorpusView
33
+ """The corpus view class used by this reader. Subclasses of
34
+ ``PlaintextCorpusReader`` may specify alternative corpus view
35
+ classes (e.g., to skip the preface sections of documents.)"""
36
+
37
+ def __init__(
38
+ self,
39
+ root,
40
+ fileids,
41
+ word_tokenizer=WordPunctTokenizer(),
42
+ sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),
43
+ para_block_reader=read_blankline_block,
44
+ encoding="utf8",
45
+ ):
46
+ r"""
47
+ Construct a new plaintext corpus reader for a set of documents
48
+ located at the given root directory. Example usage:
49
+
50
+ >>> root = '/usr/local/share/nltk_data/corpora/webtext/'
51
+ >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
52
+
53
+ :param root: The root directory for this corpus.
54
+ :param fileids: A list or regexp specifying the fileids in this corpus.
55
+ :param word_tokenizer: Tokenizer for breaking sentences or
56
+ paragraphs into words.
57
+ :param sent_tokenizer: Tokenizer for breaking paragraphs
58
+ into words.
59
+ :param para_block_reader: The block reader used to divide the
60
+ corpus into paragraph blocks.
61
+ """
62
+ CorpusReader.__init__(self, root, fileids, encoding)
63
+ self._word_tokenizer = word_tokenizer
64
+ self._sent_tokenizer = sent_tokenizer
65
+ self._para_block_reader = para_block_reader
66
+
67
+ def words(self, fileids=None):
68
+ """
69
+ :return: the given file(s) as a list of words
70
+ and punctuation symbols.
71
+ :rtype: list(str)
72
+ """
73
+ return concat(
74
+ [
75
+ self.CorpusView(path, self._read_word_block, encoding=enc)
76
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
77
+ ]
78
+ )
79
+
80
+ def sents(self, fileids=None):
81
+ """
82
+ :return: the given file(s) as a list of
83
+ sentences or utterances, each encoded as a list of word
84
+ strings.
85
+ :rtype: list(list(str))
86
+ """
87
+ if self._sent_tokenizer is None:
88
+ raise ValueError("No sentence tokenizer for this corpus")
89
+
90
+ return concat(
91
+ [
92
+ self.CorpusView(path, self._read_sent_block, encoding=enc)
93
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
94
+ ]
95
+ )
96
+
97
+ def paras(self, fileids=None):
98
+ """
99
+ :return: the given file(s) as a list of
100
+ paragraphs, each encoded as a list of sentences, which are
101
+ in turn encoded as lists of word strings.
102
+ :rtype: list(list(list(str)))
103
+ """
104
+ if self._sent_tokenizer is None:
105
+ raise ValueError("No sentence tokenizer for this corpus")
106
+
107
+ return concat(
108
+ [
109
+ self.CorpusView(path, self._read_para_block, encoding=enc)
110
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
111
+ ]
112
+ )
113
+
114
+ def _read_word_block(self, stream):
115
+ words = []
116
+ for i in range(20): # Read 20 lines at a time.
117
+ words.extend(self._word_tokenizer.tokenize(stream.readline()))
118
+ return words
119
+
120
+ def _read_sent_block(self, stream):
121
+ sents = []
122
+ for para in self._para_block_reader(stream):
123
+ sents.extend(
124
+ [
125
+ self._word_tokenizer.tokenize(sent)
126
+ for sent in self._sent_tokenizer.tokenize(para)
127
+ ]
128
+ )
129
+ return sents
130
+
131
+ def _read_para_block(self, stream):
132
+ paras = []
133
+ for para in self._para_block_reader(stream):
134
+ paras.append(
135
+ [
136
+ self._word_tokenizer.tokenize(sent)
137
+ for sent in self._sent_tokenizer.tokenize(para)
138
+ ]
139
+ )
140
+ return paras
141
+
142
+
143
+ class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
144
+ """
145
+ A reader for plaintext corpora whose documents are divided into
146
+ categories based on their file identifiers.
147
+ """
148
+
149
+ def __init__(self, *args, **kwargs):
150
+ """
151
+ Initialize the corpus reader. Categorization arguments
152
+ (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
153
+ the ``CategorizedCorpusReader`` constructor. The remaining arguments
154
+ are passed to the ``PlaintextCorpusReader`` constructor.
155
+ """
156
+ CategorizedCorpusReader.__init__(self, kwargs)
157
+ PlaintextCorpusReader.__init__(self, *args, **kwargs)
158
+
159
+
160
+ # FIXME: Is there a better way? How to not hardcode this?
161
+ # Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
162
+ # override the `sent_tokenizer`.
163
+ class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
164
+ def __init__(self, *args, **kwargs):
165
+ CategorizedCorpusReader.__init__(self, kwargs)
166
+ kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
167
+ "tokenizers/punkt/portuguese.pickle"
168
+ )
169
+ PlaintextCorpusReader.__init__(self, *args, **kwargs)
170
+
171
+
172
+ class EuroparlCorpusReader(PlaintextCorpusReader):
173
+
174
+ """
175
+ Reader for Europarl corpora that consist of plaintext documents.
176
+ Documents are divided into chapters instead of paragraphs as
177
+ for regular plaintext documents. Chapters are separated using blank
178
+ lines. Everything is inherited from ``PlaintextCorpusReader`` except
179
+ that:
180
+
181
+ - Since the corpus is pre-processed and pre-tokenized, the
182
+ word tokenizer should just split the line at whitespaces.
183
+ - For the same reason, the sentence tokenizer should just
184
+ split the paragraph at line breaks.
185
+ - There is a new 'chapters()' method that returns chapters instead
186
+ instead of paragraphs.
187
+ - The 'paras()' method inherited from PlaintextCorpusReader is
188
+ made non-functional to remove any confusion between chapters
189
+ and paragraphs for Europarl.
190
+ """
191
+
192
+ def _read_word_block(self, stream):
193
+ words = []
194
+ for i in range(20): # Read 20 lines at a time.
195
+ words.extend(stream.readline().split())
196
+ return words
197
+
198
+ def _read_sent_block(self, stream):
199
+ sents = []
200
+ for para in self._para_block_reader(stream):
201
+ sents.extend([sent.split() for sent in para.splitlines()])
202
+ return sents
203
+
204
+ def _read_para_block(self, stream):
205
+ paras = []
206
+ for para in self._para_block_reader(stream):
207
+ paras.append([sent.split() for sent in para.splitlines()])
208
+ return paras
209
+
210
+ def chapters(self, fileids=None):
211
+ """
212
+ :return: the given file(s) as a list of
213
+ chapters, each encoded as a list of sentences, which are
214
+ in turn encoded as lists of word strings.
215
+ :rtype: list(list(list(str)))
216
+ """
217
+ return concat(
218
+ [
219
+ self.CorpusView(fileid, self._read_para_block, encoding=enc)
220
+ for (fileid, enc) in self.abspaths(fileids, True)
221
+ ]
222
+ )
223
+
224
+ def paras(self, fileids=None):
225
+ raise NotImplementedError(
226
+ "The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
227
+ )
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ppattach.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: PP Attachment Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ Read lines from the Prepositional Phrase Attachment Corpus.
11
+
12
+ The PP Attachment Corpus contains several files having the format:
13
+
14
+ sentence_id verb noun1 preposition noun2 attachment
15
+
16
+ For example:
17
+
18
+ 42960 gives authority to administration V
19
+ 46742 gives inventors of microchip N
20
+
21
+ The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
22
+
23
+ (VP gives (NP authority) (PP to administration))
24
+ (VP gives (NP inventors (PP of microchip)))
25
+
26
+ The corpus contains the following files:
27
+
28
+ training: training set
29
+ devset: development test set, used for algorithm development.
30
+ test: test set, used to report results
31
+ bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
32
+
33
+ Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
34
+ Phrase Attachment. Proceedings of the ARPA Human Language Technology
35
+ Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
36
+
37
+ The PP Attachment Corpus is distributed with NLTK with the permission
38
+ of the author.
39
+ """
40
+
41
+ from nltk.corpus.reader.api import *
42
+ from nltk.corpus.reader.util import *
43
+
44
+
45
+ class PPAttachment:
46
+ def __init__(self, sent, verb, noun1, prep, noun2, attachment):
47
+ self.sent = sent
48
+ self.verb = verb
49
+ self.noun1 = noun1
50
+ self.prep = prep
51
+ self.noun2 = noun2
52
+ self.attachment = attachment
53
+
54
+ def __repr__(self):
55
+ return (
56
+ "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
57
+ "noun2=%r, attachment=%r)"
58
+ % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
59
+ )
60
+
61
+
62
+ class PPAttachmentCorpusReader(CorpusReader):
63
+ """
64
+ sentence_id verb noun1 preposition noun2 attachment
65
+ """
66
+
67
+ def attachments(self, fileids):
68
+ return concat(
69
+ [
70
+ StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
71
+ for (fileid, enc) in self.abspaths(fileids, True)
72
+ ]
73
+ )
74
+
75
+ def tuples(self, fileids):
76
+ return concat(
77
+ [
78
+ StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
79
+ for (fileid, enc) in self.abspaths(fileids, True)
80
+ ]
81
+ )
82
+
83
+ def _read_tuple_block(self, stream):
84
+ line = stream.readline()
85
+ if line:
86
+ return [tuple(line.split())]
87
+ else:
88
+ return []
89
+
90
+ def _read_obj_block(self, stream):
91
+ line = stream.readline()
92
+ if line:
93
+ return [PPAttachment(*line.split())]
94
+ else:
95
+ return []
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/propbank.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: PropBank Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ import re
9
+ from functools import total_ordering
10
+ from xml.etree import ElementTree
11
+
12
+ from nltk.corpus.reader.api import *
13
+ from nltk.corpus.reader.util import *
14
+ from nltk.internals import raise_unorderable_types
15
+ from nltk.tree import Tree
16
+
17
+
18
+ class PropbankCorpusReader(CorpusReader):
19
+ """
20
+ Corpus reader for the propbank corpus, which augments the Penn
21
+ Treebank with information about the predicate argument structure
22
+ of every verb instance. The corpus consists of two parts: the
23
+ predicate-argument annotations themselves, and a set of "frameset
24
+ files" which define the argument labels used by the annotations,
25
+ on a per-verb basis. Each "frameset file" contains one or more
26
+ predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
27
+ divided into coarse-grained word senses called "rolesets". For
28
+ each "roleset", the frameset file provides descriptions of the
29
+ argument roles, along with examples.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ root,
35
+ propfile,
36
+ framefiles="",
37
+ verbsfile=None,
38
+ parse_fileid_xform=None,
39
+ parse_corpus=None,
40
+ encoding="utf8",
41
+ ):
42
+ """
43
+ :param root: The root directory for this corpus.
44
+ :param propfile: The name of the file containing the predicate-
45
+ argument annotations (relative to ``root``).
46
+ :param framefiles: A list or regexp specifying the frameset
47
+ fileids for this corpus.
48
+ :param parse_fileid_xform: A transform that should be applied
49
+ to the fileids in this corpus. This should be a function
50
+ of one argument (a fileid) that returns a string (the new
51
+ fileid).
52
+ :param parse_corpus: The corpus containing the parse trees
53
+ corresponding to this corpus. These parse trees are
54
+ necessary to resolve the tree pointers used by propbank.
55
+ """
56
+ # If framefiles is specified as a regexp, expand it.
57
+ if isinstance(framefiles, str):
58
+ framefiles = find_corpus_fileids(root, framefiles)
59
+ framefiles = list(framefiles)
60
+ # Initialize the corpus reader.
61
+ CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
62
+
63
+ # Record our frame fileids & prop file.
64
+ self._propfile = propfile
65
+ self._framefiles = framefiles
66
+ self._verbsfile = verbsfile
67
+ self._parse_fileid_xform = parse_fileid_xform
68
+ self._parse_corpus = parse_corpus
69
+
70
+ def instances(self, baseform=None):
71
+ """
72
+ :return: a corpus view that acts as a list of
73
+ ``PropBankInstance`` objects, one for each noun in the corpus.
74
+ """
75
+ kwargs = {}
76
+ if baseform is not None:
77
+ kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
78
+ return StreamBackedCorpusView(
79
+ self.abspath(self._propfile),
80
+ lambda stream: self._read_instance_block(stream, **kwargs),
81
+ encoding=self.encoding(self._propfile),
82
+ )
83
+
84
+ def lines(self):
85
+ """
86
+ :return: a corpus view that acts as a list of strings, one for
87
+ each line in the predicate-argument annotation file.
88
+ """
89
+ return StreamBackedCorpusView(
90
+ self.abspath(self._propfile),
91
+ read_line_block,
92
+ encoding=self.encoding(self._propfile),
93
+ )
94
+
95
+ def roleset(self, roleset_id):
96
+ """
97
+ :return: the xml description for the given roleset.
98
+ """
99
+ baseform = roleset_id.split(".")[0]
100
+ framefile = "frames/%s.xml" % baseform
101
+ if framefile not in self._framefiles:
102
+ raise ValueError("Frameset file for %s not found" % roleset_id)
103
+
104
+ # n.b.: The encoding for XML fileids is specified by the file
105
+ # itself; so we ignore self._encoding here.
106
+ with self.abspath(framefile).open() as fp:
107
+ etree = ElementTree.parse(fp).getroot()
108
+ for roleset in etree.findall("predicate/roleset"):
109
+ if roleset.attrib["id"] == roleset_id:
110
+ return roleset
111
+ raise ValueError(f"Roleset {roleset_id} not found in {framefile}")
112
+
113
+ def rolesets(self, baseform=None):
114
+ """
115
+ :return: list of xml descriptions for rolesets.
116
+ """
117
+ if baseform is not None:
118
+ framefile = "frames/%s.xml" % baseform
119
+ if framefile not in self._framefiles:
120
+ raise ValueError("Frameset file for %s not found" % baseform)
121
+ framefiles = [framefile]
122
+ else:
123
+ framefiles = self._framefiles
124
+
125
+ rsets = []
126
+ for framefile in framefiles:
127
+ # n.b.: The encoding for XML fileids is specified by the file
128
+ # itself; so we ignore self._encoding here.
129
+ with self.abspath(framefile).open() as fp:
130
+ etree = ElementTree.parse(fp).getroot()
131
+ rsets.append(etree.findall("predicate/roleset"))
132
+ return LazyConcatenation(rsets)
133
+
134
+ def verbs(self):
135
+ """
136
+ :return: a corpus view that acts as a list of all verb lemmas
137
+ in this corpus (from the verbs.txt file).
138
+ """
139
+ return StreamBackedCorpusView(
140
+ self.abspath(self._verbsfile),
141
+ read_line_block,
142
+ encoding=self.encoding(self._verbsfile),
143
+ )
144
+
145
+ def _read_instance_block(self, stream, instance_filter=lambda inst: True):
146
+ block = []
147
+
148
+ # Read 100 at a time.
149
+ for i in range(100):
150
+ line = stream.readline().strip()
151
+ if line:
152
+ inst = PropbankInstance.parse(
153
+ line, self._parse_fileid_xform, self._parse_corpus
154
+ )
155
+ if instance_filter(inst):
156
+ block.append(inst)
157
+
158
+ return block
159
+
160
+
161
+ ######################################################################
162
+ # { Propbank Instance & related datatypes
163
+ ######################################################################
164
+
165
+
166
+ class PropbankInstance:
167
+ def __init__(
168
+ self,
169
+ fileid,
170
+ sentnum,
171
+ wordnum,
172
+ tagger,
173
+ roleset,
174
+ inflection,
175
+ predicate,
176
+ arguments,
177
+ parse_corpus=None,
178
+ ):
179
+
180
+ self.fileid = fileid
181
+ """The name of the file containing the parse tree for this
182
+ instance's sentence."""
183
+
184
+ self.sentnum = sentnum
185
+ """The sentence number of this sentence within ``fileid``.
186
+ Indexing starts from zero."""
187
+
188
+ self.wordnum = wordnum
189
+ """The word number of this instance's predicate within its
190
+ containing sentence. Word numbers are indexed starting from
191
+ zero, and include traces and other empty parse elements."""
192
+
193
+ self.tagger = tagger
194
+ """An identifier for the tagger who tagged this instance; or
195
+ ``'gold'`` if this is an adjuticated instance."""
196
+
197
+ self.roleset = roleset
198
+ """The name of the roleset used by this instance's predicate.
199
+ Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
200
+ look up information about the roleset."""
201
+
202
+ self.inflection = inflection
203
+ """A ``PropbankInflection`` object describing the inflection of
204
+ this instance's predicate."""
205
+
206
+ self.predicate = predicate
207
+ """A ``PropbankTreePointer`` indicating the position of this
208
+ instance's predicate within its containing sentence."""
209
+
210
+ self.arguments = tuple(arguments)
211
+ """A list of tuples (argloc, argid), specifying the location
212
+ and identifier for each of the predicate's argument in the
213
+ containing sentence. Argument identifiers are strings such as
214
+ ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
215
+ the predicate."""
216
+
217
+ self.parse_corpus = parse_corpus
218
+ """A corpus reader for the parse trees corresponding to the
219
+ instances in this propbank corpus."""
220
+
221
+ @property
222
+ def baseform(self):
223
+ """The baseform of the predicate."""
224
+ return self.roleset.split(".")[0]
225
+
226
+ @property
227
+ def sensenumber(self):
228
+ """The sense number of the predicate."""
229
+ return self.roleset.split(".")[1]
230
+
231
+ @property
232
+ def predid(self):
233
+ """Identifier of the predicate."""
234
+ return "rel"
235
+
236
+ def __repr__(self):
237
+ return "<PropbankInstance: {}, sent {}, word {}>".format(
238
+ self.fileid,
239
+ self.sentnum,
240
+ self.wordnum,
241
+ )
242
+
243
+ def __str__(self):
244
+ s = "{} {} {} {} {} {}".format(
245
+ self.fileid,
246
+ self.sentnum,
247
+ self.wordnum,
248
+ self.tagger,
249
+ self.roleset,
250
+ self.inflection,
251
+ )
252
+ items = self.arguments + ((self.predicate, "rel"),)
253
+ for (argloc, argid) in sorted(items):
254
+ s += f" {argloc}-{argid}"
255
+ return s
256
+
257
+ def _get_tree(self):
258
+ if self.parse_corpus is None:
259
+ return None
260
+ if self.fileid not in self.parse_corpus.fileids():
261
+ return None
262
+ return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
263
+
264
+ tree = property(
265
+ _get_tree,
266
+ doc="""
267
+ The parse tree corresponding to this instance, or None if
268
+ the corresponding tree is not available.""",
269
+ )
270
+
271
+ @staticmethod
272
+ def parse(s, parse_fileid_xform=None, parse_corpus=None):
273
+ pieces = s.split()
274
+ if len(pieces) < 7:
275
+ raise ValueError("Badly formatted propbank line: %r" % s)
276
+
277
+ # Divide the line into its basic pieces.
278
+ (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
279
+ rel = [p for p in pieces[6:] if p.endswith("-rel")]
280
+ args = [p for p in pieces[6:] if not p.endswith("-rel")]
281
+ if len(rel) != 1:
282
+ raise ValueError("Badly formatted propbank line: %r" % s)
283
+
284
+ # Apply the fileid selector, if any.
285
+ if parse_fileid_xform is not None:
286
+ fileid = parse_fileid_xform(fileid)
287
+
288
+ # Convert sentence & word numbers to ints.
289
+ sentnum = int(sentnum)
290
+ wordnum = int(wordnum)
291
+
292
+ # Parse the inflection
293
+ inflection = PropbankInflection.parse(inflection)
294
+
295
+ # Parse the predicate location.
296
+ predicate = PropbankTreePointer.parse(rel[0][:-4])
297
+
298
+ # Parse the arguments.
299
+ arguments = []
300
+ for arg in args:
301
+ argloc, argid = arg.split("-", 1)
302
+ arguments.append((PropbankTreePointer.parse(argloc), argid))
303
+
304
+ # Put it all together.
305
+ return PropbankInstance(
306
+ fileid,
307
+ sentnum,
308
+ wordnum,
309
+ tagger,
310
+ roleset,
311
+ inflection,
312
+ predicate,
313
+ arguments,
314
+ parse_corpus,
315
+ )
316
+
317
+
318
+ class PropbankPointer:
319
+ """
320
+ A pointer used by propbank to identify one or more constituents in
321
+ a parse tree. ``PropbankPointer`` is an abstract base class with
322
+ three concrete subclasses:
323
+
324
+ - ``PropbankTreePointer`` is used to point to single constituents.
325
+ - ``PropbankSplitTreePointer`` is used to point to 'split'
326
+ constituents, which consist of a sequence of two or more
327
+ ``PropbankTreePointer`` pointers.
328
+ - ``PropbankChainTreePointer`` is used to point to entire trace
329
+ chains in a tree. It consists of a sequence of pieces, which
330
+ can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
331
+ """
332
+
333
+ def __init__(self):
334
+ if self.__class__ == PropbankPointer:
335
+ raise NotImplementedError()
336
+
337
+
338
+ class PropbankChainTreePointer(PropbankPointer):
339
+ def __init__(self, pieces):
340
+ self.pieces = pieces
341
+ """A list of the pieces that make up this chain. Elements may
342
+ be either ``PropbankSplitTreePointer`` or
343
+ ``PropbankTreePointer`` pointers."""
344
+
345
+ def __str__(self):
346
+ return "*".join("%s" % p for p in self.pieces)
347
+
348
+ def __repr__(self):
349
+ return "<PropbankChainTreePointer: %s>" % self
350
+
351
+ def select(self, tree):
352
+ if tree is None:
353
+ raise ValueError("Parse tree not available")
354
+ return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
355
+
356
+
357
+ class PropbankSplitTreePointer(PropbankPointer):
358
+ def __init__(self, pieces):
359
+ self.pieces = pieces
360
+ """A list of the pieces that make up this chain. Elements are
361
+ all ``PropbankTreePointer`` pointers."""
362
+
363
+ def __str__(self):
364
+ return ",".join("%s" % p for p in self.pieces)
365
+
366
+ def __repr__(self):
367
+ return "<PropbankSplitTreePointer: %s>" % self
368
+
369
+ def select(self, tree):
370
+ if tree is None:
371
+ raise ValueError("Parse tree not available")
372
+ return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
373
+
374
+
375
+ @total_ordering
376
+ class PropbankTreePointer(PropbankPointer):
377
+ """
378
+ wordnum:height*wordnum:height*...
379
+ wordnum:height,
380
+
381
+ """
382
+
383
+ def __init__(self, wordnum, height):
384
+ self.wordnum = wordnum
385
+ self.height = height
386
+
387
+ @staticmethod
388
+ def parse(s):
389
+ # Deal with chains (xx*yy*zz)
390
+ pieces = s.split("*")
391
+ if len(pieces) > 1:
392
+ return PropbankChainTreePointer(
393
+ [PropbankTreePointer.parse(elt) for elt in pieces]
394
+ )
395
+
396
+ # Deal with split args (xx,yy,zz)
397
+ pieces = s.split(",")
398
+ if len(pieces) > 1:
399
+ return PropbankSplitTreePointer(
400
+ [PropbankTreePointer.parse(elt) for elt in pieces]
401
+ )
402
+
403
+ # Deal with normal pointers.
404
+ pieces = s.split(":")
405
+ if len(pieces) != 2:
406
+ raise ValueError("bad propbank pointer %r" % s)
407
+ return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
408
+
409
+ def __str__(self):
410
+ return f"{self.wordnum}:{self.height}"
411
+
412
+ def __repr__(self):
413
+ return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
414
+
415
+ def __eq__(self, other):
416
+ while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
417
+ other = other.pieces[0]
418
+
419
+ if not isinstance(other, PropbankTreePointer):
420
+ return self is other
421
+
422
+ return self.wordnum == other.wordnum and self.height == other.height
423
+
424
+ def __ne__(self, other):
425
+ return not self == other
426
+
427
+ def __lt__(self, other):
428
+ while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
429
+ other = other.pieces[0]
430
+
431
+ if not isinstance(other, PropbankTreePointer):
432
+ return id(self) < id(other)
433
+
434
+ return (self.wordnum, -self.height) < (other.wordnum, -other.height)
435
+
436
+ def select(self, tree):
437
+ if tree is None:
438
+ raise ValueError("Parse tree not available")
439
+ return tree[self.treepos(tree)]
440
+
441
+ def treepos(self, tree):
442
+ """
443
+ Convert this pointer to a standard 'tree position' pointer,
444
+ given that it points to the given tree.
445
+ """
446
+ if tree is None:
447
+ raise ValueError("Parse tree not available")
448
+ stack = [tree]
449
+ treepos = []
450
+
451
+ wordnum = 0
452
+ while True:
453
+ # tree node:
454
+ if isinstance(stack[-1], Tree):
455
+ # Select the next child.
456
+ if len(treepos) < len(stack):
457
+ treepos.append(0)
458
+ else:
459
+ treepos[-1] += 1
460
+ # Update the stack.
461
+ if treepos[-1] < len(stack[-1]):
462
+ stack.append(stack[-1][treepos[-1]])
463
+ else:
464
+ # End of node's child list: pop up a level.
465
+ stack.pop()
466
+ treepos.pop()
467
+ # word node:
468
+ else:
469
+ if wordnum == self.wordnum:
470
+ return tuple(treepos[: len(treepos) - self.height - 1])
471
+ else:
472
+ wordnum += 1
473
+ stack.pop()
474
+
475
+
476
+ class PropbankInflection:
477
+ # { Inflection Form
478
+ INFINITIVE = "i"
479
+ GERUND = "g"
480
+ PARTICIPLE = "p"
481
+ FINITE = "v"
482
+ # { Inflection Tense
483
+ FUTURE = "f"
484
+ PAST = "p"
485
+ PRESENT = "n"
486
+ # { Inflection Aspect
487
+ PERFECT = "p"
488
+ PROGRESSIVE = "o"
489
+ PERFECT_AND_PROGRESSIVE = "b"
490
+ # { Inflection Person
491
+ THIRD_PERSON = "3"
492
+ # { Inflection Voice
493
+ ACTIVE = "a"
494
+ PASSIVE = "p"
495
+ # { Inflection
496
+ NONE = "-"
497
+ # }
498
+
499
+ def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
500
+ self.form = form
501
+ self.tense = tense
502
+ self.aspect = aspect
503
+ self.person = person
504
+ self.voice = voice
505
+
506
+ def __str__(self):
507
+ return self.form + self.tense + self.aspect + self.person + self.voice
508
+
509
+ def __repr__(self):
510
+ return "<PropbankInflection: %s>" % self
511
+
512
+ _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
513
+
514
+ @staticmethod
515
+ def parse(s):
516
+ if not isinstance(s, str):
517
+ raise TypeError("expected a string")
518
+ if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
519
+ raise ValueError("Bad propbank inflection string %r" % s)
520
+ return PropbankInflection(*s)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/pros_cons.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Pros and Cons Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ CorpusReader for the Pros and Cons dataset.
10
+
11
+ - Pros and Cons dataset information -
12
+
13
+ Contact: Bing Liu, liub@cs.uic.edu
14
+ https://www.cs.uic.edu/~liub
15
+
16
+ Distributed with permission.
17
+
18
+ Related papers:
19
+
20
+ - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
21
+ Proceedings of the 22nd International Conference on Computational Linguistics
22
+ (Coling-2008), Manchester, 18-22 August, 2008.
23
+
24
+ - Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
25
+ Opinions on the Web". Proceedings of the 14th international World Wide Web
26
+ conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
27
+ """
28
+ import re
29
+
30
+ from nltk.corpus.reader.api import *
31
+ from nltk.tokenize import *
32
+
33
+
34
+ class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
35
+ """
36
+ Reader for the Pros and Cons sentence dataset.
37
+
38
+ >>> from nltk.corpus import pros_cons
39
+ >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
40
+ [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
41
+ 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
42
+ ...]
43
+ >>> pros_cons.words('IntegratedPros.txt')
44
+ ['Easy', 'to', 'use', ',', 'economical', '!', ...]
45
+ """
46
+
47
+ CorpusView = StreamBackedCorpusView
48
+
49
+ def __init__(
50
+ self,
51
+ root,
52
+ fileids,
53
+ word_tokenizer=WordPunctTokenizer(),
54
+ encoding="utf8",
55
+ **kwargs
56
+ ):
57
+ """
58
+ :param root: The root directory for the corpus.
59
+ :param fileids: a list or regexp specifying the fileids in the corpus.
60
+ :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
61
+ into words. Default: `WhitespaceTokenizer`
62
+ :param encoding: the encoding that should be used to read the corpus.
63
+ :param kwargs: additional parameters passed to CategorizedCorpusReader.
64
+ """
65
+
66
+ CorpusReader.__init__(self, root, fileids, encoding)
67
+ CategorizedCorpusReader.__init__(self, kwargs)
68
+ self._word_tokenizer = word_tokenizer
69
+
70
+ def sents(self, fileids=None, categories=None):
71
+ """
72
+ Return all sentences in the corpus or in the specified files/categories.
73
+
74
+ :param fileids: a list or regexp specifying the ids of the files whose
75
+ sentences have to be returned.
76
+ :param categories: a list specifying the categories whose sentences
77
+ have to be returned.
78
+ :return: the given file(s) as a list of sentences. Each sentence is
79
+ tokenized using the specified word_tokenizer.
80
+ :rtype: list(list(str))
81
+ """
82
+ fileids = self._resolve(fileids, categories)
83
+ if fileids is None:
84
+ fileids = self._fileids
85
+ elif isinstance(fileids, str):
86
+ fileids = [fileids]
87
+ return concat(
88
+ [
89
+ self.CorpusView(path, self._read_sent_block, encoding=enc)
90
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
91
+ ]
92
+ )
93
+
94
+ def words(self, fileids=None, categories=None):
95
+ """
96
+ Return all words and punctuation symbols in the corpus or in the specified
97
+ files/categories.
98
+
99
+ :param fileids: a list or regexp specifying the ids of the files whose
100
+ words have to be returned.
101
+ :param categories: a list specifying the categories whose words have
102
+ to be returned.
103
+ :return: the given file(s) as a list of words and punctuation symbols.
104
+ :rtype: list(str)
105
+ """
106
+ fileids = self._resolve(fileids, categories)
107
+ if fileids is None:
108
+ fileids = self._fileids
109
+ elif isinstance(fileids, str):
110
+ fileids = [fileids]
111
+ return concat(
112
+ [
113
+ self.CorpusView(path, self._read_word_block, encoding=enc)
114
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
115
+ ]
116
+ )
117
+
118
+ def _read_sent_block(self, stream):
119
+ sents = []
120
+ for i in range(20): # Read 20 lines at a time.
121
+ line = stream.readline()
122
+ if not line:
123
+ continue
124
+ sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
125
+ if sent:
126
+ sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
127
+ return sents
128
+
129
+ def _read_word_block(self, stream):
130
+ words = []
131
+ for sent in self._read_sent_block(stream):
132
+ words.extend(sent)
133
+ return words
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/reviews.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Product Reviews Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
10
+
11
+ Customer Review Corpus information
12
+ ==================================
13
+
14
+ Annotated by: Minqing Hu and Bing Liu, 2004.
15
+ Department of Computer Science
16
+ University of Illinois at Chicago
17
+
18
+ Contact: Bing Liu, liub@cs.uic.edu
19
+ https://www.cs.uic.edu/~liub
20
+
21
+ Distributed with permission.
22
+
23
+ The "product_reviews_1" and "product_reviews_2" datasets respectively contain
24
+ annotated customer reviews of 5 and 9 products from amazon.com.
25
+
26
+ Related papers:
27
+
28
+ - Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
29
+ Proceedings of the ACM SIGKDD International Conference on Knowledge
30
+ Discovery & Data Mining (KDD-04), 2004.
31
+
32
+ - Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
33
+ Proceedings of Nineteeth National Conference on Artificial Intelligence
34
+ (AAAI-2004), 2004.
35
+
36
+ - Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
37
+ Opinion Mining." Proceedings of First ACM International Conference on Web
38
+ Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
39
+ Stanford, California, USA.
40
+
41
+ Symbols used in the annotated reviews:
42
+
43
+ :[t]: the title of the review: Each [t] tag starts a review.
44
+ :xxxx[+|-n]: xxxx is a product feature.
45
+ :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
46
+ Note that the strength is quite subjective.
47
+ You may want ignore it, but only considering + and -
48
+ :[-n]: Negative opinion
49
+ :##: start of each sentence. Each line is a sentence.
50
+ :[u]: feature not appeared in the sentence.
51
+ :[p]: feature not appeared in the sentence. Pronoun resolution is needed.
52
+ :[s]: suggestion or recommendation.
53
+ :[cc]: comparison with a competing product from a different brand.
54
+ :[cs]: comparison with a competing product from the same brand.
55
+
56
+ Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
57
+ provide separation between different reviews. This is due to the fact that
58
+ the dataset was specifically designed for aspect/feature-based sentiment
59
+ analysis, for which sentence-level annotation is sufficient. For document-
60
+ level classification and analysis, this peculiarity should be taken into
61
+ consideration.
62
+ """
63
+
64
+ import re
65
+
66
+ from nltk.corpus.reader.api import *
67
+ from nltk.tokenize import *
68
+
69
+ TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
70
+ FEATURES = re.compile(
71
+ r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
72
+ ) # find 'feature' in feature[+3]
73
+ NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
74
+ SENT = re.compile(r"##(.*)$") # find tokenized sentence
75
+
76
+
77
+ class Review:
78
+ """
79
+ A Review is the main block of a ReviewsCorpusReader.
80
+ """
81
+
82
+ def __init__(self, title=None, review_lines=None):
83
+ """
84
+ :param title: the title of the review.
85
+ :param review_lines: the list of the ReviewLines that belong to the Review.
86
+ """
87
+ self.title = title
88
+ if review_lines is None:
89
+ self.review_lines = []
90
+ else:
91
+ self.review_lines = review_lines
92
+
93
+ def add_line(self, review_line):
94
+ """
95
+ Add a line (ReviewLine) to the review.
96
+
97
+ :param review_line: a ReviewLine instance that belongs to the Review.
98
+ """
99
+ assert isinstance(review_line, ReviewLine)
100
+ self.review_lines.append(review_line)
101
+
102
+ def features(self):
103
+ """
104
+ Return a list of features in the review. Each feature is a tuple made of
105
+ the specific item feature and the opinion strength about that feature.
106
+
107
+ :return: all features of the review as a list of tuples (feat, score).
108
+ :rtype: list(tuple)
109
+ """
110
+ features = []
111
+ for review_line in self.review_lines:
112
+ features.extend(review_line.features)
113
+ return features
114
+
115
+ def sents(self):
116
+ """
117
+ Return all tokenized sentences in the review.
118
+
119
+ :return: all sentences of the review as lists of tokens.
120
+ :rtype: list(list(str))
121
+ """
122
+ return [review_line.sent for review_line in self.review_lines]
123
+
124
+ def __repr__(self):
125
+ return 'Review(title="{}", review_lines={})'.format(
126
+ self.title, self.review_lines
127
+ )
128
+
129
+
130
+ class ReviewLine:
131
+ """
132
+ A ReviewLine represents a sentence of the review, together with (optional)
133
+ annotations of its features and notes about the reviewed item.
134
+ """
135
+
136
+ def __init__(self, sent, features=None, notes=None):
137
+ self.sent = sent
138
+ if features is None:
139
+ self.features = []
140
+ else:
141
+ self.features = features
142
+
143
+ if notes is None:
144
+ self.notes = []
145
+ else:
146
+ self.notes = notes
147
+
148
+ def __repr__(self):
149
+ return "ReviewLine(features={}, notes={}, sent={})".format(
150
+ self.features, self.notes, self.sent
151
+ )
152
+
153
+
154
+ class ReviewsCorpusReader(CorpusReader):
155
+ """
156
+ Reader for the Customer Review Data dataset by Hu, Liu (2004).
157
+ Note: we are not applying any sentence tokenization at the moment, just word
158
+ tokenization.
159
+
160
+ >>> from nltk.corpus import product_reviews_1
161
+ >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
162
+ >>> review = camera_reviews[0]
163
+ >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
164
+ ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
165
+ 'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
166
+ >>> review.features() # doctest: +NORMALIZE_WHITESPACE
167
+ [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
168
+ ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
169
+ ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
170
+ ('option', '+1')]
171
+
172
+ We can also reach the same information directly from the stream:
173
+
174
+ >>> product_reviews_1.features('Canon_G3.txt')
175
+ [('canon powershot g3', '+3'), ('use', '+2'), ...]
176
+
177
+ We can compute stats for specific product features:
178
+
179
+ >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
180
+ >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
181
+ >>> mean = tot / n_reviews
182
+ >>> print(n_reviews, tot, mean)
183
+ 15 24 1.6
184
+ """
185
+
186
+ CorpusView = StreamBackedCorpusView
187
+
188
+ def __init__(
189
+ self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
190
+ ):
191
+ """
192
+ :param root: The root directory for the corpus.
193
+ :param fileids: a list or regexp specifying the fileids in the corpus.
194
+ :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
195
+ into words. Default: `WordPunctTokenizer`
196
+ :param encoding: the encoding that should be used to read the corpus.
197
+ """
198
+
199
+ CorpusReader.__init__(self, root, fileids, encoding)
200
+ self._word_tokenizer = word_tokenizer
201
+ self._readme = "README.txt"
202
+
203
+ def features(self, fileids=None):
204
+ """
205
+ Return a list of features. Each feature is a tuple made of the specific
206
+ item feature and the opinion strength about that feature.
207
+
208
+ :param fileids: a list or regexp specifying the ids of the files whose
209
+ features have to be returned.
210
+ :return: all features for the item(s) in the given file(s).
211
+ :rtype: list(tuple)
212
+ """
213
+ if fileids is None:
214
+ fileids = self._fileids
215
+ elif isinstance(fileids, str):
216
+ fileids = [fileids]
217
+ return concat(
218
+ [
219
+ self.CorpusView(fileid, self._read_features, encoding=enc)
220
+ for (fileid, enc) in self.abspaths(fileids, True)
221
+ ]
222
+ )
223
+
224
+ def reviews(self, fileids=None):
225
+ """
226
+ Return all the reviews as a list of Review objects. If `fileids` is
227
+ specified, return all the reviews from each of the specified files.
228
+
229
+ :param fileids: a list or regexp specifying the ids of the files whose
230
+ reviews have to be returned.
231
+ :return: the given file(s) as a list of reviews.
232
+ """
233
+ if fileids is None:
234
+ fileids = self._fileids
235
+ return concat(
236
+ [
237
+ self.CorpusView(fileid, self._read_review_block, encoding=enc)
238
+ for (fileid, enc) in self.abspaths(fileids, True)
239
+ ]
240
+ )
241
+
242
+ def sents(self, fileids=None):
243
+ """
244
+ Return all sentences in the corpus or in the specified files.
245
+
246
+ :param fileids: a list or regexp specifying the ids of the files whose
247
+ sentences have to be returned.
248
+ :return: the given file(s) as a list of sentences, each encoded as a
249
+ list of word strings.
250
+ :rtype: list(list(str))
251
+ """
252
+ return concat(
253
+ [
254
+ self.CorpusView(path, self._read_sent_block, encoding=enc)
255
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
256
+ ]
257
+ )
258
+
259
+ def words(self, fileids=None):
260
+ """
261
+ Return all words and punctuation symbols in the corpus or in the specified
262
+ files.
263
+
264
+ :param fileids: a list or regexp specifying the ids of the files whose
265
+ words have to be returned.
266
+ :return: the given file(s) as a list of words and punctuation symbols.
267
+ :rtype: list(str)
268
+ """
269
+ return concat(
270
+ [
271
+ self.CorpusView(path, self._read_word_block, encoding=enc)
272
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
273
+ ]
274
+ )
275
+
276
+ def _read_features(self, stream):
277
+ features = []
278
+ for i in range(20):
279
+ line = stream.readline()
280
+ if not line:
281
+ return features
282
+ features.extend(re.findall(FEATURES, line))
283
+ return features
284
+
285
+ def _read_review_block(self, stream):
286
+ while True:
287
+ line = stream.readline()
288
+ if not line:
289
+ return [] # end of file.
290
+ title_match = re.match(TITLE, line)
291
+ if title_match:
292
+ review = Review(
293
+ title=title_match.group(1).strip()
294
+ ) # We create a new review
295
+ break
296
+
297
+ # Scan until we find another line matching the regexp, or EOF.
298
+ while True:
299
+ oldpos = stream.tell()
300
+ line = stream.readline()
301
+ # End of file:
302
+ if not line:
303
+ return [review]
304
+ # Start of a new review: backup to just before it starts, and
305
+ # return the review we've already collected.
306
+ if re.match(TITLE, line):
307
+ stream.seek(oldpos)
308
+ return [review]
309
+ # Anything else is part of the review line.
310
+ feats = re.findall(FEATURES, line)
311
+ notes = re.findall(NOTES, line)
312
+ sent = re.findall(SENT, line)
313
+ if sent:
314
+ sent = self._word_tokenizer.tokenize(sent[0])
315
+ review_line = ReviewLine(sent=sent, features=feats, notes=notes)
316
+ review.add_line(review_line)
317
+
318
+ def _read_sent_block(self, stream):
319
+ sents = []
320
+ for review in self._read_review_block(stream):
321
+ sents.extend([sent for sent in review.sents()])
322
+ return sents
323
+
324
+ def _read_word_block(self, stream):
325
+ words = []
326
+ for i in range(20): # Read 20 lines at a time.
327
+ line = stream.readline()
328
+ sent = re.findall(SENT, line)
329
+ if sent:
330
+ words.extend(self._word_tokenizer.tokenize(sent[0]))
331
+ return words
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/rte.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: RTE Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Ewan Klein <ewan@inf.ed.ac.uk>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
10
+
11
+ The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
12
+ were regularized.
13
+
14
+ Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
15
+ gold standard annotated files.
16
+
17
+ Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
18
+ example is taken from RTE3::
19
+
20
+ <pair id="1" entailment="YES" task="IE" length="short" >
21
+
22
+ <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
23
+ Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
24
+ company Baikalfinansgroup which was later bought by the Russian
25
+ state-owned oil company Rosneft .</t>
26
+
27
+ <h>Baikalfinansgroup was sold to Rosneft.</h>
28
+ </pair>
29
+
30
+ In order to provide globally unique IDs for each pair, a new attribute
31
+ ``challenge`` has been added to the root element ``entailment-corpus`` of each
32
+ file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
33
+ challenge number and 'n' is the pair ID.
34
+ """
35
+ from nltk.corpus.reader.api import *
36
+ from nltk.corpus.reader.util import *
37
+ from nltk.corpus.reader.xmldocs import *
38
+
39
+
40
+ def norm(value_string):
41
+ """
42
+ Normalize the string value in an RTE pair's ``value`` or ``entailment``
43
+ attribute as an integer (1, 0).
44
+
45
+ :param value_string: the label used to classify a text/hypothesis pair
46
+ :type value_string: str
47
+ :rtype: int
48
+ """
49
+
50
+ valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
51
+ return valdict[value_string.upper()]
52
+
53
+
54
+ class RTEPair:
55
+ """
56
+ Container for RTE text-hypothesis pairs.
57
+
58
+ The entailment relation is signalled by the ``value`` attribute in RTE1, and by
59
+ ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
60
+ attribute of this class.
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ pair,
66
+ challenge=None,
67
+ id=None,
68
+ text=None,
69
+ hyp=None,
70
+ value=None,
71
+ task=None,
72
+ length=None,
73
+ ):
74
+ """
75
+ :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
76
+ :param id: identifier for the pair
77
+ :param text: the text component of the pair
78
+ :param hyp: the hypothesis component of the pair
79
+ :param value: classification label for the pair
80
+ :param task: attribute for the particular NLP task that the data was drawn from
81
+ :param length: attribute for the length of the text of the pair
82
+ """
83
+ self.challenge = challenge
84
+ self.id = pair.attrib["id"]
85
+ self.gid = f"{self.challenge}-{self.id}"
86
+ self.text = pair[0].text
87
+ self.hyp = pair[1].text
88
+
89
+ if "value" in pair.attrib:
90
+ self.value = norm(pair.attrib["value"])
91
+ elif "entailment" in pair.attrib:
92
+ self.value = norm(pair.attrib["entailment"])
93
+ else:
94
+ self.value = value
95
+ if "task" in pair.attrib:
96
+ self.task = pair.attrib["task"]
97
+ else:
98
+ self.task = task
99
+ if "length" in pair.attrib:
100
+ self.length = pair.attrib["length"]
101
+ else:
102
+ self.length = length
103
+
104
+ def __repr__(self):
105
+ if self.challenge:
106
+ return f"<RTEPair: gid={self.challenge}-{self.id}>"
107
+ else:
108
+ return "<RTEPair: id=%s>" % self.id
109
+
110
+
111
+ class RTECorpusReader(XMLCorpusReader):
112
+ """
113
+ Corpus reader for corpora in RTE challenges.
114
+
115
+ This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
116
+ structure of input documents.
117
+ """
118
+
119
+ def _read_etree(self, doc):
120
+ """
121
+ Map the XML input into an RTEPair.
122
+
123
+ This uses the ``getiterator()`` method from the ElementTree package to
124
+ find all the ``<pair>`` elements.
125
+
126
+ :param doc: a parsed XML document
127
+ :rtype: list(RTEPair)
128
+ """
129
+ try:
130
+ challenge = doc.attrib["challenge"]
131
+ except KeyError:
132
+ challenge = None
133
+ pairiter = doc.iter("pair")
134
+ return [RTEPair(pair, challenge=challenge) for pair in pairiter]
135
+
136
+ def pairs(self, fileids):
137
+ """
138
+ Build a list of RTEPairs from a RTE corpus.
139
+
140
+ :param fileids: a list of RTE corpus fileids
141
+ :type: list
142
+ :rtype: list(RTEPair)
143
+ """
144
+ if isinstance(fileids, str):
145
+ fileids = [fileids]
146
+ return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/semcor.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: SemCor Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Nathan Schneider <nschneid@cs.cmu.edu>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Corpus reader for the SemCor Corpus.
10
+ """
11
+
12
+ __docformat__ = "epytext en"
13
+
14
+ from nltk.corpus.reader.api import *
15
+ from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
16
+ from nltk.tree import Tree
17
+
18
+
19
+ class SemcorCorpusReader(XMLCorpusReader):
20
+ """
21
+ Corpus reader for the SemCor Corpus.
22
+ For access to the complete XML data structure, use the ``xml()``
23
+ method. For access to simple word lists and tagged word lists, use
24
+ ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
25
+ """
26
+
27
+ def __init__(self, root, fileids, wordnet, lazy=True):
28
+ XMLCorpusReader.__init__(self, root, fileids)
29
+ self._lazy = lazy
30
+ self._wordnet = wordnet
31
+
32
+ def words(self, fileids=None):
33
+ """
34
+ :return: the given file(s) as a list of words and punctuation symbols.
35
+ :rtype: list(str)
36
+ """
37
+ return self._items(fileids, "word", False, False, False)
38
+
39
+ def chunks(self, fileids=None):
40
+ """
41
+ :return: the given file(s) as a list of chunks,
42
+ each of which is a list of words and punctuation symbols
43
+ that form a unit.
44
+ :rtype: list(list(str))
45
+ """
46
+ return self._items(fileids, "chunk", False, False, False)
47
+
48
+ def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
49
+ """
50
+ :return: the given file(s) as a list of tagged chunks, represented
51
+ in tree form.
52
+ :rtype: list(Tree)
53
+
54
+ :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
55
+ to indicate the kind of tags to include. Semantic tags consist of
56
+ WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
57
+ without a specific entry in WordNet. (Named entities of type 'other'
58
+ have no lemma. Other chunks not in WordNet have no semantic tag.
59
+ Punctuation tokens have `None` for their part of speech tag.)
60
+ """
61
+ return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
62
+
63
+ def sents(self, fileids=None):
64
+ """
65
+ :return: the given file(s) as a list of sentences, each encoded
66
+ as a list of word strings.
67
+ :rtype: list(list(str))
68
+ """
69
+ return self._items(fileids, "word", True, False, False)
70
+
71
+ def chunk_sents(self, fileids=None):
72
+ """
73
+ :return: the given file(s) as a list of sentences, each encoded
74
+ as a list of chunks.
75
+ :rtype: list(list(list(str)))
76
+ """
77
+ return self._items(fileids, "chunk", True, False, False)
78
+
79
+ def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
80
+ """
81
+ :return: the given file(s) as a list of sentences. Each sentence
82
+ is represented as a list of tagged chunks (in tree form).
83
+ :rtype: list(list(Tree))
84
+
85
+ :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
86
+ to indicate the kind of tags to include. Semantic tags consist of
87
+ WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
88
+ without a specific entry in WordNet. (Named entities of type 'other'
89
+ have no lemma. Other chunks not in WordNet have no semantic tag.
90
+ Punctuation tokens have `None` for their part of speech tag.)
91
+ """
92
+ return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
93
+
94
+ def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
95
+ if unit == "word" and not bracket_sent:
96
+ # the result of the SemcorWordView may be a multiword unit, so the
97
+ # LazyConcatenation will make sure the sentence is flattened
98
+ _ = lambda *args: LazyConcatenation(
99
+ (SemcorWordView if self._lazy else self._words)(*args)
100
+ )
101
+ else:
102
+ _ = SemcorWordView if self._lazy else self._words
103
+ return concat(
104
+ [
105
+ _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
106
+ for fileid in self.abspaths(fileids)
107
+ ]
108
+ )
109
+
110
+ def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
111
+ """
112
+ Helper used to implement the view methods -- returns a list of
113
+ tokens, (segmented) words, chunks, or sentences. The tokens
114
+ and chunks may optionally be tagged (with POS and sense
115
+ information).
116
+
117
+ :param fileid: The name of the underlying file.
118
+ :param unit: One of `'token'`, `'word'`, or `'chunk'`.
119
+ :param bracket_sent: If true, include sentence bracketing.
120
+ :param pos_tag: Whether to include part-of-speech tags.
121
+ :param sem_tag: Whether to include semantic tags, namely WordNet lemma
122
+ and OOV named entity status.
123
+ """
124
+ assert unit in ("token", "word", "chunk")
125
+ result = []
126
+
127
+ xmldoc = ElementTree.parse(fileid).getroot()
128
+ for xmlsent in xmldoc.findall(".//s"):
129
+ sent = []
130
+ for xmlword in _all_xmlwords_in(xmlsent):
131
+ itm = SemcorCorpusReader._word(
132
+ xmlword, unit, pos_tag, sem_tag, self._wordnet
133
+ )
134
+ if unit == "word":
135
+ sent.extend(itm)
136
+ else:
137
+ sent.append(itm)
138
+
139
+ if bracket_sent:
140
+ result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
141
+ else:
142
+ result.extend(sent)
143
+
144
+ assert None not in result
145
+ return result
146
+
147
+ @staticmethod
148
+ def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
149
+ tkn = xmlword.text
150
+ if not tkn:
151
+ tkn = "" # fixes issue 337?
152
+
153
+ lemma = xmlword.get("lemma", tkn) # lemma or NE class
154
+ lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
155
+ if lexsn is not None:
156
+ sense_key = lemma + "%" + lexsn
157
+ wnpos = ("n", "v", "a", "r", "s")[
158
+ int(lexsn.split(":")[0]) - 1
159
+ ] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
160
+ else:
161
+ sense_key = wnpos = None
162
+ redef = xmlword.get(
163
+ "rdf", tkn
164
+ ) # redefinition--this indicates the lookup string
165
+ # does not exactly match the enclosed string, e.g. due to typographical adjustments
166
+ # or discontinuity of a multiword expression. If a redefinition has occurred,
167
+ # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
168
+ # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
169
+ sensenum = xmlword.get("wnsn") # WordNet sense number
170
+ isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
171
+ pos = xmlword.get(
172
+ "pos"
173
+ ) # part of speech for the whole chunk (None for punctuation)
174
+
175
+ if unit == "token":
176
+ if not pos_tag and not sem_tag:
177
+ itm = tkn
178
+ else:
179
+ itm = (
180
+ (tkn,)
181
+ + ((pos,) if pos_tag else ())
182
+ + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
183
+ )
184
+ return itm
185
+ else:
186
+ ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
187
+ if unit == "word":
188
+ return ww
189
+ else:
190
+ if sensenum is not None:
191
+ try:
192
+ sense = wordnet.lemma_from_key(sense_key) # Lemma object
193
+ except Exception:
194
+ # cannot retrieve the wordnet.Lemma object. possible reasons:
195
+ # (a) the wordnet corpus is not downloaded;
196
+ # (b) a nonexistent sense is annotated: e.g., such.s.00 triggers:
197
+ # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
198
+ # solution: just use the lemma name as a string
199
+ try:
200
+ sense = "%s.%s.%02d" % (
201
+ lemma,
202
+ wnpos,
203
+ int(sensenum),
204
+ ) # e.g.: reach.v.02
205
+ except ValueError:
206
+ sense = (
207
+ lemma + "." + wnpos + "." + sensenum
208
+ ) # e.g. the sense number may be "2;1"
209
+
210
+ bottom = [Tree(pos, ww)] if pos_tag else ww
211
+
212
+ if sem_tag and isOOVEntity:
213
+ if sensenum is not None:
214
+ return Tree(sense, [Tree("NE", bottom)])
215
+ else: # 'other' NE
216
+ return Tree("NE", bottom)
217
+ elif sem_tag and sensenum is not None:
218
+ return Tree(sense, bottom)
219
+ elif pos_tag:
220
+ return bottom[0]
221
+ else:
222
+ return bottom # chunk as a list
223
+
224
+
225
+ def _all_xmlwords_in(elt, result=None):
226
+ if result is None:
227
+ result = []
228
+ for child in elt:
229
+ if child.tag in ("wf", "punc"):
230
+ result.append(child)
231
+ else:
232
+ _all_xmlwords_in(child, result)
233
+ return result
234
+
235
+
236
+ class SemcorSentence(list):
237
+ """
238
+ A list of words, augmented by an attribute ``num`` used to record
239
+ the sentence identifier (the ``n`` attribute from the XML).
240
+ """
241
+
242
+ def __init__(self, num, items):
243
+ self.num = num
244
+ list.__init__(self, items)
245
+
246
+
247
+ class SemcorWordView(XMLCorpusView):
248
+ """
249
+ A stream backed corpus view specialized for use with the BNC corpus.
250
+ """
251
+
252
+ def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
253
+ """
254
+ :param fileid: The name of the underlying file.
255
+ :param unit: One of `'token'`, `'word'`, or `'chunk'`.
256
+ :param bracket_sent: If true, include sentence bracketing.
257
+ :param pos_tag: Whether to include part-of-speech tags.
258
+ :param sem_tag: Whether to include semantic tags, namely WordNet lemma
259
+ and OOV named entity status.
260
+ """
261
+ if bracket_sent:
262
+ tagspec = ".*/s"
263
+ else:
264
+ tagspec = ".*/s/(punc|wf)"
265
+
266
+ self._unit = unit
267
+ self._sent = bracket_sent
268
+ self._pos_tag = pos_tag
269
+ self._sem_tag = sem_tag
270
+ self._wordnet = wordnet
271
+
272
+ XMLCorpusView.__init__(self, fileid, tagspec)
273
+
274
+ def handle_elt(self, elt, context):
275
+ if self._sent:
276
+ return self.handle_sent(elt)
277
+ else:
278
+ return self.handle_word(elt)
279
+
280
+ def handle_word(self, elt):
281
+ return SemcorCorpusReader._word(
282
+ elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
283
+ )
284
+
285
+ def handle_sent(self, elt):
286
+ sent = []
287
+ for child in elt:
288
+ if child.tag in ("wf", "punc"):
289
+ itm = self.handle_word(child)
290
+ if self._unit == "word":
291
+ sent.extend(itm)
292
+ else:
293
+ sent.append(itm)
294
+ else:
295
+ raise ValueError("Unexpected element %s" % child.tag)
296
+ return SemcorSentence(elt.attrib["snum"], sent)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/senseval.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Senseval 2 Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5
+ # Steven Bird <stevenbird1@gmail.com> (modifications)
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ Read from the Senseval 2 Corpus.
11
+
12
+ SENSEVAL [http://www.senseval.org/]
13
+ Evaluation exercises for Word Sense Disambiguation.
14
+ Organized by ACL-SIGLEX [https://www.siglex.org/]
15
+
16
+ Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
17
+ https://www.d.umn.edu/~tpederse/data.html
18
+ Distributed with permission.
19
+
20
+ The NLTK version of the Senseval 2 files uses well-formed XML.
21
+ Each instance of the ambiguous words "hard", "interest", "line", and "serve"
22
+ is tagged with a sense identifier, and supplied with context.
23
+ """
24
+
25
+ import re
26
+ from xml.etree import ElementTree
27
+
28
+ from nltk.corpus.reader.api import *
29
+ from nltk.corpus.reader.util import *
30
+ from nltk.tokenize import *
31
+
32
+
33
+ class SensevalInstance:
34
+ def __init__(self, word, position, context, senses):
35
+ self.word = word
36
+ self.senses = tuple(senses)
37
+ self.position = position
38
+ self.context = context
39
+
40
+ def __repr__(self):
41
+ return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
42
+ self.word,
43
+ self.position,
44
+ self.context,
45
+ self.senses,
46
+ )
47
+
48
+
49
+ class SensevalCorpusReader(CorpusReader):
50
+ def instances(self, fileids=None):
51
+ return concat(
52
+ [
53
+ SensevalCorpusView(fileid, enc)
54
+ for (fileid, enc) in self.abspaths(fileids, True)
55
+ ]
56
+ )
57
+
58
+ def _entry(self, tree):
59
+ elts = []
60
+ for lexelt in tree.findall("lexelt"):
61
+ for inst in lexelt.findall("instance"):
62
+ sense = inst[0].attrib["senseid"]
63
+ context = [(w.text, w.attrib["pos"]) for w in inst[1]]
64
+ elts.append((sense, context))
65
+ return elts
66
+
67
+
68
+ class SensevalCorpusView(StreamBackedCorpusView):
69
+ def __init__(self, fileid, encoding):
70
+ StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
71
+
72
+ self._word_tokenizer = WhitespaceTokenizer()
73
+ self._lexelt_starts = [0] # list of streampos
74
+ self._lexelts = [None] # list of lexelt names
75
+
76
+ def read_block(self, stream):
77
+ # Decide which lexical element we're in.
78
+ lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
79
+ lexelt = self._lexelts[lexelt_num]
80
+
81
+ instance_lines = []
82
+ in_instance = False
83
+ while True:
84
+ line = stream.readline()
85
+ if line == "":
86
+ assert instance_lines == []
87
+ return []
88
+
89
+ # Start of a lexical element?
90
+ if line.lstrip().startswith("<lexelt"):
91
+ lexelt_num += 1
92
+ m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
93
+ assert m is not None # <lexelt> has no 'item=...'
94
+ lexelt = m.group(1)[1:-1]
95
+ if lexelt_num < len(self._lexelts):
96
+ assert lexelt == self._lexelts[lexelt_num]
97
+ else:
98
+ self._lexelts.append(lexelt)
99
+ self._lexelt_starts.append(stream.tell())
100
+
101
+ # Start of an instance?
102
+ if line.lstrip().startswith("<instance"):
103
+ assert instance_lines == []
104
+ in_instance = True
105
+
106
+ # Body of an instance?
107
+ if in_instance:
108
+ instance_lines.append(line)
109
+
110
+ # End of an instance?
111
+ if line.lstrip().startswith("</instance"):
112
+ xml_block = "\n".join(instance_lines)
113
+ xml_block = _fixXML(xml_block)
114
+ inst = ElementTree.fromstring(xml_block)
115
+ return [self._parse_instance(inst, lexelt)]
116
+
117
+ def _parse_instance(self, instance, lexelt):
118
+ senses = []
119
+ context = []
120
+ position = None
121
+ for child in instance:
122
+ if child.tag == "answer":
123
+ senses.append(child.attrib["senseid"])
124
+ elif child.tag == "context":
125
+ context += self._word_tokenizer.tokenize(child.text)
126
+ for cword in child:
127
+ if cword.tag == "compound":
128
+ cword = cword[0] # is this ok to do?
129
+
130
+ if cword.tag == "head":
131
+ # Some santiy checks:
132
+ assert position is None, "head specified twice"
133
+ assert cword.text.strip() or len(cword) == 1
134
+ assert not (cword.text.strip() and len(cword) == 1)
135
+ # Record the position of the head:
136
+ position = len(context)
137
+ # Add on the head word itself:
138
+ if cword.text.strip():
139
+ context.append(cword.text.strip())
140
+ elif cword[0].tag == "wf":
141
+ context.append((cword[0].text, cword[0].attrib["pos"]))
142
+ if cword[0].tail:
143
+ context += self._word_tokenizer.tokenize(cword[0].tail)
144
+ else:
145
+ assert False, "expected CDATA or wf in <head>"
146
+ elif cword.tag == "wf":
147
+ context.append((cword.text, cword.attrib["pos"]))
148
+ elif cword.tag == "s":
149
+ pass # Sentence boundary marker.
150
+
151
+ else:
152
+ print("ACK", cword.tag)
153
+ assert False, "expected CDATA or <wf> or <head>"
154
+ if cword.tail:
155
+ context += self._word_tokenizer.tokenize(cword.tail)
156
+ else:
157
+ assert False, "unexpected tag %s" % child.tag
158
+ return SensevalInstance(lexelt, position, context, senses)
159
+
160
+
161
+ def _fixXML(text):
162
+ """
163
+ Fix the various issues with Senseval pseudo-XML.
164
+ """
165
+ # <~> or <^> => ~ or ^
166
+ text = re.sub(r"<([~\^])>", r"\1", text)
167
+ # fix lone &
168
+ text = re.sub(r"(\s+)\&(\s+)", r"\1&amp;\2", text)
169
+ # fix """
170
+ text = re.sub(r'"""', "'\"'", text)
171
+ # fix <s snum=dd> => <s snum="dd"/>
172
+ text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
173
+ # fix foreign word tag
174
+ text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
175
+ # remove <&I .>
176
+ text = re.sub(r"<\&I[^>]*>", "", text)
177
+ # fix <{word}>
178
+ text = re.sub(r"<{([^}]+)}>", r"\1", text)
179
+ # remove <@>, <p>, </p>
180
+ text = re.sub(r"<(@|/?p)>", r"", text)
181
+ # remove <&M .> and <&T .> and <&Ms .>
182
+ text = re.sub(r"<&\w+ \.>", r"", text)
183
+ # remove <!DOCTYPE... > lines
184
+ text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
185
+ # remove <[hi]> and <[/p]> etc
186
+ text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
187
+ # take the thing out of the brackets: <&hellip;>
188
+ text = re.sub(r"<(\&\w+;)>", r"\1", text)
189
+ # and remove the & for those patterns that aren't regular XML
190
+ text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
191
+ # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
192
+ text = re.sub(
193
+ r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
194
+ )
195
+ text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
196
+ return text
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sentiwordnet.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: SentiWordNet
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Christopher Potts <cgpotts@stanford.edu>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ An NLTK interface for SentiWordNet
10
+
11
+ SentiWordNet is a lexical resource for opinion mining.
12
+ SentiWordNet assigns to each synset of WordNet three
13
+ sentiment scores: positivity, negativity, and objectivity.
14
+
15
+ For details about SentiWordNet see:
16
+ http://sentiwordnet.isti.cnr.it/
17
+
18
+ >>> from nltk.corpus import sentiwordnet as swn
19
+ >>> print(swn.senti_synset('breakdown.n.03'))
20
+ <breakdown.n.03: PosScore=0.0 NegScore=0.25>
21
+ >>> list(swn.senti_synsets('slow'))
22
+ [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
23
+ SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
24
+ SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
25
+ SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
26
+ SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),\
27
+ SentiSynset('behind.r.03')]
28
+ >>> happy = swn.senti_synsets('happy', 'a')
29
+ >>> happy0 = list(happy)[0]
30
+ >>> happy0.pos_score()
31
+ 0.875
32
+ >>> happy0.neg_score()
33
+ 0.0
34
+ >>> happy0.obj_score()
35
+ 0.125
36
+ """
37
+
38
+ import re
39
+
40
+ from nltk.corpus.reader import CorpusReader
41
+
42
+
43
+ class SentiWordNetCorpusReader(CorpusReader):
44
+ def __init__(self, root, fileids, encoding="utf-8"):
45
+ """
46
+ Construct a new SentiWordNet Corpus Reader, using data from
47
+ the specified file.
48
+ """
49
+ super().__init__(root, fileids, encoding=encoding)
50
+ if len(self._fileids) != 1:
51
+ raise ValueError("Exactly one file must be specified")
52
+ self._db = {}
53
+ self._parse_src_file()
54
+
55
+ def _parse_src_file(self):
56
+ lines = self.open(self._fileids[0]).read().splitlines()
57
+ lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
58
+ for i, line in enumerate(lines):
59
+ fields = [field.strip() for field in re.split(r"\t+", line)]
60
+ try:
61
+ pos, offset, pos_score, neg_score, synset_terms, gloss = fields
62
+ except BaseException as e:
63
+ raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
64
+ if pos and offset:
65
+ offset = int(offset)
66
+ self._db[(pos, offset)] = (float(pos_score), float(neg_score))
67
+
68
+ def senti_synset(self, *vals):
69
+ from nltk.corpus import wordnet as wn
70
+
71
+ if tuple(vals) in self._db:
72
+ pos_score, neg_score = self._db[tuple(vals)]
73
+ pos, offset = vals
74
+ if pos == "s":
75
+ pos = "a"
76
+ synset = wn.synset_from_pos_and_offset(pos, offset)
77
+ return SentiSynset(pos_score, neg_score, synset)
78
+ else:
79
+ synset = wn.synset(vals[0])
80
+ pos = synset.pos()
81
+ if pos == "s":
82
+ pos = "a"
83
+ offset = synset.offset()
84
+ if (pos, offset) in self._db:
85
+ pos_score, neg_score = self._db[(pos, offset)]
86
+ return SentiSynset(pos_score, neg_score, synset)
87
+ else:
88
+ return None
89
+
90
+ def senti_synsets(self, string, pos=None):
91
+ from nltk.corpus import wordnet as wn
92
+
93
+ sentis = []
94
+ synset_list = wn.synsets(string, pos)
95
+ for synset in synset_list:
96
+ sentis.append(self.senti_synset(synset.name()))
97
+ sentis = filter(lambda x: x, sentis)
98
+ return sentis
99
+
100
+ def all_senti_synsets(self):
101
+ from nltk.corpus import wordnet as wn
102
+
103
+ for key, fields in self._db.items():
104
+ pos, offset = key
105
+ pos_score, neg_score = fields
106
+ synset = wn.synset_from_pos_and_offset(pos, offset)
107
+ yield SentiSynset(pos_score, neg_score, synset)
108
+
109
+
110
+ class SentiSynset:
111
+ def __init__(self, pos_score, neg_score, synset):
112
+ self._pos_score = pos_score
113
+ self._neg_score = neg_score
114
+ self._obj_score = 1.0 - (self._pos_score + self._neg_score)
115
+ self.synset = synset
116
+
117
+ def pos_score(self):
118
+ return self._pos_score
119
+
120
+ def neg_score(self):
121
+ return self._neg_score
122
+
123
+ def obj_score(self):
124
+ return self._obj_score
125
+
126
+ def __str__(self):
127
+ """Prints just the Pos/Neg scores for now."""
128
+ s = "<"
129
+ s += self.synset.name() + ": "
130
+ s += "PosScore=%s " % self._pos_score
131
+ s += "NegScore=%s" % self._neg_score
132
+ s += ">"
133
+ return s
134
+
135
+ def __repr__(self):
136
+ return "Senti" + repr(self.synset)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/sinica_treebank.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Sinica Treebank Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Sinica Treebank Corpus Sample
10
+
11
+ http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
12
+
13
+ 10,000 parsed sentences, drawn from the Academia Sinica Balanced
14
+ Corpus of Modern Chinese. Parse tree notation is based on
15
+ Information-based Case Grammar. Tagset documentation is available
16
+ at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
17
+
18
+ Language and Knowledge Processing Group, Institute of Information
19
+ Science, Academia Sinica
20
+
21
+ The data is distributed with the Natural Language Toolkit under the terms of
22
+ the Creative Commons Attribution-NonCommercial-ShareAlike License
23
+ [https://creativecommons.org/licenses/by-nc-sa/2.5/].
24
+
25
+ References:
26
+
27
+ Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
28
+ The Construction of Sinica Treebank. Computational Linguistics and
29
+ Chinese Language Processing, 4, pp 87-104.
30
+
31
+ Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
32
+ Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
33
+ Annotation Guidelines, and On-line Interface. Proceedings of 2nd
34
+ Chinese Language Processing Workshop, Association for Computational
35
+ Linguistics.
36
+
37
+ Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
38
+ Extraction, Proceedings of IJCNLP-04, pp560-565.
39
+ """
40
+
41
+ from nltk.corpus.reader.api import *
42
+ from nltk.corpus.reader.util import *
43
+ from nltk.tag import map_tag
44
+ from nltk.tree import sinica_parse
45
+
46
+ IDENTIFIER = re.compile(r"^#\S+\s")
47
+ APPENDIX = re.compile(r"(?<=\))#.*$")
48
+ TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
49
+ WORD = re.compile(r":[^:()|]+:([^:()|]+)")
50
+
51
+
52
+ class SinicaTreebankCorpusReader(SyntaxCorpusReader):
53
+ """
54
+ Reader for the sinica treebank.
55
+ """
56
+
57
+ def _read_block(self, stream):
58
+ sent = stream.readline()
59
+ sent = IDENTIFIER.sub("", sent)
60
+ sent = APPENDIX.sub("", sent)
61
+ return [sent]
62
+
63
+ def _parse(self, sent):
64
+ return sinica_parse(sent)
65
+
66
+ def _tag(self, sent, tagset=None):
67
+ tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
68
+ if tagset and tagset != self._tagset:
69
+ tagged_sent = [
70
+ (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
71
+ ]
72
+ return tagged_sent
73
+
74
+ def _word(self, sent):
75
+ return WORD.findall(sent)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/string_category.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: String Category Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ Read tuples from a corpus consisting of categorized strings.
11
+ For example, from the question classification corpus:
12
+
13
+ NUM:dist How far is it from Denver to Aspen ?
14
+ LOC:city What county is Modesto , California in ?
15
+ HUM:desc Who was Galileo ?
16
+ DESC:def What is an atom ?
17
+ NUM:date When did Hawaii become a state ?
18
+ """
19
+
20
+ from nltk.corpus.reader.api import *
21
+
22
+ # based on PPAttachmentCorpusReader
23
+ from nltk.corpus.reader.util import *
24
+
25
+
26
+ # [xx] Should the order of the tuple be reversed -- in most other places
27
+ # in nltk, we use the form (data, tag) -- e.g., tagged words and
28
+ # labeled texts for classifiers.
29
+ class StringCategoryCorpusReader(CorpusReader):
30
+ def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
31
+ """
32
+ :param root: The root directory for this corpus.
33
+ :param fileids: A list or regexp specifying the fileids in this corpus.
34
+ :param delimiter: Field delimiter
35
+ """
36
+ CorpusReader.__init__(self, root, fileids, encoding)
37
+ self._delimiter = delimiter
38
+
39
+ def tuples(self, fileids=None):
40
+ if fileids is None:
41
+ fileids = self._fileids
42
+ elif isinstance(fileids, str):
43
+ fileids = [fileids]
44
+ return concat(
45
+ [
46
+ StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
47
+ for (fileid, enc) in self.abspaths(fileids, True)
48
+ ]
49
+ )
50
+
51
+ def _read_tuple_block(self, stream):
52
+ line = stream.readline().strip()
53
+ if line:
54
+ return [tuple(line.split(self._delimiter, 1))]
55
+ else:
56
+ return []
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/switchboard.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Switchboard Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+ import re
8
+
9
+ from nltk.corpus.reader.api import *
10
+ from nltk.corpus.reader.util import *
11
+ from nltk.tag import map_tag, str2tuple
12
+
13
+
14
+ class SwitchboardTurn(list):
15
+ """
16
+ A specialized list object used to encode switchboard utterances.
17
+ The elements of the list are the words in the utterance; and two
18
+ attributes, ``speaker`` and ``id``, are provided to retrieve the
19
+ spearker identifier and utterance id. Note that utterance ids
20
+ are only unique within a given discourse.
21
+ """
22
+
23
+ def __init__(self, words, speaker, id):
24
+ list.__init__(self, words)
25
+ self.speaker = speaker
26
+ self.id = int(id)
27
+
28
+ def __repr__(self):
29
+ if len(self) == 0:
30
+ text = ""
31
+ elif isinstance(self[0], tuple):
32
+ text = " ".join("%s/%s" % w for w in self)
33
+ else:
34
+ text = " ".join(self)
35
+ return f"<{self.speaker}.{self.id}: {text!r}>"
36
+
37
+
38
+ class SwitchboardCorpusReader(CorpusReader):
39
+ _FILES = ["tagged"]
40
+ # Use the "tagged" file even for non-tagged data methods, since
41
+ # it's tokenized.
42
+
43
+ def __init__(self, root, tagset=None):
44
+ CorpusReader.__init__(self, root, self._FILES)
45
+ self._tagset = tagset
46
+
47
+ def words(self):
48
+ return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
49
+
50
+ def tagged_words(self, tagset=None):
51
+ def tagged_words_block_reader(stream):
52
+ return self._tagged_words_block_reader(stream, tagset)
53
+
54
+ return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
55
+
56
+ def turns(self):
57
+ return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
58
+
59
+ def tagged_turns(self, tagset=None):
60
+ def tagged_turns_block_reader(stream):
61
+ return self._tagged_turns_block_reader(stream, tagset)
62
+
63
+ return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
64
+
65
+ def discourses(self):
66
+ return StreamBackedCorpusView(
67
+ self.abspath("tagged"), self._discourses_block_reader
68
+ )
69
+
70
+ def tagged_discourses(self, tagset=False):
71
+ def tagged_discourses_block_reader(stream):
72
+ return self._tagged_discourses_block_reader(stream, tagset)
73
+
74
+ return StreamBackedCorpusView(
75
+ self.abspath("tagged"), tagged_discourses_block_reader
76
+ )
77
+
78
+ def _discourses_block_reader(self, stream):
79
+ # returns at most 1 discourse. (The other methods depend on this.)
80
+ return [
81
+ [
82
+ self._parse_utterance(u, include_tag=False)
83
+ for b in read_blankline_block(stream)
84
+ for u in b.split("\n")
85
+ if u.strip()
86
+ ]
87
+ ]
88
+
89
+ def _tagged_discourses_block_reader(self, stream, tagset=None):
90
+ # returns at most 1 discourse. (The other methods depend on this.)
91
+ return [
92
+ [
93
+ self._parse_utterance(u, include_tag=True, tagset=tagset)
94
+ for b in read_blankline_block(stream)
95
+ for u in b.split("\n")
96
+ if u.strip()
97
+ ]
98
+ ]
99
+
100
+ def _turns_block_reader(self, stream):
101
+ return self._discourses_block_reader(stream)[0]
102
+
103
+ def _tagged_turns_block_reader(self, stream, tagset=None):
104
+ return self._tagged_discourses_block_reader(stream, tagset)[0]
105
+
106
+ def _words_block_reader(self, stream):
107
+ return sum(self._discourses_block_reader(stream)[0], [])
108
+
109
+ def _tagged_words_block_reader(self, stream, tagset=None):
110
+ return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
111
+
112
+ _UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)")
113
+ _SEP = "/"
114
+
115
+ def _parse_utterance(self, utterance, include_tag, tagset=None):
116
+ m = self._UTTERANCE_RE.match(utterance)
117
+ if m is None:
118
+ raise ValueError("Bad utterance %r" % utterance)
119
+ speaker, id, text = m.groups()
120
+ words = [str2tuple(s, self._SEP) for s in text.split()]
121
+ if not include_tag:
122
+ words = [w for (w, t) in words]
123
+ elif tagset and tagset != self._tagset:
124
+ words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
125
+ return SwitchboardTurn(words, speaker, id)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/tagged.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Tagged Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com>
6
+ # Jacob Perkins <japerk@gmail.com>
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ """
11
+ A reader for corpora whose documents contain part-of-speech-tagged words.
12
+ """
13
+
14
+ import os
15
+
16
+ from nltk.corpus.reader.api import *
17
+ from nltk.corpus.reader.timit import read_timit_block
18
+ from nltk.corpus.reader.util import *
19
+ from nltk.tag import map_tag, str2tuple
20
+ from nltk.tokenize import *
21
+
22
+
23
+ class TaggedCorpusReader(CorpusReader):
24
+ """
25
+ Reader for simple part-of-speech tagged corpora. Paragraphs are
26
+ assumed to be split using blank lines. Sentences and words can be
27
+ tokenized using the default tokenizers, or by custom tokenizers
28
+ specified as parameters to the constructor. Words are parsed
29
+ using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the
30
+ separator. I.e., words should have the form::
31
+
32
+ word1/tag1 word2/tag2 word3/tag3 ...
33
+
34
+ But custom separators may be specified as parameters to the
35
+ constructor. Part of speech tags are case-normalized to upper
36
+ case.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ root,
42
+ fileids,
43
+ sep="/",
44
+ word_tokenizer=WhitespaceTokenizer(),
45
+ sent_tokenizer=RegexpTokenizer("\n", gaps=True),
46
+ para_block_reader=read_blankline_block,
47
+ encoding="utf8",
48
+ tagset=None,
49
+ ):
50
+ """
51
+ Construct a new Tagged Corpus reader for a set of documents
52
+ located at the given root directory. Example usage:
53
+
54
+ >>> root = '/...path to corpus.../'
55
+ >>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
56
+
57
+ :param root: The root directory for this corpus.
58
+ :param fileids: A list or regexp specifying the fileids in this corpus.
59
+ """
60
+ CorpusReader.__init__(self, root, fileids, encoding)
61
+ self._sep = sep
62
+ self._word_tokenizer = word_tokenizer
63
+ self._sent_tokenizer = sent_tokenizer
64
+ self._para_block_reader = para_block_reader
65
+ self._tagset = tagset
66
+
67
+ def words(self, fileids=None):
68
+ """
69
+ :return: the given file(s) as a list of words
70
+ and punctuation symbols.
71
+ :rtype: list(str)
72
+ """
73
+ return concat(
74
+ [
75
+ TaggedCorpusView(
76
+ fileid,
77
+ enc,
78
+ False,
79
+ False,
80
+ False,
81
+ self._sep,
82
+ self._word_tokenizer,
83
+ self._sent_tokenizer,
84
+ self._para_block_reader,
85
+ None,
86
+ )
87
+ for (fileid, enc) in self.abspaths(fileids, True)
88
+ ]
89
+ )
90
+
91
+ def sents(self, fileids=None):
92
+ """
93
+ :return: the given file(s) as a list of
94
+ sentences or utterances, each encoded as a list of word
95
+ strings.
96
+ :rtype: list(list(str))
97
+ """
98
+ return concat(
99
+ [
100
+ TaggedCorpusView(
101
+ fileid,
102
+ enc,
103
+ False,
104
+ True,
105
+ False,
106
+ self._sep,
107
+ self._word_tokenizer,
108
+ self._sent_tokenizer,
109
+ self._para_block_reader,
110
+ None,
111
+ )
112
+ for (fileid, enc) in self.abspaths(fileids, True)
113
+ ]
114
+ )
115
+
116
+ def paras(self, fileids=None):
117
+ """
118
+ :return: the given file(s) as a list of
119
+ paragraphs, each encoded as a list of sentences, which are
120
+ in turn encoded as lists of word strings.
121
+ :rtype: list(list(list(str)))
122
+ """
123
+ return concat(
124
+ [
125
+ TaggedCorpusView(
126
+ fileid,
127
+ enc,
128
+ False,
129
+ True,
130
+ True,
131
+ self._sep,
132
+ self._word_tokenizer,
133
+ self._sent_tokenizer,
134
+ self._para_block_reader,
135
+ None,
136
+ )
137
+ for (fileid, enc) in self.abspaths(fileids, True)
138
+ ]
139
+ )
140
+
141
+ def tagged_words(self, fileids=None, tagset=None):
142
+ """
143
+ :return: the given file(s) as a list of tagged
144
+ words and punctuation symbols, encoded as tuples
145
+ ``(word,tag)``.
146
+ :rtype: list(tuple(str,str))
147
+ """
148
+ if tagset and tagset != self._tagset:
149
+ tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
150
+ else:
151
+ tag_mapping_function = None
152
+ return concat(
153
+ [
154
+ TaggedCorpusView(
155
+ fileid,
156
+ enc,
157
+ True,
158
+ False,
159
+ False,
160
+ self._sep,
161
+ self._word_tokenizer,
162
+ self._sent_tokenizer,
163
+ self._para_block_reader,
164
+ tag_mapping_function,
165
+ )
166
+ for (fileid, enc) in self.abspaths(fileids, True)
167
+ ]
168
+ )
169
+
170
+ def tagged_sents(self, fileids=None, tagset=None):
171
+ """
172
+ :return: the given file(s) as a list of
173
+ sentences, each encoded as a list of ``(word,tag)`` tuples.
174
+
175
+ :rtype: list(list(tuple(str,str)))
176
+ """
177
+ if tagset and tagset != self._tagset:
178
+ tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
179
+ else:
180
+ tag_mapping_function = None
181
+ return concat(
182
+ [
183
+ TaggedCorpusView(
184
+ fileid,
185
+ enc,
186
+ True,
187
+ True,
188
+ False,
189
+ self._sep,
190
+ self._word_tokenizer,
191
+ self._sent_tokenizer,
192
+ self._para_block_reader,
193
+ tag_mapping_function,
194
+ )
195
+ for (fileid, enc) in self.abspaths(fileids, True)
196
+ ]
197
+ )
198
+
199
+ def tagged_paras(self, fileids=None, tagset=None):
200
+ """
201
+ :return: the given file(s) as a list of
202
+ paragraphs, each encoded as a list of sentences, which are
203
+ in turn encoded as lists of ``(word,tag)`` tuples.
204
+ :rtype: list(list(list(tuple(str,str))))
205
+ """
206
+ if tagset and tagset != self._tagset:
207
+ tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
208
+ else:
209
+ tag_mapping_function = None
210
+ return concat(
211
+ [
212
+ TaggedCorpusView(
213
+ fileid,
214
+ enc,
215
+ True,
216
+ True,
217
+ True,
218
+ self._sep,
219
+ self._word_tokenizer,
220
+ self._sent_tokenizer,
221
+ self._para_block_reader,
222
+ tag_mapping_function,
223
+ )
224
+ for (fileid, enc) in self.abspaths(fileids, True)
225
+ ]
226
+ )
227
+
228
+
229
+ class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
230
+ """
231
+ A reader for part-of-speech tagged corpora whose documents are
232
+ divided into categories based on their file identifiers.
233
+ """
234
+
235
+ def __init__(self, *args, **kwargs):
236
+ """
237
+ Initialize the corpus reader. Categorization arguments
238
+ (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
239
+ the ``CategorizedCorpusReader`` constructor. The remaining arguments
240
+ are passed to the ``TaggedCorpusReader``.
241
+ """
242
+ CategorizedCorpusReader.__init__(self, kwargs)
243
+ TaggedCorpusReader.__init__(self, *args, **kwargs)
244
+
245
+ def tagged_words(self, fileids=None, categories=None, tagset=None):
246
+ return super().tagged_words(self._resolve(fileids, categories), tagset)
247
+
248
+ def tagged_sents(self, fileids=None, categories=None, tagset=None):
249
+ return super().tagged_sents(self._resolve(fileids, categories), tagset)
250
+
251
+ def tagged_paras(self, fileids=None, categories=None, tagset=None):
252
+ return super().tagged_paras(self._resolve(fileids, categories), tagset)
253
+
254
+
255
+ class TaggedCorpusView(StreamBackedCorpusView):
256
+ """
257
+ A specialized corpus view for tagged documents. It can be
258
+ customized via flags to divide the tagged corpus documents up by
259
+ sentence or paragraph, and to include or omit part of speech tags.
260
+ ``TaggedCorpusView`` objects are typically created by
261
+ ``TaggedCorpusReader`` (not directly by nltk users).
262
+ """
263
+
264
+ def __init__(
265
+ self,
266
+ corpus_file,
267
+ encoding,
268
+ tagged,
269
+ group_by_sent,
270
+ group_by_para,
271
+ sep,
272
+ word_tokenizer,
273
+ sent_tokenizer,
274
+ para_block_reader,
275
+ tag_mapping_function=None,
276
+ ):
277
+ self._tagged = tagged
278
+ self._group_by_sent = group_by_sent
279
+ self._group_by_para = group_by_para
280
+ self._sep = sep
281
+ self._word_tokenizer = word_tokenizer
282
+ self._sent_tokenizer = sent_tokenizer
283
+ self._para_block_reader = para_block_reader
284
+ self._tag_mapping_function = tag_mapping_function
285
+ StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
286
+
287
+ def read_block(self, stream):
288
+ """Reads one paragraph at a time."""
289
+ block = []
290
+ for para_str in self._para_block_reader(stream):
291
+ para = []
292
+ for sent_str in self._sent_tokenizer.tokenize(para_str):
293
+ sent = [
294
+ str2tuple(s, self._sep)
295
+ for s in self._word_tokenizer.tokenize(sent_str)
296
+ ]
297
+ if self._tag_mapping_function:
298
+ sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
299
+ if not self._tagged:
300
+ sent = [w for (w, t) in sent]
301
+ if self._group_by_sent:
302
+ para.append(sent)
303
+ else:
304
+ para.extend(sent)
305
+ if self._group_by_para:
306
+ block.append(para)
307
+ else:
308
+ block.extend(para)
309
+ return block
310
+
311
+
312
+ # needs to implement simplified tags
313
+ class MacMorphoCorpusReader(TaggedCorpusReader):
314
+ """
315
+ A corpus reader for the MAC_MORPHO corpus. Each line contains a
316
+ single tagged word, using '_' as a separator. Sentence boundaries
317
+ are based on the end-sentence tag ('_.'). Paragraph information
318
+ is not included in the corpus, so each paragraph returned by
319
+ ``self.paras()`` and ``self.tagged_paras()`` contains a single
320
+ sentence.
321
+ """
322
+
323
+ def __init__(self, root, fileids, encoding="utf8", tagset=None):
324
+ TaggedCorpusReader.__init__(
325
+ self,
326
+ root,
327
+ fileids,
328
+ sep="_",
329
+ word_tokenizer=LineTokenizer(),
330
+ sent_tokenizer=RegexpTokenizer(".*\n"),
331
+ para_block_reader=self._read_block,
332
+ encoding=encoding,
333
+ tagset=tagset,
334
+ )
335
+
336
+ def _read_block(self, stream):
337
+ return read_regexp_block(stream, r".*", r".*_\.")
338
+
339
+
340
+ class TimitTaggedCorpusReader(TaggedCorpusReader):
341
+ """
342
+ A corpus reader for tagged sentences that are included in the TIMIT corpus.
343
+ """
344
+
345
+ def __init__(self, *args, **kwargs):
346
+ TaggedCorpusReader.__init__(
347
+ self, para_block_reader=read_timit_block, *args, **kwargs
348
+ )
349
+
350
+ def paras(self):
351
+ raise NotImplementedError("use sents() instead")
352
+
353
+ def tagged_paras(self):
354
+ raise NotImplementedError("use tagged_sents() instead")
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/timit.py ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: TIMIT Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2007 NLTK Project
4
+ # Author: Haejoong Lee <haejoong@ldc.upenn.edu>
5
+ # Steven Bird <stevenbird1@gmail.com>
6
+ # Jacob Perkins <japerk@gmail.com>
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ # [xx] this docstring is out-of-date:
11
+ """
12
+ Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
13
+
14
+ This corpus contains selected portion of the TIMIT corpus.
15
+
16
+ - 16 speakers from 8 dialect regions
17
+ - 1 male and 1 female from each dialect region
18
+ - total 130 sentences (10 sentences per speaker. Note that some
19
+ sentences are shared among other speakers, especially sa1 and sa2
20
+ are spoken by all speakers.)
21
+ - total 160 recording of sentences (10 recordings per speaker)
22
+ - audio format: NIST Sphere, single channel, 16kHz sampling,
23
+ 16 bit sample, PCM encoding
24
+
25
+
26
+ Module contents
27
+ ===============
28
+
29
+ The timit corpus reader provides 4 functions and 4 data items.
30
+
31
+ - utterances
32
+
33
+ List of utterances in the corpus. There are total 160 utterances,
34
+ each of which corresponds to a unique utterance of a speaker.
35
+ Here's an example of an utterance identifier in the list::
36
+
37
+ dr1-fvmh0/sx206
38
+ - _---- _---
39
+ | | | | |
40
+ | | | | |
41
+ | | | | `--- sentence number
42
+ | | | `----- sentence type (a:all, i:shared, x:exclusive)
43
+ | | `--------- speaker ID
44
+ | `------------ sex (m:male, f:female)
45
+ `-------------- dialect region (1..8)
46
+
47
+ - speakers
48
+
49
+ List of speaker IDs. An example of speaker ID::
50
+
51
+ dr1-fvmh0
52
+
53
+ Note that if you split an item ID with colon and take the first element of
54
+ the result, you will get a speaker ID.
55
+
56
+ >>> itemid = 'dr1-fvmh0/sx206'
57
+ >>> spkrid , sentid = itemid.split('/')
58
+ >>> spkrid
59
+ 'dr1-fvmh0'
60
+
61
+ The second element of the result is a sentence ID.
62
+
63
+ - dictionary()
64
+
65
+ Phonetic dictionary of words contained in this corpus. This is a Python
66
+ dictionary from words to phoneme lists.
67
+
68
+ - spkrinfo()
69
+
70
+ Speaker information table. It's a Python dictionary from speaker IDs to
71
+ records of 10 fields. Speaker IDs the same as the ones in timie.speakers.
72
+ Each record is a dictionary from field names to values, and the fields are
73
+ as follows::
74
+
75
+ id speaker ID as defined in the original TIMIT speaker info table
76
+ sex speaker gender (M:male, F:female)
77
+ dr speaker dialect region (1:new england, 2:northern,
78
+ 3:north midland, 4:south midland, 5:southern, 6:new york city,
79
+ 7:western, 8:army brat (moved around))
80
+ use corpus type (TRN:training, TST:test)
81
+ in this sample corpus only TRN is available
82
+ recdate recording date
83
+ birthdate speaker birth date
84
+ ht speaker height
85
+ race speaker race (WHT:white, BLK:black, AMR:american indian,
86
+ SPN:spanish-american, ORN:oriental,???:unknown)
87
+ edu speaker education level (HS:high school, AS:associate degree,
88
+ BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
89
+ PHD:doctorate degree (PhD,JD,MD), ??:unknown)
90
+ comments comments by the recorder
91
+
92
+ The 4 functions are as follows.
93
+
94
+ - tokenized(sentences=items, offset=False)
95
+
96
+ Given a list of items, returns an iterator of a list of word lists,
97
+ each of which corresponds to an item (sentence). If offset is set to True,
98
+ each element of the word list is a tuple of word(string), start offset and
99
+ end offset, where offset is represented as a number of 16kHz samples.
100
+
101
+ - phonetic(sentences=items, offset=False)
102
+
103
+ Given a list of items, returns an iterator of a list of phoneme lists,
104
+ each of which corresponds to an item (sentence). If offset is set to True,
105
+ each element of the phoneme list is a tuple of word(string), start offset
106
+ and end offset, where offset is represented as a number of 16kHz samples.
107
+
108
+ - audiodata(item, start=0, end=None)
109
+
110
+ Given an item, returns a chunk of audio samples formatted into a string.
111
+ When the function is called, if start and end are omitted, the entire
112
+ samples of the recording will be returned. If only end is omitted,
113
+ samples from the start offset to the end of the recording will be returned.
114
+
115
+ - play(data)
116
+
117
+ Play the given audio samples. The audio samples can be obtained from the
118
+ timit.audiodata function.
119
+
120
+ """
121
+ import sys
122
+ import time
123
+
124
+ from nltk.corpus.reader.api import *
125
+ from nltk.internals import import_from_stdlib
126
+ from nltk.tree import Tree
127
+
128
+
129
+ class TimitCorpusReader(CorpusReader):
130
+ """
131
+ Reader for the TIMIT corpus (or any other corpus with the same
132
+ file layout and use of file formats). The corpus root directory
133
+ should contain the following files:
134
+
135
+ - timitdic.txt: dictionary of standard transcriptions
136
+ - spkrinfo.txt: table of speaker information
137
+
138
+ In addition, the root directory should contain one subdirectory
139
+ for each speaker, containing three files for each utterance:
140
+
141
+ - <utterance-id>.txt: text content of utterances
142
+ - <utterance-id>.wrd: tokenized text content of utterances
143
+ - <utterance-id>.phn: phonetic transcription of utterances
144
+ - <utterance-id>.wav: utterance sound file
145
+ """
146
+
147
+ _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
148
+ """A regexp matching fileids that are used by this corpus reader."""
149
+ _UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
150
+
151
+ def __init__(self, root, encoding="utf8"):
152
+ """
153
+ Construct a new TIMIT corpus reader in the given directory.
154
+ :param root: The root directory for this corpus.
155
+ """
156
+ # Ensure that wave files don't get treated as unicode data:
157
+ if isinstance(encoding, str):
158
+ encoding = [(r".*\.wav", None), (".*", encoding)]
159
+
160
+ CorpusReader.__init__(
161
+ self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
162
+ )
163
+
164
+ self._utterances = [
165
+ name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
166
+ ]
167
+ """A list of the utterance identifiers for all utterances in
168
+ this corpus."""
169
+
170
+ self._speakerinfo = None
171
+ self._root = root
172
+ self.speakers = sorted({u.split("/")[0] for u in self._utterances})
173
+
174
+ def fileids(self, filetype=None):
175
+ """
176
+ Return a list of file identifiers for the files that make up
177
+ this corpus.
178
+
179
+ :param filetype: If specified, then ``filetype`` indicates that
180
+ only the files that have the given type should be
181
+ returned. Accepted values are: ``txt``, ``wrd``, ``phn``,
182
+ ``wav``, or ``metadata``,
183
+ """
184
+ if filetype is None:
185
+ return CorpusReader.fileids(self)
186
+ elif filetype in ("txt", "wrd", "phn", "wav"):
187
+ return [f"{u}.{filetype}" for u in self._utterances]
188
+ elif filetype == "metadata":
189
+ return ["timitdic.txt", "spkrinfo.txt"]
190
+ else:
191
+ raise ValueError("Bad value for filetype: %r" % filetype)
192
+
193
+ def utteranceids(
194
+ self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
195
+ ):
196
+ """
197
+ :return: A list of the utterance identifiers for all
198
+ utterances in this corpus, or for the given speaker, dialect
199
+ region, gender, sentence type, or sentence number, if
200
+ specified.
201
+ """
202
+ if isinstance(dialect, str):
203
+ dialect = [dialect]
204
+ if isinstance(sex, str):
205
+ sex = [sex]
206
+ if isinstance(spkrid, str):
207
+ spkrid = [spkrid]
208
+ if isinstance(sent_type, str):
209
+ sent_type = [sent_type]
210
+ if isinstance(sentid, str):
211
+ sentid = [sentid]
212
+
213
+ utterances = self._utterances[:]
214
+ if dialect is not None:
215
+ utterances = [u for u in utterances if u[2] in dialect]
216
+ if sex is not None:
217
+ utterances = [u for u in utterances if u[4] in sex]
218
+ if spkrid is not None:
219
+ utterances = [u for u in utterances if u[:9] in spkrid]
220
+ if sent_type is not None:
221
+ utterances = [u for u in utterances if u[11] in sent_type]
222
+ if sentid is not None:
223
+ utterances = [u for u in utterances if u[10:] in spkrid]
224
+ return utterances
225
+
226
+ def transcription_dict(self):
227
+ """
228
+ :return: A dictionary giving the 'standard' transcription for
229
+ each word.
230
+ """
231
+ _transcriptions = {}
232
+ with self.open("timitdic.txt") as fp:
233
+ for line in fp:
234
+ if not line.strip() or line[0] == ";":
235
+ continue
236
+ m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
237
+ if not m:
238
+ raise ValueError("Bad line: %r" % line)
239
+ _transcriptions[m.group(1)] = m.group(2).split()
240
+ return _transcriptions
241
+
242
+ def spkrid(self, utterance):
243
+ return utterance.split("/")[0]
244
+
245
+ def sentid(self, utterance):
246
+ return utterance.split("/")[1]
247
+
248
+ def utterance(self, spkrid, sentid):
249
+ return f"{spkrid}/{sentid}"
250
+
251
+ def spkrutteranceids(self, speaker):
252
+ """
253
+ :return: A list of all utterances associated with a given
254
+ speaker.
255
+ """
256
+ return [
257
+ utterance
258
+ for utterance in self._utterances
259
+ if utterance.startswith(speaker + "/")
260
+ ]
261
+
262
+ def spkrinfo(self, speaker):
263
+ """
264
+ :return: A dictionary mapping .. something.
265
+ """
266
+ if speaker in self._utterances:
267
+ speaker = self.spkrid(speaker)
268
+
269
+ if self._speakerinfo is None:
270
+ self._speakerinfo = {}
271
+ with self.open("spkrinfo.txt") as fp:
272
+ for line in fp:
273
+ if not line.strip() or line[0] == ";":
274
+ continue
275
+ rec = line.strip().split(None, 9)
276
+ key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}"
277
+ self._speakerinfo[key] = SpeakerInfo(*rec)
278
+
279
+ return self._speakerinfo[speaker]
280
+
281
+ def phones(self, utterances=None):
282
+ results = []
283
+ for fileid in self._utterance_fileids(utterances, ".phn"):
284
+ with self.open(fileid) as fp:
285
+ for line in fp:
286
+ if line.strip():
287
+ results.append(line.split()[-1])
288
+ return results
289
+
290
+ def phone_times(self, utterances=None):
291
+ """
292
+ offset is represented as a number of 16kHz samples!
293
+ """
294
+ results = []
295
+ for fileid in self._utterance_fileids(utterances, ".phn"):
296
+ with self.open(fileid) as fp:
297
+ for line in fp:
298
+ if line.strip():
299
+ results.append(
300
+ (
301
+ line.split()[2],
302
+ int(line.split()[0]),
303
+ int(line.split()[1]),
304
+ )
305
+ )
306
+ return results
307
+
308
+ def words(self, utterances=None):
309
+ results = []
310
+ for fileid in self._utterance_fileids(utterances, ".wrd"):
311
+ with self.open(fileid) as fp:
312
+ for line in fp:
313
+ if line.strip():
314
+ results.append(line.split()[-1])
315
+ return results
316
+
317
+ def word_times(self, utterances=None):
318
+ results = []
319
+ for fileid in self._utterance_fileids(utterances, ".wrd"):
320
+ with self.open(fileid) as fp:
321
+ for line in fp:
322
+ if line.strip():
323
+ results.append(
324
+ (
325
+ line.split()[2],
326
+ int(line.split()[0]),
327
+ int(line.split()[1]),
328
+ )
329
+ )
330
+ return results
331
+
332
+ def sents(self, utterances=None):
333
+ results = []
334
+ for fileid in self._utterance_fileids(utterances, ".wrd"):
335
+ with self.open(fileid) as fp:
336
+ results.append([line.split()[-1] for line in fp if line.strip()])
337
+ return results
338
+
339
+ def sent_times(self, utterances=None):
340
+ # TODO: Check this
341
+ return [
342
+ (
343
+ line.split(None, 2)[-1].strip(),
344
+ int(line.split()[0]),
345
+ int(line.split()[1]),
346
+ )
347
+ for fileid in self._utterance_fileids(utterances, ".txt")
348
+ for line in self.open(fileid)
349
+ if line.strip()
350
+ ]
351
+
352
+ def phone_trees(self, utterances=None):
353
+ if utterances is None:
354
+ utterances = self._utterances
355
+ if isinstance(utterances, str):
356
+ utterances = [utterances]
357
+
358
+ trees = []
359
+ for utterance in utterances:
360
+ word_times = self.word_times(utterance)
361
+ phone_times = self.phone_times(utterance)
362
+ sent_times = self.sent_times(utterance)
363
+
364
+ while sent_times:
365
+ (sent, sent_start, sent_end) = sent_times.pop(0)
366
+ trees.append(Tree("S", []))
367
+ while (
368
+ word_times and phone_times and phone_times[0][2] <= word_times[0][1]
369
+ ):
370
+ trees[-1].append(phone_times.pop(0)[0])
371
+ while word_times and word_times[0][2] <= sent_end:
372
+ (word, word_start, word_end) = word_times.pop(0)
373
+ trees[-1].append(Tree(word, []))
374
+ while phone_times and phone_times[0][2] <= word_end:
375
+ trees[-1][-1].append(phone_times.pop(0)[0])
376
+ while phone_times and phone_times[0][2] <= sent_end:
377
+ trees[-1].append(phone_times.pop(0)[0])
378
+ return trees
379
+
380
+ # [xx] NOTE: This is currently broken -- we're assuming that the
381
+ # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
382
+ # fileids.
383
+ def wav(self, utterance, start=0, end=None):
384
+ # nltk.chunk conflicts with the stdlib module 'chunk'
385
+ wave = import_from_stdlib("wave")
386
+
387
+ w = wave.open(self.open(utterance + ".wav"), "rb")
388
+
389
+ if end is None:
390
+ end = w.getnframes()
391
+
392
+ # Skip past frames before start, then read the frames we want
393
+ w.readframes(start)
394
+ frames = w.readframes(end - start)
395
+
396
+ # Open a new temporary file -- the wave module requires
397
+ # an actual file, and won't work w/ stringio. :(
398
+ tf = tempfile.TemporaryFile()
399
+ out = wave.open(tf, "w")
400
+
401
+ # Write the parameters & data to the new file.
402
+ out.setparams(w.getparams())
403
+ out.writeframes(frames)
404
+ out.close()
405
+
406
+ # Read the data back from the file, and return it. The
407
+ # file will automatically be deleted when we return.
408
+ tf.seek(0)
409
+ return tf.read()
410
+
411
+ def audiodata(self, utterance, start=0, end=None):
412
+ assert end is None or end > start
413
+ headersize = 44
414
+ with self.open(utterance + ".wav") as fp:
415
+ if end is None:
416
+ data = fp.read()
417
+ else:
418
+ data = fp.read(headersize + end * 2)
419
+ return data[headersize + start * 2 :]
420
+
421
+ def _utterance_fileids(self, utterances, extension):
422
+ if utterances is None:
423
+ utterances = self._utterances
424
+ if isinstance(utterances, str):
425
+ utterances = [utterances]
426
+ return [f"{u}{extension}" for u in utterances]
427
+
428
+ def play(self, utterance, start=0, end=None):
429
+ """
430
+ Play the given audio sample.
431
+
432
+ :param utterance: The utterance id of the sample to play
433
+ """
434
+ # Method 1: os audio dev.
435
+ try:
436
+ import ossaudiodev
437
+
438
+ try:
439
+ dsp = ossaudiodev.open("w")
440
+ dsp.setfmt(ossaudiodev.AFMT_S16_LE)
441
+ dsp.channels(1)
442
+ dsp.speed(16000)
443
+ dsp.write(self.audiodata(utterance, start, end))
444
+ dsp.close()
445
+ except OSError as e:
446
+ print(
447
+ (
448
+ "can't acquire the audio device; please "
449
+ "activate your audio device."
450
+ ),
451
+ file=sys.stderr,
452
+ )
453
+ print("system error message:", str(e), file=sys.stderr)
454
+ return
455
+ except ImportError:
456
+ pass
457
+
458
+ # Method 2: pygame
459
+ try:
460
+ # FIXME: this won't work under python 3
461
+ import pygame.mixer
462
+ import StringIO
463
+
464
+ pygame.mixer.init(16000)
465
+ f = StringIO.StringIO(self.wav(utterance, start, end))
466
+ pygame.mixer.Sound(f).play()
467
+ while pygame.mixer.get_busy():
468
+ time.sleep(0.01)
469
+ return
470
+ except ImportError:
471
+ pass
472
+
473
+ # Method 3: complain. :)
474
+ print(
475
+ ("you must install pygame or ossaudiodev " "for audio playback."),
476
+ file=sys.stderr,
477
+ )
478
+
479
+
480
+ class SpeakerInfo:
481
+ def __init__(
482
+ self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
483
+ ):
484
+ self.id = id
485
+ self.sex = sex
486
+ self.dr = dr
487
+ self.use = use
488
+ self.recdate = recdate
489
+ self.birthdate = birthdate
490
+ self.ht = ht
491
+ self.race = race
492
+ self.edu = edu
493
+ self.comments = comments
494
+
495
+ def __repr__(self):
496
+ attribs = "id sex dr use recdate birthdate ht race edu comments"
497
+ args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()]
498
+ return "SpeakerInfo(%s)" % (", ".join(args))
499
+
500
+
501
+ def read_timit_block(stream):
502
+ """
503
+ Block reader for timit tagged sentences, which are preceded by a sentence
504
+ number that will be ignored.
505
+ """
506
+ line = stream.readline()
507
+ if not line:
508
+ return []
509
+ n, sent = line.split(" ", 1)
510
+ return [sent]
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/toolbox.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Toolbox Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Greg Aumann <greg_aumann@sil.org>
5
+ # Stuart Robinson <Stuart.Robinson@mpi.nl>
6
+ # Steven Bird <stevenbird1@gmail.com>
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ """
11
+ Module for reading, writing and manipulating
12
+ Toolbox databases and settings fileids.
13
+ """
14
+
15
+ from nltk.corpus.reader.api import *
16
+ from nltk.corpus.reader.util import *
17
+ from nltk.toolbox import ToolboxData
18
+
19
+
20
+ class ToolboxCorpusReader(CorpusReader):
21
+ def xml(self, fileids, key=None):
22
+ return concat(
23
+ [
24
+ ToolboxData(path, enc).parse(key=key)
25
+ for (path, enc) in self.abspaths(fileids, True)
26
+ ]
27
+ )
28
+
29
+ def fields(
30
+ self,
31
+ fileids,
32
+ strip=True,
33
+ unwrap=True,
34
+ encoding="utf8",
35
+ errors="strict",
36
+ unicode_fields=None,
37
+ ):
38
+ return concat(
39
+ [
40
+ list(
41
+ ToolboxData(fileid, enc).fields(
42
+ strip, unwrap, encoding, errors, unicode_fields
43
+ )
44
+ )
45
+ for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
46
+ ]
47
+ )
48
+
49
+ # should probably be done lazily:
50
+ def entries(self, fileids, **kwargs):
51
+ if "key" in kwargs:
52
+ key = kwargs["key"]
53
+ del kwargs["key"]
54
+ else:
55
+ key = "lx" # the default key in MDF
56
+ entries = []
57
+ for marker, contents in self.fields(fileids, **kwargs):
58
+ if marker == key:
59
+ entries.append((contents, []))
60
+ else:
61
+ try:
62
+ entries[-1][-1].append((marker, contents))
63
+ except IndexError:
64
+ pass
65
+ return entries
66
+
67
+ def words(self, fileids, key="lx"):
68
+ return [contents for marker, contents in self.fields(fileids) if marker == key]
69
+
70
+
71
+ def demo():
72
+ pass
73
+
74
+
75
+ if __name__ == "__main__":
76
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/twitter.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Twitter Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Ewan Klein <ewan@inf.ed.ac.uk>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A reader for corpora that consist of Tweets. It is assumed that the Tweets
10
+ have been serialised into line-delimited JSON.
11
+ """
12
+
13
+ import json
14
+ import os
15
+
16
+ from nltk.corpus.reader.api import CorpusReader
17
+ from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat
18
+ from nltk.tokenize import TweetTokenizer
19
+
20
+
21
+ class TwitterCorpusReader(CorpusReader):
22
+ r"""
23
+ Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
24
+
25
+ Individual Tweets can be tokenized using the default tokenizer, or by a
26
+ custom tokenizer specified as a parameter to the constructor.
27
+
28
+ Construct a new Tweet corpus reader for a set of documents
29
+ located at the given root directory.
30
+
31
+ If you made your own tweet collection in a directory called
32
+ `twitter-files`, then you can initialise the reader as::
33
+
34
+ from nltk.corpus import TwitterCorpusReader
35
+ reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
36
+
37
+ However, the recommended approach is to set the relevant directory as the
38
+ value of the environmental variable `TWITTER`, and then invoke the reader
39
+ as follows::
40
+
41
+ root = os.environ['TWITTER']
42
+ reader = TwitterCorpusReader(root, '.*\.json')
43
+
44
+ If you want to work directly with the raw Tweets, the `json` library can
45
+ be used::
46
+
47
+ import json
48
+ for tweet in reader.docs():
49
+ print(json.dumps(tweet, indent=1, sort_keys=True))
50
+
51
+ """
52
+
53
+ CorpusView = StreamBackedCorpusView
54
+ """
55
+ The corpus view class used by this reader.
56
+ """
57
+
58
+ def __init__(
59
+ self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
60
+ ):
61
+ """
62
+ :param root: The root directory for this corpus.
63
+ :param fileids: A list or regexp specifying the fileids in this corpus.
64
+ :param word_tokenizer: Tokenizer for breaking the text of Tweets into
65
+ smaller units, including but not limited to words.
66
+ """
67
+ CorpusReader.__init__(self, root, fileids, encoding)
68
+
69
+ for path in self.abspaths(self._fileids):
70
+ if isinstance(path, ZipFilePathPointer):
71
+ pass
72
+ elif os.path.getsize(path) == 0:
73
+ raise ValueError(f"File {path} is empty")
74
+ """Check that all user-created corpus files are non-empty."""
75
+
76
+ self._word_tokenizer = word_tokenizer
77
+
78
+ def docs(self, fileids=None):
79
+ """
80
+ Returns the full Tweet objects, as specified by `Twitter
81
+ documentation on Tweets
82
+ <https://dev.twitter.com/docs/platform-objects/tweets>`_
83
+
84
+ :return: the given file(s) as a list of dictionaries deserialised
85
+ from JSON.
86
+ :rtype: list(dict)
87
+ """
88
+ return concat(
89
+ [
90
+ self.CorpusView(path, self._read_tweets, encoding=enc)
91
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
92
+ ]
93
+ )
94
+
95
+ def strings(self, fileids=None):
96
+ """
97
+ Returns only the text content of Tweets in the file(s)
98
+
99
+ :return: the given file(s) as a list of Tweets.
100
+ :rtype: list(str)
101
+ """
102
+ fulltweets = self.docs(fileids)
103
+ tweets = []
104
+ for jsono in fulltweets:
105
+ try:
106
+ text = jsono["text"]
107
+ if isinstance(text, bytes):
108
+ text = text.decode(self.encoding)
109
+ tweets.append(text)
110
+ except KeyError:
111
+ pass
112
+ return tweets
113
+
114
+ def tokenized(self, fileids=None):
115
+ """
116
+ :return: the given file(s) as a list of the text content of Tweets as
117
+ as a list of words, screenanames, hashtags, URLs and punctuation symbols.
118
+
119
+ :rtype: list(list(str))
120
+ """
121
+ tweets = self.strings(fileids)
122
+ tokenizer = self._word_tokenizer
123
+ return [tokenizer.tokenize(t) for t in tweets]
124
+
125
+ def _read_tweets(self, stream):
126
+ """
127
+ Assumes that each line in ``stream`` is a JSON-serialised object.
128
+ """
129
+ tweets = []
130
+ for i in range(10):
131
+ line = stream.readline()
132
+ if not line:
133
+ return tweets
134
+ tweet = json.loads(line)
135
+ tweets.append(tweet)
136
+ return tweets
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/udhr.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ UDHR corpus reader. It mostly deals with encodings.
3
+ """
4
+
5
+ from nltk.corpus.reader.plaintext import PlaintextCorpusReader
6
+ from nltk.corpus.reader.util import find_corpus_fileids
7
+
8
+
9
+ class UdhrCorpusReader(PlaintextCorpusReader):
10
+
11
+ ENCODINGS = [
12
+ (".*-Latin1$", "latin-1"),
13
+ (".*-Hebrew$", "hebrew"),
14
+ (".*-Arabic$", "cp1256"),
15
+ ("Czech_Cesky-UTF8", "cp1250"), # yeah
16
+ ("Polish-Latin2", "cp1250"),
17
+ ("Polish_Polski-Latin2", "cp1250"),
18
+ (".*-Cyrillic$", "cyrillic"),
19
+ (".*-SJIS$", "SJIS"),
20
+ (".*-GB2312$", "GB2312"),
21
+ (".*-Latin2$", "ISO-8859-2"),
22
+ (".*-Greek$", "greek"),
23
+ (".*-UTF8$", "utf-8"),
24
+ ("Hungarian_Magyar-Unicode", "utf-16-le"),
25
+ ("Amahuaca", "latin1"),
26
+ ("Turkish_Turkce-Turkish", "latin5"),
27
+ ("Lithuanian_Lietuviskai-Baltic", "latin4"),
28
+ ("Japanese_Nihongo-EUC", "EUC-JP"),
29
+ ("Japanese_Nihongo-JIS", "iso2022_jp"),
30
+ ("Chinese_Mandarin-HZ", "hz"),
31
+ (r"Abkhaz\-Cyrillic\+Abkh", "cp1251"),
32
+ ]
33
+
34
+ SKIP = {
35
+ # The following files are not fully decodable because they
36
+ # were truncated at wrong bytes:
37
+ "Burmese_Myanmar-UTF8",
38
+ "Japanese_Nihongo-JIS",
39
+ "Chinese_Mandarin-HZ",
40
+ "Chinese_Mandarin-UTF8",
41
+ "Gujarati-UTF8",
42
+ "Hungarian_Magyar-Unicode",
43
+ "Lao-UTF8",
44
+ "Magahi-UTF8",
45
+ "Marathi-UTF8",
46
+ "Tamil-UTF8",
47
+ # Unfortunately, encodings required for reading
48
+ # the following files are not supported by Python:
49
+ "Vietnamese-VPS",
50
+ "Vietnamese-VIQR",
51
+ "Vietnamese-TCVN",
52
+ "Magahi-Agra",
53
+ "Bhojpuri-Agra",
54
+ "Esperanto-T61", # latin3 raises an exception
55
+ # The following files are encoded for specific fonts:
56
+ "Burmese_Myanmar-WinResearcher",
57
+ "Armenian-DallakHelv",
58
+ "Tigrinya_Tigrigna-VG2Main",
59
+ "Amharic-Afenegus6..60375", # ?
60
+ "Navaho_Dine-Navajo-Navaho-font",
61
+ # What are these?
62
+ "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
63
+ "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
64
+ # The following files are unintended:
65
+ "Czech-Latin2-err",
66
+ "Russian_Russky-UTF8~",
67
+ }
68
+
69
+ def __init__(self, root="udhr"):
70
+ fileids = find_corpus_fileids(root, r"(?!README|\.).*")
71
+ super().__init__(
72
+ root,
73
+ [fileid for fileid in fileids if fileid not in self.SKIP],
74
+ encoding=self.ENCODINGS,
75
+ )
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/util.py ADDED
@@ -0,0 +1,867 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Corpus Reader Utilities
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ import bisect
10
+ import os
11
+ import pickle
12
+ import re
13
+ import tempfile
14
+ from functools import reduce
15
+ from xml.etree import ElementTree
16
+
17
+ from nltk.data import (
18
+ FileSystemPathPointer,
19
+ PathPointer,
20
+ SeekableUnicodeStreamReader,
21
+ ZipFilePathPointer,
22
+ )
23
+ from nltk.internals import slice_bounds
24
+ from nltk.tokenize import wordpunct_tokenize
25
+ from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence
26
+
27
+ ######################################################################
28
+ # { Corpus View
29
+ ######################################################################
30
+
31
+
32
+ class StreamBackedCorpusView(AbstractLazySequence):
33
+ """
34
+ A 'view' of a corpus file, which acts like a sequence of tokens:
35
+ it can be accessed by index, iterated over, etc. However, the
36
+ tokens are only constructed as-needed -- the entire corpus is
37
+ never stored in memory at once.
38
+
39
+ The constructor to ``StreamBackedCorpusView`` takes two arguments:
40
+ a corpus fileid (specified as a string or as a ``PathPointer``);
41
+ and a block reader. A "block reader" is a function that reads
42
+ zero or more tokens from a stream, and returns them as a list. A
43
+ very simple example of a block reader is:
44
+
45
+ >>> def simple_block_reader(stream):
46
+ ... return stream.readline().split()
47
+
48
+ This simple block reader reads a single line at a time, and
49
+ returns a single token (consisting of a string) for each
50
+ whitespace-separated substring on the line.
51
+
52
+ When deciding how to define the block reader for a given
53
+ corpus, careful consideration should be given to the size of
54
+ blocks handled by the block reader. Smaller block sizes will
55
+ increase the memory requirements of the corpus view's internal
56
+ data structures (by 2 integers per block). On the other hand,
57
+ larger block sizes may decrease performance for random access to
58
+ the corpus. (But note that larger block sizes will *not*
59
+ decrease performance for iteration.)
60
+
61
+ Internally, ``CorpusView`` maintains a partial mapping from token
62
+ index to file position, with one entry per block. When a token
63
+ with a given index *i* is requested, the ``CorpusView`` constructs
64
+ it as follows:
65
+
66
+ 1. First, it searches the toknum/filepos mapping for the token
67
+ index closest to (but less than or equal to) *i*.
68
+
69
+ 2. Then, starting at the file position corresponding to that
70
+ index, it reads one block at a time using the block reader
71
+ until it reaches the requested token.
72
+
73
+ The toknum/filepos mapping is created lazily: it is initially
74
+ empty, but every time a new block is read, the block's
75
+ initial token is added to the mapping. (Thus, the toknum/filepos
76
+ map has one entry per block.)
77
+
78
+ In order to increase efficiency for random access patterns that
79
+ have high degrees of locality, the corpus view may cache one or
80
+ more blocks.
81
+
82
+ :note: Each ``CorpusView`` object internally maintains an open file
83
+ object for its underlying corpus file. This file should be
84
+ automatically closed when the ``CorpusView`` is garbage collected,
85
+ but if you wish to close it manually, use the ``close()``
86
+ method. If you access a ``CorpusView``'s items after it has been
87
+ closed, the file object will be automatically re-opened.
88
+
89
+ :warning: If the contents of the file are modified during the
90
+ lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
91
+ is undefined.
92
+
93
+ :warning: If a unicode encoding is specified when constructing a
94
+ ``CorpusView``, then the block reader may only call
95
+ ``stream.seek()`` with offsets that have been returned by
96
+ ``stream.tell()``; in particular, calling ``stream.seek()`` with
97
+ relative offsets, or with offsets based on string lengths, may
98
+ lead to incorrect behavior.
99
+
100
+ :ivar _block_reader: The function used to read
101
+ a single block from the underlying file stream.
102
+ :ivar _toknum: A list containing the token index of each block
103
+ that has been processed. In particular, ``_toknum[i]`` is the
104
+ token index of the first token in block ``i``. Together
105
+ with ``_filepos``, this forms a partial mapping between token
106
+ indices and file positions.
107
+ :ivar _filepos: A list containing the file position of each block
108
+ that has been processed. In particular, ``_toknum[i]`` is the
109
+ file position of the first character in block ``i``. Together
110
+ with ``_toknum``, this forms a partial mapping between token
111
+ indices and file positions.
112
+ :ivar _stream: The stream used to access the underlying corpus file.
113
+ :ivar _len: The total number of tokens in the corpus, if known;
114
+ or None, if the number of tokens is not yet known.
115
+ :ivar _eofpos: The character position of the last character in the
116
+ file. This is calculated when the corpus view is initialized,
117
+ and is used to decide when the end of file has been reached.
118
+ :ivar _cache: A cache of the most recently read block. It
119
+ is encoded as a tuple (start_toknum, end_toknum, tokens), where
120
+ start_toknum is the token index of the first token in the block;
121
+ end_toknum is the token index of the first token not in the
122
+ block; and tokens is a list of the tokens in the block.
123
+ """
124
+
125
+ def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
126
+ """
127
+ Create a new corpus view, based on the file ``fileid``, and
128
+ read with ``block_reader``. See the class documentation
129
+ for more information.
130
+
131
+ :param fileid: The path to the file that is read by this
132
+ corpus view. ``fileid`` can either be a string or a
133
+ ``PathPointer``.
134
+
135
+ :param startpos: The file position at which the view will
136
+ start reading. This can be used to skip over preface
137
+ sections.
138
+
139
+ :param encoding: The unicode encoding that should be used to
140
+ read the file's contents. If no encoding is specified,
141
+ then the file's contents will be read as a non-unicode
142
+ string (i.e., a str).
143
+ """
144
+ if block_reader:
145
+ self.read_block = block_reader
146
+ # Initialize our toknum/filepos mapping.
147
+ self._toknum = [0]
148
+ self._filepos = [startpos]
149
+ self._encoding = encoding
150
+ # We don't know our length (number of tokens) yet.
151
+ self._len = None
152
+
153
+ self._fileid = fileid
154
+ self._stream = None
155
+
156
+ self._current_toknum = None
157
+ """This variable is set to the index of the next token that
158
+ will be read, immediately before ``self.read_block()`` is
159
+ called. This is provided for the benefit of the block
160
+ reader, which under rare circumstances may need to know
161
+ the current token number."""
162
+
163
+ self._current_blocknum = None
164
+ """This variable is set to the index of the next block that
165
+ will be read, immediately before ``self.read_block()`` is
166
+ called. This is provided for the benefit of the block
167
+ reader, which under rare circumstances may need to know
168
+ the current block number."""
169
+
170
+ # Find the length of the file.
171
+ try:
172
+ if isinstance(self._fileid, PathPointer):
173
+ self._eofpos = self._fileid.file_size()
174
+ else:
175
+ self._eofpos = os.stat(self._fileid).st_size
176
+ except Exception as exc:
177
+ raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc
178
+
179
+ # Maintain a cache of the most recently read block, to
180
+ # increase efficiency of random access.
181
+ self._cache = (-1, -1, None)
182
+
183
+ fileid = property(
184
+ lambda self: self._fileid,
185
+ doc="""
186
+ The fileid of the file that is accessed by this view.
187
+
188
+ :type: str or PathPointer""",
189
+ )
190
+
191
+ def read_block(self, stream):
192
+ """
193
+ Read a block from the input stream.
194
+
195
+ :return: a block of tokens from the input stream
196
+ :rtype: list(any)
197
+ :param stream: an input stream
198
+ :type stream: stream
199
+ """
200
+ raise NotImplementedError("Abstract Method")
201
+
202
+ def _open(self):
203
+ """
204
+ Open the file stream associated with this corpus view. This
205
+ will be called performed if any value is read from the view
206
+ while its file stream is closed.
207
+ """
208
+ if isinstance(self._fileid, PathPointer):
209
+ self._stream = self._fileid.open(self._encoding)
210
+ elif self._encoding:
211
+ self._stream = SeekableUnicodeStreamReader(
212
+ open(self._fileid, "rb"), self._encoding
213
+ )
214
+ else:
215
+ self._stream = open(self._fileid, "rb")
216
+
217
+ def close(self):
218
+ """
219
+ Close the file stream associated with this corpus view. This
220
+ can be useful if you are worried about running out of file
221
+ handles (although the stream should automatically be closed
222
+ upon garbage collection of the corpus view). If the corpus
223
+ view is accessed after it is closed, it will be automatically
224
+ re-opened.
225
+ """
226
+ if self._stream is not None:
227
+ self._stream.close()
228
+ self._stream = None
229
+
230
+ def __enter__(self):
231
+ return self
232
+
233
+ def __exit__(self, type, value, traceback):
234
+ self.close()
235
+
236
+ def __len__(self):
237
+ if self._len is None:
238
+ # iterate_from() sets self._len when it reaches the end
239
+ # of the file:
240
+ for tok in self.iterate_from(self._toknum[-1]):
241
+ pass
242
+ return self._len
243
+
244
+ def __getitem__(self, i):
245
+ if isinstance(i, slice):
246
+ start, stop = slice_bounds(self, i)
247
+ # Check if it's in the cache.
248
+ offset = self._cache[0]
249
+ if offset <= start and stop <= self._cache[1]:
250
+ return self._cache[2][start - offset : stop - offset]
251
+ # Construct & return the result.
252
+ return LazySubsequence(self, start, stop)
253
+ else:
254
+ # Handle negative indices
255
+ if i < 0:
256
+ i += len(self)
257
+ if i < 0:
258
+ raise IndexError("index out of range")
259
+ # Check if it's in the cache.
260
+ offset = self._cache[0]
261
+ if offset <= i < self._cache[1]:
262
+ return self._cache[2][i - offset]
263
+ # Use iterate_from to extract it.
264
+ try:
265
+ return next(self.iterate_from(i))
266
+ except StopIteration as e:
267
+ raise IndexError("index out of range") from e
268
+
269
+ # If we wanted to be thread-safe, then this method would need to
270
+ # do some locking.
271
+ def iterate_from(self, start_tok):
272
+ # Start by feeding from the cache, if possible.
273
+ if self._cache[0] <= start_tok < self._cache[1]:
274
+ for tok in self._cache[2][start_tok - self._cache[0] :]:
275
+ yield tok
276
+ start_tok += 1
277
+
278
+ # Decide where in the file we should start. If `start` is in
279
+ # our mapping, then we can jump straight to the correct block;
280
+ # otherwise, start at the last block we've processed.
281
+ if start_tok < self._toknum[-1]:
282
+ block_index = bisect.bisect_right(self._toknum, start_tok) - 1
283
+ toknum = self._toknum[block_index]
284
+ filepos = self._filepos[block_index]
285
+ else:
286
+ block_index = len(self._toknum) - 1
287
+ toknum = self._toknum[-1]
288
+ filepos = self._filepos[-1]
289
+
290
+ # Open the stream, if it's not open already.
291
+ if self._stream is None:
292
+ self._open()
293
+
294
+ # If the file is empty, the while loop will never run.
295
+ # This *seems* to be all the state we need to set:
296
+ if self._eofpos == 0:
297
+ self._len = 0
298
+
299
+ # Each iteration through this loop, we read a single block
300
+ # from the stream.
301
+ while filepos < self._eofpos:
302
+ # Read the next block.
303
+ self._stream.seek(filepos)
304
+ self._current_toknum = toknum
305
+ self._current_blocknum = block_index
306
+ tokens = self.read_block(self._stream)
307
+ assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
308
+ "block reader %s() should return list or tuple."
309
+ % self.read_block.__name__
310
+ )
311
+ num_toks = len(tokens)
312
+ new_filepos = self._stream.tell()
313
+ assert (
314
+ new_filepos > filepos
315
+ ), "block reader %s() should consume at least 1 byte (filepos=%d)" % (
316
+ self.read_block.__name__,
317
+ filepos,
318
+ )
319
+
320
+ # Update our cache.
321
+ self._cache = (toknum, toknum + num_toks, list(tokens))
322
+
323
+ # Update our mapping.
324
+ assert toknum <= self._toknum[-1]
325
+ if num_toks > 0:
326
+ block_index += 1
327
+ if toknum == self._toknum[-1]:
328
+ assert new_filepos > self._filepos[-1] # monotonic!
329
+ self._filepos.append(new_filepos)
330
+ self._toknum.append(toknum + num_toks)
331
+ else:
332
+ # Check for consistency:
333
+ assert (
334
+ new_filepos == self._filepos[block_index]
335
+ ), "inconsistent block reader (num chars read)"
336
+ assert (
337
+ toknum + num_toks == self._toknum[block_index]
338
+ ), "inconsistent block reader (num tokens returned)"
339
+
340
+ # If we reached the end of the file, then update self._len
341
+ if new_filepos == self._eofpos:
342
+ self._len = toknum + num_toks
343
+ # Generate the tokens in this block (but skip any tokens
344
+ # before start_tok). Note that between yields, our state
345
+ # may be modified.
346
+ for tok in tokens[max(0, start_tok - toknum) :]:
347
+ yield tok
348
+ # If we're at the end of the file, then we're done.
349
+ assert new_filepos <= self._eofpos
350
+ if new_filepos == self._eofpos:
351
+ break
352
+ # Update our indices
353
+ toknum += num_toks
354
+ filepos = new_filepos
355
+
356
+ # If we reach this point, then we should know our length.
357
+ assert self._len is not None
358
+ # Enforce closing of stream once we reached end of file
359
+ # We should have reached EOF once we're out of the while loop.
360
+ self.close()
361
+
362
+ # Use concat for these, so we can use a ConcatenatedCorpusView
363
+ # when possible.
364
+ def __add__(self, other):
365
+ return concat([self, other])
366
+
367
+ def __radd__(self, other):
368
+ return concat([other, self])
369
+
370
+ def __mul__(self, count):
371
+ return concat([self] * count)
372
+
373
+ def __rmul__(self, count):
374
+ return concat([self] * count)
375
+
376
+
377
+ class ConcatenatedCorpusView(AbstractLazySequence):
378
+ """
379
+ A 'view' of a corpus file that joins together one or more
380
+ ``StreamBackedCorpusViews<StreamBackedCorpusView>``. At most
381
+ one file handle is left open at any time.
382
+ """
383
+
384
+ def __init__(self, corpus_views):
385
+ self._pieces = corpus_views
386
+ """A list of the corpus subviews that make up this
387
+ concatenation."""
388
+
389
+ self._offsets = [0]
390
+ """A list of offsets, indicating the index at which each
391
+ subview begins. In particular::
392
+ offsets[i] = sum([len(p) for p in pieces[:i]])"""
393
+
394
+ self._open_piece = None
395
+ """The most recently accessed corpus subview (or None).
396
+ Before a new subview is accessed, this subview will be closed."""
397
+
398
+ def __len__(self):
399
+ if len(self._offsets) <= len(self._pieces):
400
+ # Iterate to the end of the corpus.
401
+ for tok in self.iterate_from(self._offsets[-1]):
402
+ pass
403
+
404
+ return self._offsets[-1]
405
+
406
+ def close(self):
407
+ for piece in self._pieces:
408
+ piece.close()
409
+
410
+ def iterate_from(self, start_tok):
411
+ piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
412
+
413
+ while piecenum < len(self._pieces):
414
+ offset = self._offsets[piecenum]
415
+ piece = self._pieces[piecenum]
416
+
417
+ # If we've got another piece open, close it first.
418
+ if self._open_piece is not piece:
419
+ if self._open_piece is not None:
420
+ self._open_piece.close()
421
+ self._open_piece = piece
422
+
423
+ # Get everything we can from this piece.
424
+ yield from piece.iterate_from(max(0, start_tok - offset))
425
+
426
+ # Update the offset table.
427
+ if piecenum + 1 == len(self._offsets):
428
+ self._offsets.append(self._offsets[-1] + len(piece))
429
+
430
+ # Move on to the next piece.
431
+ piecenum += 1
432
+
433
+
434
+ def concat(docs):
435
+ """
436
+ Concatenate together the contents of multiple documents from a
437
+ single corpus, using an appropriate concatenation function. This
438
+ utility function is used by corpus readers when the user requests
439
+ more than one document at a time.
440
+ """
441
+ if len(docs) == 1:
442
+ return docs[0]
443
+ if len(docs) == 0:
444
+ raise ValueError("concat() expects at least one object!")
445
+
446
+ types = {d.__class__ for d in docs}
447
+
448
+ # If they're all strings, use string concatenation.
449
+ if all(isinstance(doc, str) for doc in docs):
450
+ return "".join(docs)
451
+
452
+ # If they're all corpus views, then use ConcatenatedCorpusView.
453
+ for typ in types:
454
+ if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
455
+ break
456
+ else:
457
+ return ConcatenatedCorpusView(docs)
458
+
459
+ # If they're all lazy sequences, use a lazy concatenation
460
+ for typ in types:
461
+ if not issubclass(typ, AbstractLazySequence):
462
+ break
463
+ else:
464
+ return LazyConcatenation(docs)
465
+
466
+ # Otherwise, see what we can do:
467
+ if len(types) == 1:
468
+ typ = list(types)[0]
469
+
470
+ if issubclass(typ, list):
471
+ return reduce((lambda a, b: a + b), docs, [])
472
+
473
+ if issubclass(typ, tuple):
474
+ return reduce((lambda a, b: a + b), docs, ())
475
+
476
+ if ElementTree.iselement(typ):
477
+ xmltree = ElementTree.Element("documents")
478
+ for doc in docs:
479
+ xmltree.append(doc)
480
+ return xmltree
481
+
482
+ # No method found!
483
+ raise ValueError("Don't know how to concatenate types: %r" % types)
484
+
485
+
486
+ ######################################################################
487
+ # { Corpus View for Pickled Sequences
488
+ ######################################################################
489
+
490
+
491
+ class PickleCorpusView(StreamBackedCorpusView):
492
+ """
493
+ A stream backed corpus view for corpus files that consist of
494
+ sequences of serialized Python objects (serialized using
495
+ ``pickle.dump``). One use case for this class is to store the
496
+ result of running feature detection on a corpus to disk. This can
497
+ be useful when performing feature detection is expensive (so we
498
+ don't want to repeat it); but the corpus is too large to store in
499
+ memory. The following example illustrates this technique:
500
+
501
+ >>> from nltk.corpus.reader.util import PickleCorpusView
502
+ >>> from nltk.util import LazyMap
503
+ >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP
504
+ >>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP
505
+ >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP
506
+ """
507
+
508
+ BLOCK_SIZE = 100
509
+ PROTOCOL = -1
510
+
511
+ def __init__(self, fileid, delete_on_gc=False):
512
+ """
513
+ Create a new corpus view that reads the pickle corpus
514
+ ``fileid``.
515
+
516
+ :param delete_on_gc: If true, then ``fileid`` will be deleted
517
+ whenever this object gets garbage-collected.
518
+ """
519
+ self._delete_on_gc = delete_on_gc
520
+ StreamBackedCorpusView.__init__(self, fileid)
521
+
522
+ def read_block(self, stream):
523
+ result = []
524
+ for i in range(self.BLOCK_SIZE):
525
+ try:
526
+ result.append(pickle.load(stream))
527
+ except EOFError:
528
+ break
529
+ return result
530
+
531
+ def __del__(self):
532
+ """
533
+ If ``delete_on_gc`` was set to true when this
534
+ ``PickleCorpusView`` was created, then delete the corpus view's
535
+ fileid. (This method is called whenever a
536
+ ``PickledCorpusView`` is garbage-collected.
537
+ """
538
+ if getattr(self, "_delete_on_gc"):
539
+ if os.path.exists(self._fileid):
540
+ try:
541
+ os.remove(self._fileid)
542
+ except OSError:
543
+ pass
544
+ self.__dict__.clear() # make the garbage collector's job easier
545
+
546
+ @classmethod
547
+ def write(cls, sequence, output_file):
548
+ if isinstance(output_file, str):
549
+ output_file = open(output_file, "wb")
550
+ for item in sequence:
551
+ pickle.dump(item, output_file, cls.PROTOCOL)
552
+
553
+ @classmethod
554
+ def cache_to_tempfile(cls, sequence, delete_on_gc=True):
555
+ """
556
+ Write the given sequence to a temporary file as a pickle
557
+ corpus; and then return a ``PickleCorpusView`` view for that
558
+ temporary corpus file.
559
+
560
+ :param delete_on_gc: If true, then the temporary file will be
561
+ deleted whenever this object gets garbage-collected.
562
+ """
563
+ try:
564
+ fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-")
565
+ output_file = os.fdopen(fd, "wb")
566
+ cls.write(sequence, output_file)
567
+ output_file.close()
568
+ return PickleCorpusView(output_file_name, delete_on_gc)
569
+ except OSError as e:
570
+ raise ValueError("Error while creating temp file: %s" % e) from e
571
+
572
+
573
+ ######################################################################
574
+ # { Block Readers
575
+ ######################################################################
576
+
577
+
578
+ def read_whitespace_block(stream):
579
+ toks = []
580
+ for i in range(20): # Read 20 lines at a time.
581
+ toks.extend(stream.readline().split())
582
+ return toks
583
+
584
+
585
+ def read_wordpunct_block(stream):
586
+ toks = []
587
+ for i in range(20): # Read 20 lines at a time.
588
+ toks.extend(wordpunct_tokenize(stream.readline()))
589
+ return toks
590
+
591
+
592
+ def read_line_block(stream):
593
+ toks = []
594
+ for i in range(20):
595
+ line = stream.readline()
596
+ if not line:
597
+ return toks
598
+ toks.append(line.rstrip("\n"))
599
+ return toks
600
+
601
+
602
+ def read_blankline_block(stream):
603
+ s = ""
604
+ while True:
605
+ line = stream.readline()
606
+ # End of file:
607
+ if not line:
608
+ if s:
609
+ return [s]
610
+ else:
611
+ return []
612
+ # Blank line:
613
+ elif line and not line.strip():
614
+ if s:
615
+ return [s]
616
+ # Other line:
617
+ else:
618
+ s += line
619
+
620
+
621
+ def read_alignedsent_block(stream):
622
+ s = ""
623
+ while True:
624
+ line = stream.readline()
625
+ if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
626
+ continue
627
+ # End of file:
628
+ if not line:
629
+ if s:
630
+ return [s]
631
+ else:
632
+ return []
633
+ # Other line:
634
+ else:
635
+ s += line
636
+ if re.match(r"^\d+-\d+", line) is not None:
637
+ return [s]
638
+
639
+
640
+ def read_regexp_block(stream, start_re, end_re=None):
641
+ """
642
+ Read a sequence of tokens from a stream, where tokens begin with
643
+ lines that match ``start_re``. If ``end_re`` is specified, then
644
+ tokens end with lines that match ``end_re``; otherwise, tokens end
645
+ whenever the next line matching ``start_re`` or EOF is found.
646
+ """
647
+ # Scan until we find a line matching the start regexp.
648
+ while True:
649
+ line = stream.readline()
650
+ if not line:
651
+ return [] # end of file.
652
+ if re.match(start_re, line):
653
+ break
654
+
655
+ # Scan until we find another line matching the regexp, or EOF.
656
+ lines = [line]
657
+ while True:
658
+ oldpos = stream.tell()
659
+ line = stream.readline()
660
+ # End of file:
661
+ if not line:
662
+ return ["".join(lines)]
663
+ # End of token:
664
+ if end_re is not None and re.match(end_re, line):
665
+ return ["".join(lines)]
666
+ # Start of new token: backup to just before it starts, and
667
+ # return the token we've already collected.
668
+ if end_re is None and re.match(start_re, line):
669
+ stream.seek(oldpos)
670
+ return ["".join(lines)]
671
+ # Anything else is part of the token.
672
+ lines.append(line)
673
+
674
+
675
+ def read_sexpr_block(stream, block_size=16384, comment_char=None):
676
+ """
677
+ Read a sequence of s-expressions from the stream, and leave the
678
+ stream's file position at the end the last complete s-expression
679
+ read. This function will always return at least one s-expression,
680
+ unless there are no more s-expressions in the file.
681
+
682
+ If the file ends in in the middle of an s-expression, then that
683
+ incomplete s-expression is returned when the end of the file is
684
+ reached.
685
+
686
+ :param block_size: The default block size for reading. If an
687
+ s-expression is longer than one block, then more than one
688
+ block will be read.
689
+ :param comment_char: A character that marks comments. Any lines
690
+ that begin with this character will be stripped out.
691
+ (If spaces or tabs precede the comment character, then the
692
+ line will not be stripped.)
693
+ """
694
+ start = stream.tell()
695
+ block = stream.read(block_size)
696
+ encoding = getattr(stream, "encoding", None)
697
+ assert encoding is not None or isinstance(block, str)
698
+ if encoding not in (None, "utf-8"):
699
+ import warnings
700
+
701
+ warnings.warn(
702
+ "Parsing may fail, depending on the properties "
703
+ "of the %s encoding!" % encoding
704
+ )
705
+ # (e.g., the utf-16 encoding does not work because it insists
706
+ # on adding BOMs to the beginning of encoded strings.)
707
+
708
+ if comment_char:
709
+ COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
710
+ while True:
711
+ try:
712
+ # If we're stripping comments, then make sure our block ends
713
+ # on a line boundary; and then replace any comments with
714
+ # space characters. (We can't just strip them out -- that
715
+ # would make our offset wrong.)
716
+ if comment_char:
717
+ block += stream.readline()
718
+ block = re.sub(COMMENT, _sub_space, block)
719
+ # Read the block.
720
+ tokens, offset = _parse_sexpr_block(block)
721
+ # Skip whitespace
722
+ offset = re.compile(r"\s*").search(block, offset).end()
723
+
724
+ # Move to the end position.
725
+ if encoding is None:
726
+ stream.seek(start + offset)
727
+ else:
728
+ stream.seek(start + len(block[:offset].encode(encoding)))
729
+
730
+ # Return the list of tokens we processed
731
+ return tokens
732
+ except ValueError as e:
733
+ if e.args[0] == "Block too small":
734
+ next_block = stream.read(block_size)
735
+ if next_block:
736
+ block += next_block
737
+ continue
738
+ else:
739
+ # The file ended mid-sexpr -- return what we got.
740
+ return [block.strip()]
741
+ else:
742
+ raise
743
+
744
+
745
+ def _sub_space(m):
746
+ """Helper function: given a regexp match, return a string of
747
+ spaces that's the same length as the matched string."""
748
+ return " " * (m.end() - m.start())
749
+
750
+
751
+ def _parse_sexpr_block(block):
752
+ tokens = []
753
+ start = end = 0
754
+
755
+ while end < len(block):
756
+ m = re.compile(r"\S").search(block, end)
757
+ if not m:
758
+ return tokens, end
759
+
760
+ start = m.start()
761
+
762
+ # Case 1: sexpr is not parenthesized.
763
+ if m.group() != "(":
764
+ m2 = re.compile(r"[\s(]").search(block, start)
765
+ if m2:
766
+ end = m2.start()
767
+ else:
768
+ if tokens:
769
+ return tokens, end
770
+ raise ValueError("Block too small")
771
+
772
+ # Case 2: parenthesized sexpr.
773
+ else:
774
+ nesting = 0
775
+ for m in re.compile(r"[()]").finditer(block, start):
776
+ if m.group() == "(":
777
+ nesting += 1
778
+ else:
779
+ nesting -= 1
780
+ if nesting == 0:
781
+ end = m.end()
782
+ break
783
+ else:
784
+ if tokens:
785
+ return tokens, end
786
+ raise ValueError("Block too small")
787
+
788
+ tokens.append(block[start:end])
789
+
790
+ return tokens, end
791
+
792
+
793
+ ######################################################################
794
+ # { Finding Corpus Items
795
+ ######################################################################
796
+
797
+
798
+ def find_corpus_fileids(root, regexp):
799
+ if not isinstance(root, PathPointer):
800
+ raise TypeError("find_corpus_fileids: expected a PathPointer")
801
+ regexp += "$"
802
+
803
+ # Find fileids in a zipfile: scan the zipfile's namelist. Filter
804
+ # out entries that end in '/' -- they're directories.
805
+ if isinstance(root, ZipFilePathPointer):
806
+ fileids = [
807
+ name[len(root.entry) :]
808
+ for name in root.zipfile.namelist()
809
+ if not name.endswith("/")
810
+ ]
811
+ items = [name for name in fileids if re.match(regexp, name)]
812
+ return sorted(items)
813
+
814
+ # Find fileids in a directory: use os.walk to search all (proper
815
+ # or symlinked) subdirectories, and match paths against the regexp.
816
+ elif isinstance(root, FileSystemPathPointer):
817
+ items = []
818
+ for dirname, subdirs, fileids in os.walk(root.path):
819
+ prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
820
+ items += [
821
+ prefix + fileid
822
+ for fileid in fileids
823
+ if re.match(regexp, prefix + fileid)
824
+ ]
825
+ # Don't visit svn directories:
826
+ if ".svn" in subdirs:
827
+ subdirs.remove(".svn")
828
+ return sorted(items)
829
+
830
+ else:
831
+ raise AssertionError("Don't know how to handle %r" % root)
832
+
833
+
834
+ def _path_from(parent, child):
835
+ if os.path.split(parent)[1] == "":
836
+ parent = os.path.split(parent)[0]
837
+ path = []
838
+ while parent != child:
839
+ child, dirname = os.path.split(child)
840
+ path.insert(0, dirname)
841
+ assert os.path.split(child)[0] != child
842
+ return path
843
+
844
+
845
+ ######################################################################
846
+ # { Paragraph structure in Treebank files
847
+ ######################################################################
848
+
849
+
850
+ def tagged_treebank_para_block_reader(stream):
851
+ # Read the next paragraph.
852
+ para = ""
853
+ while True:
854
+ line = stream.readline()
855
+ # End of paragraph:
856
+ if re.match(r"======+\s*$", line):
857
+ if para.strip():
858
+ return [para]
859
+ # End of file:
860
+ elif line == "":
861
+ if para.strip():
862
+ return [para]
863
+ else:
864
+ return []
865
+ # Content line:
866
+ else:
867
+ para += line
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/verbnet.py ADDED
@@ -0,0 +1,629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Verbnet Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ An NLTK interface to the VerbNet verb lexicon
10
+
11
+ For details about VerbNet see:
12
+ https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
13
+ """
14
+
15
+ import re
16
+ import textwrap
17
+ from collections import defaultdict
18
+
19
+ from nltk.corpus.reader.xmldocs import XMLCorpusReader
20
+
21
+
22
+ class VerbnetCorpusReader(XMLCorpusReader):
23
+ """
24
+ An NLTK interface to the VerbNet verb lexicon.
25
+
26
+ From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
27
+ on-line verb lexicon currently available for English. It is a hierarchical
28
+ domain-independent, broad-coverage verb lexicon with mappings to other
29
+ lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
30
+ (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
31
+
32
+ For details about VerbNet see:
33
+ https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
34
+ """
35
+
36
+ # No unicode encoding param, since the data files are all XML.
37
+ def __init__(self, root, fileids, wrap_etree=False):
38
+ XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
39
+
40
+ self._lemma_to_class = defaultdict(list)
41
+ """A dictionary mapping from verb lemma strings to lists of
42
+ VerbNet class identifiers."""
43
+
44
+ self._wordnet_to_class = defaultdict(list)
45
+ """A dictionary mapping from wordnet identifier strings to
46
+ lists of VerbNet class identifiers."""
47
+
48
+ self._class_to_fileid = {}
49
+ """A dictionary mapping from class identifiers to
50
+ corresponding file identifiers. The keys of this dictionary
51
+ provide a complete list of all classes and subclasses."""
52
+
53
+ self._shortid_to_longid = {}
54
+
55
+ # Initialize the dictionaries. Use the quick (regexp-based)
56
+ # method instead of the slow (xml-based) method, because it
57
+ # runs 2-30 times faster.
58
+ self._quick_index()
59
+
60
+ _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
61
+ """Regular expression that matches (and decomposes) longids"""
62
+
63
+ _SHORTID_RE = re.compile(r"[\d+.\-]+$")
64
+ """Regular expression that matches shortids"""
65
+
66
+ _INDEX_RE = re.compile(
67
+ r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' r'<VNSUBCLASS ID="([^"]+)"/?>'
68
+ )
69
+ """Regular expression used by ``_index()`` to quickly scan the corpus
70
+ for basic information."""
71
+
72
+ def lemmas(self, vnclass=None):
73
+ """
74
+ Return a list of all verb lemmas that appear in any class, or
75
+ in the ``classid`` if specified.
76
+ """
77
+ if vnclass is None:
78
+ return sorted(self._lemma_to_class.keys())
79
+ else:
80
+ # [xx] should this include subclass members?
81
+ if isinstance(vnclass, str):
82
+ vnclass = self.vnclass(vnclass)
83
+ return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
84
+
85
+ def wordnetids(self, vnclass=None):
86
+ """
87
+ Return a list of all wordnet identifiers that appear in any
88
+ class, or in ``classid`` if specified.
89
+ """
90
+ if vnclass is None:
91
+ return sorted(self._wordnet_to_class.keys())
92
+ else:
93
+ # [xx] should this include subclass members?
94
+ if isinstance(vnclass, str):
95
+ vnclass = self.vnclass(vnclass)
96
+ return sum(
97
+ (
98
+ member.get("wn", "").split()
99
+ for member in vnclass.findall("MEMBERS/MEMBER")
100
+ ),
101
+ [],
102
+ )
103
+
104
+ def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
105
+ """
106
+ Return a list of the VerbNet class identifiers. If a file
107
+ identifier is specified, then return only the VerbNet class
108
+ identifiers for classes (and subclasses) defined by that file.
109
+ If a lemma is specified, then return only VerbNet class
110
+ identifiers for classes that contain that lemma as a member.
111
+ If a wordnetid is specified, then return only identifiers for
112
+ classes that contain that wordnetid as a member. If a classid
113
+ is specified, then return only identifiers for subclasses of
114
+ the specified VerbNet class.
115
+ If nothing is specified, return all classids within VerbNet
116
+ """
117
+ if fileid is not None:
118
+ return [c for (c, f) in self._class_to_fileid.items() if f == fileid]
119
+ elif lemma is not None:
120
+ return self._lemma_to_class[lemma]
121
+ elif wordnetid is not None:
122
+ return self._wordnet_to_class[wordnetid]
123
+ elif classid is not None:
124
+ xmltree = self.vnclass(classid)
125
+ return [
126
+ subclass.get("ID")
127
+ for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
128
+ ]
129
+ else:
130
+ return sorted(self._class_to_fileid.keys())
131
+
132
+ def vnclass(self, fileid_or_classid):
133
+ """Returns VerbNet class ElementTree
134
+
135
+ Return an ElementTree containing the xml for the specified
136
+ VerbNet class.
137
+
138
+ :param fileid_or_classid: An identifier specifying which class
139
+ should be returned. Can be a file identifier (such as
140
+ ``'put-9.1.xml'``), or a VerbNet class identifier (such as
141
+ ``'put-9.1'``) or a short VerbNet class identifier (such as
142
+ ``'9.1'``).
143
+ """
144
+ # File identifier: just return the xml.
145
+ if fileid_or_classid in self._fileids:
146
+ return self.xml(fileid_or_classid)
147
+
148
+ # Class identifier: get the xml, and find the right elt.
149
+ classid = self.longid(fileid_or_classid)
150
+ if classid in self._class_to_fileid:
151
+ fileid = self._class_to_fileid[self.longid(classid)]
152
+ tree = self.xml(fileid)
153
+ if classid == tree.get("ID"):
154
+ return tree
155
+ else:
156
+ for subclass in tree.findall(".//VNSUBCLASS"):
157
+ if classid == subclass.get("ID"):
158
+ return subclass
159
+ else:
160
+ assert False # we saw it during _index()!
161
+
162
+ else:
163
+ raise ValueError(f"Unknown identifier {fileid_or_classid}")
164
+
165
+ def fileids(self, vnclass_ids=None):
166
+ """
167
+ Return a list of fileids that make up this corpus. If
168
+ ``vnclass_ids`` is specified, then return the fileids that make
169
+ up the specified VerbNet class(es).
170
+ """
171
+ if vnclass_ids is None:
172
+ return self._fileids
173
+ elif isinstance(vnclass_ids, str):
174
+ return [self._class_to_fileid[self.longid(vnclass_ids)]]
175
+ else:
176
+ return [
177
+ self._class_to_fileid[self.longid(vnclass_id)]
178
+ for vnclass_id in vnclass_ids
179
+ ]
180
+
181
+ def frames(self, vnclass):
182
+ """Given a VerbNet class, this method returns VerbNet frames
183
+
184
+ The members returned are:
185
+ 1) Example
186
+ 2) Description
187
+ 3) Syntax
188
+ 4) Semantics
189
+
190
+ :param vnclass: A VerbNet class identifier; or an ElementTree
191
+ containing the xml contents of a VerbNet class.
192
+ :return: frames - a list of frame dictionaries
193
+ """
194
+ if isinstance(vnclass, str):
195
+ vnclass = self.vnclass(vnclass)
196
+ frames = []
197
+ vnframes = vnclass.findall("FRAMES/FRAME")
198
+ for vnframe in vnframes:
199
+ frames.append(
200
+ {
201
+ "example": self._get_example_within_frame(vnframe),
202
+ "description": self._get_description_within_frame(vnframe),
203
+ "syntax": self._get_syntactic_list_within_frame(vnframe),
204
+ "semantics": self._get_semantics_within_frame(vnframe),
205
+ }
206
+ )
207
+ return frames
208
+
209
+ def subclasses(self, vnclass):
210
+ """Returns subclass ids, if any exist
211
+
212
+ Given a VerbNet class, this method returns subclass ids (if they exist)
213
+ in a list of strings.
214
+
215
+ :param vnclass: A VerbNet class identifier; or an ElementTree
216
+ containing the xml contents of a VerbNet class.
217
+ :return: list of subclasses
218
+ """
219
+ if isinstance(vnclass, str):
220
+ vnclass = self.vnclass(vnclass)
221
+
222
+ subclasses = [
223
+ subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
224
+ ]
225
+ return subclasses
226
+
227
+ def themroles(self, vnclass):
228
+ """Returns thematic roles participating in a VerbNet class
229
+
230
+ Members returned as part of roles are-
231
+ 1) Type
232
+ 2) Modifiers
233
+
234
+ :param vnclass: A VerbNet class identifier; or an ElementTree
235
+ containing the xml contents of a VerbNet class.
236
+ :return: themroles: A list of thematic roles in the VerbNet class
237
+ """
238
+ if isinstance(vnclass, str):
239
+ vnclass = self.vnclass(vnclass)
240
+
241
+ themroles = []
242
+ for trole in vnclass.findall("THEMROLES/THEMROLE"):
243
+ themroles.append(
244
+ {
245
+ "type": trole.get("type"),
246
+ "modifiers": [
247
+ {"value": restr.get("Value"), "type": restr.get("type")}
248
+ for restr in trole.findall("SELRESTRS/SELRESTR")
249
+ ],
250
+ }
251
+ )
252
+ return themroles
253
+
254
+ ######################################################################
255
+ # { Index Initialization
256
+ ######################################################################
257
+
258
+ def _index(self):
259
+ """
260
+ Initialize the indexes ``_lemma_to_class``,
261
+ ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
262
+ through the corpus fileids. This is fast if ElementTree
263
+ uses the C implementation (<0.1 secs), but quite slow (>10 secs)
264
+ if only the python implementation is available.
265
+ """
266
+ for fileid in self._fileids:
267
+ self._index_helper(self.xml(fileid), fileid)
268
+
269
+ def _index_helper(self, xmltree, fileid):
270
+ """Helper for ``_index()``"""
271
+ vnclass = xmltree.get("ID")
272
+ self._class_to_fileid[vnclass] = fileid
273
+ self._shortid_to_longid[self.shortid(vnclass)] = vnclass
274
+ for member in xmltree.findall("MEMBERS/MEMBER"):
275
+ self._lemma_to_class[member.get("name")].append(vnclass)
276
+ for wn in member.get("wn", "").split():
277
+ self._wordnet_to_class[wn].append(vnclass)
278
+ for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
279
+ self._index_helper(subclass, fileid)
280
+
281
+ def _quick_index(self):
282
+ """
283
+ Initialize the indexes ``_lemma_to_class``,
284
+ ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
285
+ through the corpus fileids. This doesn't do proper xml parsing,
286
+ but is good enough to find everything in the standard VerbNet
287
+ corpus -- and it runs about 30 times faster than xml parsing
288
+ (with the python ElementTree; only 2-3 times faster
289
+ if ElementTree uses the C implementation).
290
+ """
291
+ # nb: if we got rid of wordnet_to_class, this would run 2-3
292
+ # times faster.
293
+ for fileid in self._fileids:
294
+ vnclass = fileid[:-4] # strip the '.xml'
295
+ self._class_to_fileid[vnclass] = fileid
296
+ self._shortid_to_longid[self.shortid(vnclass)] = vnclass
297
+ with self.open(fileid) as fp:
298
+ for m in self._INDEX_RE.finditer(fp.read()):
299
+ groups = m.groups()
300
+ if groups[0] is not None:
301
+ self._lemma_to_class[groups[0]].append(vnclass)
302
+ for wn in groups[1].split():
303
+ self._wordnet_to_class[wn].append(vnclass)
304
+ elif groups[2] is not None:
305
+ self._class_to_fileid[groups[2]] = fileid
306
+ vnclass = groups[2] # for <MEMBER> elts.
307
+ self._shortid_to_longid[self.shortid(vnclass)] = vnclass
308
+ else:
309
+ assert False, "unexpected match condition"
310
+
311
+ ######################################################################
312
+ # { Identifier conversion
313
+ ######################################################################
314
+
315
+ def longid(self, shortid):
316
+ """Returns longid of a VerbNet class
317
+
318
+ Given a short VerbNet class identifier (eg '37.10'), map it
319
+ to a long id (eg 'confess-37.10'). If ``shortid`` is already a
320
+ long id, then return it as-is"""
321
+ if self._LONGID_RE.match(shortid):
322
+ return shortid # it's already a longid.
323
+ elif not self._SHORTID_RE.match(shortid):
324
+ raise ValueError("vnclass identifier %r not found" % shortid)
325
+ try:
326
+ return self._shortid_to_longid[shortid]
327
+ except KeyError as e:
328
+ raise ValueError("vnclass identifier %r not found" % shortid) from e
329
+
330
+ def shortid(self, longid):
331
+ """Returns shortid of a VerbNet class
332
+
333
+ Given a long VerbNet class identifier (eg 'confess-37.10'),
334
+ map it to a short id (eg '37.10'). If ``longid`` is already a
335
+ short id, then return it as-is."""
336
+ if self._SHORTID_RE.match(longid):
337
+ return longid # it's already a shortid.
338
+ m = self._LONGID_RE.match(longid)
339
+ if m:
340
+ return m.group(2)
341
+ else:
342
+ raise ValueError("vnclass identifier %r not found" % longid)
343
+
344
+ ######################################################################
345
+ # { Frame access utility functions
346
+ ######################################################################
347
+
348
+ def _get_semantics_within_frame(self, vnframe):
349
+ """Returns semantics within a single frame
350
+
351
+ A utility function to retrieve semantics within a frame in VerbNet
352
+ Members of the semantics dictionary:
353
+ 1) Predicate value
354
+ 2) Arguments
355
+
356
+ :param vnframe: An ElementTree containing the xml contents of
357
+ a VerbNet frame.
358
+ :return: semantics: semantics dictionary
359
+ """
360
+ semantics_within_single_frame = []
361
+ for pred in vnframe.findall("SEMANTICS/PRED"):
362
+ arguments = [
363
+ {"type": arg.get("type"), "value": arg.get("value")}
364
+ for arg in pred.findall("ARGS/ARG")
365
+ ]
366
+ semantics_within_single_frame.append(
367
+ {
368
+ "predicate_value": pred.get("value"),
369
+ "arguments": arguments,
370
+ "negated": pred.get("bool") == "!",
371
+ }
372
+ )
373
+ return semantics_within_single_frame
374
+
375
+ def _get_example_within_frame(self, vnframe):
376
+ """Returns example within a frame
377
+
378
+ A utility function to retrieve an example within a frame in VerbNet.
379
+
380
+ :param vnframe: An ElementTree containing the xml contents of
381
+ a VerbNet frame.
382
+ :return: example_text: The example sentence for this particular frame
383
+ """
384
+ example_element = vnframe.find("EXAMPLES/EXAMPLE")
385
+ if example_element is not None:
386
+ example_text = example_element.text
387
+ else:
388
+ example_text = ""
389
+ return example_text
390
+
391
+ def _get_description_within_frame(self, vnframe):
392
+ """Returns member description within frame
393
+
394
+ A utility function to retrieve a description of participating members
395
+ within a frame in VerbNet.
396
+
397
+ :param vnframe: An ElementTree containing the xml contents of
398
+ a VerbNet frame.
399
+ :return: description: a description dictionary with members - primary and secondary
400
+ """
401
+ description_element = vnframe.find("DESCRIPTION")
402
+ return {
403
+ "primary": description_element.attrib["primary"],
404
+ "secondary": description_element.get("secondary", ""),
405
+ }
406
+
407
+ def _get_syntactic_list_within_frame(self, vnframe):
408
+ """Returns semantics within a frame
409
+
410
+ A utility function to retrieve semantics within a frame in VerbNet.
411
+ Members of the syntactic dictionary:
412
+ 1) POS Tag
413
+ 2) Modifiers
414
+
415
+ :param vnframe: An ElementTree containing the xml contents of
416
+ a VerbNet frame.
417
+ :return: syntax_within_single_frame
418
+ """
419
+ syntax_within_single_frame = []
420
+ for elt in vnframe.find("SYNTAX"):
421
+ pos_tag = elt.tag
422
+ modifiers = dict()
423
+ modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
424
+ modifiers["selrestrs"] = [
425
+ {"value": restr.get("Value"), "type": restr.get("type")}
426
+ for restr in elt.findall("SELRESTRS/SELRESTR")
427
+ ]
428
+ modifiers["synrestrs"] = [
429
+ {"value": restr.get("Value"), "type": restr.get("type")}
430
+ for restr in elt.findall("SYNRESTRS/SYNRESTR")
431
+ ]
432
+ syntax_within_single_frame.append(
433
+ {"pos_tag": pos_tag, "modifiers": modifiers}
434
+ )
435
+ return syntax_within_single_frame
436
+
437
+ ######################################################################
438
+ # { Pretty Printing
439
+ ######################################################################
440
+
441
+ def pprint(self, vnclass):
442
+ """Returns pretty printed version of a VerbNet class
443
+
444
+ Return a string containing a pretty-printed representation of
445
+ the given VerbNet class.
446
+
447
+ :param vnclass: A VerbNet class identifier; or an ElementTree
448
+ containing the xml contents of a VerbNet class.
449
+ """
450
+ if isinstance(vnclass, str):
451
+ vnclass = self.vnclass(vnclass)
452
+
453
+ s = vnclass.get("ID") + "\n"
454
+ s += self.pprint_subclasses(vnclass, indent=" ") + "\n"
455
+ s += self.pprint_members(vnclass, indent=" ") + "\n"
456
+ s += " Thematic roles:\n"
457
+ s += self.pprint_themroles(vnclass, indent=" ") + "\n"
458
+ s += " Frames:\n"
459
+ s += self.pprint_frames(vnclass, indent=" ")
460
+ return s
461
+
462
+ def pprint_subclasses(self, vnclass, indent=""):
463
+ """Returns pretty printed version of subclasses of VerbNet class
464
+
465
+ Return a string containing a pretty-printed representation of
466
+ the given VerbNet class's subclasses.
467
+
468
+ :param vnclass: A VerbNet class identifier; or an ElementTree
469
+ containing the xml contents of a VerbNet class.
470
+ """
471
+ if isinstance(vnclass, str):
472
+ vnclass = self.vnclass(vnclass)
473
+
474
+ subclasses = self.subclasses(vnclass)
475
+ if not subclasses:
476
+ subclasses = ["(none)"]
477
+ s = "Subclasses: " + " ".join(subclasses)
478
+ return textwrap.fill(
479
+ s, 70, initial_indent=indent, subsequent_indent=indent + " "
480
+ )
481
+
482
+ def pprint_members(self, vnclass, indent=""):
483
+ """Returns pretty printed version of members in a VerbNet class
484
+
485
+ Return a string containing a pretty-printed representation of
486
+ the given VerbNet class's member verbs.
487
+
488
+ :param vnclass: A VerbNet class identifier; or an ElementTree
489
+ containing the xml contents of a VerbNet class.
490
+ """
491
+ if isinstance(vnclass, str):
492
+ vnclass = self.vnclass(vnclass)
493
+
494
+ members = self.lemmas(vnclass)
495
+ if not members:
496
+ members = ["(none)"]
497
+ s = "Members: " + " ".join(members)
498
+ return textwrap.fill(
499
+ s, 70, initial_indent=indent, subsequent_indent=indent + " "
500
+ )
501
+
502
+ def pprint_themroles(self, vnclass, indent=""):
503
+ """Returns pretty printed version of thematic roles in a VerbNet class
504
+
505
+ Return a string containing a pretty-printed representation of
506
+ the given VerbNet class's thematic roles.
507
+
508
+ :param vnclass: A VerbNet class identifier; or an ElementTree
509
+ containing the xml contents of a VerbNet class.
510
+ """
511
+ if isinstance(vnclass, str):
512
+ vnclass = self.vnclass(vnclass)
513
+
514
+ pieces = []
515
+ for themrole in self.themroles(vnclass):
516
+ piece = indent + "* " + themrole.get("type")
517
+ modifiers = [
518
+ modifier["value"] + modifier["type"]
519
+ for modifier in themrole["modifiers"]
520
+ ]
521
+ if modifiers:
522
+ piece += "[{}]".format(" ".join(modifiers))
523
+ pieces.append(piece)
524
+ return "\n".join(pieces)
525
+
526
+ def pprint_frames(self, vnclass, indent=""):
527
+ """Returns pretty version of all frames in a VerbNet class
528
+
529
+ Return a string containing a pretty-printed representation of
530
+ the list of frames within the VerbNet class.
531
+
532
+ :param vnclass: A VerbNet class identifier; or an ElementTree
533
+ containing the xml contents of a VerbNet class.
534
+ """
535
+ if isinstance(vnclass, str):
536
+ vnclass = self.vnclass(vnclass)
537
+ pieces = []
538
+ for vnframe in self.frames(vnclass):
539
+ pieces.append(self._pprint_single_frame(vnframe, indent))
540
+ return "\n".join(pieces)
541
+
542
+ def _pprint_single_frame(self, vnframe, indent=""):
543
+ """Returns pretty printed version of a single frame in a VerbNet class
544
+
545
+ Returns a string containing a pretty-printed representation of
546
+ the given frame.
547
+
548
+ :param vnframe: An ElementTree containing the xml contents of
549
+ a VerbNet frame.
550
+ """
551
+ frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
552
+ frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
553
+ frame_string += (
554
+ self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n"
555
+ )
556
+ frame_string += indent + " Semantics:\n"
557
+ frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ")
558
+ return frame_string
559
+
560
+ def _pprint_example_within_frame(self, vnframe, indent=""):
561
+ """Returns pretty printed version of example within frame in a VerbNet class
562
+
563
+ Return a string containing a pretty-printed representation of
564
+ the given VerbNet frame example.
565
+
566
+ :param vnframe: An ElementTree containing the xml contents of
567
+ a Verbnet frame.
568
+ """
569
+ if vnframe["example"]:
570
+ return indent + " Example: " + vnframe["example"]
571
+
572
+ def _pprint_description_within_frame(self, vnframe, indent=""):
573
+ """Returns pretty printed version of a VerbNet frame description
574
+
575
+ Return a string containing a pretty-printed representation of
576
+ the given VerbNet frame description.
577
+
578
+ :param vnframe: An ElementTree containing the xml contents of
579
+ a VerbNet frame.
580
+ """
581
+ description = indent + vnframe["description"]["primary"]
582
+ if vnframe["description"]["secondary"]:
583
+ description += " ({})".format(vnframe["description"]["secondary"])
584
+ return description
585
+
586
+ def _pprint_syntax_within_frame(self, vnframe, indent=""):
587
+ """Returns pretty printed version of syntax within a frame in a VerbNet class
588
+
589
+ Return a string containing a pretty-printed representation of
590
+ the given VerbNet frame syntax.
591
+
592
+ :param vnframe: An ElementTree containing the xml contents of
593
+ a VerbNet frame.
594
+ """
595
+ pieces = []
596
+ for element in vnframe["syntax"]:
597
+ piece = element["pos_tag"]
598
+ modifier_list = []
599
+ if "value" in element["modifiers"] and element["modifiers"]["value"]:
600
+ modifier_list.append(element["modifiers"]["value"])
601
+ modifier_list += [
602
+ "{}{}".format(restr["value"], restr["type"])
603
+ for restr in (
604
+ element["modifiers"]["selrestrs"]
605
+ + element["modifiers"]["synrestrs"]
606
+ )
607
+ ]
608
+ if modifier_list:
609
+ piece += "[{}]".format(" ".join(modifier_list))
610
+ pieces.append(piece)
611
+
612
+ return indent + " ".join(pieces)
613
+
614
+ def _pprint_semantics_within_frame(self, vnframe, indent=""):
615
+ """Returns a pretty printed version of semantics within frame in a VerbNet class
616
+
617
+ Return a string containing a pretty-printed representation of
618
+ the given VerbNet frame semantics.
619
+
620
+ :param vnframe: An ElementTree containing the xml contents of
621
+ a VerbNet frame.
622
+ """
623
+ pieces = []
624
+ for predicate in vnframe["semantics"]:
625
+ arguments = [argument["value"] for argument in predicate["arguments"]]
626
+ pieces.append(
627
+ f"{'¬' if predicate['negated'] else ''}{predicate['predicate_value']}({', '.join(arguments)})"
628
+ )
629
+ return "\n".join(f"{indent}* {piece}" for piece in pieces)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordlist.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Word List Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+ from nltk.corpus.reader.api import *
9
+ from nltk.corpus.reader.util import *
10
+ from nltk.tokenize import line_tokenize
11
+
12
+
13
+ class WordListCorpusReader(CorpusReader):
14
+ """
15
+ List of words, one per line. Blank lines are ignored.
16
+ """
17
+
18
+ def words(self, fileids=None, ignore_lines_startswith="\n"):
19
+ return [
20
+ line
21
+ for line in line_tokenize(self.raw(fileids))
22
+ if not line.startswith(ignore_lines_startswith)
23
+ ]
24
+
25
+
26
+ class SwadeshCorpusReader(WordListCorpusReader):
27
+ def entries(self, fileids=None):
28
+ """
29
+ :return: a tuple of words for the specified fileids.
30
+ """
31
+ if not fileids:
32
+ fileids = self.fileids()
33
+
34
+ wordlists = [self.words(f) for f in fileids]
35
+ return list(zip(*wordlists))
36
+
37
+
38
+ class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
39
+ """
40
+ This is a class to read the nonbreaking prefixes textfiles from the
41
+ Moses Machine Translation toolkit. These lists are used in the Python port
42
+ of the Moses' word tokenizer.
43
+ """
44
+
45
+ available_langs = {
46
+ "catalan": "ca",
47
+ "czech": "cs",
48
+ "german": "de",
49
+ "greek": "el",
50
+ "english": "en",
51
+ "spanish": "es",
52
+ "finnish": "fi",
53
+ "french": "fr",
54
+ "hungarian": "hu",
55
+ "icelandic": "is",
56
+ "italian": "it",
57
+ "latvian": "lv",
58
+ "dutch": "nl",
59
+ "polish": "pl",
60
+ "portuguese": "pt",
61
+ "romanian": "ro",
62
+ "russian": "ru",
63
+ "slovak": "sk",
64
+ "slovenian": "sl",
65
+ "swedish": "sv",
66
+ "tamil": "ta",
67
+ }
68
+ # Also, add the lang IDs as the keys.
69
+ available_langs.update({v: v for v in available_langs.values()})
70
+
71
+ def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
72
+ """
73
+ This module returns a list of nonbreaking prefixes for the specified
74
+ language(s).
75
+
76
+ >>> from nltk.corpus import nonbreaking_prefixes as nbp
77
+ >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
78
+ True
79
+ >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
80
+ True
81
+
82
+ :return: a list words for the specified language(s).
83
+ """
84
+ # If *lang* in list of languages available, allocate apt fileid.
85
+ # Otherwise, the function returns non-breaking prefixes for
86
+ # all languages when fileids==None.
87
+ if lang in self.available_langs:
88
+ lang = self.available_langs[lang]
89
+ fileids = ["nonbreaking_prefix." + lang]
90
+ return [
91
+ line
92
+ for line in line_tokenize(self.raw(fileids))
93
+ if not line.startswith(ignore_lines_startswith)
94
+ ]
95
+
96
+
97
+ class UnicharsCorpusReader(WordListCorpusReader):
98
+ """
99
+ This class is used to read lists of characters from the Perl Unicode
100
+ Properties (see https://perldoc.perl.org/perluniprops.html).
101
+ The files in the perluniprop.zip are extracted using the Unicode::Tussle
102
+ module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
103
+ """
104
+
105
+ # These are categories similar to the Perl Unicode Properties
106
+ available_categories = [
107
+ "Close_Punctuation",
108
+ "Currency_Symbol",
109
+ "IsAlnum",
110
+ "IsAlpha",
111
+ "IsLower",
112
+ "IsN",
113
+ "IsSc",
114
+ "IsSo",
115
+ "IsUpper",
116
+ "Line_Separator",
117
+ "Number",
118
+ "Open_Punctuation",
119
+ "Punctuation",
120
+ "Separator",
121
+ "Symbol",
122
+ ]
123
+
124
+ def chars(self, category=None, fileids=None):
125
+ """
126
+ This module returns a list of characters from the Perl Unicode Properties.
127
+ They are very useful when porting Perl tokenizers to Python.
128
+
129
+ >>> from nltk.corpus import perluniprops as pup
130
+ >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
131
+ True
132
+ >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
133
+ True
134
+ >>> pup.available_categories
135
+ ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
136
+
137
+ :return: a list of characters given the specific unicode character category
138
+ """
139
+ if category in self.available_categories:
140
+ fileids = [category + ".txt"]
141
+ return list(self.raw(fileids).strip())
142
+
143
+
144
+ class MWAPPDBCorpusReader(WordListCorpusReader):
145
+ """
146
+ This class is used to read the list of word pairs from the subset of lexical
147
+ pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
148
+ Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
149
+
150
+ - http://acl2014.org/acl2014/Q14/pdf/Q14-1017
151
+ - https://www.aclweb.org/anthology/S14-2039
152
+ - https://www.aclweb.org/anthology/S15-2027
153
+
154
+ The original source of the full PPDB corpus can be found on
155
+ https://www.cis.upenn.edu/~ccb/ppdb/
156
+
157
+ :return: a list of tuples of similar lexical terms.
158
+ """
159
+
160
+ mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
161
+
162
+ def entries(self, fileids=mwa_ppdb_xxxl_file):
163
+ """
164
+ :return: a tuple of synonym word pairs.
165
+ """
166
+ return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/wordnet.py ADDED
@@ -0,0 +1,2489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: WordNet
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bethard <Steven.Bethard@colorado.edu>
5
+ # Steven Bird <stevenbird1@gmail.com>
6
+ # Edward Loper <edloper@gmail.com>
7
+ # Nitin Madnani <nmadnani@ets.org>
8
+ # Nasruddin A’aidil Shari
9
+ # Sim Wei Ying Geraldine
10
+ # Soe Lynn
11
+ # Francis Bond <bond@ieee.org>
12
+ # Eric Kafe <kafe.eric@gmail.com>
13
+
14
+ # URL: <https://www.nltk.org/>
15
+ # For license information, see LICENSE.TXT
16
+
17
+ """
18
+ An NLTK interface for WordNet
19
+
20
+ WordNet is a lexical database of English.
21
+ Using synsets, helps find conceptual relationships between words
22
+ such as hypernyms, hyponyms, synonyms, antonyms etc.
23
+
24
+ For details about WordNet see:
25
+ https://wordnet.princeton.edu/
26
+
27
+ This module also allows you to find lemmas in languages
28
+ other than English from the Open Multilingual Wordnet
29
+ http://compling.hss.ntu.edu.sg/omw/
30
+
31
+ """
32
+
33
+ import math
34
+ import os
35
+ import re
36
+ import warnings
37
+ from collections import defaultdict, deque
38
+ from functools import total_ordering
39
+ from itertools import chain, islice
40
+ from operator import itemgetter
41
+
42
+ from nltk.corpus.reader import CorpusReader
43
+ from nltk.internals import deprecated
44
+ from nltk.probability import FreqDist
45
+ from nltk.util import binary_search_file as _binary_search_file
46
+
47
+ ######################################################################
48
+ # Table of Contents
49
+ ######################################################################
50
+ # - Constants
51
+ # - Data Classes
52
+ # - WordNetError
53
+ # - Lemma
54
+ # - Synset
55
+ # - WordNet Corpus Reader
56
+ # - WordNet Information Content Corpus Reader
57
+ # - Similarity Metrics
58
+ # - Demo
59
+
60
+ ######################################################################
61
+ # Constants
62
+ ######################################################################
63
+
64
+ #: Positive infinity (for similarity functions)
65
+ _INF = 1e300
66
+
67
+ # { Part-of-speech constants
68
+ ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
69
+ # }
70
+
71
+ POS_LIST = [NOUN, VERB, ADJ, ADV]
72
+
73
+ # A table of strings that are used to express verb frames.
74
+ VERB_FRAME_STRINGS = (
75
+ None,
76
+ "Something %s",
77
+ "Somebody %s",
78
+ "It is %sing",
79
+ "Something is %sing PP",
80
+ "Something %s something Adjective/Noun",
81
+ "Something %s Adjective/Noun",
82
+ "Somebody %s Adjective",
83
+ "Somebody %s something",
84
+ "Somebody %s somebody",
85
+ "Something %s somebody",
86
+ "Something %s something",
87
+ "Something %s to somebody",
88
+ "Somebody %s on something",
89
+ "Somebody %s somebody something",
90
+ "Somebody %s something to somebody",
91
+ "Somebody %s something from somebody",
92
+ "Somebody %s somebody with something",
93
+ "Somebody %s somebody of something",
94
+ "Somebody %s something on somebody",
95
+ "Somebody %s somebody PP",
96
+ "Somebody %s something PP",
97
+ "Somebody %s PP",
98
+ "Somebody's (body part) %s",
99
+ "Somebody %s somebody to INFINITIVE",
100
+ "Somebody %s somebody INFINITIVE",
101
+ "Somebody %s that CLAUSE",
102
+ "Somebody %s to somebody",
103
+ "Somebody %s to INFINITIVE",
104
+ "Somebody %s whether INFINITIVE",
105
+ "Somebody %s somebody into V-ing something",
106
+ "Somebody %s something with something",
107
+ "Somebody %s INFINITIVE",
108
+ "Somebody %s VERB-ing",
109
+ "It %s that CLAUSE",
110
+ "Something %s INFINITIVE",
111
+ # OEWN additions:
112
+ "Somebody %s at something",
113
+ "Somebody %s for something",
114
+ "Somebody %s on somebody",
115
+ "Somebody %s out of somebody",
116
+ )
117
+
118
+ SENSENUM_RE = re.compile(r"\.[\d]+\.")
119
+
120
+
121
+ ######################################################################
122
+ # Data Classes
123
+ ######################################################################
124
+
125
+
126
+ class WordNetError(Exception):
127
+ """An exception class for wordnet-related errors."""
128
+
129
+
130
+ @total_ordering
131
+ class _WordNetObject:
132
+ """A common base class for lemmas and synsets."""
133
+
134
+ def hypernyms(self):
135
+ return self._related("@")
136
+
137
+ def _hypernyms(self):
138
+ return self._related("@")
139
+
140
+ def instance_hypernyms(self):
141
+ return self._related("@i")
142
+
143
+ def _instance_hypernyms(self):
144
+ return self._related("@i")
145
+
146
+ def hyponyms(self):
147
+ return self._related("~")
148
+
149
+ def instance_hyponyms(self):
150
+ return self._related("~i")
151
+
152
+ def member_holonyms(self):
153
+ return self._related("#m")
154
+
155
+ def substance_holonyms(self):
156
+ return self._related("#s")
157
+
158
+ def part_holonyms(self):
159
+ return self._related("#p")
160
+
161
+ def member_meronyms(self):
162
+ return self._related("%m")
163
+
164
+ def substance_meronyms(self):
165
+ return self._related("%s")
166
+
167
+ def part_meronyms(self):
168
+ return self._related("%p")
169
+
170
+ def topic_domains(self):
171
+ return self._related(";c")
172
+
173
+ def in_topic_domains(self):
174
+ return self._related("-c")
175
+
176
+ def region_domains(self):
177
+ return self._related(";r")
178
+
179
+ def in_region_domains(self):
180
+ return self._related("-r")
181
+
182
+ def usage_domains(self):
183
+ return self._related(";u")
184
+
185
+ def in_usage_domains(self):
186
+ return self._related("-u")
187
+
188
+ def attributes(self):
189
+ return self._related("=")
190
+
191
+ def entailments(self):
192
+ return self._related("*")
193
+
194
+ def causes(self):
195
+ return self._related(">")
196
+
197
+ def also_sees(self):
198
+ return self._related("^")
199
+
200
+ def verb_groups(self):
201
+ return self._related("$")
202
+
203
+ def similar_tos(self):
204
+ return self._related("&")
205
+
206
+ def __hash__(self):
207
+ return hash(self._name)
208
+
209
+ def __eq__(self, other):
210
+ return self._name == other._name
211
+
212
+ def __ne__(self, other):
213
+ return self._name != other._name
214
+
215
+ def __lt__(self, other):
216
+ return self._name < other._name
217
+
218
+
219
+ class Lemma(_WordNetObject):
220
+ """
221
+ The lexical entry for a single morphological form of a
222
+ sense-disambiguated word.
223
+
224
+ Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
225
+ <word> is the morphological stem identifying the synset
226
+ <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
227
+ <number> is the sense number, counting from 0.
228
+ <lemma> is the morphological form of interest
229
+
230
+ Note that <word> and <lemma> can be different, e.g. the Synset
231
+ 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
232
+ 'salt.n.03.salinity'.
233
+
234
+ Lemma attributes, accessible via methods with the same name:
235
+
236
+ - name: The canonical name of this lemma.
237
+ - synset: The synset that this lemma belongs to.
238
+ - syntactic_marker: For adjectives, the WordNet string identifying the
239
+ syntactic position relative modified noun. See:
240
+ https://wordnet.princeton.edu/documentation/wninput5wn
241
+ For all other parts of speech, this attribute is None.
242
+ - count: The frequency of this lemma in wordnet.
243
+
244
+ Lemma methods:
245
+
246
+ Lemmas have the following methods for retrieving related Lemmas. They
247
+ correspond to the names for the pointer symbols defined here:
248
+ https://wordnet.princeton.edu/documentation/wninput5wn
249
+ These methods all return lists of Lemmas:
250
+
251
+ - antonyms
252
+ - hypernyms, instance_hypernyms
253
+ - hyponyms, instance_hyponyms
254
+ - member_holonyms, substance_holonyms, part_holonyms
255
+ - member_meronyms, substance_meronyms, part_meronyms
256
+ - topic_domains, region_domains, usage_domains
257
+ - attributes
258
+ - derivationally_related_forms
259
+ - entailments
260
+ - causes
261
+ - also_sees
262
+ - verb_groups
263
+ - similar_tos
264
+ - pertainyms
265
+ """
266
+
267
+ __slots__ = [
268
+ "_wordnet_corpus_reader",
269
+ "_name",
270
+ "_syntactic_marker",
271
+ "_synset",
272
+ "_frame_strings",
273
+ "_frame_ids",
274
+ "_lexname_index",
275
+ "_lex_id",
276
+ "_lang",
277
+ "_key",
278
+ ]
279
+
280
+ def __init__(
281
+ self,
282
+ wordnet_corpus_reader,
283
+ synset,
284
+ name,
285
+ lexname_index,
286
+ lex_id,
287
+ syntactic_marker,
288
+ ):
289
+ self._wordnet_corpus_reader = wordnet_corpus_reader
290
+ self._name = name
291
+ self._syntactic_marker = syntactic_marker
292
+ self._synset = synset
293
+ self._frame_strings = []
294
+ self._frame_ids = []
295
+ self._lexname_index = lexname_index
296
+ self._lex_id = lex_id
297
+ self._lang = "eng"
298
+
299
+ self._key = None # gets set later.
300
+
301
+ def name(self):
302
+ return self._name
303
+
304
+ def syntactic_marker(self):
305
+ return self._syntactic_marker
306
+
307
+ def synset(self):
308
+ return self._synset
309
+
310
+ def frame_strings(self):
311
+ return self._frame_strings
312
+
313
+ def frame_ids(self):
314
+ return self._frame_ids
315
+
316
+ def lang(self):
317
+ return self._lang
318
+
319
+ def key(self):
320
+ return self._key
321
+
322
+ def __repr__(self):
323
+ tup = type(self).__name__, self._synset._name, self._name
324
+ return "%s('%s.%s')" % tup
325
+
326
+ def _related(self, relation_symbol):
327
+ get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
328
+ if (self._name, relation_symbol) not in self._synset._lemma_pointers:
329
+ return []
330
+ return [
331
+ get_synset(pos, offset)._lemmas[lemma_index]
332
+ for pos, offset, lemma_index in self._synset._lemma_pointers[
333
+ self._name, relation_symbol
334
+ ]
335
+ ]
336
+
337
+ def count(self):
338
+ """Return the frequency count for this Lemma"""
339
+ return self._wordnet_corpus_reader.lemma_count(self)
340
+
341
+ def antonyms(self):
342
+ return self._related("!")
343
+
344
+ def derivationally_related_forms(self):
345
+ return self._related("+")
346
+
347
+ def pertainyms(self):
348
+ return self._related("\\")
349
+
350
+
351
+ class Synset(_WordNetObject):
352
+ """Create a Synset from a "<lemma>.<pos>.<number>" string where:
353
+ <lemma> is the word's morphological stem
354
+ <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
355
+ <number> is the sense number, counting from 0.
356
+
357
+ Synset attributes, accessible via methods with the same name:
358
+
359
+ - name: The canonical name of this synset, formed using the first lemma
360
+ of this synset. Note that this may be different from the name
361
+ passed to the constructor if that string used a different lemma to
362
+ identify the synset.
363
+ - pos: The synset's part of speech, matching one of the module level
364
+ attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
365
+ - lemmas: A list of the Lemma objects for this synset.
366
+ - definition: The definition for this synset.
367
+ - examples: A list of example strings for this synset.
368
+ - offset: The offset in the WordNet dict file of this synset.
369
+ - lexname: The name of the lexicographer file containing this synset.
370
+
371
+ Synset methods:
372
+
373
+ Synsets have the following methods for retrieving related Synsets.
374
+ They correspond to the names for the pointer symbols defined here:
375
+ https://wordnet.princeton.edu/documentation/wninput5wn
376
+ These methods all return lists of Synsets.
377
+
378
+ - hypernyms, instance_hypernyms
379
+ - hyponyms, instance_hyponyms
380
+ - member_holonyms, substance_holonyms, part_holonyms
381
+ - member_meronyms, substance_meronyms, part_meronyms
382
+ - attributes
383
+ - entailments
384
+ - causes
385
+ - also_sees
386
+ - verb_groups
387
+ - similar_tos
388
+
389
+ Additionally, Synsets support the following methods specific to the
390
+ hypernym relation:
391
+
392
+ - root_hypernyms
393
+ - common_hypernyms
394
+ - lowest_common_hypernyms
395
+
396
+ Note that Synsets do not support the following relations because
397
+ these are defined by WordNet as lexical relations:
398
+
399
+ - antonyms
400
+ - derivationally_related_forms
401
+ - pertainyms
402
+ """
403
+
404
+ __slots__ = [
405
+ "_pos",
406
+ "_offset",
407
+ "_name",
408
+ "_frame_ids",
409
+ "_lemmas",
410
+ "_lemma_names",
411
+ "_definition",
412
+ "_examples",
413
+ "_lexname",
414
+ "_pointers",
415
+ "_lemma_pointers",
416
+ "_max_depth",
417
+ "_min_depth",
418
+ ]
419
+
420
+ def __init__(self, wordnet_corpus_reader):
421
+ self._wordnet_corpus_reader = wordnet_corpus_reader
422
+ # All of these attributes get initialized by
423
+ # WordNetCorpusReader._synset_from_pos_and_line()
424
+
425
+ self._pos = None
426
+ self._offset = None
427
+ self._name = None
428
+ self._frame_ids = []
429
+ self._lemmas = []
430
+ self._lemma_names = []
431
+ self._definition = None
432
+ self._examples = []
433
+ self._lexname = None # lexicographer name
434
+ self._all_hypernyms = None
435
+
436
+ self._pointers = defaultdict(set)
437
+ self._lemma_pointers = defaultdict(list)
438
+
439
+ def pos(self):
440
+ return self._pos
441
+
442
+ def offset(self):
443
+ return self._offset
444
+
445
+ def name(self):
446
+ return self._name
447
+
448
+ def frame_ids(self):
449
+ return self._frame_ids
450
+
451
+ def _doc(self, doc_type, default, lang="eng"):
452
+ """Helper method for Synset.definition and Synset.examples"""
453
+ corpus = self._wordnet_corpus_reader
454
+ if lang not in corpus.langs():
455
+ return None
456
+ elif lang == "eng":
457
+ return default
458
+ else:
459
+ corpus._load_lang_data(lang)
460
+ of = corpus.ss2of(self)
461
+ i = corpus.lg_attrs.index(doc_type)
462
+ if of in corpus._lang_data[lang][i]:
463
+ return corpus._lang_data[lang][i][of]
464
+ else:
465
+ return None
466
+
467
+ def definition(self, lang="eng"):
468
+ """Return definition in specified language"""
469
+ return self._doc("def", self._definition, lang=lang)
470
+
471
+ def examples(self, lang="eng"):
472
+ """Return examples in specified language"""
473
+ return self._doc("exe", self._examples, lang=lang)
474
+
475
+ def lexname(self):
476
+ return self._lexname
477
+
478
+ def _needs_root(self):
479
+ if self._pos == NOUN and self._wordnet_corpus_reader.get_version() != "1.6":
480
+ return False
481
+ else:
482
+ return True
483
+
484
+ def lemma_names(self, lang="eng"):
485
+ """Return all the lemma_names associated with the synset"""
486
+ if lang == "eng":
487
+ return self._lemma_names
488
+ else:
489
+ reader = self._wordnet_corpus_reader
490
+ reader._load_lang_data(lang)
491
+ i = reader.ss2of(self)
492
+ if i in reader._lang_data[lang][0]:
493
+ return reader._lang_data[lang][0][i]
494
+ else:
495
+ return []
496
+
497
+ def lemmas(self, lang="eng"):
498
+ """Return all the lemma objects associated with the synset"""
499
+ if lang == "eng":
500
+ return self._lemmas
501
+ elif self._name:
502
+ self._wordnet_corpus_reader._load_lang_data(lang)
503
+ lemmark = []
504
+ lemmy = self.lemma_names(lang)
505
+ for lem in lemmy:
506
+ temp = Lemma(
507
+ self._wordnet_corpus_reader,
508
+ self,
509
+ lem,
510
+ self._wordnet_corpus_reader._lexnames.index(self.lexname()),
511
+ 0,
512
+ None,
513
+ )
514
+ temp._lang = lang
515
+ lemmark.append(temp)
516
+ return lemmark
517
+
518
+ def root_hypernyms(self):
519
+ """Get the topmost hypernyms of this synset in WordNet."""
520
+
521
+ result = []
522
+ seen = set()
523
+ todo = [self]
524
+ while todo:
525
+ next_synset = todo.pop()
526
+ if next_synset not in seen:
527
+ seen.add(next_synset)
528
+ next_hypernyms = (
529
+ next_synset.hypernyms() + next_synset.instance_hypernyms()
530
+ )
531
+ if not next_hypernyms:
532
+ result.append(next_synset)
533
+ else:
534
+ todo.extend(next_hypernyms)
535
+ return result
536
+
537
+ # Simpler implementation which makes incorrect assumption that
538
+ # hypernym hierarchy is acyclic:
539
+ #
540
+ # if not self.hypernyms():
541
+ # return [self]
542
+ # else:
543
+ # return list(set(root for h in self.hypernyms()
544
+ # for root in h.root_hypernyms()))
545
+ def max_depth(self):
546
+ """
547
+ :return: The length of the longest hypernym path from this
548
+ synset to the root.
549
+ """
550
+
551
+ if "_max_depth" not in self.__dict__:
552
+ hypernyms = self.hypernyms() + self.instance_hypernyms()
553
+ if not hypernyms:
554
+ self._max_depth = 0
555
+ else:
556
+ self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
557
+ return self._max_depth
558
+
559
+ def min_depth(self):
560
+ """
561
+ :return: The length of the shortest hypernym path from this
562
+ synset to the root.
563
+ """
564
+
565
+ if "_min_depth" not in self.__dict__:
566
+ hypernyms = self.hypernyms() + self.instance_hypernyms()
567
+ if not hypernyms:
568
+ self._min_depth = 0
569
+ else:
570
+ self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
571
+ return self._min_depth
572
+
573
+ def closure(self, rel, depth=-1):
574
+ """
575
+ Return the transitive closure of source under the rel
576
+ relationship, breadth-first, discarding cycles:
577
+
578
+ >>> from nltk.corpus import wordnet as wn
579
+ >>> computer = wn.synset('computer.n.01')
580
+ >>> topic = lambda s:s.topic_domains()
581
+ >>> print(list(computer.closure(topic)))
582
+ [Synset('computer_science.n.01')]
583
+
584
+ UserWarning: Discarded redundant search for Synset('computer.n.01') at depth 2
585
+
586
+
587
+ Include redundant paths (but only once), avoiding duplicate searches
588
+ (from 'animal.n.01' to 'entity.n.01'):
589
+
590
+ >>> dog = wn.synset('dog.n.01')
591
+ >>> hyp = lambda s:s.hypernyms()
592
+ >>> print(list(dog.closure(hyp)))
593
+ [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'),\
594
+ Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'),\
595
+ Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'),\
596
+ Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),\
597
+ Synset('physical_entity.n.01'), Synset('entity.n.01')]
598
+
599
+ UserWarning: Discarded redundant search for Synset('animal.n.01') at depth 7
600
+ """
601
+
602
+ from nltk.util import acyclic_breadth_first
603
+
604
+ for synset in acyclic_breadth_first(self, rel, depth):
605
+ if synset != self:
606
+ yield synset
607
+
608
+ from nltk.util import acyclic_depth_first as acyclic_tree
609
+ from nltk.util import unweighted_minimum_spanning_tree as mst
610
+
611
+ # Also add this shortcut?
612
+ # from nltk.util import unweighted_minimum_spanning_digraph as umsd
613
+
614
+ def tree(self, rel, depth=-1, cut_mark=None):
615
+ """
616
+ Return the full relation tree, including self,
617
+ discarding cycles:
618
+
619
+ >>> from nltk.corpus import wordnet as wn
620
+ >>> from pprint import pprint
621
+ >>> computer = wn.synset('computer.n.01')
622
+ >>> topic = lambda s:s.topic_domains()
623
+ >>> pprint(computer.tree(topic))
624
+ [Synset('computer.n.01'), [Synset('computer_science.n.01')]]
625
+
626
+ UserWarning: Discarded redundant search for Synset('computer.n.01') at depth -3
627
+
628
+
629
+ But keep duplicate branches (from 'animal.n.01' to 'entity.n.01'):
630
+
631
+ >>> dog = wn.synset('dog.n.01')
632
+ >>> hyp = lambda s:s.hypernyms()
633
+ >>> pprint(dog.tree(hyp))
634
+ [Synset('dog.n.01'),
635
+ [Synset('canine.n.02'),
636
+ [Synset('carnivore.n.01'),
637
+ [Synset('placental.n.01'),
638
+ [Synset('mammal.n.01'),
639
+ [Synset('vertebrate.n.01'),
640
+ [Synset('chordate.n.01'),
641
+ [Synset('animal.n.01'),
642
+ [Synset('organism.n.01'),
643
+ [Synset('living_thing.n.01'),
644
+ [Synset('whole.n.02'),
645
+ [Synset('object.n.01'),
646
+ [Synset('physical_entity.n.01'),
647
+ [Synset('entity.n.01')]]]]]]]]]]]]],
648
+ [Synset('domestic_animal.n.01'),
649
+ [Synset('animal.n.01'),
650
+ [Synset('organism.n.01'),
651
+ [Synset('living_thing.n.01'),
652
+ [Synset('whole.n.02'),
653
+ [Synset('object.n.01'),
654
+ [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]
655
+ """
656
+
657
+ from nltk.util import acyclic_branches_depth_first
658
+
659
+ return acyclic_branches_depth_first(self, rel, depth, cut_mark)
660
+
661
+ def hypernym_paths(self):
662
+ """
663
+ Get the path(s) from this synset to the root, where each path is a
664
+ list of the synset nodes traversed on the way to the root.
665
+
666
+ :return: A list of lists, where each list gives the node sequence
667
+ connecting the initial ``Synset`` node and a root node.
668
+ """
669
+ paths = []
670
+
671
+ hypernyms = self.hypernyms() + self.instance_hypernyms()
672
+ if len(hypernyms) == 0:
673
+ paths = [[self]]
674
+
675
+ for hypernym in hypernyms:
676
+ for ancestor_list in hypernym.hypernym_paths():
677
+ ancestor_list.append(self)
678
+ paths.append(ancestor_list)
679
+ return paths
680
+
681
+ def common_hypernyms(self, other):
682
+ """
683
+ Find all synsets that are hypernyms of this synset and the
684
+ other synset.
685
+
686
+ :type other: Synset
687
+ :param other: other input synset.
688
+ :return: The synsets that are hypernyms of both synsets.
689
+ """
690
+ if not self._all_hypernyms:
691
+ self._all_hypernyms = {
692
+ self_synset
693
+ for self_synsets in self._iter_hypernym_lists()
694
+ for self_synset in self_synsets
695
+ }
696
+ if not other._all_hypernyms:
697
+ other._all_hypernyms = {
698
+ other_synset
699
+ for other_synsets in other._iter_hypernym_lists()
700
+ for other_synset in other_synsets
701
+ }
702
+ return list(self._all_hypernyms.intersection(other._all_hypernyms))
703
+
704
+ def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
705
+ """
706
+ Get a list of lowest synset(s) that both synsets have as a hypernym.
707
+ When `use_min_depth == False` this means that the synset which appears
708
+ as a hypernym of both `self` and `other` with the lowest maximum depth
709
+ is returned or if there are multiple such synsets at the same depth
710
+ they are all returned
711
+
712
+ However, if `use_min_depth == True` then the synset(s) which has/have
713
+ the lowest minimum depth and appear(s) in both paths is/are returned.
714
+
715
+ By setting the use_min_depth flag to True, the behavior of NLTK2 can be
716
+ preserved. This was changed in NLTK3 to give more accurate results in a
717
+ small set of cases, generally with synsets concerning people. (eg:
718
+ 'chef.n.01', 'fireman.n.01', etc.)
719
+
720
+ This method is an implementation of Ted Pedersen's "Lowest Common
721
+ Subsumer" method from the Perl Wordnet module. It can return either
722
+ "self" or "other" if they are a hypernym of the other.
723
+
724
+ :type other: Synset
725
+ :param other: other input synset
726
+ :type simulate_root: bool
727
+ :param simulate_root: The various verb taxonomies do not
728
+ share a single root which disallows this metric from working for
729
+ synsets that are not connected. This flag (False by default)
730
+ creates a fake root that connects all the taxonomies. Set it
731
+ to True to enable this behavior. For the noun taxonomy,
732
+ there is usually a default root except for WordNet version 1.6.
733
+ If you are using wordnet 1.6, a fake root will need to be added
734
+ for nouns as well.
735
+ :type use_min_depth: bool
736
+ :param use_min_depth: This setting mimics older (v2) behavior of NLTK
737
+ wordnet If True, will use the min_depth function to calculate the
738
+ lowest common hypernyms. This is known to give strange results for
739
+ some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
740
+ for backwards compatibility
741
+ :return: The synsets that are the lowest common hypernyms of both
742
+ synsets
743
+ """
744
+ synsets = self.common_hypernyms(other)
745
+ if simulate_root:
746
+ fake_synset = Synset(None)
747
+ fake_synset._name = "*ROOT*"
748
+ fake_synset.hypernyms = lambda: []
749
+ fake_synset.instance_hypernyms = lambda: []
750
+ synsets.append(fake_synset)
751
+
752
+ try:
753
+ if use_min_depth:
754
+ max_depth = max(s.min_depth() for s in synsets)
755
+ unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
756
+ else:
757
+ max_depth = max(s.max_depth() for s in synsets)
758
+ unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
759
+ return sorted(unsorted_lch)
760
+ except ValueError:
761
+ return []
762
+
763
+ def hypernym_distances(self, distance=0, simulate_root=False):
764
+ """
765
+ Get the path(s) from this synset to the root, counting the distance
766
+ of each node from the initial node on the way. A set of
767
+ (synset, distance) tuples is returned.
768
+
769
+ :type distance: int
770
+ :param distance: the distance (number of edges) from this hypernym to
771
+ the original hypernym ``Synset`` on which this method was called.
772
+ :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is
773
+ a hypernym of the first ``Synset``.
774
+ """
775
+ distances = {(self, distance)}
776
+ for hypernym in self._hypernyms() + self._instance_hypernyms():
777
+ distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
778
+ if simulate_root:
779
+ fake_synset = Synset(None)
780
+ fake_synset._name = "*ROOT*"
781
+ fake_synset_distance = max(distances, key=itemgetter(1))[1]
782
+ distances.add((fake_synset, fake_synset_distance + 1))
783
+ return distances
784
+
785
+ def _shortest_hypernym_paths(self, simulate_root):
786
+ if self._name == "*ROOT*":
787
+ return {self: 0}
788
+
789
+ queue = deque([(self, 0)])
790
+ path = {}
791
+
792
+ while queue:
793
+ s, depth = queue.popleft()
794
+ if s in path:
795
+ continue
796
+ path[s] = depth
797
+
798
+ depth += 1
799
+ queue.extend((hyp, depth) for hyp in s._hypernyms())
800
+ queue.extend((hyp, depth) for hyp in s._instance_hypernyms())
801
+
802
+ if simulate_root:
803
+ fake_synset = Synset(None)
804
+ fake_synset._name = "*ROOT*"
805
+ path[fake_synset] = max(path.values()) + 1
806
+
807
+ return path
808
+
809
+ def shortest_path_distance(self, other, simulate_root=False):
810
+ """
811
+ Returns the distance of the shortest path linking the two synsets (if
812
+ one exists). For each synset, all the ancestor nodes and their
813
+ distances are recorded and compared. The ancestor node common to both
814
+ synsets that can be reached with the minimum number of traversals is
815
+ used. If no ancestor nodes are common, None is returned. If a node is
816
+ compared with itself 0 is returned.
817
+
818
+ :type other: Synset
819
+ :param other: The Synset to which the shortest path will be found.
820
+ :return: The number of edges in the shortest path connecting the two
821
+ nodes, or None if no path exists.
822
+ """
823
+
824
+ if self == other:
825
+ return 0
826
+
827
+ dist_dict1 = self._shortest_hypernym_paths(simulate_root)
828
+ dist_dict2 = other._shortest_hypernym_paths(simulate_root)
829
+
830
+ # For each ancestor synset common to both subject synsets, find the
831
+ # connecting path length. Return the shortest of these.
832
+
833
+ inf = float("inf")
834
+ path_distance = inf
835
+ for synset, d1 in dist_dict1.items():
836
+ d2 = dist_dict2.get(synset, inf)
837
+ path_distance = min(path_distance, d1 + d2)
838
+
839
+ return None if math.isinf(path_distance) else path_distance
840
+
841
+ # interface to similarity methods
842
+ def path_similarity(self, other, verbose=False, simulate_root=True):
843
+ """
844
+ Path Distance Similarity:
845
+ Return a score denoting how similar two word senses are, based on the
846
+ shortest path that connects the senses in the is-a (hypernym/hypnoym)
847
+ taxonomy. The score is in the range 0 to 1, except in those cases where
848
+ a path cannot be found (will only be true for verbs as there are many
849
+ distinct verb taxonomies), in which case None is returned. A score of
850
+ 1 represents identity i.e. comparing a sense with itself will return 1.
851
+
852
+ :type other: Synset
853
+ :param other: The ``Synset`` that this ``Synset`` is being compared to.
854
+ :type simulate_root: bool
855
+ :param simulate_root: The various verb taxonomies do not
856
+ share a single root which disallows this metric from working for
857
+ synsets that are not connected. This flag (True by default)
858
+ creates a fake root that connects all the taxonomies. Set it
859
+ to false to disable this behavior. For the noun taxonomy,
860
+ there is usually a default root except for WordNet version 1.6.
861
+ If you are using wordnet 1.6, a fake root will be added for nouns
862
+ as well.
863
+ :return: A score denoting the similarity of the two ``Synset`` objects,
864
+ normally between 0 and 1. None is returned if no connecting path
865
+ could be found. 1 is returned if a ``Synset`` is compared with
866
+ itself.
867
+ """
868
+
869
+ distance = self.shortest_path_distance(
870
+ other,
871
+ simulate_root=simulate_root and (self._needs_root() or other._needs_root()),
872
+ )
873
+ if distance is None or distance < 0:
874
+ return None
875
+ return 1.0 / (distance + 1)
876
+
877
+ def lch_similarity(self, other, verbose=False, simulate_root=True):
878
+ """
879
+ Leacock Chodorow Similarity:
880
+ Return a score denoting how similar two word senses are, based on the
881
+ shortest path that connects the senses (as above) and the maximum depth
882
+ of the taxonomy in which the senses occur. The relationship is given as
883
+ -log(p/2d) where p is the shortest path length and d is the taxonomy
884
+ depth.
885
+
886
+ :type other: Synset
887
+ :param other: The ``Synset`` that this ``Synset`` is being compared to.
888
+ :type simulate_root: bool
889
+ :param simulate_root: The various verb taxonomies do not
890
+ share a single root which disallows this metric from working for
891
+ synsets that are not connected. This flag (True by default)
892
+ creates a fake root that connects all the taxonomies. Set it
893
+ to false to disable this behavior. For the noun taxonomy,
894
+ there is usually a default root except for WordNet version 1.6.
895
+ If you are using wordnet 1.6, a fake root will be added for nouns
896
+ as well.
897
+ :return: A score denoting the similarity of the two ``Synset`` objects,
898
+ normally greater than 0. None is returned if no connecting path
899
+ could be found. If a ``Synset`` is compared with itself, the
900
+ maximum score is returned, which varies depending on the taxonomy
901
+ depth.
902
+ """
903
+
904
+ if self._pos != other._pos:
905
+ raise WordNetError(
906
+ "Computing the lch similarity requires "
907
+ "%s and %s to have the same part of speech." % (self, other)
908
+ )
909
+
910
+ need_root = self._needs_root()
911
+
912
+ if self._pos not in self._wordnet_corpus_reader._max_depth:
913
+ self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
914
+
915
+ depth = self._wordnet_corpus_reader._max_depth[self._pos]
916
+
917
+ distance = self.shortest_path_distance(
918
+ other, simulate_root=simulate_root and need_root
919
+ )
920
+
921
+ if distance is None or distance < 0 or depth == 0:
922
+ return None
923
+ return -math.log((distance + 1) / (2.0 * depth))
924
+
925
+ def wup_similarity(self, other, verbose=False, simulate_root=True):
926
+ """
927
+ Wu-Palmer Similarity:
928
+ Return a score denoting how similar two word senses are, based on the
929
+ depth of the two senses in the taxonomy and that of their Least Common
930
+ Subsumer (most specific ancestor node). Previously, the scores computed
931
+ by this implementation did _not_ always agree with those given by
932
+ Pedersen's Perl implementation of WordNet Similarity. However, with
933
+ the addition of the simulate_root flag (see below), the score for
934
+ verbs now almost always agree but not always for nouns.
935
+
936
+ The LCS does not necessarily feature in the shortest path connecting
937
+ the two senses, as it is by definition the common ancestor deepest in
938
+ the taxonomy, not closest to the two senses. Typically, however, it
939
+ will so feature. Where multiple candidates for the LCS exist, that
940
+ whose shortest path to the root node is the longest will be selected.
941
+ Where the LCS has multiple paths to the root, the longer path is used
942
+ for the purposes of the calculation.
943
+
944
+ :type other: Synset
945
+ :param other: The ``Synset`` that this ``Synset`` is being compared to.
946
+ :type simulate_root: bool
947
+ :param simulate_root: The various verb taxonomies do not
948
+ share a single root which disallows this metric from working for
949
+ synsets that are not connected. This flag (True by default)
950
+ creates a fake root that connects all the taxonomies. Set it
951
+ to false to disable this behavior. For the noun taxonomy,
952
+ there is usually a default root except for WordNet version 1.6.
953
+ If you are using wordnet 1.6, a fake root will be added for nouns
954
+ as well.
955
+ :return: A float score denoting the similarity of the two ``Synset``
956
+ objects, normally greater than zero. If no connecting path between
957
+ the two senses can be found, None is returned.
958
+
959
+ """
960
+ need_root = self._needs_root() or other._needs_root()
961
+
962
+ # Note that to preserve behavior from NLTK2 we set use_min_depth=True
963
+ # It is possible that more accurate results could be obtained by
964
+ # removing this setting and it should be tested later on
965
+ subsumers = self.lowest_common_hypernyms(
966
+ other, simulate_root=simulate_root and need_root, use_min_depth=True
967
+ )
968
+
969
+ # If no LCS was found return None
970
+ if len(subsumers) == 0:
971
+ return None
972
+
973
+ subsumer = self if self in subsumers else subsumers[0]
974
+
975
+ # Get the longest path from the LCS to the root,
976
+ # including a correction:
977
+ # - add one because the calculations include both the start and end
978
+ # nodes
979
+ depth = subsumer.max_depth() + 1
980
+
981
+ # Note: No need for an additional add-one correction for non-nouns
982
+ # to account for an imaginary root node because that is now
983
+ # automatically handled by simulate_root
984
+ # if subsumer._pos != NOUN:
985
+ # depth += 1
986
+
987
+ # Get the shortest path from the LCS to each of the synsets it is
988
+ # subsuming. Add this to the LCS path length to get the path
989
+ # length from each synset to the root.
990
+ len1 = self.shortest_path_distance(
991
+ subsumer, simulate_root=simulate_root and need_root
992
+ )
993
+ len2 = other.shortest_path_distance(
994
+ subsumer, simulate_root=simulate_root and need_root
995
+ )
996
+ if len1 is None or len2 is None:
997
+ return None
998
+ len1 += depth
999
+ len2 += depth
1000
+ return (2.0 * depth) / (len1 + len2)
1001
+
1002
+ def res_similarity(self, other, ic, verbose=False):
1003
+ """
1004
+ Resnik Similarity:
1005
+ Return a score denoting how similar two word senses are, based on the
1006
+ Information Content (IC) of the Least Common Subsumer (most specific
1007
+ ancestor node).
1008
+
1009
+ :type other: Synset
1010
+ :param other: The ``Synset`` that this ``Synset`` is being compared to.
1011
+ :type ic: dict
1012
+ :param ic: an information content object (as returned by
1013
+ ``nltk.corpus.wordnet_ic.ic()``).
1014
+ :return: A float score denoting the similarity of the two ``Synset``
1015
+ objects. Synsets whose LCS is the root node of the taxonomy will
1016
+ have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
1017
+ """
1018
+
1019
+ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
1020
+ return lcs_ic
1021
+
1022
+ def jcn_similarity(self, other, ic, verbose=False):
1023
+ """
1024
+ Jiang-Conrath Similarity:
1025
+ Return a score denoting how similar two word senses are, based on the
1026
+ Information Content (IC) of the Least Common Subsumer (most specific
1027
+ ancestor node) and that of the two input Synsets. The relationship is
1028
+ given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
1029
+
1030
+ :type other: Synset
1031
+ :param other: The ``Synset`` that this ``Synset`` is being compared to.
1032
+ :type ic: dict
1033
+ :param ic: an information content object (as returned by
1034
+ ``nltk.corpus.wordnet_ic.ic()``).
1035
+ :return: A float score denoting the similarity of the two ``Synset``
1036
+ objects.
1037
+ """
1038
+
1039
+ if self == other:
1040
+ return _INF
1041
+
1042
+ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
1043
+
1044
+ # If either of the input synsets are the root synset, or have a
1045
+ # frequency of 0 (sparse data problem), return 0.
1046
+ if ic1 == 0 or ic2 == 0:
1047
+ return 0
1048
+
1049
+ ic_difference = ic1 + ic2 - 2 * lcs_ic
1050
+
1051
+ if ic_difference == 0:
1052
+ return _INF
1053
+
1054
+ return 1 / ic_difference
1055
+
1056
+ def lin_similarity(self, other, ic, verbose=False):
1057
+ """
1058
+ Lin Similarity:
1059
+ Return a score denoting how similar two word senses are, based on the
1060
+ Information Content (IC) of the Least Common Subsumer (most specific
1061
+ ancestor node) and that of the two input Synsets. The relationship is
1062
+ given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
1063
+
1064
+ :type other: Synset
1065
+ :param other: The ``Synset`` that this ``Synset`` is being compared to.
1066
+ :type ic: dict
1067
+ :param ic: an information content object (as returned by
1068
+ ``nltk.corpus.wordnet_ic.ic()``).
1069
+ :return: A float score denoting the similarity of the two ``Synset``
1070
+ objects, in the range 0 to 1.
1071
+ """
1072
+
1073
+ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
1074
+ return (2.0 * lcs_ic) / (ic1 + ic2)
1075
+
1076
+ def _iter_hypernym_lists(self):
1077
+ """
1078
+ :return: An iterator over ``Synset`` objects that are either proper
1079
+ hypernyms or instance of hypernyms of the synset.
1080
+ """
1081
+ todo = [self]
1082
+ seen = set()
1083
+ while todo:
1084
+ for synset in todo:
1085
+ seen.add(synset)
1086
+ yield todo
1087
+ todo = [
1088
+ hypernym
1089
+ for synset in todo
1090
+ for hypernym in (synset.hypernyms() + synset.instance_hypernyms())
1091
+ if hypernym not in seen
1092
+ ]
1093
+
1094
+ def __repr__(self):
1095
+ return f"{type(self).__name__}('{self._name}')"
1096
+
1097
+ def _related(self, relation_symbol, sort=True):
1098
+ get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
1099
+ if relation_symbol not in self._pointers:
1100
+ return []
1101
+ pointer_tuples = self._pointers[relation_symbol]
1102
+ r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
1103
+ if sort:
1104
+ r.sort()
1105
+ return r
1106
+
1107
+
1108
+ ######################################################################
1109
+ # WordNet Corpus Reader
1110
+ ######################################################################
1111
+
1112
+
1113
+ class WordNetCorpusReader(CorpusReader):
1114
+ """
1115
+ A corpus reader used to access wordnet or its variants.
1116
+ """
1117
+
1118
+ _ENCODING = "utf8"
1119
+
1120
+ # { Part-of-speech constants
1121
+ ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
1122
+ # }
1123
+
1124
+ # { Filename constants
1125
+ _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"}
1126
+ # }
1127
+
1128
+ # { Part of speech constants
1129
+ _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
1130
+ _pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
1131
+ # }
1132
+
1133
+ #: A list of file identifiers for all the fileids used by this
1134
+ #: corpus reader.
1135
+ _FILES = (
1136
+ "cntlist.rev",
1137
+ "lexnames",
1138
+ "index.sense",
1139
+ "index.adj",
1140
+ "index.adv",
1141
+ "index.noun",
1142
+ "index.verb",
1143
+ "data.adj",
1144
+ "data.adv",
1145
+ "data.noun",
1146
+ "data.verb",
1147
+ "adj.exc",
1148
+ "adv.exc",
1149
+ "noun.exc",
1150
+ "verb.exc",
1151
+ )
1152
+
1153
+ def __init__(self, root, omw_reader):
1154
+ """
1155
+ Construct a new wordnet corpus reader, with the given root
1156
+ directory.
1157
+ """
1158
+
1159
+ super().__init__(root, self._FILES, encoding=self._ENCODING)
1160
+
1161
+ # A index that provides the file offset
1162
+ # Map from lemma -> pos -> synset_index -> offset
1163
+ self._lemma_pos_offset_map = defaultdict(dict)
1164
+
1165
+ # A cache so we don't have to reconstruct synsets
1166
+ # Map from pos -> offset -> synset
1167
+ self._synset_offset_cache = defaultdict(dict)
1168
+
1169
+ # A lookup for the maximum depth of each part of speech. Useful for
1170
+ # the lch similarity metric.
1171
+ self._max_depth = defaultdict(dict)
1172
+
1173
+ # Corpus reader containing omw data.
1174
+ self._omw_reader = omw_reader
1175
+
1176
+ # Corpus reader containing extended_omw data.
1177
+ self._exomw_reader = None
1178
+
1179
+ self.provenances = defaultdict(str)
1180
+ self.provenances["eng"] = ""
1181
+
1182
+ if self._omw_reader is None:
1183
+ warnings.warn(
1184
+ "The multilingual functions are not available with this Wordnet version"
1185
+ )
1186
+
1187
+ self.omw_langs = set()
1188
+
1189
+ # A cache to store the wordnet data of multiple languages
1190
+ self._lang_data = defaultdict(list)
1191
+
1192
+ self._data_file_map = {}
1193
+ self._exception_map = {}
1194
+ self._lexnames = []
1195
+ self._key_count_file = None
1196
+ self._key_synset_file = None
1197
+
1198
+ # Load the lexnames
1199
+ with self.open("lexnames") as fp:
1200
+ for i, line in enumerate(fp):
1201
+ index, lexname, _ = line.split()
1202
+ assert int(index) == i
1203
+ self._lexnames.append(lexname)
1204
+
1205
+ # Load the indices for lemmas and synset offsets
1206
+ self._load_lemma_pos_offset_map()
1207
+
1208
+ # load the exception file data into memory
1209
+ self._load_exception_map()
1210
+
1211
+ self.nomap = []
1212
+ self.splits = {}
1213
+
1214
+ # map from WordNet 3.0 for OMW data
1215
+ self.map30 = self.map_wn30()
1216
+
1217
+ # Language data attributes
1218
+ self.lg_attrs = ["lemma", "none", "def", "exe"]
1219
+
1220
+ def index_sense(self, version=None):
1221
+ """Read sense key to synset id mapping from index.sense file in corpus directory"""
1222
+ fn = "index.sense"
1223
+ if version:
1224
+ from nltk.corpus import CorpusReader, LazyCorpusLoader
1225
+
1226
+ ixreader = LazyCorpusLoader(version, CorpusReader, r".*/" + fn)
1227
+ else:
1228
+ ixreader = self
1229
+ with ixreader.open(fn) as fp:
1230
+ sensekey_map = {}
1231
+ for line in fp:
1232
+ fields = line.strip().split()
1233
+ sensekey = fields[0]
1234
+ pos = self._pos_names[int(sensekey.split("%")[1].split(":")[0])]
1235
+ sensekey_map[sensekey] = f"{fields[1]}-{pos}"
1236
+ return sensekey_map
1237
+
1238
+ def map_to_many(self):
1239
+ sensekey_map1 = self.index_sense("wordnet")
1240
+ sensekey_map2 = self.index_sense()
1241
+ synset_to_many = {}
1242
+ for synsetid in set(sensekey_map1.values()):
1243
+ synset_to_many[synsetid] = []
1244
+ for sensekey in set(sensekey_map1.keys()).intersection(
1245
+ set(sensekey_map2.keys())
1246
+ ):
1247
+ source = sensekey_map1[sensekey]
1248
+ target = sensekey_map2[sensekey]
1249
+ synset_to_many[source].append(target)
1250
+ return synset_to_many
1251
+
1252
+ def map_to_one(self):
1253
+ synset_to_many = self.map_to_many()
1254
+ synset_to_one = {}
1255
+ for source in synset_to_many:
1256
+ candidates_bag = synset_to_many[source]
1257
+ if candidates_bag:
1258
+ candidates_set = set(candidates_bag)
1259
+ if len(candidates_set) == 1:
1260
+ target = candidates_bag[0]
1261
+ else:
1262
+ counts = []
1263
+ for candidate in candidates_set:
1264
+ counts.append((candidates_bag.count(candidate), candidate))
1265
+ self.splits[source] = counts
1266
+ target = max(counts)[1]
1267
+ synset_to_one[source] = target
1268
+ if source[-1] == "s":
1269
+ # Add a mapping from "a" to target for applications like omw,
1270
+ # where only Lithuanian and Slovak use the "s" ss_type.
1271
+ synset_to_one[f"{source[:-1]}a"] = target
1272
+ else:
1273
+ self.nomap.append(source)
1274
+ return synset_to_one
1275
+
1276
+ def map_wn30(self):
1277
+ """Mapping from Wordnet 3.0 to currently loaded Wordnet version"""
1278
+ if self.get_version() == "3.0":
1279
+ return None
1280
+ else:
1281
+ return self.map_to_one()
1282
+
1283
+ # Open Multilingual WordNet functions, contributed by
1284
+ # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
1285
+
1286
+ def of2ss(self, of):
1287
+ """take an id and return the synsets"""
1288
+ return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
1289
+
1290
+ def ss2of(self, ss):
1291
+ """return the ID of the synset"""
1292
+ if ss:
1293
+ return f"{ss.offset():08d}-{ss.pos()}"
1294
+
1295
+ def _load_lang_data(self, lang):
1296
+ """load the wordnet data of the requested language from the file to
1297
+ the cache, _lang_data"""
1298
+
1299
+ if lang in self._lang_data:
1300
+ return
1301
+
1302
+ if self._omw_reader and not self.omw_langs:
1303
+ self.add_omw()
1304
+
1305
+ if lang not in self.langs():
1306
+ raise WordNetError("Language is not supported.")
1307
+
1308
+ if self._exomw_reader and lang not in self.omw_langs:
1309
+ reader = self._exomw_reader
1310
+ else:
1311
+ reader = self._omw_reader
1312
+
1313
+ prov = self.provenances[lang]
1314
+ if prov in ["cldr", "wikt"]:
1315
+ prov2 = prov
1316
+ else:
1317
+ prov2 = "data"
1318
+
1319
+ with reader.open(f"{prov}/wn-{prov2}-{lang.split('_')[0]}.tab") as fp:
1320
+ self.custom_lemmas(fp, lang)
1321
+ self.disable_custom_lemmas(lang)
1322
+
1323
+ def add_provs(self, reader):
1324
+ """Add languages from Multilingual Wordnet to the provenance dictionary"""
1325
+ fileids = reader.fileids()
1326
+ for fileid in fileids:
1327
+ prov, langfile = os.path.split(fileid)
1328
+ file_name, file_extension = os.path.splitext(langfile)
1329
+ if file_extension == ".tab":
1330
+ lang = file_name.split("-")[-1]
1331
+ if lang in self.provenances or prov in ["cldr", "wikt"]:
1332
+ # We already have another resource for this lang,
1333
+ # so we need to further specify the lang id:
1334
+ lang = f"{lang}_{prov}"
1335
+ self.provenances[lang] = prov
1336
+
1337
+ def add_omw(self):
1338
+ self.add_provs(self._omw_reader)
1339
+ self.omw_langs = set(self.provenances.keys())
1340
+
1341
+ def add_exomw(self):
1342
+ """
1343
+ Add languages from Extended OMW
1344
+
1345
+ >>> import nltk
1346
+ >>> from nltk.corpus import wordnet as wn
1347
+ >>> wn.add_exomw()
1348
+ >>> print(wn.synset('intrinsically.r.01').lemmas(lang="eng_wikt"))
1349
+ [Lemma('intrinsically.r.01.per_se'), Lemma('intrinsically.r.01.as_such')]
1350
+ """
1351
+ from nltk.corpus import extended_omw
1352
+
1353
+ self.add_omw()
1354
+ self._exomw_reader = extended_omw
1355
+ self.add_provs(self._exomw_reader)
1356
+
1357
+ def langs(self):
1358
+ """return a list of languages supported by Multilingual Wordnet"""
1359
+ return list(self.provenances.keys())
1360
+
1361
+ def _load_lemma_pos_offset_map(self):
1362
+ for suffix in self._FILEMAP.values():
1363
+
1364
+ # parse each line of the file (ignoring comment lines)
1365
+ with self.open("index.%s" % suffix) as fp:
1366
+ for i, line in enumerate(fp):
1367
+ if line.startswith(" "):
1368
+ continue
1369
+
1370
+ _iter = iter(line.split())
1371
+
1372
+ def _next_token():
1373
+ return next(_iter)
1374
+
1375
+ try:
1376
+
1377
+ # get the lemma and part-of-speech
1378
+ lemma = _next_token()
1379
+ pos = _next_token()
1380
+
1381
+ # get the number of synsets for this lemma
1382
+ n_synsets = int(_next_token())
1383
+ assert n_synsets > 0
1384
+
1385
+ # get and ignore the pointer symbols for all synsets of
1386
+ # this lemma
1387
+ n_pointers = int(_next_token())
1388
+ [_next_token() for _ in range(n_pointers)]
1389
+
1390
+ # same as number of synsets
1391
+ n_senses = int(_next_token())
1392
+ assert n_synsets == n_senses
1393
+
1394
+ # get and ignore number of senses ranked according to
1395
+ # frequency
1396
+ _next_token()
1397
+
1398
+ # get synset offsets
1399
+ synset_offsets = [int(_next_token()) for _ in range(n_synsets)]
1400
+
1401
+ # raise more informative error with file name and line number
1402
+ except (AssertionError, ValueError) as e:
1403
+ tup = ("index.%s" % suffix), (i + 1), e
1404
+ raise WordNetError("file %s, line %i: %s" % tup) from e
1405
+
1406
+ # map lemmas and parts of speech to synsets
1407
+ self._lemma_pos_offset_map[lemma][pos] = synset_offsets
1408
+ if pos == ADJ:
1409
+ self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
1410
+
1411
+ def _load_exception_map(self):
1412
+ # load the exception file data into memory
1413
+ for pos, suffix in self._FILEMAP.items():
1414
+ self._exception_map[pos] = {}
1415
+ with self.open("%s.exc" % suffix) as fp:
1416
+ for line in fp:
1417
+ terms = line.split()
1418
+ self._exception_map[pos][terms[0]] = terms[1:]
1419
+ self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
1420
+
1421
+ def _compute_max_depth(self, pos, simulate_root):
1422
+ """
1423
+ Compute the max depth for the given part of speech. This is
1424
+ used by the lch similarity metric.
1425
+ """
1426
+ depth = 0
1427
+ for ii in self.all_synsets(pos):
1428
+ try:
1429
+ depth = max(depth, ii.max_depth())
1430
+ except RuntimeError:
1431
+ print(ii)
1432
+ if simulate_root:
1433
+ depth += 1
1434
+ self._max_depth[pos] = depth
1435
+
1436
+ def get_version(self):
1437
+ fh = self._data_file(ADJ)
1438
+ fh.seek(0)
1439
+ for line in fh:
1440
+ match = re.search(r"Word[nN]et (\d+|\d+\.\d+) Copyright", line)
1441
+ if match is not None:
1442
+ version = match.group(1)
1443
+ fh.seek(0)
1444
+ return version
1445
+
1446
+ #############################################################
1447
+ # Loading Lemmas
1448
+ #############################################################
1449
+
1450
+ def lemma(self, name, lang="eng"):
1451
+ """Return lemma object that matches the name"""
1452
+ # cannot simply split on first '.',
1453
+ # e.g.: '.45_caliber.a.01..45_caliber'
1454
+ separator = SENSENUM_RE.search(name).end()
1455
+
1456
+ synset_name, lemma_name = name[: separator - 1], name[separator:]
1457
+
1458
+ synset = self.synset(synset_name)
1459
+ for lemma in synset.lemmas(lang):
1460
+ if lemma._name == lemma_name:
1461
+ return lemma
1462
+ raise WordNetError(f"No lemma {lemma_name!r} in {synset_name!r}")
1463
+
1464
+ def lemma_from_key(self, key):
1465
+ # Keys are case sensitive and always lower-case
1466
+ key = key.lower()
1467
+
1468
+ lemma_name, lex_sense = key.split("%")
1469
+ pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":")
1470
+ pos = self._pos_names[int(pos_number)]
1471
+
1472
+ # open the key -> synset file if necessary
1473
+ if self._key_synset_file is None:
1474
+ self._key_synset_file = self.open("index.sense")
1475
+
1476
+ # Find the synset for the lemma.
1477
+ synset_line = _binary_search_file(self._key_synset_file, key)
1478
+ if not synset_line:
1479
+ raise WordNetError("No synset found for key %r" % key)
1480
+ offset = int(synset_line.split()[1])
1481
+ synset = self.synset_from_pos_and_offset(pos, offset)
1482
+ # return the corresponding lemma
1483
+ for lemma in synset._lemmas:
1484
+ if lemma._key == key:
1485
+ return lemma
1486
+ raise WordNetError("No lemma found for for key %r" % key)
1487
+
1488
+ #############################################################
1489
+ # Loading Synsets
1490
+ #############################################################
1491
+ def synset(self, name):
1492
+ # split name into lemma, part of speech and synset number
1493
+ lemma, pos, synset_index_str = name.lower().rsplit(".", 2)
1494
+ synset_index = int(synset_index_str) - 1
1495
+
1496
+ # get the offset for this synset
1497
+ try:
1498
+ offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
1499
+ except KeyError as e:
1500
+ raise WordNetError(f"No lemma {lemma!r} with part of speech {pos!r}") from e
1501
+ except IndexError as e:
1502
+ n_senses = len(self._lemma_pos_offset_map[lemma][pos])
1503
+ raise WordNetError(
1504
+ f"Lemma {lemma!r} with part of speech {pos!r} only "
1505
+ f"has {n_senses} {'sense' if n_senses == 1 else 'senses'}"
1506
+ ) from e
1507
+
1508
+ # load synset information from the appropriate file
1509
+ synset = self.synset_from_pos_and_offset(pos, offset)
1510
+
1511
+ # some basic sanity checks on loaded attributes
1512
+ if pos == "s" and synset._pos == "a":
1513
+ message = (
1514
+ "Adjective satellite requested but only plain "
1515
+ "adjective found for lemma %r"
1516
+ )
1517
+ raise WordNetError(message % lemma)
1518
+ assert synset._pos == pos or (pos == "a" and synset._pos == "s")
1519
+
1520
+ # Return the synset object.
1521
+ return synset
1522
+
1523
+ def _data_file(self, pos):
1524
+ """
1525
+ Return an open file pointer for the data file for the given
1526
+ part of speech.
1527
+ """
1528
+ if pos == ADJ_SAT:
1529
+ pos = ADJ
1530
+ if self._data_file_map.get(pos) is None:
1531
+ fileid = "data.%s" % self._FILEMAP[pos]
1532
+ self._data_file_map[pos] = self.open(fileid)
1533
+ return self._data_file_map[pos]
1534
+
1535
+ def synset_from_pos_and_offset(self, pos, offset):
1536
+ """
1537
+ - pos: The synset's part of speech, matching one of the module level
1538
+ attributes ADJ, ADJ_SAT, ADV, NOUN or VERB ('a', 's', 'r', 'n', or 'v').
1539
+ - offset: The byte offset of this synset in the WordNet dict file
1540
+ for this pos.
1541
+
1542
+ >>> from nltk.corpus import wordnet as wn
1543
+ >>> print(wn.synset_from_pos_and_offset('n', 1740))
1544
+ Synset('entity.n.01')
1545
+ """
1546
+ # Check to see if the synset is in the cache
1547
+ if offset in self._synset_offset_cache[pos]:
1548
+ return self._synset_offset_cache[pos][offset]
1549
+
1550
+ data_file = self._data_file(pos)
1551
+ data_file.seek(offset)
1552
+ data_file_line = data_file.readline()
1553
+ # If valid, the offset equals the 8-digit 0-padded integer found at the start of the line:
1554
+ line_offset = data_file_line[:8]
1555
+ if (
1556
+ line_offset.isalnum()
1557
+ and line_offset == f"{'0'*(8-len(str(offset)))}{str(offset)}"
1558
+ ):
1559
+ synset = self._synset_from_pos_and_line(pos, data_file_line)
1560
+ assert synset._offset == offset
1561
+ self._synset_offset_cache[pos][offset] = synset
1562
+ else:
1563
+ synset = None
1564
+ warnings.warn(f"No WordNet synset found for pos={pos} at offset={offset}.")
1565
+ data_file.seek(0)
1566
+ return synset
1567
+
1568
+ @deprecated("Use public method synset_from_pos_and_offset() instead")
1569
+ def _synset_from_pos_and_offset(self, *args, **kwargs):
1570
+ """
1571
+ Hack to help people like the readers of
1572
+ https://stackoverflow.com/a/27145655/1709587
1573
+ who were using this function before it was officially a public method
1574
+ """
1575
+ return self.synset_from_pos_and_offset(*args, **kwargs)
1576
+
1577
+ def _synset_from_pos_and_line(self, pos, data_file_line):
1578
+ # Construct a new (empty) synset.
1579
+ synset = Synset(self)
1580
+
1581
+ # parse the entry for this synset
1582
+ try:
1583
+
1584
+ # parse out the definitions and examples from the gloss
1585
+ columns_str, gloss = data_file_line.strip().split("|")
1586
+ definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
1587
+ examples = re.findall(r'"([^"]*)"', gloss)
1588
+ for example in examples:
1589
+ synset._examples.append(example)
1590
+
1591
+ synset._definition = definition.strip("; ")
1592
+
1593
+ # split the other info into fields
1594
+ _iter = iter(columns_str.split())
1595
+
1596
+ def _next_token():
1597
+ return next(_iter)
1598
+
1599
+ # get the offset
1600
+ synset._offset = int(_next_token())
1601
+
1602
+ # determine the lexicographer file name
1603
+ lexname_index = int(_next_token())
1604
+ synset._lexname = self._lexnames[lexname_index]
1605
+
1606
+ # get the part of speech
1607
+ synset._pos = _next_token()
1608
+
1609
+ # create Lemma objects for each lemma
1610
+ n_lemmas = int(_next_token(), 16)
1611
+ for _ in range(n_lemmas):
1612
+ # get the lemma name
1613
+ lemma_name = _next_token()
1614
+ # get the lex_id (used for sense_keys)
1615
+ lex_id = int(_next_token(), 16)
1616
+ # If the lemma has a syntactic marker, extract it.
1617
+ m = re.match(r"(.*?)(\(.*\))?$", lemma_name)
1618
+ lemma_name, syn_mark = m.groups()
1619
+ # create the lemma object
1620
+ lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
1621
+ synset._lemmas.append(lemma)
1622
+ synset._lemma_names.append(lemma._name)
1623
+
1624
+ # collect the pointer tuples
1625
+ n_pointers = int(_next_token())
1626
+ for _ in range(n_pointers):
1627
+ symbol = _next_token()
1628
+ offset = int(_next_token())
1629
+ pos = _next_token()
1630
+ lemma_ids_str = _next_token()
1631
+ if lemma_ids_str == "0000":
1632
+ synset._pointers[symbol].add((pos, offset))
1633
+ else:
1634
+ source_index = int(lemma_ids_str[:2], 16) - 1
1635
+ target_index = int(lemma_ids_str[2:], 16) - 1
1636
+ source_lemma_name = synset._lemmas[source_index]._name
1637
+ lemma_pointers = synset._lemma_pointers
1638
+ tups = lemma_pointers[source_lemma_name, symbol]
1639
+ tups.append((pos, offset, target_index))
1640
+
1641
+ # read the verb frames
1642
+ try:
1643
+ frame_count = int(_next_token())
1644
+ except StopIteration:
1645
+ pass
1646
+ else:
1647
+ for _ in range(frame_count):
1648
+ # read the plus sign
1649
+ plus = _next_token()
1650
+ assert plus == "+"
1651
+ # read the frame and lemma number
1652
+ frame_number = int(_next_token())
1653
+ frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
1654
+ lemma_number = int(_next_token(), 16)
1655
+ # lemma number of 00 means all words in the synset
1656
+ if lemma_number == 0:
1657
+ synset._frame_ids.append(frame_number)
1658
+ for lemma in synset._lemmas:
1659
+ lemma._frame_ids.append(frame_number)
1660
+ lemma._frame_strings.append(frame_string_fmt % lemma._name)
1661
+ # only a specific word in the synset
1662
+ else:
1663
+ lemma = synset._lemmas[lemma_number - 1]
1664
+ lemma._frame_ids.append(frame_number)
1665
+ lemma._frame_strings.append(frame_string_fmt % lemma._name)
1666
+
1667
+ # raise a more informative error with line text
1668
+ except ValueError as e:
1669
+ raise WordNetError(f"line {data_file_line!r}: {e}") from e
1670
+
1671
+ # set sense keys for Lemma objects - note that this has to be
1672
+ # done afterwards so that the relations are available
1673
+ for lemma in synset._lemmas:
1674
+ if synset._pos == ADJ_SAT:
1675
+ head_lemma = synset.similar_tos()[0]._lemmas[0]
1676
+ head_name = head_lemma._name
1677
+ head_id = "%02d" % head_lemma._lex_id
1678
+ else:
1679
+ head_name = head_id = ""
1680
+ tup = (
1681
+ lemma._name,
1682
+ WordNetCorpusReader._pos_numbers[synset._pos],
1683
+ lemma._lexname_index,
1684
+ lemma._lex_id,
1685
+ head_name,
1686
+ head_id,
1687
+ )
1688
+ lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower()
1689
+
1690
+ # the canonical name is based on the first lemma
1691
+ lemma_name = synset._lemmas[0]._name.lower()
1692
+ offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
1693
+ sense_index = offsets.index(synset._offset)
1694
+ tup = lemma_name, synset._pos, sense_index + 1
1695
+ synset._name = "%s.%s.%02i" % tup
1696
+
1697
+ return synset
1698
+
1699
+ def synset_from_sense_key(self, sense_key):
1700
+ """
1701
+ Retrieves synset based on a given sense_key. Sense keys can be
1702
+ obtained from lemma.key()
1703
+
1704
+ From https://wordnet.princeton.edu/documentation/senseidx5wn:
1705
+ A sense_key is represented as::
1706
+
1707
+ lemma % lex_sense (e.g. 'dog%1:18:01::')
1708
+
1709
+ where lex_sense is encoded as::
1710
+
1711
+ ss_type:lex_filenum:lex_id:head_word:head_id
1712
+
1713
+ :lemma: ASCII text of word/collocation, in lower case
1714
+ :ss_type: synset type for the sense (1 digit int)
1715
+ The synset type is encoded as follows::
1716
+
1717
+ 1 NOUN
1718
+ 2 VERB
1719
+ 3 ADJECTIVE
1720
+ 4 ADVERB
1721
+ 5 ADJECTIVE SATELLITE
1722
+ :lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
1723
+ :lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
1724
+ :head_word: lemma of the first word in satellite's head synset
1725
+ Only used if sense is in an adjective satellite synset
1726
+ :head_id: uniquely identifies sense in a lexicographer file when paired with head_word
1727
+ Only used if head_word is present (2 digit int)
1728
+
1729
+ >>> import nltk
1730
+ >>> from nltk.corpus import wordnet as wn
1731
+ >>> print(wn.synset_from_sense_key("drive%1:04:03::"))
1732
+ Synset('drive.n.06')
1733
+
1734
+ >>> print(wn.synset_from_sense_key("driving%1:04:03::"))
1735
+ Synset('drive.n.06')
1736
+ """
1737
+ return self.lemma_from_key(sense_key).synset()
1738
+
1739
+ #############################################################
1740
+ # Retrieve synsets and lemmas.
1741
+ #############################################################
1742
+
1743
+ def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True):
1744
+ """Load all synsets with a given lemma and part of speech tag.
1745
+ If no pos is specified, all synsets for all parts of speech
1746
+ will be loaded.
1747
+ If lang is specified, all the synsets associated with the lemma name
1748
+ of that language will be returned.
1749
+ """
1750
+ lemma = lemma.lower()
1751
+
1752
+ if lang == "eng":
1753
+ get_synset = self.synset_from_pos_and_offset
1754
+ index = self._lemma_pos_offset_map
1755
+ if pos is None:
1756
+ pos = POS_LIST
1757
+ return [
1758
+ get_synset(p, offset)
1759
+ for p in pos
1760
+ for form in self._morphy(lemma, p, check_exceptions)
1761
+ for offset in index[form].get(p, [])
1762
+ ]
1763
+
1764
+ else:
1765
+ self._load_lang_data(lang)
1766
+ synset_list = []
1767
+ if lemma in self._lang_data[lang][1]:
1768
+ for l in self._lang_data[lang][1][lemma]:
1769
+ if pos is not None and l[-1] != pos:
1770
+ continue
1771
+ synset_list.append(self.of2ss(l))
1772
+ return synset_list
1773
+
1774
+ def lemmas(self, lemma, pos=None, lang="eng"):
1775
+ """Return all Lemma objects with a name matching the specified lemma
1776
+ name and part of speech tag. Matches any part of speech tag if none is
1777
+ specified."""
1778
+
1779
+ lemma = lemma.lower()
1780
+ if lang == "eng":
1781
+ return [
1782
+ lemma_obj
1783
+ for synset in self.synsets(lemma, pos)
1784
+ for lemma_obj in synset.lemmas()
1785
+ if lemma_obj.name().lower() == lemma
1786
+ ]
1787
+
1788
+ else:
1789
+ self._load_lang_data(lang)
1790
+ lemmas = []
1791
+ syn = self.synsets(lemma, lang=lang)
1792
+ for s in syn:
1793
+ if pos is not None and s.pos() != pos:
1794
+ continue
1795
+ for lemma_obj in s.lemmas(lang=lang):
1796
+ if lemma_obj.name().lower() == lemma:
1797
+ lemmas.append(lemma_obj)
1798
+ return lemmas
1799
+
1800
+ def all_lemma_names(self, pos=None, lang="eng"):
1801
+ """Return all lemma names for all synsets for the given
1802
+ part of speech tag and language or languages. If pos is
1803
+ not specified, all synsets for all parts of speech will
1804
+ be used."""
1805
+
1806
+ if lang == "eng":
1807
+ if pos is None:
1808
+ return iter(self._lemma_pos_offset_map)
1809
+ else:
1810
+ return (
1811
+ lemma
1812
+ for lemma in self._lemma_pos_offset_map
1813
+ if pos in self._lemma_pos_offset_map[lemma]
1814
+ )
1815
+ else:
1816
+ self._load_lang_data(lang)
1817
+ lemma = []
1818
+ for i in self._lang_data[lang][0]:
1819
+ if pos is not None and i[-1] != pos:
1820
+ continue
1821
+ lemma.extend(self._lang_data[lang][0][i])
1822
+
1823
+ lemma = iter(set(lemma))
1824
+ return lemma
1825
+
1826
+ def all_omw_synsets(self, pos=None, lang=None):
1827
+ if lang not in self.langs():
1828
+ return None
1829
+ self._load_lang_data(lang)
1830
+ for of in self._lang_data[lang][0]:
1831
+ if not pos or of[-1] == pos:
1832
+ ss = self.of2ss(of)
1833
+ if ss:
1834
+ yield ss
1835
+
1836
+ # else:
1837
+ # A few OMW offsets don't exist in Wordnet 3.0.
1838
+ # warnings.warn(f"Language {lang}: no synset found for {of}")
1839
+
1840
+ def all_synsets(self, pos=None, lang="eng"):
1841
+ """Iterate over all synsets with a given part of speech tag.
1842
+ If no pos is specified, all synsets for all parts of speech
1843
+ will be loaded.
1844
+ """
1845
+ if lang == "eng":
1846
+ return self.all_eng_synsets(pos=pos)
1847
+ else:
1848
+ return self.all_omw_synsets(pos=pos, lang=lang)
1849
+
1850
+ def all_eng_synsets(self, pos=None):
1851
+ if pos is None:
1852
+ pos_tags = self._FILEMAP.keys()
1853
+ else:
1854
+ pos_tags = [pos]
1855
+
1856
+ cache = self._synset_offset_cache
1857
+ from_pos_and_line = self._synset_from_pos_and_line
1858
+
1859
+ # generate all synsets for each part of speech
1860
+ for pos_tag in pos_tags:
1861
+ # Open the file for reading. Note that we can not re-use
1862
+ # the file pointers from self._data_file_map here, because
1863
+ # we're defining an iterator, and those file pointers might
1864
+ # be moved while we're not looking.
1865
+ if pos_tag == ADJ_SAT:
1866
+ pos_file = ADJ
1867
+ else:
1868
+ pos_file = pos_tag
1869
+ fileid = "data.%s" % self._FILEMAP[pos_file]
1870
+ data_file = self.open(fileid)
1871
+
1872
+ try:
1873
+ # generate synsets for each line in the POS file
1874
+ offset = data_file.tell()
1875
+ line = data_file.readline()
1876
+ while line:
1877
+ if not line[0].isspace():
1878
+ if offset in cache[pos_tag]:
1879
+ # See if the synset is cached
1880
+ synset = cache[pos_tag][offset]
1881
+ else:
1882
+ # Otherwise, parse the line
1883
+ synset = from_pos_and_line(pos_tag, line)
1884
+ cache[pos_tag][offset] = synset
1885
+
1886
+ # adjective satellites are in the same file as
1887
+ # adjectives so only yield the synset if it's actually
1888
+ # a satellite
1889
+ if pos_tag == ADJ_SAT and synset._pos == ADJ_SAT:
1890
+ yield synset
1891
+ # for all other POS tags, yield all synsets (this means
1892
+ # that adjectives also include adjective satellites)
1893
+ elif pos_tag != ADJ_SAT:
1894
+ yield synset
1895
+ offset = data_file.tell()
1896
+ line = data_file.readline()
1897
+
1898
+ # close the extra file handle we opened
1899
+ except:
1900
+ data_file.close()
1901
+ raise
1902
+ else:
1903
+ data_file.close()
1904
+
1905
+ def words(self, lang="eng"):
1906
+ """return lemmas of the given language as list of words"""
1907
+ return self.all_lemma_names(lang=lang)
1908
+
1909
+ def synonyms(self, word, lang="eng"):
1910
+ """return nested list with the synonyms of the different senses of word in the given language"""
1911
+ return [
1912
+ sorted(list(set(ss.lemma_names(lang=lang)) - {word}))
1913
+ for ss in self.synsets(word, lang=lang)
1914
+ ]
1915
+
1916
+ def doc(self, file="README", lang="eng"):
1917
+ """Return the contents of readme, license or citation file
1918
+ use lang=lang to get the file for an individual language"""
1919
+ if lang == "eng":
1920
+ reader = self
1921
+ else:
1922
+ reader = self._omw_reader
1923
+ if lang in self.langs():
1924
+ file = f"{os.path.join(self.provenances[lang],file)}"
1925
+ try:
1926
+ with reader.open(file) as fp:
1927
+ return fp.read()
1928
+ except:
1929
+ if lang in self._lang_data:
1930
+ return f"Cannot determine {file} for {lang}"
1931
+ else:
1932
+ return f"Language {lang} is not supported."
1933
+
1934
+ def license(self, lang="eng"):
1935
+ """Return the contents of LICENSE (for omw)
1936
+ use lang=lang to get the license for an individual language"""
1937
+ return self.doc(file="LICENSE", lang=lang)
1938
+
1939
+ def readme(self, lang="eng"):
1940
+ """Return the contents of README (for omw)
1941
+ use lang=lang to get the readme for an individual language"""
1942
+ return self.doc(file="README", lang=lang)
1943
+
1944
+ def citation(self, lang="eng"):
1945
+ """Return the contents of citation.bib file (for omw)
1946
+ use lang=lang to get the citation for an individual language"""
1947
+ return self.doc(file="citation.bib", lang=lang)
1948
+
1949
+ #############################################################
1950
+ # Misc
1951
+ #############################################################
1952
+ def lemma_count(self, lemma):
1953
+ """Return the frequency count for this Lemma"""
1954
+ # Currently, count is only work for English
1955
+ if lemma._lang != "eng":
1956
+ return 0
1957
+ # open the count file if we haven't already
1958
+ if self._key_count_file is None:
1959
+ self._key_count_file = self.open("cntlist.rev")
1960
+ # find the key in the counts file and return the count
1961
+ line = _binary_search_file(self._key_count_file, lemma._key)
1962
+ if line:
1963
+ return int(line.rsplit(" ", 1)[-1])
1964
+ else:
1965
+ return 0
1966
+
1967
+ def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1968
+ return synset1.path_similarity(synset2, verbose, simulate_root)
1969
+
1970
+ path_similarity.__doc__ = Synset.path_similarity.__doc__
1971
+
1972
+ def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1973
+ return synset1.lch_similarity(synset2, verbose, simulate_root)
1974
+
1975
+ lch_similarity.__doc__ = Synset.lch_similarity.__doc__
1976
+
1977
+ def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1978
+ return synset1.wup_similarity(synset2, verbose, simulate_root)
1979
+
1980
+ wup_similarity.__doc__ = Synset.wup_similarity.__doc__
1981
+
1982
+ def res_similarity(self, synset1, synset2, ic, verbose=False):
1983
+ return synset1.res_similarity(synset2, ic, verbose)
1984
+
1985
+ res_similarity.__doc__ = Synset.res_similarity.__doc__
1986
+
1987
+ def jcn_similarity(self, synset1, synset2, ic, verbose=False):
1988
+ return synset1.jcn_similarity(synset2, ic, verbose)
1989
+
1990
+ jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
1991
+
1992
+ def lin_similarity(self, synset1, synset2, ic, verbose=False):
1993
+ return synset1.lin_similarity(synset2, ic, verbose)
1994
+
1995
+ lin_similarity.__doc__ = Synset.lin_similarity.__doc__
1996
+
1997
+ #############################################################
1998
+ # Morphy
1999
+ #############################################################
2000
+ # Morphy, adapted from Oliver Steele's pywordnet
2001
+ def morphy(self, form, pos=None, check_exceptions=True):
2002
+ """
2003
+ Find a possible base form for the given form, with the given
2004
+ part of speech, by checking WordNet's list of exceptional
2005
+ forms, and by recursively stripping affixes for this part of
2006
+ speech until a form in WordNet is found.
2007
+
2008
+ >>> from nltk.corpus import wordnet as wn
2009
+ >>> print(wn.morphy('dogs'))
2010
+ dog
2011
+ >>> print(wn.morphy('churches'))
2012
+ church
2013
+ >>> print(wn.morphy('aardwolves'))
2014
+ aardwolf
2015
+ >>> print(wn.morphy('abaci'))
2016
+ abacus
2017
+ >>> wn.morphy('hardrock', wn.ADV)
2018
+ >>> print(wn.morphy('book', wn.NOUN))
2019
+ book
2020
+ >>> wn.morphy('book', wn.ADJ)
2021
+ """
2022
+
2023
+ if pos is None:
2024
+ morphy = self._morphy
2025
+ analyses = chain(a for p in POS_LIST for a in morphy(form, p))
2026
+ else:
2027
+ analyses = self._morphy(form, pos, check_exceptions)
2028
+
2029
+ # get the first one we find
2030
+ first = list(islice(analyses, 1))
2031
+ if len(first) == 1:
2032
+ return first[0]
2033
+ else:
2034
+ return None
2035
+
2036
+ MORPHOLOGICAL_SUBSTITUTIONS = {
2037
+ NOUN: [
2038
+ ("s", ""),
2039
+ ("ses", "s"),
2040
+ ("ves", "f"),
2041
+ ("xes", "x"),
2042
+ ("zes", "z"),
2043
+ ("ches", "ch"),
2044
+ ("shes", "sh"),
2045
+ ("men", "man"),
2046
+ ("ies", "y"),
2047
+ ],
2048
+ VERB: [
2049
+ ("s", ""),
2050
+ ("ies", "y"),
2051
+ ("es", "e"),
2052
+ ("es", ""),
2053
+ ("ed", "e"),
2054
+ ("ed", ""),
2055
+ ("ing", "e"),
2056
+ ("ing", ""),
2057
+ ],
2058
+ ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")],
2059
+ ADV: [],
2060
+ }
2061
+
2062
+ MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
2063
+
2064
+ def _morphy(self, form, pos, check_exceptions=True):
2065
+ # from jordanbg:
2066
+ # Given an original string x
2067
+ # 1. Apply rules once to the input to get y1, y2, y3, etc.
2068
+ # 2. Return all that are in the database
2069
+ # 3. If there are no matches, keep applying rules until you either
2070
+ # find a match or you can't go any further
2071
+
2072
+ exceptions = self._exception_map[pos]
2073
+ substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
2074
+
2075
+ def apply_rules(forms):
2076
+ return [
2077
+ form[: -len(old)] + new
2078
+ for form in forms
2079
+ for old, new in substitutions
2080
+ if form.endswith(old)
2081
+ ]
2082
+
2083
+ def filter_forms(forms):
2084
+ result = []
2085
+ seen = set()
2086
+ for form in forms:
2087
+ if form in self._lemma_pos_offset_map:
2088
+ if pos in self._lemma_pos_offset_map[form]:
2089
+ if form not in seen:
2090
+ result.append(form)
2091
+ seen.add(form)
2092
+ return result
2093
+
2094
+ # 0. Check the exception lists
2095
+ if check_exceptions:
2096
+ if form in exceptions:
2097
+ return filter_forms([form] + exceptions[form])
2098
+
2099
+ # 1. Apply rules once to the input to get y1, y2, y3, etc.
2100
+ forms = apply_rules([form])
2101
+
2102
+ # 2. Return all that are in the database (and check the original too)
2103
+ results = filter_forms([form] + forms)
2104
+ if results:
2105
+ return results
2106
+
2107
+ # 3. If there are no matches, keep applying rules until we find a match
2108
+ while forms:
2109
+ forms = apply_rules(forms)
2110
+ results = filter_forms(forms)
2111
+ if results:
2112
+ return results
2113
+
2114
+ # Return an empty list if we can't find anything
2115
+ return []
2116
+
2117
+ #############################################################
2118
+ # Create information content from corpus
2119
+ #############################################################
2120
+ def ic(self, corpus, weight_senses_equally=False, smoothing=1.0):
2121
+ """
2122
+ Creates an information content lookup dictionary from a corpus.
2123
+
2124
+ :type corpus: CorpusReader
2125
+ :param corpus: The corpus from which we create an information
2126
+ content dictionary.
2127
+ :type weight_senses_equally: bool
2128
+ :param weight_senses_equally: If this is True, gives all
2129
+ possible senses equal weight rather than dividing by the
2130
+ number of possible senses. (If a word has 3 synses, each
2131
+ sense gets 0.3333 per appearance when this is False, 1.0 when
2132
+ it is true.)
2133
+ :param smoothing: How much do we smooth synset counts (default is 1.0)
2134
+ :type smoothing: float
2135
+ :return: An information content dictionary
2136
+ """
2137
+ counts = FreqDist()
2138
+ for ww in corpus.words():
2139
+ counts[ww] += 1
2140
+
2141
+ ic = {}
2142
+ for pp in POS_LIST:
2143
+ ic[pp] = defaultdict(float)
2144
+
2145
+ # Initialize the counts with the smoothing value
2146
+ if smoothing > 0.0:
2147
+ for pp in POS_LIST:
2148
+ ic[pp][0] = smoothing
2149
+ for ss in self.all_synsets():
2150
+ pos = ss._pos
2151
+ if pos == ADJ_SAT:
2152
+ pos = ADJ
2153
+ ic[pos][ss._offset] = smoothing
2154
+
2155
+ for ww in counts:
2156
+ possible_synsets = self.synsets(ww)
2157
+ if len(possible_synsets) == 0:
2158
+ continue
2159
+
2160
+ # Distribute weight among possible synsets
2161
+ weight = float(counts[ww])
2162
+ if not weight_senses_equally:
2163
+ weight /= float(len(possible_synsets))
2164
+
2165
+ for ss in possible_synsets:
2166
+ pos = ss._pos
2167
+ if pos == ADJ_SAT:
2168
+ pos = ADJ
2169
+ for level in ss._iter_hypernym_lists():
2170
+ for hh in level:
2171
+ ic[pos][hh._offset] += weight
2172
+ # Add the weight to the root
2173
+ ic[pos][0] += weight
2174
+ return ic
2175
+
2176
+ def custom_lemmas(self, tab_file, lang):
2177
+ """
2178
+ Reads a custom tab file containing mappings of lemmas in the given
2179
+ language to Princeton WordNet 3.0 synset offsets, allowing NLTK's
2180
+ WordNet functions to then be used with that language.
2181
+
2182
+ See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for
2183
+ documentation on the Multilingual WordNet tab file format.
2184
+
2185
+ :param tab_file: Tab file as a file or file-like object
2186
+ :type: lang str
2187
+ :param: lang ISO 639-3 code of the language of the tab file
2188
+ """
2189
+ lg = lang.split("_")[0]
2190
+ if len(lg) != 3:
2191
+ raise ValueError("lang should be a (3 character) ISO 639-3 code")
2192
+ self._lang_data[lang] = [
2193
+ defaultdict(list),
2194
+ defaultdict(list),
2195
+ defaultdict(list),
2196
+ defaultdict(list),
2197
+ ]
2198
+ for line in tab_file.readlines():
2199
+ if isinstance(line, bytes):
2200
+ # Support byte-stream files (e.g. as returned by Python 2's
2201
+ # open() function) as well as text-stream ones
2202
+ line = line.decode("utf-8")
2203
+ if not line.startswith("#"):
2204
+ triple = line.strip().split("\t")
2205
+ if len(triple) < 3:
2206
+ continue
2207
+ offset_pos, label = triple[:2]
2208
+ val = triple[-1]
2209
+ if self.map30:
2210
+ if offset_pos in self.map30:
2211
+ # Map offset_pos to current Wordnet version:
2212
+ offset_pos = self.map30[offset_pos]
2213
+ else:
2214
+ # Some OMW offsets were never in Wordnet:
2215
+ if (
2216
+ offset_pos not in self.nomap
2217
+ and offset_pos.replace("a", "s") not in self.nomap
2218
+ ):
2219
+ warnings.warn(
2220
+ f"{lang}: invalid offset {offset_pos} in '{line}'"
2221
+ )
2222
+ continue
2223
+ elif offset_pos[-1] == "a":
2224
+ wnss = self.of2ss(offset_pos)
2225
+ if wnss and wnss.pos() == "s": # Wordnet pos is "s"
2226
+ # Label OMW adjective satellites back to their Wordnet pos ("s")
2227
+ offset_pos = self.ss2of(wnss)
2228
+ pair = label.split(":")
2229
+ attr = pair[-1]
2230
+ if len(pair) == 1 or pair[0] == lg:
2231
+ if attr == "lemma":
2232
+ val = val.strip().replace(" ", "_")
2233
+ self._lang_data[lang][1][val.lower()].append(offset_pos)
2234
+ if attr in self.lg_attrs:
2235
+ self._lang_data[lang][self.lg_attrs.index(attr)][
2236
+ offset_pos
2237
+ ].append(val)
2238
+
2239
+ def disable_custom_lemmas(self, lang):
2240
+ """prevent synsets from being mistakenly added"""
2241
+ for n in range(len(self.lg_attrs)):
2242
+ self._lang_data[lang][n].default_factory = None
2243
+
2244
+ ######################################################################
2245
+ # Visualize WordNet relation graphs using Graphviz
2246
+ ######################################################################
2247
+
2248
+ def digraph(
2249
+ self,
2250
+ inputs,
2251
+ rel=lambda s: s.hypernyms(),
2252
+ pos=None,
2253
+ maxdepth=-1,
2254
+ shapes=None,
2255
+ attr=None,
2256
+ verbose=False,
2257
+ ):
2258
+ """
2259
+ Produce a graphical representation from 'inputs' (a list of
2260
+ start nodes, which can be a mix of Synsets, Lemmas and/or words),
2261
+ and a synset relation, for drawing with the 'dot' graph visualisation
2262
+ program from the Graphviz package.
2263
+
2264
+ Return a string in the DOT graph file language, which can then be
2265
+ converted to an image by nltk.parse.dependencygraph.dot2img(dot_string).
2266
+
2267
+ Optional Parameters:
2268
+ :rel: Wordnet synset relation
2269
+ :pos: for words, restricts Part of Speech to 'n', 'v', 'a' or 'r'
2270
+ :maxdepth: limit the longest path
2271
+ :shapes: dictionary of strings that trigger a specified shape
2272
+ :attr: dictionary with global graph attributes
2273
+ :verbose: warn about cycles
2274
+
2275
+ >>> from nltk.corpus import wordnet as wn
2276
+ >>> print(wn.digraph([wn.synset('dog.n.01')]))
2277
+ digraph G {
2278
+ "Synset('animal.n.01')" -> "Synset('organism.n.01')";
2279
+ "Synset('canine.n.02')" -> "Synset('carnivore.n.01')";
2280
+ "Synset('carnivore.n.01')" -> "Synset('placental.n.01')";
2281
+ "Synset('chordate.n.01')" -> "Synset('animal.n.01')";
2282
+ "Synset('dog.n.01')" -> "Synset('canine.n.02')";
2283
+ "Synset('dog.n.01')" -> "Synset('domestic_animal.n.01')";
2284
+ "Synset('domestic_animal.n.01')" -> "Synset('animal.n.01')";
2285
+ "Synset('living_thing.n.01')" -> "Synset('whole.n.02')";
2286
+ "Synset('mammal.n.01')" -> "Synset('vertebrate.n.01')";
2287
+ "Synset('object.n.01')" -> "Synset('physical_entity.n.01')";
2288
+ "Synset('organism.n.01')" -> "Synset('living_thing.n.01')";
2289
+ "Synset('physical_entity.n.01')" -> "Synset('entity.n.01')";
2290
+ "Synset('placental.n.01')" -> "Synset('mammal.n.01')";
2291
+ "Synset('vertebrate.n.01')" -> "Synset('chordate.n.01')";
2292
+ "Synset('whole.n.02')" -> "Synset('object.n.01')";
2293
+ }
2294
+ <BLANKLINE>
2295
+ """
2296
+ from nltk.util import edge_closure, edges2dot
2297
+
2298
+ synsets = set()
2299
+ edges = set()
2300
+ if not shapes:
2301
+ shapes = dict()
2302
+ if not attr:
2303
+ attr = dict()
2304
+
2305
+ def add_lemma(lem):
2306
+ ss = lem.synset()
2307
+ synsets.add(ss)
2308
+ edges.add((lem, ss))
2309
+
2310
+ for node in inputs:
2311
+ typ = type(node)
2312
+ if typ == Synset:
2313
+ synsets.add(node)
2314
+ elif typ == Lemma:
2315
+ add_lemma(node)
2316
+ elif typ == str:
2317
+ for lemma in self.lemmas(node, pos):
2318
+ add_lemma(lemma)
2319
+
2320
+ for ss in synsets:
2321
+ edges = edges.union(edge_closure(ss, rel, maxdepth, verbose))
2322
+ dot_string = edges2dot(sorted(list(edges)), shapes=shapes, attr=attr)
2323
+ return dot_string
2324
+
2325
+
2326
+ ######################################################################
2327
+ # WordNet Information Content Corpus Reader
2328
+ ######################################################################
2329
+
2330
+
2331
+ class WordNetICCorpusReader(CorpusReader):
2332
+ """
2333
+ A corpus reader for the WordNet information content corpus.
2334
+ """
2335
+
2336
+ def __init__(self, root, fileids):
2337
+ CorpusReader.__init__(self, root, fileids, encoding="utf8")
2338
+
2339
+ # this load function would be more efficient if the data was pickled
2340
+ # Note that we can't use NLTK's frequency distributions because
2341
+ # synsets are overlapping (each instance of a synset also counts
2342
+ # as an instance of its hypernyms)
2343
+ def ic(self, icfile):
2344
+ """
2345
+ Load an information content file from the wordnet_ic corpus
2346
+ and return a dictionary. This dictionary has just two keys,
2347
+ NOUN and VERB, whose values are dictionaries that map from
2348
+ synsets to information content values.
2349
+
2350
+ :type icfile: str
2351
+ :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
2352
+ :return: An information content dictionary
2353
+ """
2354
+ ic = {}
2355
+ ic[NOUN] = defaultdict(float)
2356
+ ic[VERB] = defaultdict(float)
2357
+ with self.open(icfile) as fp:
2358
+ for num, line in enumerate(fp):
2359
+ if num == 0: # skip the header
2360
+ continue
2361
+ fields = line.split()
2362
+ offset = int(fields[0][:-1])
2363
+ value = float(fields[1])
2364
+ pos = _get_pos(fields[0])
2365
+ if len(fields) == 3 and fields[2] == "ROOT":
2366
+ # Store root count.
2367
+ ic[pos][0] += value
2368
+ if value != 0:
2369
+ ic[pos][offset] = value
2370
+ return ic
2371
+
2372
+
2373
+ ######################################################################
2374
+ # Similarity metrics
2375
+ ######################################################################
2376
+
2377
+ # TODO: Add in the option to manually add a new root node; this will be
2378
+ # useful for verb similarity as there exist multiple verb taxonomies.
2379
+
2380
+ # More information about the metrics is available at
2381
+ # http://marimba.d.umn.edu/similarity/measures.html
2382
+
2383
+
2384
+ def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
2385
+ return synset1.path_similarity(
2386
+ synset2, verbose=verbose, simulate_root=simulate_root
2387
+ )
2388
+
2389
+
2390
+ def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
2391
+ return synset1.lch_similarity(synset2, verbose=verbose, simulate_root=simulate_root)
2392
+
2393
+
2394
+ def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
2395
+ return synset1.wup_similarity(synset2, verbose=verbose, simulate_root=simulate_root)
2396
+
2397
+
2398
+ def res_similarity(synset1, synset2, ic, verbose=False):
2399
+ return synset1.res_similarity(synset2, ic, verbose=verbose)
2400
+
2401
+
2402
+ def jcn_similarity(synset1, synset2, ic, verbose=False):
2403
+ return synset1.jcn_similarity(synset2, ic, verbose=verbose)
2404
+
2405
+
2406
+ def lin_similarity(synset1, synset2, ic, verbose=False):
2407
+ return synset1.lin_similarity(synset2, ic, verbose=verbose)
2408
+
2409
+
2410
+ path_similarity.__doc__ = Synset.path_similarity.__doc__
2411
+ lch_similarity.__doc__ = Synset.lch_similarity.__doc__
2412
+ wup_similarity.__doc__ = Synset.wup_similarity.__doc__
2413
+ res_similarity.__doc__ = Synset.res_similarity.__doc__
2414
+ jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
2415
+ lin_similarity.__doc__ = Synset.lin_similarity.__doc__
2416
+
2417
+
2418
+ def _lcs_ic(synset1, synset2, ic, verbose=False):
2419
+ """
2420
+ Get the information content of the least common subsumer that has
2421
+ the highest information content value. If two nodes have no
2422
+ explicit common subsumer, assume that they share an artificial
2423
+ root node that is the hypernym of all explicit roots.
2424
+
2425
+ :type synset1: Synset
2426
+ :param synset1: First input synset.
2427
+ :type synset2: Synset
2428
+ :param synset2: Second input synset. Must be the same part of
2429
+ speech as the first synset.
2430
+ :type ic: dict
2431
+ :param ic: an information content object (as returned by ``load_ic()``).
2432
+ :return: The information content of the two synsets and their most
2433
+ informative subsumer
2434
+ """
2435
+ if synset1._pos != synset2._pos:
2436
+ raise WordNetError(
2437
+ "Computing the least common subsumer requires "
2438
+ "%s and %s to have the same part of speech." % (synset1, synset2)
2439
+ )
2440
+
2441
+ ic1 = information_content(synset1, ic)
2442
+ ic2 = information_content(synset2, ic)
2443
+ subsumers = synset1.common_hypernyms(synset2)
2444
+ if len(subsumers) == 0:
2445
+ subsumer_ic = 0
2446
+ else:
2447
+ subsumer_ic = max(information_content(s, ic) for s in subsumers)
2448
+
2449
+ if verbose:
2450
+ print("> LCS Subsumer by content:", subsumer_ic)
2451
+
2452
+ return ic1, ic2, subsumer_ic
2453
+
2454
+
2455
+ # Utility functions
2456
+
2457
+
2458
+ def information_content(synset, ic):
2459
+ pos = synset._pos
2460
+ if pos == ADJ_SAT:
2461
+ pos = ADJ
2462
+ try:
2463
+ icpos = ic[pos]
2464
+ except KeyError as e:
2465
+ msg = "Information content file has no entries for part-of-speech: %s"
2466
+ raise WordNetError(msg % pos) from e
2467
+
2468
+ counts = icpos[synset._offset]
2469
+ if counts == 0:
2470
+ return _INF
2471
+ else:
2472
+ return -math.log(counts / icpos[0])
2473
+
2474
+
2475
+ # get the part of speech (NOUN or VERB) from the information content record
2476
+ # (each identifier has a 'n' or 'v' suffix)
2477
+
2478
+
2479
+ def _get_pos(field):
2480
+ if field[-1] == "n":
2481
+ return NOUN
2482
+ elif field[-1] == "v":
2483
+ return VERB
2484
+ else:
2485
+ msg = (
2486
+ "Unidentified part of speech in WordNet Information Content file "
2487
+ "for field %s" % field
2488
+ )
2489
+ raise ValueError(msg)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/xmldocs.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: XML Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Corpus reader for corpora whose documents are xml files.
10
+
11
+ (note -- not named 'xml' to avoid conflicting w/ standard xml package)
12
+ """
13
+
14
+ import codecs
15
+ from xml.etree import ElementTree
16
+
17
+ from nltk.corpus.reader.api import CorpusReader
18
+ from nltk.corpus.reader.util import *
19
+ from nltk.data import SeekableUnicodeStreamReader
20
+ from nltk.internals import ElementWrapper
21
+ from nltk.tokenize import WordPunctTokenizer
22
+
23
+
24
+ class XMLCorpusReader(CorpusReader):
25
+ """
26
+ Corpus reader for corpora whose documents are xml files.
27
+
28
+ Note that the ``XMLCorpusReader`` constructor does not take an
29
+ ``encoding`` argument, because the unicode encoding is specified by
30
+ the XML files themselves. See the XML specs for more info.
31
+ """
32
+
33
+ def __init__(self, root, fileids, wrap_etree=False):
34
+ self._wrap_etree = wrap_etree
35
+ CorpusReader.__init__(self, root, fileids)
36
+
37
+ def xml(self, fileid=None):
38
+ # Make sure we have exactly one file -- no concatenating XML.
39
+ if fileid is None and len(self._fileids) == 1:
40
+ fileid = self._fileids[0]
41
+ if not isinstance(fileid, str):
42
+ raise TypeError("Expected a single file identifier string")
43
+ # Read the XML in using ElementTree.
44
+ with self.abspath(fileid).open() as fp:
45
+ elt = ElementTree.parse(fp).getroot()
46
+ # If requested, wrap it.
47
+ if self._wrap_etree:
48
+ elt = ElementWrapper(elt)
49
+ # Return the ElementTree element.
50
+ return elt
51
+
52
+ def words(self, fileid=None):
53
+ """
54
+ Returns all of the words and punctuation symbols in the specified file
55
+ that were in text nodes -- ie, tags are ignored. Like the xml() method,
56
+ fileid can only specify one file.
57
+
58
+ :return: the given file's text nodes as a list of words and punctuation symbols
59
+ :rtype: list(str)
60
+ """
61
+
62
+ elt = self.xml(fileid)
63
+ encoding = self.encoding(fileid)
64
+ word_tokenizer = WordPunctTokenizer()
65
+ try:
66
+ iterator = elt.getiterator()
67
+ except:
68
+ iterator = elt.iter()
69
+ out = []
70
+
71
+ for node in iterator:
72
+ text = node.text
73
+ if text is not None:
74
+ if isinstance(text, bytes):
75
+ text = text.decode(encoding)
76
+ toks = word_tokenizer.tokenize(text)
77
+ out.extend(toks)
78
+ return out
79
+
80
+
81
+ class XMLCorpusView(StreamBackedCorpusView):
82
+ """
83
+ A corpus view that selects out specified elements from an XML
84
+ file, and provides a flat list-like interface for accessing them.
85
+ (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
86
+ but may be used by subclasses of ``XMLCorpusReader``.)
87
+
88
+ Every XML corpus view has a "tag specification", indicating what
89
+ XML elements should be included in the view; and each (non-nested)
90
+ element that matches this specification corresponds to one item in
91
+ the view. Tag specifications are regular expressions over tag
92
+ paths, where a tag path is a list of element tag names, separated
93
+ by '/', indicating the ancestry of the element. Some examples:
94
+
95
+ - ``'foo'``: A top-level element whose tag is ``foo``.
96
+ - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
97
+ is a top-level element whose tag is ``foo``.
98
+ - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
99
+ in the xml tree.
100
+ - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
101
+ appearing anywhere in the xml tree.
102
+
103
+ The view items are generated from the selected XML elements via
104
+ the method ``handle_elt()``. By default, this method returns the
105
+ element as-is (i.e., as an ElementTree object); but it can be
106
+ overridden, either via subclassing or via the ``elt_handler``
107
+ constructor parameter.
108
+ """
109
+
110
+ #: If true, then display debugging output to stdout when reading
111
+ #: blocks.
112
+ _DEBUG = False
113
+
114
+ #: The number of characters read at a time by this corpus reader.
115
+ _BLOCK_SIZE = 1024
116
+
117
+ def __init__(self, fileid, tagspec, elt_handler=None):
118
+ """
119
+ Create a new corpus view based on a specified XML file.
120
+
121
+ Note that the ``XMLCorpusView`` constructor does not take an
122
+ ``encoding`` argument, because the unicode encoding is
123
+ specified by the XML files themselves.
124
+
125
+ :type tagspec: str
126
+ :param tagspec: A tag specification, indicating what XML
127
+ elements should be included in the view. Each non-nested
128
+ element that matches this specification corresponds to one
129
+ item in the view.
130
+
131
+ :param elt_handler: A function used to transform each element
132
+ to a value for the view. If no handler is specified, then
133
+ ``self.handle_elt()`` is called, which returns the element
134
+ as an ElementTree object. The signature of elt_handler is::
135
+
136
+ elt_handler(elt, tagspec) -> value
137
+ """
138
+ if elt_handler:
139
+ self.handle_elt = elt_handler
140
+
141
+ self._tagspec = re.compile(tagspec + r"\Z")
142
+ """The tag specification for this corpus view."""
143
+
144
+ self._tag_context = {0: ()}
145
+ """A dictionary mapping from file positions (as returned by
146
+ ``stream.seek()`` to XML contexts. An XML context is a
147
+ tuple of XML tag names, indicating which tags have not yet
148
+ been closed."""
149
+
150
+ encoding = self._detect_encoding(fileid)
151
+ StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
152
+
153
+ def _detect_encoding(self, fileid):
154
+ if isinstance(fileid, PathPointer):
155
+ try:
156
+ infile = fileid.open()
157
+ s = infile.readline()
158
+ finally:
159
+ infile.close()
160
+ else:
161
+ with open(fileid, "rb") as infile:
162
+ s = infile.readline()
163
+ if s.startswith(codecs.BOM_UTF16_BE):
164
+ return "utf-16-be"
165
+ if s.startswith(codecs.BOM_UTF16_LE):
166
+ return "utf-16-le"
167
+ if s.startswith(codecs.BOM_UTF32_BE):
168
+ return "utf-32-be"
169
+ if s.startswith(codecs.BOM_UTF32_LE):
170
+ return "utf-32-le"
171
+ if s.startswith(codecs.BOM_UTF8):
172
+ return "utf-8"
173
+ m = re.match(rb'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
174
+ if m:
175
+ return m.group(1).decode()
176
+ m = re.match(rb"\s*<\?xml\b.*\bencoding='([^']+)'", s)
177
+ if m:
178
+ return m.group(1).decode()
179
+ # No encoding found -- what should the default be?
180
+ return "utf-8"
181
+
182
+ def handle_elt(self, elt, context):
183
+ """
184
+ Convert an element into an appropriate value for inclusion in
185
+ the view. Unless overridden by a subclass or by the
186
+ ``elt_handler`` constructor argument, this method simply
187
+ returns ``elt``.
188
+
189
+ :return: The view value corresponding to ``elt``.
190
+
191
+ :type elt: ElementTree
192
+ :param elt: The element that should be converted.
193
+
194
+ :type context: str
195
+ :param context: A string composed of element tags separated by
196
+ forward slashes, indicating the XML context of the given
197
+ element. For example, the string ``'foo/bar/baz'``
198
+ indicates that the element is a ``baz`` element whose
199
+ parent is a ``bar`` element and whose grandparent is a
200
+ top-level ``foo`` element.
201
+ """
202
+ return elt
203
+
204
+ #: A regular expression that matches XML fragments that do not
205
+ #: contain any un-closed tags.
206
+ _VALID_XML_RE = re.compile(
207
+ r"""
208
+ [^<]*
209
+ (
210
+ ((<!--.*?-->) | # comment
211
+ (<![CDATA[.*?]]) | # raw character data
212
+ (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
213
+ (<[^!>][^>]*>)) # tag or PI
214
+ [^<]*)*
215
+ \Z""",
216
+ re.DOTALL | re.VERBOSE,
217
+ )
218
+
219
+ #: A regular expression used to extract the tag name from a start tag,
220
+ #: end tag, or empty-elt tag string.
221
+ _XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)")
222
+
223
+ #: A regular expression used to find all start-tags, end-tags, and
224
+ #: empty-elt tags in an XML file. This regexp is more lenient than
225
+ #: the XML spec -- e.g., it allows spaces in some places where the
226
+ #: spec does not.
227
+ _XML_PIECE = re.compile(
228
+ r"""
229
+ # Include these so we can skip them:
230
+ (?P<COMMENT> <!--.*?--> )|
231
+ (?P<CDATA> <![CDATA[.*?]]> )|
232
+ (?P<PI> <\?.*?\?> )|
233
+ (?P<DOCTYPE> <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
234
+ # These are the ones we actually care about:
235
+ (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
236
+ (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
237
+ (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
238
+ re.DOTALL | re.VERBOSE,
239
+ )
240
+
241
+ def _read_xml_fragment(self, stream):
242
+ """
243
+ Read a string from the given stream that does not contain any
244
+ un-closed tags. In particular, this function first reads a
245
+ block from the stream of size ``self._BLOCK_SIZE``. It then
246
+ checks if that block contains an un-closed tag. If it does,
247
+ then this function either backtracks to the last '<', or reads
248
+ another block.
249
+ """
250
+ fragment = ""
251
+
252
+ if isinstance(stream, SeekableUnicodeStreamReader):
253
+ startpos = stream.tell()
254
+ while True:
255
+ # Read a block and add it to the fragment.
256
+ xml_block = stream.read(self._BLOCK_SIZE)
257
+ fragment += xml_block
258
+
259
+ # Do we have a well-formed xml fragment?
260
+ if self._VALID_XML_RE.match(fragment):
261
+ return fragment
262
+
263
+ # Do we have a fragment that will never be well-formed?
264
+ if re.search("[<>]", fragment).group(0) == ">":
265
+ pos = stream.tell() - (
266
+ len(fragment) - re.search("[<>]", fragment).end()
267
+ )
268
+ raise ValueError('Unexpected ">" near char %s' % pos)
269
+
270
+ # End of file?
271
+ if not xml_block:
272
+ raise ValueError("Unexpected end of file: tag not closed")
273
+
274
+ # If not, then we must be in the middle of a <..tag..>.
275
+ # If appropriate, backtrack to the most recent '<'
276
+ # character.
277
+ last_open_bracket = fragment.rfind("<")
278
+ if last_open_bracket > 0:
279
+ if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
280
+ if isinstance(stream, SeekableUnicodeStreamReader):
281
+ stream.seek(startpos)
282
+ stream.char_seek_forward(last_open_bracket)
283
+ else:
284
+ stream.seek(-(len(fragment) - last_open_bracket), 1)
285
+ return fragment[:last_open_bracket]
286
+
287
+ # Otherwise, read another block. (i.e., return to the
288
+ # top of the loop.)
289
+
290
+ def read_block(self, stream, tagspec=None, elt_handler=None):
291
+ """
292
+ Read from ``stream`` until we find at least one element that
293
+ matches ``tagspec``, and return the result of applying
294
+ ``elt_handler`` to each element found.
295
+ """
296
+ if tagspec is None:
297
+ tagspec = self._tagspec
298
+ if elt_handler is None:
299
+ elt_handler = self.handle_elt
300
+
301
+ # Use a stack of strings to keep track of our context:
302
+ context = list(self._tag_context.get(stream.tell()))
303
+ assert context is not None # check this -- could it ever happen?
304
+
305
+ elts = []
306
+
307
+ elt_start = None # where does the elt start
308
+ elt_depth = None # what context depth
309
+ elt_text = ""
310
+
311
+ while elts == [] or elt_start is not None:
312
+ if isinstance(stream, SeekableUnicodeStreamReader):
313
+ startpos = stream.tell()
314
+ xml_fragment = self._read_xml_fragment(stream)
315
+
316
+ # End of file.
317
+ if not xml_fragment:
318
+ if elt_start is None:
319
+ break
320
+ else:
321
+ raise ValueError("Unexpected end of file")
322
+
323
+ # Process each <tag> in the xml fragment.
324
+ for piece in self._XML_PIECE.finditer(xml_fragment):
325
+ if self._DEBUG:
326
+ print("{:>25} {}".format("/".join(context)[-20:], piece.group()))
327
+
328
+ if piece.group("START_TAG"):
329
+ name = self._XML_TAG_NAME.match(piece.group()).group(1)
330
+ # Keep context up-to-date.
331
+ context.append(name)
332
+ # Is this one of the elts we're looking for?
333
+ if elt_start is None:
334
+ if re.match(tagspec, "/".join(context)):
335
+ elt_start = piece.start()
336
+ elt_depth = len(context)
337
+
338
+ elif piece.group("END_TAG"):
339
+ name = self._XML_TAG_NAME.match(piece.group()).group(1)
340
+ # sanity checks:
341
+ if not context:
342
+ raise ValueError("Unmatched tag </%s>" % name)
343
+ if name != context[-1]:
344
+ raise ValueError(f"Unmatched tag <{context[-1]}>...</{name}>")
345
+ # Is this the end of an element?
346
+ if elt_start is not None and elt_depth == len(context):
347
+ elt_text += xml_fragment[elt_start : piece.end()]
348
+ elts.append((elt_text, "/".join(context)))
349
+ elt_start = elt_depth = None
350
+ elt_text = ""
351
+ # Keep context up-to-date
352
+ context.pop()
353
+
354
+ elif piece.group("EMPTY_ELT_TAG"):
355
+ name = self._XML_TAG_NAME.match(piece.group()).group(1)
356
+ if elt_start is None:
357
+ if re.match(tagspec, "/".join(context) + "/" + name):
358
+ elts.append((piece.group(), "/".join(context) + "/" + name))
359
+
360
+ if elt_start is not None:
361
+ # If we haven't found any elements yet, then keep
362
+ # looping until we do.
363
+ if elts == []:
364
+ elt_text += xml_fragment[elt_start:]
365
+ elt_start = 0
366
+
367
+ # If we've found at least one element, then try
368
+ # backtracking to the start of the element that we're
369
+ # inside of.
370
+ else:
371
+ # take back the last start-tag, and return what
372
+ # we've gotten so far (elts is non-empty).
373
+ if self._DEBUG:
374
+ print(" " * 36 + "(backtrack)")
375
+ if isinstance(stream, SeekableUnicodeStreamReader):
376
+ stream.seek(startpos)
377
+ stream.char_seek_forward(elt_start)
378
+ else:
379
+ stream.seek(-(len(xml_fragment) - elt_start), 1)
380
+ context = context[: elt_depth - 1]
381
+ elt_start = elt_depth = None
382
+ elt_text = ""
383
+
384
+ # Update the _tag_context dict.
385
+ pos = stream.tell()
386
+ if pos in self._tag_context:
387
+ assert tuple(context) == self._tag_context[pos]
388
+ else:
389
+ self._tag_context[pos] = tuple(context)
390
+
391
+ return [
392
+ elt_handler(
393
+ ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
394
+ context,
395
+ )
396
+ for (elt, context) in elts
397
+ ]
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/ycoe.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
2
+ #
3
+ # Copyright (C) 2001-2015 NLTK Project
4
+ # Author: Selina Dennis <selina@tranzfusion.net>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
10
+ English Prose (YCOE), a 1.5 million word syntactically-annotated
11
+ corpus of Old English prose texts. The corpus is distributed by the
12
+ Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
13
+ with NLTK.
14
+
15
+ The YCOE corpus is divided into 100 files, each representing
16
+ an Old English prose text. Tags used within each text complies
17
+ to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
18
+ """
19
+
20
+ import os
21
+ import re
22
+
23
+ from nltk.corpus.reader.api import *
24
+ from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
25
+ from nltk.corpus.reader.tagged import TaggedCorpusReader
26
+ from nltk.corpus.reader.util import *
27
+ from nltk.tokenize import RegexpTokenizer
28
+
29
+
30
+ class YCOECorpusReader(CorpusReader):
31
+ """
32
+ Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
33
+ English Prose (YCOE), a 1.5 million word syntactically-annotated
34
+ corpus of Old English prose texts.
35
+ """
36
+
37
+ def __init__(self, root, encoding="utf8"):
38
+ CorpusReader.__init__(self, root, [], encoding)
39
+
40
+ self._psd_reader = YCOEParseCorpusReader(
41
+ self.root.join("psd"), ".*", ".psd", encoding=encoding
42
+ )
43
+ self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
44
+
45
+ # Make sure we have a consistent set of items:
46
+ documents = {f[:-4] for f in self._psd_reader.fileids()}
47
+ if {f[:-4] for f in self._pos_reader.fileids()} != documents:
48
+ raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
49
+
50
+ fileids = sorted(
51
+ ["%s.psd" % doc for doc in documents]
52
+ + ["%s.pos" % doc for doc in documents]
53
+ )
54
+ CorpusReader.__init__(self, root, fileids, encoding)
55
+ self._documents = sorted(documents)
56
+
57
+ def documents(self, fileids=None):
58
+ """
59
+ Return a list of document identifiers for all documents in
60
+ this corpus, or for the documents with the given file(s) if
61
+ specified.
62
+ """
63
+ if fileids is None:
64
+ return self._documents
65
+ if isinstance(fileids, str):
66
+ fileids = [fileids]
67
+ for f in fileids:
68
+ if f not in self._fileids:
69
+ raise KeyError("File id %s not found" % fileids)
70
+ # Strip off the '.pos' and '.psd' extensions.
71
+ return sorted({f[:-4] for f in fileids})
72
+
73
+ def fileids(self, documents=None):
74
+ """
75
+ Return a list of file identifiers for the files that make up
76
+ this corpus, or that store the given document(s) if specified.
77
+ """
78
+ if documents is None:
79
+ return self._fileids
80
+ elif isinstance(documents, str):
81
+ documents = [documents]
82
+ return sorted(
83
+ set(
84
+ ["%s.pos" % doc for doc in documents]
85
+ + ["%s.psd" % doc for doc in documents]
86
+ )
87
+ )
88
+
89
+ def _getfileids(self, documents, subcorpus):
90
+ """
91
+ Helper that selects the appropriate fileids for a given set of
92
+ documents from a given subcorpus (pos or psd).
93
+ """
94
+ if documents is None:
95
+ documents = self._documents
96
+ else:
97
+ if isinstance(documents, str):
98
+ documents = [documents]
99
+ for document in documents:
100
+ if document not in self._documents:
101
+ if document[-4:] in (".pos", ".psd"):
102
+ raise ValueError(
103
+ "Expected a document identifier, not a file "
104
+ "identifier. (Use corpus.documents() to get "
105
+ "a list of document identifiers."
106
+ )
107
+ else:
108
+ raise ValueError("Document identifier %s not found" % document)
109
+ return [f"{d}.{subcorpus}" for d in documents]
110
+
111
+ # Delegate to one of our two sub-readers:
112
+ def words(self, documents=None):
113
+ return self._pos_reader.words(self._getfileids(documents, "pos"))
114
+
115
+ def sents(self, documents=None):
116
+ return self._pos_reader.sents(self._getfileids(documents, "pos"))
117
+
118
+ def paras(self, documents=None):
119
+ return self._pos_reader.paras(self._getfileids(documents, "pos"))
120
+
121
+ def tagged_words(self, documents=None):
122
+ return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
123
+
124
+ def tagged_sents(self, documents=None):
125
+ return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
126
+
127
+ def tagged_paras(self, documents=None):
128
+ return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
129
+
130
+ def parsed_sents(self, documents=None):
131
+ return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
132
+
133
+
134
+ class YCOEParseCorpusReader(BracketParseCorpusReader):
135
+ """Specialized version of the standard bracket parse corpus reader
136
+ that strips out (CODE ...) and (ID ...) nodes."""
137
+
138
+ def _parse(self, t):
139
+ t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
140
+ if re.match(r"\s*\(\s*\)\s*$", t):
141
+ return None
142
+ return BracketParseCorpusReader._parse(self, t)
143
+
144
+
145
+ class YCOETaggedCorpusReader(TaggedCorpusReader):
146
+ def __init__(self, root, items, encoding="utf8"):
147
+ gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
148
+ sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
149
+ TaggedCorpusReader.__init__(
150
+ self, root, items, sep="_", sent_tokenizer=sent_tokenizer
151
+ )
152
+
153
+
154
+ #: A list of all documents and their titles in ycoe.
155
+ documents = {
156
+ "coadrian.o34": "Adrian and Ritheus",
157
+ "coaelhom.o3": "Ælfric, Supplemental Homilies",
158
+ "coaelive.o3": "Ælfric's Lives of Saints",
159
+ "coalcuin": "Alcuin De virtutibus et vitiis",
160
+ "coalex.o23": "Alexander's Letter to Aristotle",
161
+ "coapollo.o3": "Apollonius of Tyre",
162
+ "coaugust": "Augustine",
163
+ "cobede.o2": "Bede's History of the English Church",
164
+ "cobenrul.o3": "Benedictine Rule",
165
+ "coblick.o23": "Blickling Homilies",
166
+ "coboeth.o2": "Boethius' Consolation of Philosophy",
167
+ "cobyrhtf.o3": "Byrhtferth's Manual",
168
+ "cocanedgD": "Canons of Edgar (D)",
169
+ "cocanedgX": "Canons of Edgar (X)",
170
+ "cocathom1.o3": "Ælfric's Catholic Homilies I",
171
+ "cocathom2.o3": "Ælfric's Catholic Homilies II",
172
+ "cochad.o24": "Saint Chad",
173
+ "cochdrul": "Chrodegang of Metz, Rule",
174
+ "cochristoph": "Saint Christopher",
175
+ "cochronA.o23": "Anglo-Saxon Chronicle A",
176
+ "cochronC": "Anglo-Saxon Chronicle C",
177
+ "cochronD": "Anglo-Saxon Chronicle D",
178
+ "cochronE.o34": "Anglo-Saxon Chronicle E",
179
+ "cocura.o2": "Cura Pastoralis",
180
+ "cocuraC": "Cura Pastoralis (Cotton)",
181
+ "codicts.o34": "Dicts of Cato",
182
+ "codocu1.o1": "Documents 1 (O1)",
183
+ "codocu2.o12": "Documents 2 (O1/O2)",
184
+ "codocu2.o2": "Documents 2 (O2)",
185
+ "codocu3.o23": "Documents 3 (O2/O3)",
186
+ "codocu3.o3": "Documents 3 (O3)",
187
+ "codocu4.o24": "Documents 4 (O2/O4)",
188
+ "coeluc1": "Honorius of Autun, Elucidarium 1",
189
+ "coeluc2": "Honorius of Autun, Elucidarium 1",
190
+ "coepigen.o3": "Ælfric's Epilogue to Genesis",
191
+ "coeuphr": "Saint Euphrosyne",
192
+ "coeust": "Saint Eustace and his companions",
193
+ "coexodusP": "Exodus (P)",
194
+ "cogenesiC": "Genesis (C)",
195
+ "cogregdC.o24": "Gregory's Dialogues (C)",
196
+ "cogregdH.o23": "Gregory's Dialogues (H)",
197
+ "coherbar": "Pseudo-Apuleius, Herbarium",
198
+ "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
199
+ "coinspolX": "Wulfstan's Institute of Polity (X)",
200
+ "cojames": "Saint James",
201
+ "colacnu.o23": "Lacnunga",
202
+ "colaece.o2": "Leechdoms",
203
+ "colaw1cn.o3": "Laws, Cnut I",
204
+ "colaw2cn.o3": "Laws, Cnut II",
205
+ "colaw5atr.o3": "Laws, Æthelred V",
206
+ "colaw6atr.o3": "Laws, Æthelred VI",
207
+ "colawaf.o2": "Laws, Alfred",
208
+ "colawafint.o2": "Alfred's Introduction to Laws",
209
+ "colawger.o34": "Laws, Gerefa",
210
+ "colawine.ox2": "Laws, Ine",
211
+ "colawnorthu.o3": "Northumbra Preosta Lagu",
212
+ "colawwllad.o4": "Laws, William I, Lad",
213
+ "coleofri.o4": "Leofric",
214
+ "colsigef.o3": "Ælfric's Letter to Sigefyrth",
215
+ "colsigewB": "Ælfric's Letter to Sigeweard (B)",
216
+ "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
217
+ "colwgeat": "Ælfric's Letter to Wulfgeat",
218
+ "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
219
+ "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
220
+ "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
221
+ "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
222
+ "comargaC.o34": "Saint Margaret (C)",
223
+ "comargaT": "Saint Margaret (T)",
224
+ "comart1": "Martyrology, I",
225
+ "comart2": "Martyrology, II",
226
+ "comart3.o23": "Martyrology, III",
227
+ "comarvel.o23": "Marvels of the East",
228
+ "comary": "Mary of Egypt",
229
+ "coneot": "Saint Neot",
230
+ "conicodA": "Gospel of Nicodemus (A)",
231
+ "conicodC": "Gospel of Nicodemus (C)",
232
+ "conicodD": "Gospel of Nicodemus (D)",
233
+ "conicodE": "Gospel of Nicodemus (E)",
234
+ "coorosiu.o2": "Orosius",
235
+ "cootest.o3": "Heptateuch",
236
+ "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
237
+ "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
238
+ "coprefcura.o2": "Preface to the Cura Pastoralis",
239
+ "coprefgen.o3": "Ælfric's Preface to Genesis",
240
+ "copreflives.o3": "Ælfric's Preface to Lives of Saints",
241
+ "coprefsolilo": "Preface to Augustine's Soliloquies",
242
+ "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
243
+ "corood": "History of the Holy Rood-Tree",
244
+ "cosevensl": "Seven Sleepers",
245
+ "cosolilo": "St. Augustine's Soliloquies",
246
+ "cosolsat1.o4": "Solomon and Saturn I",
247
+ "cosolsat2": "Solomon and Saturn II",
248
+ "cotempo.o3": "Ælfric's De Temporibus Anni",
249
+ "coverhom": "Vercelli Homilies",
250
+ "coverhomE": "Vercelli Homilies (E)",
251
+ "coverhomL": "Vercelli Homilies (L)",
252
+ "covinceB": "Saint Vincent (Bodley 343)",
253
+ "covinsal": "Vindicta Salvatoris",
254
+ "cowsgosp.o3": "West-Saxon Gospels",
255
+ "cowulf.o34": "Wulfstan's Homilies",
256
+ }
.eggs/nltk-3.8-py3.10.egg/nltk/parse/malt.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Interface to MaltParser
2
+ #
3
+ # Author: Dan Garrette <dhgarrette@gmail.com>
4
+ # Contributor: Liling Tan, Mustufain, osamamukhtar11
5
+ #
6
+ # Copyright (C) 2001-2022 NLTK Project
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ import inspect
11
+ import os
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+
16
+ from nltk.data import ZipFilePathPointer
17
+ from nltk.internals import find_dir, find_file, find_jars_within_path
18
+ from nltk.parse.api import ParserI
19
+ from nltk.parse.dependencygraph import DependencyGraph
20
+ from nltk.parse.util import taggedsents_to_conll
21
+
22
+
23
+ def malt_regex_tagger():
24
+ from nltk.tag import RegexpTagger
25
+
26
+ _tagger = RegexpTagger(
27
+ [
28
+ (r"\.$", "."),
29
+ (r"\,$", ","),
30
+ (r"\?$", "?"), # fullstop, comma, Qmark
31
+ (r"\($", "("),
32
+ (r"\)$", ")"), # round brackets
33
+ (r"\[$", "["),
34
+ (r"\]$", "]"), # square brackets
35
+ (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
36
+ (r"(The|the|A|a|An|an)$", "DT"), # articles
37
+ (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
38
+ (r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive
39
+ (r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive
40
+ (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
41
+ (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
42
+ (r"(till|Till|until|Until)$", "IN"), # time prepopsitions
43
+ (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
44
+ (r"(under|Under|below|Below)$", "IN"), # space prepopsitions
45
+ (r"(over|Over|above|Above)$", "IN"), # space prepopsitions
46
+ (r"(across|Across|through|Through)$", "IN"), # space prepopsitions
47
+ (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
48
+ (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
49
+ (r".*able$", "JJ"), # adjectives
50
+ (r".*ness$", "NN"), # nouns formed from adjectives
51
+ (r".*ly$", "RB"), # adverbs
52
+ (r".*s$", "NNS"), # plural nouns
53
+ (r".*ing$", "VBG"), # gerunds
54
+ (r".*ed$", "VBD"), # past tense verbs
55
+ (r".*", "NN"), # nouns (default)
56
+ ]
57
+ )
58
+ return _tagger.tag
59
+
60
+
61
+ def find_maltparser(parser_dirname):
62
+ """
63
+ A module to find MaltParser .jar file and its dependencies.
64
+ """
65
+ if os.path.exists(parser_dirname): # If a full path is given.
66
+ _malt_dir = parser_dirname
67
+ else: # Try to find path to maltparser directory in environment variables.
68
+ _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
69
+ # Checks that that the found directory contains all the necessary .jar
70
+ malt_dependencies = ["", "", ""]
71
+ _malt_jars = set(find_jars_within_path(_malt_dir))
72
+ _jars = {os.path.split(jar)[1] for jar in _malt_jars}
73
+ malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"}
74
+
75
+ assert malt_dependencies.issubset(_jars)
76
+ assert any(
77
+ filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
78
+ )
79
+ return list(_malt_jars)
80
+
81
+
82
+ def find_malt_model(model_filename):
83
+ """
84
+ A module to find pre-trained MaltParser model.
85
+ """
86
+ if model_filename is None:
87
+ return "malt_temp.mco"
88
+ elif os.path.exists(model_filename): # If a full path is given.
89
+ return model_filename
90
+ else: # Try to find path to malt model in environment variables.
91
+ return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
92
+
93
+
94
+ class MaltParser(ParserI):
95
+ """
96
+ A class for dependency parsing with MaltParser. The input is the paths to:
97
+ - (optionally) a maltparser directory
98
+ - (optionally) the path to a pre-trained MaltParser .mco model file
99
+ - (optionally) the tagger to use for POS tagging before parsing
100
+ - (optionally) additional Java arguments
101
+
102
+ Example:
103
+ >>> from nltk.parse import malt
104
+ >>> # With MALT_PARSER and MALT_MODEL environment set.
105
+ >>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP
106
+ >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
107
+ (shot I (elephant an) (in (pajamas my)) .)
108
+ >>> # Without MALT_PARSER and MALT_MODEL environment.
109
+ >>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
110
+ >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
111
+ (shot I (elephant an) (in (pajamas my)) .)
112
+ """
113
+
114
+ def __init__(
115
+ self,
116
+ parser_dirname="",
117
+ model_filename=None,
118
+ tagger=None,
119
+ additional_java_args=None,
120
+ ):
121
+ """
122
+ An interface for parsing with the Malt Parser.
123
+
124
+ :param parser_dirname: The path to the maltparser directory that
125
+ contains the maltparser-1.x.jar
126
+ :type parser_dirname: str
127
+ :param model_filename: The name of the pre-trained model with .mco file
128
+ extension. If provided, training will not be required.
129
+ (see http://www.maltparser.org/mco/mco.html and
130
+ see http://www.patful.com/chalk/node/185)
131
+ :type model_filename: str
132
+ :param tagger: The tagger used to POS tag the raw string before
133
+ formatting to CONLL format. It should behave like `nltk.pos_tag`
134
+ :type tagger: function
135
+ :param additional_java_args: This is the additional Java arguments that
136
+ one can use when calling Maltparser, usually this is the heapsize
137
+ limits, e.g. `additional_java_args=['-Xmx1024m']`
138
+ (see https://goo.gl/mpDBvQ)
139
+ :type additional_java_args: list
140
+ """
141
+
142
+ # Find all the necessary jar files for MaltParser.
143
+ self.malt_jars = find_maltparser(parser_dirname)
144
+ # Initialize additional java arguments.
145
+ self.additional_java_args = (
146
+ additional_java_args if additional_java_args is not None else []
147
+ )
148
+ # Initialize model.
149
+ self.model = find_malt_model(model_filename)
150
+ self._trained = self.model != "malt_temp.mco"
151
+ # Set the working_dir parameters i.e. `-w` from MaltParser's option.
152
+ self.working_dir = tempfile.gettempdir()
153
+ # Initialize POS tagger.
154
+ self.tagger = tagger if tagger is not None else malt_regex_tagger()
155
+
156
+ def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
157
+ """
158
+ Use MaltParser to parse multiple POS tagged sentences. Takes multiple
159
+ sentences where each sentence is a list of (word, tag) tuples.
160
+ The sentences must have already been tokenized and tagged.
161
+
162
+ :param sentences: Input sentences to parse
163
+ :type sentence: list(list(tuple(str, str)))
164
+ :return: iter(iter(``DependencyGraph``)) the dependency graph
165
+ representation of each sentence
166
+ """
167
+ if not self._trained:
168
+ raise Exception("Parser has not been trained. Call train() first.")
169
+
170
+ with tempfile.NamedTemporaryFile(
171
+ prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
172
+ ) as input_file:
173
+ with tempfile.NamedTemporaryFile(
174
+ prefix="malt_output.conll.",
175
+ dir=self.working_dir,
176
+ mode="w",
177
+ delete=False,
178
+ ) as output_file:
179
+ # Convert list of sentences to CONLL format.
180
+ for line in taggedsents_to_conll(sentences):
181
+ input_file.write(str(line))
182
+ input_file.close()
183
+
184
+ # Generate command to run maltparser.
185
+ cmd = self.generate_malt_command(
186
+ input_file.name, output_file.name, mode="parse"
187
+ )
188
+
189
+ # This is a maltparser quirk, it needs to be run
190
+ # where the model file is. otherwise it goes into an awkward
191
+ # missing .jars or strange -w working_dir problem.
192
+ _current_path = os.getcwd() # Remembers the current path.
193
+ try: # Change to modelfile path
194
+ os.chdir(os.path.split(self.model)[0])
195
+ except:
196
+ pass
197
+ ret = self._execute(cmd, verbose) # Run command.
198
+ os.chdir(_current_path) # Change back to current path.
199
+
200
+ if ret != 0:
201
+ raise Exception(
202
+ "MaltParser parsing (%s) failed with exit "
203
+ "code %d" % (" ".join(cmd), ret)
204
+ )
205
+
206
+ # Must return iter(iter(Tree))
207
+ with open(output_file.name) as infile:
208
+ for tree_str in infile.read().split("\n\n"):
209
+ yield (
210
+ iter(
211
+ [
212
+ DependencyGraph(
213
+ tree_str, top_relation_label=top_relation_label
214
+ )
215
+ ]
216
+ )
217
+ )
218
+
219
+ os.remove(input_file.name)
220
+ os.remove(output_file.name)
221
+
222
+ def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
223
+ """
224
+ Use MaltParser to parse multiple sentences.
225
+ Takes a list of sentences, where each sentence is a list of words.
226
+ Each sentence will be automatically tagged with this
227
+ MaltParser instance's tagger.
228
+
229
+ :param sentences: Input sentences to parse
230
+ :type sentence: list(list(str))
231
+ :return: iter(DependencyGraph)
232
+ """
233
+ tagged_sentences = (self.tagger(sentence) for sentence in sentences)
234
+ return self.parse_tagged_sents(
235
+ tagged_sentences, verbose, top_relation_label=top_relation_label
236
+ )
237
+
238
+ def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
239
+ """
240
+ This function generates the maltparser command use at the terminal.
241
+
242
+ :param inputfilename: path to the input file
243
+ :type inputfilename: str
244
+ :param outputfilename: path to the output file
245
+ :type outputfilename: str
246
+ """
247
+
248
+ cmd = ["java"]
249
+ cmd += self.additional_java_args # Adds additional java arguments
250
+ # Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
251
+ classpaths_separator = ";" if sys.platform.startswith("win") else ":"
252
+ cmd += [
253
+ "-cp",
254
+ classpaths_separator.join(self.malt_jars),
255
+ ] # Adds classpaths for jars
256
+ cmd += ["org.maltparser.Malt"] # Adds the main function.
257
+
258
+ # Adds the model file.
259
+ if os.path.exists(self.model): # when parsing
260
+ cmd += ["-c", os.path.split(self.model)[-1]]
261
+ else: # when learning
262
+ cmd += ["-c", self.model]
263
+
264
+ cmd += ["-i", inputfilename]
265
+ if mode == "parse":
266
+ cmd += ["-o", outputfilename]
267
+ cmd += ["-m", mode] # mode use to generate parses.
268
+ return cmd
269
+
270
+ @staticmethod
271
+ def _execute(cmd, verbose=False):
272
+ output = None if verbose else subprocess.PIPE
273
+ p = subprocess.Popen(cmd, stdout=output, stderr=output)
274
+ return p.wait()
275
+
276
+ def train(self, depgraphs, verbose=False):
277
+ """
278
+ Train MaltParser from a list of ``DependencyGraph`` objects
279
+
280
+ :param depgraphs: list of ``DependencyGraph`` objects for training input data
281
+ :type depgraphs: DependencyGraph
282
+ """
283
+
284
+ # Write the conll_str to malt_train.conll file in /tmp/
285
+ with tempfile.NamedTemporaryFile(
286
+ prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
287
+ ) as input_file:
288
+ input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
289
+ input_file.write(str(input_str))
290
+ # Trains the model with the malt_train.conll
291
+ self.train_from_file(input_file.name, verbose=verbose)
292
+ # Removes the malt_train.conll once training finishes.
293
+ os.remove(input_file.name)
294
+
295
+ def train_from_file(self, conll_file, verbose=False):
296
+ """
297
+ Train MaltParser from a file
298
+ :param conll_file: str for the filename of the training input data
299
+ :type conll_file: str
300
+ """
301
+
302
+ # If conll_file is a ZipFilePathPointer,
303
+ # then we need to do some extra massaging
304
+ if isinstance(conll_file, ZipFilePathPointer):
305
+ with tempfile.NamedTemporaryFile(
306
+ prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
307
+ ) as input_file:
308
+ with conll_file.open() as conll_input_file:
309
+ conll_str = conll_input_file.read()
310
+ input_file.write(str(conll_str))
311
+ return self.train_from_file(input_file.name, verbose=verbose)
312
+
313
+ # Generate command to run maltparser.
314
+ cmd = self.generate_malt_command(conll_file, mode="learn")
315
+ ret = self._execute(cmd, verbose)
316
+ if ret != 0:
317
+ raise Exception(
318
+ "MaltParser training (%s) failed with exit "
319
+ "code %d" % (" ".join(cmd), ret)
320
+ )
321
+ self._trained = True
322
+
323
+
324
+ if __name__ == "__main__":
325
+ """
326
+ A demonstration function to show how NLTK users can use the malt parser API.
327
+
328
+ >>> from nltk import pos_tag
329
+ >>> assert 'MALT_PARSER' in os.environ, str(
330
+ ... "Please set MALT_PARSER in your global environment, e.g.:\n"
331
+ ... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'")
332
+ >>>
333
+ >>> assert 'MALT_MODEL' in os.environ, str(
334
+ ... "Please set MALT_MODEL in your global environment, e.g.:\n"
335
+ ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
336
+ >>>
337
+ >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
338
+ ... "2 sees _ VB _ _ 0 ROOT _ _\n"
339
+ ... "3 a _ DT _ _ 4 SPEC _ _\n"
340
+ ... "4 dog _ NN _ _ 2 OBJ _ _\n"
341
+ ... "5 . _ . _ _ 2 PUNCT _ _\n")
342
+ >>>
343
+ >>>
344
+ >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
345
+ ... "2 walks _ VB _ _ 0 ROOT _ _\n"
346
+ ... "3 . _ . _ _ 2 PUNCT _ _\n")
347
+ >>> dg1 = DependencyGraph(_dg1_str)
348
+ >>> dg2 = DependencyGraph(_dg2_str)
349
+ >>> # Initialize a MaltParser object
350
+ >>> mp = MaltParser()
351
+ >>>
352
+ >>> # Trains a model.
353
+ >>> mp.train([dg1,dg2], verbose=False)
354
+ >>> sent1 = ['John','sees','Mary', '.']
355
+ >>> sent2 = ['John', 'walks', 'a', 'dog', '.']
356
+ >>>
357
+ >>> # Parse a single sentence.
358
+ >>> parsed_sent1 = mp.parse_one(sent1)
359
+ >>> parsed_sent2 = mp.parse_one(sent2)
360
+ >>> print(parsed_sent1.tree())
361
+ (sees John Mary .)
362
+ >>> print(parsed_sent2.tree())
363
+ (walks John (dog a) .)
364
+ >>>
365
+ >>> # Parsing multiple sentences.
366
+ >>> sentences = [sent1,sent2]
367
+ >>> parsed_sents = mp.parse_sents(sentences)
368
+ >>> print(next(next(parsed_sents)).tree())
369
+ (sees John Mary .)
370
+ >>> print(next(next(parsed_sents)).tree())
371
+ (walks John (dog a) .)
372
+ >>>
373
+ >>> # Initialize a MaltParser object with an English pre-trained model.
374
+ >>> parser_dirname = 'maltparser-1.9.2'
375
+ >>> model_name = 'engmalt.linear-1.7.mco'
376
+ >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
377
+ >>> sent1 = 'I shot an elephant in my pajamas .'.split()
378
+ >>> sent2 = 'Time flies like banana .'.split()
379
+ >>> # Parse a single sentence.
380
+ >>> print(mp.parse_one(sent1).tree())
381
+ (shot I (elephant an) (in (pajamas my)) .)
382
+ # Parsing multiple sentences
383
+ >>> sentences = [sent1,sent2]
384
+ >>> parsed_sents = mp.parse_sents(sentences)
385
+ >>> print(next(next(parsed_sents)).tree())
386
+ (shot I (elephant an) (in (pajamas my)) .)
387
+ >>> print(next(next(parsed_sents)).tree())
388
+ (flies Time (like banana) .)
389
+ """
390
+
391
+ import doctest
392
+
393
+ doctest.testmod()
.eggs/nltk-3.8-py3.10.egg/nltk/parse/nonprojectivedependencyparser.py ADDED
@@ -0,0 +1,772 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Dependency Grammars
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Jason Narad <jason.narad@gmail.com>
5
+ #
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+ #
9
+
10
+ import logging
11
+ import math
12
+
13
+ from nltk.parse.dependencygraph import DependencyGraph
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ #################################################################
18
+ # DependencyScorerI - Interface for Graph-Edge Weight Calculation
19
+ #################################################################
20
+
21
+
22
+ class DependencyScorerI:
23
+ """
24
+ A scorer for calculated the weights on the edges of a weighted
25
+ dependency graph. This is used by a
26
+ ``ProbabilisticNonprojectiveParser`` to initialize the edge
27
+ weights of a ``DependencyGraph``. While typically this would be done
28
+ by training a binary classifier, any class that can return a
29
+ multidimensional list representation of the edge weights can
30
+ implement this interface. As such, it has no necessary
31
+ fields.
32
+ """
33
+
34
+ def __init__(self):
35
+ if self.__class__ == DependencyScorerI:
36
+ raise TypeError("DependencyScorerI is an abstract interface")
37
+
38
+ def train(self, graphs):
39
+ """
40
+ :type graphs: list(DependencyGraph)
41
+ :param graphs: A list of dependency graphs to train the scorer.
42
+ Typically the edges present in the graphs can be used as
43
+ positive training examples, and the edges not present as negative
44
+ examples.
45
+ """
46
+ raise NotImplementedError()
47
+
48
+ def score(self, graph):
49
+ """
50
+ :type graph: DependencyGraph
51
+ :param graph: A dependency graph whose set of edges need to be
52
+ scored.
53
+ :rtype: A three-dimensional list of numbers.
54
+ :return: The score is returned in a multidimensional(3) list, such
55
+ that the outer-dimension refers to the head, and the
56
+ inner-dimension refers to the dependencies. For instance,
57
+ scores[0][1] would reference the list of scores corresponding to
58
+ arcs from node 0 to node 1. The node's 'address' field can be used
59
+ to determine its number identification.
60
+
61
+ For further illustration, a score list corresponding to Fig.2 of
62
+ Keith Hall's 'K-best Spanning Tree Parsing' paper::
63
+
64
+ scores = [[[], [5], [1], [1]],
65
+ [[], [], [11], [4]],
66
+ [[], [10], [], [5]],
67
+ [[], [8], [8], []]]
68
+
69
+ When used in conjunction with a MaxEntClassifier, each score would
70
+ correspond to the confidence of a particular edge being classified
71
+ with the positive training examples.
72
+ """
73
+ raise NotImplementedError()
74
+
75
+
76
+ #################################################################
77
+ # NaiveBayesDependencyScorer
78
+ #################################################################
79
+
80
+
81
+ class NaiveBayesDependencyScorer(DependencyScorerI):
82
+ """
83
+ A dependency scorer built around a MaxEnt classifier. In this
84
+ particular class that classifier is a ``NaiveBayesClassifier``.
85
+ It uses head-word, head-tag, child-word, and child-tag features
86
+ for classification.
87
+
88
+ >>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2
89
+
90
+ >>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry]
91
+ >>> npp = ProbabilisticNonprojectiveParser()
92
+ >>> npp.train(graphs, NaiveBayesDependencyScorer())
93
+ >>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc'])
94
+ >>> len(list(parses))
95
+ 1
96
+
97
+ """
98
+
99
+ def __init__(self):
100
+ pass # Do nothing without throwing error
101
+
102
+ def train(self, graphs):
103
+ """
104
+ Trains a ``NaiveBayesClassifier`` using the edges present in
105
+ graphs list as positive examples, the edges not present as
106
+ negative examples. Uses a feature vector of head-word,
107
+ head-tag, child-word, and child-tag.
108
+
109
+ :type graphs: list(DependencyGraph)
110
+ :param graphs: A list of dependency graphs to train the scorer.
111
+ """
112
+
113
+ from nltk.classify import NaiveBayesClassifier
114
+
115
+ # Create training labeled training examples
116
+ labeled_examples = []
117
+ for graph in graphs:
118
+ for head_node in graph.nodes.values():
119
+ for child_index, child_node in graph.nodes.items():
120
+ if child_index in head_node["deps"]:
121
+ label = "T"
122
+ else:
123
+ label = "F"
124
+ labeled_examples.append(
125
+ (
126
+ dict(
127
+ a=head_node["word"],
128
+ b=head_node["tag"],
129
+ c=child_node["word"],
130
+ d=child_node["tag"],
131
+ ),
132
+ label,
133
+ )
134
+ )
135
+
136
+ self.classifier = NaiveBayesClassifier.train(labeled_examples)
137
+
138
+ def score(self, graph):
139
+ """
140
+ Converts the graph into a feature-based representation of
141
+ each edge, and then assigns a score to each based on the
142
+ confidence of the classifier in assigning it to the
143
+ positive label. Scores are returned in a multidimensional list.
144
+
145
+ :type graph: DependencyGraph
146
+ :param graph: A dependency graph to score.
147
+ :rtype: 3 dimensional list
148
+ :return: Edge scores for the graph parameter.
149
+ """
150
+ # Convert graph to feature representation
151
+ edges = []
152
+ for head_node in graph.nodes.values():
153
+ for child_node in graph.nodes.values():
154
+ edges.append(
155
+ dict(
156
+ a=head_node["word"],
157
+ b=head_node["tag"],
158
+ c=child_node["word"],
159
+ d=child_node["tag"],
160
+ )
161
+ )
162
+
163
+ # Score edges
164
+ edge_scores = []
165
+ row = []
166
+ count = 0
167
+ for pdist in self.classifier.prob_classify_many(edges):
168
+ logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
169
+ # smoothing in case the probability = 0
170
+ row.append([math.log(pdist.prob("T") + 0.00000000001)])
171
+ count += 1
172
+ if count == len(graph.nodes):
173
+ edge_scores.append(row)
174
+ row = []
175
+ count = 0
176
+ return edge_scores
177
+
178
+
179
+ #################################################################
180
+ # A Scorer for Demo Purposes
181
+ #################################################################
182
+ # A short class necessary to show parsing example from paper
183
+ class DemoScorer(DependencyScorerI):
184
+ def train(self, graphs):
185
+ print("Training...")
186
+
187
+ def score(self, graph):
188
+ # scores for Keith Hall 'K-best Spanning Tree Parsing' paper
189
+ return [
190
+ [[], [5], [1], [1]],
191
+ [[], [], [11], [4]],
192
+ [[], [10], [], [5]],
193
+ [[], [8], [8], []],
194
+ ]
195
+
196
+
197
+ #################################################################
198
+ # Non-Projective Probabilistic Parsing
199
+ #################################################################
200
+
201
+
202
+ class ProbabilisticNonprojectiveParser:
203
+ """A probabilistic non-projective dependency parser.
204
+
205
+ Nonprojective dependencies allows for "crossing branches" in the parse tree
206
+ which is necessary for representing particular linguistic phenomena, or even
207
+ typical parses in some languages. This parser follows the MST parsing
208
+ algorithm, outlined in McDonald(2005), which likens the search for the best
209
+ non-projective parse to finding the maximum spanning tree in a weighted
210
+ directed graph.
211
+
212
+ >>> class Scorer(DependencyScorerI):
213
+ ... def train(self, graphs):
214
+ ... pass
215
+ ...
216
+ ... def score(self, graph):
217
+ ... return [
218
+ ... [[], [5], [1], [1]],
219
+ ... [[], [], [11], [4]],
220
+ ... [[], [10], [], [5]],
221
+ ... [[], [8], [8], []],
222
+ ... ]
223
+
224
+
225
+ >>> npp = ProbabilisticNonprojectiveParser()
226
+ >>> npp.train([], Scorer())
227
+
228
+ >>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None])
229
+ >>> len(list(parses))
230
+ 1
231
+
232
+ Rule based example
233
+
234
+ >>> from nltk.grammar import DependencyGrammar
235
+
236
+ >>> grammar = DependencyGrammar.fromstring('''
237
+ ... 'taught' -> 'play' | 'man'
238
+ ... 'man' -> 'the' | 'in'
239
+ ... 'in' -> 'corner'
240
+ ... 'corner' -> 'the'
241
+ ... 'play' -> 'golf' | 'dachshund' | 'to'
242
+ ... 'dachshund' -> 'his'
243
+ ... ''')
244
+
245
+ >>> ndp = NonprojectiveDependencyParser(grammar)
246
+ >>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
247
+ >>> len(list(parses))
248
+ 4
249
+
250
+ """
251
+
252
+ def __init__(self):
253
+ """
254
+ Creates a new non-projective parser.
255
+ """
256
+ logging.debug("initializing prob. nonprojective...")
257
+
258
+ def train(self, graphs, dependency_scorer):
259
+ """
260
+ Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects,
261
+ and establishes this as the parser's scorer. This is used to
262
+ initialize the scores on a ``DependencyGraph`` during the parsing
263
+ procedure.
264
+
265
+ :type graphs: list(DependencyGraph)
266
+ :param graphs: A list of dependency graphs to train the scorer.
267
+ :type dependency_scorer: DependencyScorerI
268
+ :param dependency_scorer: A scorer which implements the
269
+ ``DependencyScorerI`` interface.
270
+ """
271
+ self._scorer = dependency_scorer
272
+ self._scorer.train(graphs)
273
+
274
+ def initialize_edge_scores(self, graph):
275
+ """
276
+ Assigns a score to every edge in the ``DependencyGraph`` graph.
277
+ These scores are generated via the parser's scorer which
278
+ was assigned during the training process.
279
+
280
+ :type graph: DependencyGraph
281
+ :param graph: A dependency graph to assign scores to.
282
+ """
283
+ self.scores = self._scorer.score(graph)
284
+
285
+ def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph):
286
+ """
287
+ Takes a list of nodes that have been identified to belong to a cycle,
288
+ and collapses them into on larger node. The arcs of all nodes in
289
+ the graph must be updated to account for this.
290
+
291
+ :type new_node: Node.
292
+ :param new_node: A Node (Dictionary) to collapse the cycle nodes into.
293
+ :type cycle_path: A list of integers.
294
+ :param cycle_path: A list of node addresses, each of which is in the cycle.
295
+ :type g_graph, b_graph, c_graph: DependencyGraph
296
+ :param g_graph, b_graph, c_graph: Graphs which need to be updated.
297
+ """
298
+ logger.debug("Collapsing nodes...")
299
+ # Collapse all cycle nodes into v_n+1 in G_Graph
300
+ for cycle_node_index in cycle_path:
301
+ g_graph.remove_by_address(cycle_node_index)
302
+ g_graph.add_node(new_node)
303
+ g_graph.redirect_arcs(cycle_path, new_node["address"])
304
+
305
+ def update_edge_scores(self, new_node, cycle_path):
306
+ """
307
+ Updates the edge scores to reflect a collapse operation into
308
+ new_node.
309
+
310
+ :type new_node: A Node.
311
+ :param new_node: The node which cycle nodes are collapsed into.
312
+ :type cycle_path: A list of integers.
313
+ :param cycle_path: A list of node addresses that belong to the cycle.
314
+ """
315
+ logger.debug("cycle %s", cycle_path)
316
+
317
+ cycle_path = self.compute_original_indexes(cycle_path)
318
+
319
+ logger.debug("old cycle %s", cycle_path)
320
+ logger.debug("Prior to update: %s", self.scores)
321
+
322
+ for i, row in enumerate(self.scores):
323
+ for j, column in enumerate(self.scores[i]):
324
+ logger.debug(self.scores[i][j])
325
+ if j in cycle_path and i not in cycle_path and self.scores[i][j]:
326
+ subtract_val = self.compute_max_subtract_score(j, cycle_path)
327
+
328
+ logger.debug("%s - %s", self.scores[i][j], subtract_val)
329
+
330
+ new_vals = []
331
+ for cur_val in self.scores[i][j]:
332
+ new_vals.append(cur_val - subtract_val)
333
+
334
+ self.scores[i][j] = new_vals
335
+
336
+ for i, row in enumerate(self.scores):
337
+ for j, cell in enumerate(self.scores[i]):
338
+ if i in cycle_path and j in cycle_path:
339
+ self.scores[i][j] = []
340
+
341
+ logger.debug("After update: %s", self.scores)
342
+
343
+ def compute_original_indexes(self, new_indexes):
344
+ """
345
+ As nodes are collapsed into others, they are replaced
346
+ by the new node in the graph, but it's still necessary
347
+ to keep track of what these original nodes were. This
348
+ takes a list of node addresses and replaces any collapsed
349
+ node addresses with their original addresses.
350
+
351
+ :type new_indexes: A list of integers.
352
+ :param new_indexes: A list of node addresses to check for
353
+ subsumed nodes.
354
+ """
355
+ swapped = True
356
+ while swapped:
357
+ originals = []
358
+ swapped = False
359
+ for new_index in new_indexes:
360
+ if new_index in self.inner_nodes:
361
+ for old_val in self.inner_nodes[new_index]:
362
+ if old_val not in originals:
363
+ originals.append(old_val)
364
+ swapped = True
365
+ else:
366
+ originals.append(new_index)
367
+ new_indexes = originals
368
+ return new_indexes
369
+
370
+ def compute_max_subtract_score(self, column_index, cycle_indexes):
371
+ """
372
+ When updating scores the score of the highest-weighted incoming
373
+ arc is subtracted upon collapse. This returns the correct
374
+ amount to subtract from that edge.
375
+
376
+ :type column_index: integer.
377
+ :param column_index: A index representing the column of incoming arcs
378
+ to a particular node being updated
379
+ :type cycle_indexes: A list of integers.
380
+ :param cycle_indexes: Only arcs from cycle nodes are considered. This
381
+ is a list of such nodes addresses.
382
+ """
383
+ max_score = -100000
384
+ for row_index in cycle_indexes:
385
+ for subtract_val in self.scores[row_index][column_index]:
386
+ if subtract_val > max_score:
387
+ max_score = subtract_val
388
+ return max_score
389
+
390
+ def best_incoming_arc(self, node_index):
391
+ """
392
+ Returns the source of the best incoming arc to the
393
+ node with address: node_index
394
+
395
+ :type node_index: integer.
396
+ :param node_index: The address of the 'destination' node,
397
+ the node that is arced to.
398
+ """
399
+ originals = self.compute_original_indexes([node_index])
400
+ logger.debug("originals: %s", originals)
401
+
402
+ max_arc = None
403
+ max_score = None
404
+ for row_index in range(len(self.scores)):
405
+ for col_index in range(len(self.scores[row_index])):
406
+ if col_index in originals and (
407
+ max_score is None or self.scores[row_index][col_index] > max_score
408
+ ):
409
+ max_score = self.scores[row_index][col_index]
410
+ max_arc = row_index
411
+ logger.debug("%s, %s", row_index, col_index)
412
+
413
+ logger.debug(max_score)
414
+
415
+ for key in self.inner_nodes:
416
+ replaced_nodes = self.inner_nodes[key]
417
+ if max_arc in replaced_nodes:
418
+ return key
419
+
420
+ return max_arc
421
+
422
+ def original_best_arc(self, node_index):
423
+ originals = self.compute_original_indexes([node_index])
424
+ max_arc = None
425
+ max_score = None
426
+ max_orig = None
427
+ for row_index in range(len(self.scores)):
428
+ for col_index in range(len(self.scores[row_index])):
429
+ if col_index in originals and (
430
+ max_score is None or self.scores[row_index][col_index] > max_score
431
+ ):
432
+ max_score = self.scores[row_index][col_index]
433
+ max_arc = row_index
434
+ max_orig = col_index
435
+ return [max_arc, max_orig]
436
+
437
+ def parse(self, tokens, tags):
438
+ """
439
+ Parses a list of tokens in accordance to the MST parsing algorithm
440
+ for non-projective dependency parses. Assumes that the tokens to
441
+ be parsed have already been tagged and those tags are provided. Various
442
+ scoring methods can be used by implementing the ``DependencyScorerI``
443
+ interface and passing it to the training algorithm.
444
+
445
+ :type tokens: list(str)
446
+ :param tokens: A list of words or punctuation to be parsed.
447
+ :type tags: list(str)
448
+ :param tags: A list of tags corresponding by index to the words in the tokens list.
449
+ :return: An iterator of non-projective parses.
450
+ :rtype: iter(DependencyGraph)
451
+ """
452
+ self.inner_nodes = {}
453
+
454
+ # Initialize g_graph
455
+ g_graph = DependencyGraph()
456
+ for index, token in enumerate(tokens):
457
+ g_graph.nodes[index + 1].update(
458
+ {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
459
+ )
460
+
461
+ # Fully connect non-root nodes in g_graph
462
+ g_graph.connect_graph()
463
+ original_graph = DependencyGraph()
464
+ for index, token in enumerate(tokens):
465
+ original_graph.nodes[index + 1].update(
466
+ {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
467
+ )
468
+
469
+ b_graph = DependencyGraph()
470
+ c_graph = DependencyGraph()
471
+
472
+ for index, token in enumerate(tokens):
473
+ c_graph.nodes[index + 1].update(
474
+ {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
475
+ )
476
+
477
+ # Assign initial scores to g_graph edges
478
+ self.initialize_edge_scores(g_graph)
479
+ logger.debug(self.scores)
480
+ # Initialize a list of unvisited vertices (by node address)
481
+ unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
482
+ # Iterate over unvisited vertices
483
+ nr_vertices = len(tokens)
484
+ betas = {}
485
+ while unvisited_vertices:
486
+ # Mark current node as visited
487
+ current_vertex = unvisited_vertices.pop(0)
488
+ logger.debug("current_vertex: %s", current_vertex)
489
+ # Get corresponding node n_i to vertex v_i
490
+ current_node = g_graph.get_by_address(current_vertex)
491
+ logger.debug("current_node: %s", current_node)
492
+ # Get best in-edge node b for current node
493
+ best_in_edge = self.best_incoming_arc(current_vertex)
494
+ betas[current_vertex] = self.original_best_arc(current_vertex)
495
+ logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
496
+ # b_graph = Union(b_graph, b)
497
+ for new_vertex in [current_vertex, best_in_edge]:
498
+ b_graph.nodes[new_vertex].update(
499
+ {"word": "TEMP", "rel": "NTOP", "address": new_vertex}
500
+ )
501
+ b_graph.add_arc(best_in_edge, current_vertex)
502
+ # Beta(current node) = b - stored for parse recovery
503
+ # If b_graph contains a cycle, collapse it
504
+ cycle_path = b_graph.contains_cycle()
505
+ if cycle_path:
506
+ # Create a new node v_n+1 with address = len(nodes) + 1
507
+ new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
508
+ # c_graph = Union(c_graph, v_n+1)
509
+ c_graph.add_node(new_node)
510
+ # Collapse all nodes in cycle C into v_n+1
511
+ self.update_edge_scores(new_node, cycle_path)
512
+ self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
513
+ for cycle_index in cycle_path:
514
+ c_graph.add_arc(new_node["address"], cycle_index)
515
+ # self.replaced_by[cycle_index] = new_node['address']
516
+
517
+ self.inner_nodes[new_node["address"]] = cycle_path
518
+
519
+ # Add v_n+1 to list of unvisited vertices
520
+ unvisited_vertices.insert(0, nr_vertices + 1)
521
+
522
+ # increment # of nodes counter
523
+ nr_vertices += 1
524
+
525
+ # Remove cycle nodes from b_graph; B = B - cycle c
526
+ for cycle_node_address in cycle_path:
527
+ b_graph.remove_by_address(cycle_node_address)
528
+
529
+ logger.debug("g_graph: %s", g_graph)
530
+ logger.debug("b_graph: %s", b_graph)
531
+ logger.debug("c_graph: %s", c_graph)
532
+ logger.debug("Betas: %s", betas)
533
+ logger.debug("replaced nodes %s", self.inner_nodes)
534
+
535
+ # Recover parse tree
536
+ logger.debug("Final scores: %s", self.scores)
537
+
538
+ logger.debug("Recovering parse...")
539
+ for i in range(len(tokens) + 1, nr_vertices + 1):
540
+ betas[betas[i][1]] = betas[i]
541
+
542
+ logger.debug("Betas: %s", betas)
543
+ for node in original_graph.nodes.values():
544
+ # TODO: It's dangerous to assume that deps it a dictionary
545
+ # because it's a default dictionary. Ideally, here we should not
546
+ # be concerned how dependencies are stored inside of a dependency
547
+ # graph.
548
+ node["deps"] = {}
549
+ for i in range(1, len(tokens) + 1):
550
+ original_graph.add_arc(betas[i][0], betas[i][1])
551
+
552
+ logger.debug("Done.")
553
+ yield original_graph
554
+
555
+
556
+ #################################################################
557
+ # Rule-based Non-Projective Parser
558
+ #################################################################
559
+
560
+
561
+ class NonprojectiveDependencyParser:
562
+ """
563
+ A non-projective, rule-based, dependency parser. This parser
564
+ will return the set of all possible non-projective parses based on
565
+ the word-to-word relations defined in the parser's dependency
566
+ grammar, and will allow the branches of the parse tree to cross
567
+ in order to capture a variety of linguistic phenomena that a
568
+ projective parser will not.
569
+ """
570
+
571
+ def __init__(self, dependency_grammar):
572
+ """
573
+ Creates a new ``NonprojectiveDependencyParser``.
574
+
575
+ :param dependency_grammar: a grammar of word-to-word relations.
576
+ :type dependency_grammar: DependencyGrammar
577
+ """
578
+ self._grammar = dependency_grammar
579
+
580
+ def parse(self, tokens):
581
+ """
582
+ Parses the input tokens with respect to the parser's grammar. Parsing
583
+ is accomplished by representing the search-space of possible parses as
584
+ a fully-connected directed graph. Arcs that would lead to ungrammatical
585
+ parses are removed and a lattice is constructed of length n, where n is
586
+ the number of input tokens, to represent all possible grammatical
587
+ traversals. All possible paths through the lattice are then enumerated
588
+ to produce the set of non-projective parses.
589
+
590
+ param tokens: A list of tokens to parse.
591
+ type tokens: list(str)
592
+ return: An iterator of non-projective parses.
593
+ rtype: iter(DependencyGraph)
594
+ """
595
+ # Create graph representation of tokens
596
+ self._graph = DependencyGraph()
597
+
598
+ for index, token in enumerate(tokens):
599
+ self._graph.nodes[index] = {
600
+ "word": token,
601
+ "deps": [],
602
+ "rel": "NTOP",
603
+ "address": index,
604
+ }
605
+
606
+ for head_node in self._graph.nodes.values():
607
+ deps = []
608
+ for dep_node in self._graph.nodes.values():
609
+ if (
610
+ self._grammar.contains(head_node["word"], dep_node["word"])
611
+ and head_node["word"] != dep_node["word"]
612
+ ):
613
+ deps.append(dep_node["address"])
614
+ head_node["deps"] = deps
615
+
616
+ # Create lattice of possible heads
617
+ roots = []
618
+ possible_heads = []
619
+ for i, word in enumerate(tokens):
620
+ heads = []
621
+ for j, head in enumerate(tokens):
622
+ if (i != j) and self._grammar.contains(head, word):
623
+ heads.append(j)
624
+ if len(heads) == 0:
625
+ roots.append(i)
626
+ possible_heads.append(heads)
627
+
628
+ # Set roots to attempt
629
+ if len(roots) < 2:
630
+ if len(roots) == 0:
631
+ for i in range(len(tokens)):
632
+ roots.append(i)
633
+
634
+ # Traverse lattice
635
+ analyses = []
636
+ for _ in roots:
637
+ stack = []
638
+ analysis = [[] for i in range(len(possible_heads))]
639
+ i = 0
640
+ forward = True
641
+ while i >= 0:
642
+ if forward:
643
+ if len(possible_heads[i]) == 1:
644
+ analysis[i] = possible_heads[i][0]
645
+ elif len(possible_heads[i]) == 0:
646
+ analysis[i] = -1
647
+ else:
648
+ head = possible_heads[i].pop()
649
+ analysis[i] = head
650
+ stack.append([i, head])
651
+ if not forward:
652
+ index_on_stack = False
653
+ for stack_item in stack:
654
+ if stack_item[0] == i:
655
+ index_on_stack = True
656
+ orig_length = len(possible_heads[i])
657
+
658
+ if index_on_stack and orig_length == 0:
659
+ for j in range(len(stack) - 1, -1, -1):
660
+ stack_item = stack[j]
661
+ if stack_item[0] == i:
662
+ possible_heads[i].append(stack.pop(j)[1])
663
+
664
+ elif index_on_stack and orig_length > 0:
665
+ head = possible_heads[i].pop()
666
+ analysis[i] = head
667
+ stack.append([i, head])
668
+ forward = True
669
+
670
+ if i + 1 == len(possible_heads):
671
+ analyses.append(analysis[:])
672
+ forward = False
673
+ if forward:
674
+ i += 1
675
+ else:
676
+ i -= 1
677
+
678
+ # Filter parses
679
+ # ensure 1 root, every thing has 1 head
680
+ for analysis in analyses:
681
+ if analysis.count(-1) > 1:
682
+ # there are several root elements!
683
+ continue
684
+
685
+ graph = DependencyGraph()
686
+ graph.root = graph.nodes[analysis.index(-1) + 1]
687
+
688
+ for address, (token, head_index) in enumerate(
689
+ zip(tokens, analysis), start=1
690
+ ):
691
+ head_address = head_index + 1
692
+
693
+ node = graph.nodes[address]
694
+ node.update({"word": token, "address": address})
695
+
696
+ if head_address == 0:
697
+ rel = "ROOT"
698
+ else:
699
+ rel = ""
700
+ graph.nodes[head_index + 1]["deps"][rel].append(address)
701
+
702
+ # TODO: check for cycles
703
+ yield graph
704
+
705
+
706
+ #################################################################
707
+ # Demos
708
+ #################################################################
709
+
710
+
711
+ def demo():
712
+ # hall_demo()
713
+ nonprojective_conll_parse_demo()
714
+ rule_based_demo()
715
+
716
+
717
+ def hall_demo():
718
+ npp = ProbabilisticNonprojectiveParser()
719
+ npp.train([], DemoScorer())
720
+ for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
721
+ print(parse_graph)
722
+
723
+
724
+ def nonprojective_conll_parse_demo():
725
+ from nltk.parse.dependencygraph import conll_data2
726
+
727
+ graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
728
+ npp = ProbabilisticNonprojectiveParser()
729
+ npp.train(graphs, NaiveBayesDependencyScorer())
730
+ for parse_graph in npp.parse(
731
+ ["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
732
+ ):
733
+ print(parse_graph)
734
+
735
+
736
+ def rule_based_demo():
737
+ from nltk.grammar import DependencyGrammar
738
+
739
+ grammar = DependencyGrammar.fromstring(
740
+ """
741
+ 'taught' -> 'play' | 'man'
742
+ 'man' -> 'the' | 'in'
743
+ 'in' -> 'corner'
744
+ 'corner' -> 'the'
745
+ 'play' -> 'golf' | 'dachshund' | 'to'
746
+ 'dachshund' -> 'his'
747
+ """
748
+ )
749
+ print(grammar)
750
+ ndp = NonprojectiveDependencyParser(grammar)
751
+ graphs = ndp.parse(
752
+ [
753
+ "the",
754
+ "man",
755
+ "in",
756
+ "the",
757
+ "corner",
758
+ "taught",
759
+ "his",
760
+ "dachshund",
761
+ "to",
762
+ "play",
763
+ "golf",
764
+ ]
765
+ )
766
+ print("Graphs:")
767
+ for graph in graphs:
768
+ print(graph)
769
+
770
+
771
+ if __name__ == "__main__":
772
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/parse/recursivedescent.py ADDED
@@ -0,0 +1,684 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Recursive Descent Parser
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ from nltk.grammar import Nonterminal
10
+ from nltk.parse.api import ParserI
11
+ from nltk.tree import ImmutableTree, Tree
12
+
13
+
14
+ ##//////////////////////////////////////////////////////
15
+ ## Recursive Descent Parser
16
+ ##//////////////////////////////////////////////////////
17
+ class RecursiveDescentParser(ParserI):
18
+ """
19
+ A simple top-down CFG parser that parses texts by recursively
20
+ expanding the fringe of a Tree, and matching it against a
21
+ text.
22
+
23
+ ``RecursiveDescentParser`` uses a list of tree locations called a
24
+ "frontier" to remember which subtrees have not yet been expanded
25
+ and which leaves have not yet been matched against the text. Each
26
+ tree location consists of a list of child indices specifying the
27
+ path from the root of the tree to a subtree or a leaf; see the
28
+ reference documentation for Tree for more information
29
+ about tree locations.
30
+
31
+ When the parser begins parsing a text, it constructs a tree
32
+ containing only the start symbol, and a frontier containing the
33
+ location of the tree's root node. It then extends the tree to
34
+ cover the text, using the following recursive procedure:
35
+
36
+ - If the frontier is empty, and the text is covered by the tree,
37
+ then return the tree as a possible parse.
38
+ - If the frontier is empty, and the text is not covered by the
39
+ tree, then return no parses.
40
+ - If the first element of the frontier is a subtree, then
41
+ use CFG productions to "expand" it. For each applicable
42
+ production, add the expanded subtree's children to the
43
+ frontier, and recursively find all parses that can be
44
+ generated by the new tree and frontier.
45
+ - If the first element of the frontier is a token, then "match"
46
+ it against the next token from the text. Remove the token
47
+ from the frontier, and recursively find all parses that can be
48
+ generated by the new tree and frontier.
49
+
50
+ :see: ``nltk.grammar``
51
+ """
52
+
53
+ def __init__(self, grammar, trace=0):
54
+ """
55
+ Create a new ``RecursiveDescentParser``, that uses ``grammar``
56
+ to parse texts.
57
+
58
+ :type grammar: CFG
59
+ :param grammar: The grammar used to parse texts.
60
+ :type trace: int
61
+ :param trace: The level of tracing that should be used when
62
+ parsing a text. ``0`` will generate no tracing output;
63
+ and higher numbers will produce more verbose tracing
64
+ output.
65
+ """
66
+ self._grammar = grammar
67
+ self._trace = trace
68
+
69
+ def grammar(self):
70
+ return self._grammar
71
+
72
+ def parse(self, tokens):
73
+ # Inherit docs from ParserI
74
+
75
+ tokens = list(tokens)
76
+ self._grammar.check_coverage(tokens)
77
+
78
+ # Start a recursive descent parse, with an initial tree
79
+ # containing just the start symbol.
80
+ start = self._grammar.start().symbol()
81
+ initial_tree = Tree(start, [])
82
+ frontier = [()]
83
+ if self._trace:
84
+ self._trace_start(initial_tree, frontier, tokens)
85
+ return self._parse(tokens, initial_tree, frontier)
86
+
87
+ def _parse(self, remaining_text, tree, frontier):
88
+ """
89
+ Recursively expand and match each elements of ``tree``
90
+ specified by ``frontier``, to cover ``remaining_text``. Return
91
+ a list of all parses found.
92
+
93
+ :return: An iterator of all parses that can be generated by
94
+ matching and expanding the elements of ``tree``
95
+ specified by ``frontier``.
96
+ :rtype: iter(Tree)
97
+ :type tree: Tree
98
+ :param tree: A partial structure for the text that is
99
+ currently being parsed. The elements of ``tree``
100
+ that are specified by ``frontier`` have not yet been
101
+ expanded or matched.
102
+ :type remaining_text: list(str)
103
+ :param remaining_text: The portion of the text that is not yet
104
+ covered by ``tree``.
105
+ :type frontier: list(tuple(int))
106
+ :param frontier: A list of the locations within ``tree`` of
107
+ all subtrees that have not yet been expanded, and all
108
+ leaves that have not yet been matched. This list sorted
109
+ in left-to-right order of location within the tree.
110
+ """
111
+
112
+ # If the tree covers the text, and there's nothing left to
113
+ # expand, then we've found a complete parse; return it.
114
+ if len(remaining_text) == 0 and len(frontier) == 0:
115
+ if self._trace:
116
+ self._trace_succeed(tree, frontier)
117
+ yield tree
118
+
119
+ # If there's still text, but nothing left to expand, we failed.
120
+ elif len(frontier) == 0:
121
+ if self._trace:
122
+ self._trace_backtrack(tree, frontier)
123
+
124
+ # If the next element on the frontier is a tree, expand it.
125
+ elif isinstance(tree[frontier[0]], Tree):
126
+ yield from self._expand(remaining_text, tree, frontier)
127
+
128
+ # If the next element on the frontier is a token, match it.
129
+ else:
130
+ yield from self._match(remaining_text, tree, frontier)
131
+
132
+ def _match(self, rtext, tree, frontier):
133
+ """
134
+ :rtype: iter(Tree)
135
+ :return: an iterator of all parses that can be generated by
136
+ matching the first element of ``frontier`` against the
137
+ first token in ``rtext``. In particular, if the first
138
+ element of ``frontier`` has the same type as the first
139
+ token in ``rtext``, then substitute the token into
140
+ ``tree``; and return all parses that can be generated by
141
+ matching and expanding the remaining elements of
142
+ ``frontier``. If the first element of ``frontier`` does not
143
+ have the same type as the first token in ``rtext``, then
144
+ return empty list.
145
+
146
+ :type tree: Tree
147
+ :param tree: A partial structure for the text that is
148
+ currently being parsed. The elements of ``tree``
149
+ that are specified by ``frontier`` have not yet been
150
+ expanded or matched.
151
+ :type rtext: list(str)
152
+ :param rtext: The portion of the text that is not yet
153
+ covered by ``tree``.
154
+ :type frontier: list of tuple of int
155
+ :param frontier: A list of the locations within ``tree`` of
156
+ all subtrees that have not yet been expanded, and all
157
+ leaves that have not yet been matched.
158
+ """
159
+
160
+ tree_leaf = tree[frontier[0]]
161
+ if len(rtext) > 0 and tree_leaf == rtext[0]:
162
+ # If it's a terminal that matches rtext[0], then substitute
163
+ # in the token, and continue parsing.
164
+ newtree = tree.copy(deep=True)
165
+ newtree[frontier[0]] = rtext[0]
166
+ if self._trace:
167
+ self._trace_match(newtree, frontier[1:], rtext[0])
168
+ yield from self._parse(rtext[1:], newtree, frontier[1:])
169
+ else:
170
+ # If it's a non-matching terminal, fail.
171
+ if self._trace:
172
+ self._trace_backtrack(tree, frontier, rtext[:1])
173
+
174
+ def _expand(self, remaining_text, tree, frontier, production=None):
175
+ """
176
+ :rtype: iter(Tree)
177
+ :return: An iterator of all parses that can be generated by
178
+ expanding the first element of ``frontier`` with
179
+ ``production``. In particular, if the first element of
180
+ ``frontier`` is a subtree whose node type is equal to
181
+ ``production``'s left hand side, then add a child to that
182
+ subtree for each element of ``production``'s right hand
183
+ side; and return all parses that can be generated by
184
+ matching and expanding the remaining elements of
185
+ ``frontier``. If the first element of ``frontier`` is not a
186
+ subtree whose node type is equal to ``production``'s left
187
+ hand side, then return an empty list. If ``production`` is
188
+ not specified, then return a list of all parses that can
189
+ be generated by expanding the first element of ``frontier``
190
+ with *any* CFG production.
191
+
192
+ :type tree: Tree
193
+ :param tree: A partial structure for the text that is
194
+ currently being parsed. The elements of ``tree``
195
+ that are specified by ``frontier`` have not yet been
196
+ expanded or matched.
197
+ :type remaining_text: list(str)
198
+ :param remaining_text: The portion of the text that is not yet
199
+ covered by ``tree``.
200
+ :type frontier: list(tuple(int))
201
+ :param frontier: A list of the locations within ``tree`` of
202
+ all subtrees that have not yet been expanded, and all
203
+ leaves that have not yet been matched.
204
+ """
205
+
206
+ if production is None:
207
+ productions = self._grammar.productions()
208
+ else:
209
+ productions = [production]
210
+
211
+ for production in productions:
212
+ lhs = production.lhs().symbol()
213
+ if lhs == tree[frontier[0]].label():
214
+ subtree = self._production_to_tree(production)
215
+ if frontier[0] == ():
216
+ newtree = subtree
217
+ else:
218
+ newtree = tree.copy(deep=True)
219
+ newtree[frontier[0]] = subtree
220
+ new_frontier = [
221
+ frontier[0] + (i,) for i in range(len(production.rhs()))
222
+ ]
223
+ if self._trace:
224
+ self._trace_expand(newtree, new_frontier, production)
225
+ yield from self._parse(
226
+ remaining_text, newtree, new_frontier + frontier[1:]
227
+ )
228
+
229
+ def _production_to_tree(self, production):
230
+ """
231
+ :rtype: Tree
232
+ :return: The Tree that is licensed by ``production``.
233
+ In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
234
+ return a tree that has a node ``lhs.symbol``, and
235
+ ``n`` children. For each nonterminal element
236
+ ``elt[i]`` in the production, the tree token has a
237
+ childless subtree with node value ``elt[i].symbol``; and
238
+ for each terminal element ``elt[j]``, the tree token has
239
+ a leaf token with type ``elt[j]``.
240
+
241
+ :param production: The CFG production that licenses the tree
242
+ token that should be returned.
243
+ :type production: Production
244
+ """
245
+ children = []
246
+ for elt in production.rhs():
247
+ if isinstance(elt, Nonterminal):
248
+ children.append(Tree(elt.symbol(), []))
249
+ else:
250
+ # This will be matched.
251
+ children.append(elt)
252
+ return Tree(production.lhs().symbol(), children)
253
+
254
+ def trace(self, trace=2):
255
+ """
256
+ Set the level of tracing output that should be generated when
257
+ parsing a text.
258
+
259
+ :type trace: int
260
+ :param trace: The trace level. A trace level of ``0`` will
261
+ generate no tracing output; and higher trace levels will
262
+ produce more verbose tracing output.
263
+ :rtype: None
264
+ """
265
+ self._trace = trace
266
+
267
+ def _trace_fringe(self, tree, treeloc=None):
268
+ """
269
+ Print trace output displaying the fringe of ``tree``. The
270
+ fringe of ``tree`` consists of all of its leaves and all of
271
+ its childless subtrees.
272
+
273
+ :rtype: None
274
+ """
275
+
276
+ if treeloc == ():
277
+ print("*", end=" ")
278
+ if isinstance(tree, Tree):
279
+ if len(tree) == 0:
280
+ print(repr(Nonterminal(tree.label())), end=" ")
281
+ for i in range(len(tree)):
282
+ if treeloc is not None and i == treeloc[0]:
283
+ self._trace_fringe(tree[i], treeloc[1:])
284
+ else:
285
+ self._trace_fringe(tree[i])
286
+ else:
287
+ print(repr(tree), end=" ")
288
+
289
+ def _trace_tree(self, tree, frontier, operation):
290
+ """
291
+ Print trace output displaying the parser's current state.
292
+
293
+ :param operation: A character identifying the operation that
294
+ generated the current state.
295
+ :rtype: None
296
+ """
297
+ if self._trace == 2:
298
+ print(" %c [" % operation, end=" ")
299
+ else:
300
+ print(" [", end=" ")
301
+ if len(frontier) > 0:
302
+ self._trace_fringe(tree, frontier[0])
303
+ else:
304
+ self._trace_fringe(tree)
305
+ print("]")
306
+
307
+ def _trace_start(self, tree, frontier, text):
308
+ print("Parsing %r" % " ".join(text))
309
+ if self._trace > 2:
310
+ print("Start:")
311
+ if self._trace > 1:
312
+ self._trace_tree(tree, frontier, " ")
313
+
314
+ def _trace_expand(self, tree, frontier, production):
315
+ if self._trace > 2:
316
+ print("Expand: %s" % production)
317
+ if self._trace > 1:
318
+ self._trace_tree(tree, frontier, "E")
319
+
320
+ def _trace_match(self, tree, frontier, tok):
321
+ if self._trace > 2:
322
+ print("Match: %r" % tok)
323
+ if self._trace > 1:
324
+ self._trace_tree(tree, frontier, "M")
325
+
326
+ def _trace_succeed(self, tree, frontier):
327
+ if self._trace > 2:
328
+ print("GOOD PARSE:")
329
+ if self._trace == 1:
330
+ print("Found a parse:\n%s" % tree)
331
+ if self._trace > 1:
332
+ self._trace_tree(tree, frontier, "+")
333
+
334
+ def _trace_backtrack(self, tree, frontier, toks=None):
335
+ if self._trace > 2:
336
+ if toks:
337
+ print("Backtrack: %r match failed" % toks[0])
338
+ else:
339
+ print("Backtrack")
340
+
341
+
342
+ ##//////////////////////////////////////////////////////
343
+ ## Stepping Recursive Descent Parser
344
+ ##//////////////////////////////////////////////////////
345
+ class SteppingRecursiveDescentParser(RecursiveDescentParser):
346
+ """
347
+ A ``RecursiveDescentParser`` that allows you to step through the
348
+ parsing process, performing a single operation at a time.
349
+
350
+ The ``initialize`` method is used to start parsing a text.
351
+ ``expand`` expands the first element on the frontier using a single
352
+ CFG production, and ``match`` matches the first element on the
353
+ frontier against the next text token. ``backtrack`` undoes the most
354
+ recent expand or match operation. ``step`` performs a single
355
+ expand, match, or backtrack operation. ``parses`` returns the set
356
+ of parses that have been found by the parser.
357
+
358
+ :ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
359
+ containing the previous states of the parser. This history is
360
+ used to implement the ``backtrack`` operation.
361
+ :ivar _tried_e: A record of all productions that have been tried
362
+ for a given tree. This record is used by ``expand`` to perform
363
+ the next untried production.
364
+ :ivar _tried_m: A record of what tokens have been matched for a
365
+ given tree. This record is used by ``step`` to decide whether
366
+ or not to match a token.
367
+ :see: ``nltk.grammar``
368
+ """
369
+
370
+ def __init__(self, grammar, trace=0):
371
+ super().__init__(grammar, trace)
372
+ self._rtext = None
373
+ self._tree = None
374
+ self._frontier = [()]
375
+ self._tried_e = {}
376
+ self._tried_m = {}
377
+ self._history = []
378
+ self._parses = []
379
+
380
+ # [XX] TEMPORARY HACK WARNING! This should be replaced with
381
+ # something nicer when we get the chance.
382
+ def _freeze(self, tree):
383
+ c = tree.copy()
384
+ # for pos in c.treepositions('leaves'):
385
+ # c[pos] = c[pos].freeze()
386
+ return ImmutableTree.convert(c)
387
+
388
+ def parse(self, tokens):
389
+ tokens = list(tokens)
390
+ self.initialize(tokens)
391
+ while self.step() is not None:
392
+ pass
393
+ return self.parses()
394
+
395
+ def initialize(self, tokens):
396
+ """
397
+ Start parsing a given text. This sets the parser's tree to
398
+ the start symbol, its frontier to the root node, and its
399
+ remaining text to ``token['SUBTOKENS']``.
400
+ """
401
+
402
+ self._rtext = tokens
403
+ start = self._grammar.start().symbol()
404
+ self._tree = Tree(start, [])
405
+ self._frontier = [()]
406
+ self._tried_e = {}
407
+ self._tried_m = {}
408
+ self._history = []
409
+ self._parses = []
410
+ if self._trace:
411
+ self._trace_start(self._tree, self._frontier, self._rtext)
412
+
413
+ def remaining_text(self):
414
+ """
415
+ :return: The portion of the text that is not yet covered by the
416
+ tree.
417
+ :rtype: list(str)
418
+ """
419
+ return self._rtext
420
+
421
+ def frontier(self):
422
+ """
423
+ :return: A list of the tree locations of all subtrees that
424
+ have not yet been expanded, and all leaves that have not
425
+ yet been matched.
426
+ :rtype: list(tuple(int))
427
+ """
428
+ return self._frontier
429
+
430
+ def tree(self):
431
+ """
432
+ :return: A partial structure for the text that is
433
+ currently being parsed. The elements specified by the
434
+ frontier have not yet been expanded or matched.
435
+ :rtype: Tree
436
+ """
437
+ return self._tree
438
+
439
+ def step(self):
440
+ """
441
+ Perform a single parsing operation. If an untried match is
442
+ possible, then perform the match, and return the matched
443
+ token. If an untried expansion is possible, then perform the
444
+ expansion, and return the production that it is based on. If
445
+ backtracking is possible, then backtrack, and return True.
446
+ Otherwise, return None.
447
+
448
+ :return: None if no operation was performed; a token if a match
449
+ was performed; a production if an expansion was performed;
450
+ and True if a backtrack operation was performed.
451
+ :rtype: Production or String or bool
452
+ """
453
+ # Try matching (if we haven't already)
454
+ if self.untried_match():
455
+ token = self.match()
456
+ if token is not None:
457
+ return token
458
+
459
+ # Try expanding.
460
+ production = self.expand()
461
+ if production is not None:
462
+ return production
463
+
464
+ # Try backtracking
465
+ if self.backtrack():
466
+ self._trace_backtrack(self._tree, self._frontier)
467
+ return True
468
+
469
+ # Nothing left to do.
470
+ return None
471
+
472
+ def expand(self, production=None):
473
+ """
474
+ Expand the first element of the frontier. In particular, if
475
+ the first element of the frontier is a subtree whose node type
476
+ is equal to ``production``'s left hand side, then add a child
477
+ to that subtree for each element of ``production``'s right hand
478
+ side. If ``production`` is not specified, then use the first
479
+ untried expandable production. If all expandable productions
480
+ have been tried, do nothing.
481
+
482
+ :return: The production used to expand the frontier, if an
483
+ expansion was performed. If no expansion was performed,
484
+ return None.
485
+ :rtype: Production or None
486
+ """
487
+
488
+ # Make sure we *can* expand.
489
+ if len(self._frontier) == 0:
490
+ return None
491
+ if not isinstance(self._tree[self._frontier[0]], Tree):
492
+ return None
493
+
494
+ # If they didn't specify a production, check all untried ones.
495
+ if production is None:
496
+ productions = self.untried_expandable_productions()
497
+ else:
498
+ productions = [production]
499
+
500
+ parses = []
501
+ for prod in productions:
502
+ # Record that we've tried this production now.
503
+ self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
504
+
505
+ # Try expanding.
506
+ for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
507
+ return prod
508
+
509
+ # We didn't expand anything.
510
+ return None
511
+
512
+ def match(self):
513
+ """
514
+ Match the first element of the frontier. In particular, if
515
+ the first element of the frontier has the same type as the
516
+ next text token, then substitute the text token into the tree.
517
+
518
+ :return: The token matched, if a match operation was
519
+ performed. If no match was performed, return None
520
+ :rtype: str or None
521
+ """
522
+
523
+ # Record that we've tried matching this token.
524
+ tok = self._rtext[0]
525
+ self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
526
+
527
+ # Make sure we *can* match.
528
+ if len(self._frontier) == 0:
529
+ return None
530
+ if isinstance(self._tree[self._frontier[0]], Tree):
531
+ return None
532
+
533
+ for _result in self._match(self._rtext, self._tree, self._frontier):
534
+ # Return the token we just matched.
535
+ return self._history[-1][0][0]
536
+ return None
537
+
538
+ def backtrack(self):
539
+ """
540
+ Return the parser to its state before the most recent
541
+ match or expand operation. Calling ``undo`` repeatedly return
542
+ the parser to successively earlier states. If no match or
543
+ expand operations have been performed, ``undo`` will make no
544
+ changes.
545
+
546
+ :return: true if an operation was successfully undone.
547
+ :rtype: bool
548
+ """
549
+ if len(self._history) == 0:
550
+ return False
551
+ (self._rtext, self._tree, self._frontier) = self._history.pop()
552
+ return True
553
+
554
+ def expandable_productions(self):
555
+ """
556
+ :return: A list of all the productions for which expansions
557
+ are available for the current parser state.
558
+ :rtype: list(Production)
559
+ """
560
+ # Make sure we *can* expand.
561
+ if len(self._frontier) == 0:
562
+ return []
563
+ frontier_child = self._tree[self._frontier[0]]
564
+ if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
565
+ return []
566
+
567
+ return [
568
+ p
569
+ for p in self._grammar.productions()
570
+ if p.lhs().symbol() == frontier_child.label()
571
+ ]
572
+
573
+ def untried_expandable_productions(self):
574
+ """
575
+ :return: A list of all the untried productions for which
576
+ expansions are available for the current parser state.
577
+ :rtype: list(Production)
578
+ """
579
+
580
+ tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
581
+ return [p for p in self.expandable_productions() if p not in tried_expansions]
582
+
583
+ def untried_match(self):
584
+ """
585
+ :return: Whether the first element of the frontier is a token
586
+ that has not yet been matched.
587
+ :rtype: bool
588
+ """
589
+
590
+ if len(self._rtext) == 0:
591
+ return False
592
+ tried_matches = self._tried_m.get(self._freeze(self._tree), [])
593
+ return self._rtext[0] not in tried_matches
594
+
595
+ def currently_complete(self):
596
+ """
597
+ :return: Whether the parser's current state represents a
598
+ complete parse.
599
+ :rtype: bool
600
+ """
601
+ return len(self._frontier) == 0 and len(self._rtext) == 0
602
+
603
+ def _parse(self, remaining_text, tree, frontier):
604
+ """
605
+ A stub version of ``_parse`` that sets the parsers current
606
+ state to the given arguments. In ``RecursiveDescentParser``,
607
+ the ``_parse`` method is used to recursively continue parsing a
608
+ text. ``SteppingRecursiveDescentParser`` overrides it to
609
+ capture these recursive calls. It records the parser's old
610
+ state in the history (to allow for backtracking), and updates
611
+ the parser's new state using the given arguments. Finally, it
612
+ returns ``[1]``, which is used by ``match`` and ``expand`` to
613
+ detect whether their operations were successful.
614
+
615
+ :return: ``[1]``
616
+ :rtype: list of int
617
+ """
618
+ self._history.append((self._rtext, self._tree, self._frontier))
619
+ self._rtext = remaining_text
620
+ self._tree = tree
621
+ self._frontier = frontier
622
+
623
+ # Is it a good parse? If so, record it.
624
+ if len(frontier) == 0 and len(remaining_text) == 0:
625
+ self._parses.append(tree)
626
+ self._trace_succeed(self._tree, self._frontier)
627
+
628
+ return [1]
629
+
630
+ def parses(self):
631
+ """
632
+ :return: An iterator of the parses that have been found by this
633
+ parser so far.
634
+ :rtype: list of Tree
635
+ """
636
+ return iter(self._parses)
637
+
638
+ def set_grammar(self, grammar):
639
+ """
640
+ Change the grammar used to parse texts.
641
+
642
+ :param grammar: The new grammar.
643
+ :type grammar: CFG
644
+ """
645
+ self._grammar = grammar
646
+
647
+
648
+ ##//////////////////////////////////////////////////////
649
+ ## Demonstration Code
650
+ ##//////////////////////////////////////////////////////
651
+
652
+
653
+ def demo():
654
+ """
655
+ A demonstration of the recursive descent parser.
656
+ """
657
+
658
+ from nltk import CFG, parse
659
+
660
+ grammar = CFG.fromstring(
661
+ """
662
+ S -> NP VP
663
+ NP -> Det N | Det N PP
664
+ VP -> V NP | V NP PP
665
+ PP -> P NP
666
+ NP -> 'I'
667
+ N -> 'man' | 'park' | 'telescope' | 'dog'
668
+ Det -> 'the' | 'a'
669
+ P -> 'in' | 'with'
670
+ V -> 'saw'
671
+ """
672
+ )
673
+
674
+ for prod in grammar.productions():
675
+ print(prod)
676
+
677
+ sent = "I saw a man in the park".split()
678
+ parser = parse.RecursiveDescentParser(grammar, trace=2)
679
+ for p in parser.parse(sent):
680
+ print(p)
681
+
682
+
683
+ if __name__ == "__main__":
684
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/parse/shiftreduce.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Shift-Reduce Parser
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ from nltk.grammar import Nonterminal
10
+ from nltk.parse.api import ParserI
11
+ from nltk.tree import Tree
12
+
13
+
14
+ ##//////////////////////////////////////////////////////
15
+ ## Shift/Reduce Parser
16
+ ##//////////////////////////////////////////////////////
17
+ class ShiftReduceParser(ParserI):
18
+ """
19
+ A simple bottom-up CFG parser that uses two operations, "shift"
20
+ and "reduce", to find a single parse for a text.
21
+
22
+ ``ShiftReduceParser`` maintains a stack, which records the
23
+ structure of a portion of the text. This stack is a list of
24
+ strings and Trees that collectively cover a portion of
25
+ the text. For example, while parsing the sentence "the dog saw
26
+ the man" with a typical grammar, ``ShiftReduceParser`` will produce
27
+ the following stack, which covers "the dog saw"::
28
+
29
+ [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
30
+
31
+ ``ShiftReduceParser`` attempts to extend the stack to cover the
32
+ entire text, and to combine the stack elements into a single tree,
33
+ producing a complete parse for the sentence.
34
+
35
+ Initially, the stack is empty. It is extended to cover the text,
36
+ from left to right, by repeatedly applying two operations:
37
+
38
+ - "shift" moves a token from the beginning of the text to the
39
+ end of the stack.
40
+ - "reduce" uses a CFG production to combine the rightmost stack
41
+ elements into a single Tree.
42
+
43
+ Often, more than one operation can be performed on a given stack.
44
+ In this case, ``ShiftReduceParser`` uses the following heuristics
45
+ to decide which operation to perform:
46
+
47
+ - Only shift if no reductions are available.
48
+ - If multiple reductions are available, then apply the reduction
49
+ whose CFG production is listed earliest in the grammar.
50
+
51
+ Note that these heuristics are not guaranteed to choose an
52
+ operation that leads to a parse of the text. Also, if multiple
53
+ parses exists, ``ShiftReduceParser`` will return at most one of
54
+ them.
55
+
56
+ :see: ``nltk.grammar``
57
+ """
58
+
59
+ def __init__(self, grammar, trace=0):
60
+ """
61
+ Create a new ``ShiftReduceParser``, that uses ``grammar`` to
62
+ parse texts.
63
+
64
+ :type grammar: Grammar
65
+ :param grammar: The grammar used to parse texts.
66
+ :type trace: int
67
+ :param trace: The level of tracing that should be used when
68
+ parsing a text. ``0`` will generate no tracing output;
69
+ and higher numbers will produce more verbose tracing
70
+ output.
71
+ """
72
+ self._grammar = grammar
73
+ self._trace = trace
74
+ self._check_grammar()
75
+
76
+ def grammar(self):
77
+ return self._grammar
78
+
79
+ def parse(self, tokens):
80
+ tokens = list(tokens)
81
+ self._grammar.check_coverage(tokens)
82
+
83
+ # initialize the stack.
84
+ stack = []
85
+ remaining_text = tokens
86
+
87
+ # Trace output.
88
+ if self._trace:
89
+ print("Parsing %r" % " ".join(tokens))
90
+ self._trace_stack(stack, remaining_text)
91
+
92
+ # iterate through the text, pushing the token onto
93
+ # the stack, then reducing the stack.
94
+ while len(remaining_text) > 0:
95
+ self._shift(stack, remaining_text)
96
+ while self._reduce(stack, remaining_text):
97
+ pass
98
+
99
+ # Did we reduce everything?
100
+ if len(stack) == 1:
101
+ # Did we end up with the right category?
102
+ if stack[0].label() == self._grammar.start().symbol():
103
+ yield stack[0]
104
+
105
+ def _shift(self, stack, remaining_text):
106
+ """
107
+ Move a token from the beginning of ``remaining_text`` to the
108
+ end of ``stack``.
109
+
110
+ :type stack: list(str and Tree)
111
+ :param stack: A list of strings and Trees, encoding
112
+ the structure of the text that has been parsed so far.
113
+ :type remaining_text: list(str)
114
+ :param remaining_text: The portion of the text that is not yet
115
+ covered by ``stack``.
116
+ :rtype: None
117
+ """
118
+ stack.append(remaining_text[0])
119
+ remaining_text.remove(remaining_text[0])
120
+ if self._trace:
121
+ self._trace_shift(stack, remaining_text)
122
+
123
+ def _match_rhs(self, rhs, rightmost_stack):
124
+ """
125
+ :rtype: bool
126
+ :return: true if the right hand side of a CFG production
127
+ matches the rightmost elements of the stack. ``rhs``
128
+ matches ``rightmost_stack`` if they are the same length,
129
+ and each element of ``rhs`` matches the corresponding
130
+ element of ``rightmost_stack``. A nonterminal element of
131
+ ``rhs`` matches any Tree whose node value is equal
132
+ to the nonterminal's symbol. A terminal element of ``rhs``
133
+ matches any string whose type is equal to the terminal.
134
+ :type rhs: list(terminal and Nonterminal)
135
+ :param rhs: The right hand side of a CFG production.
136
+ :type rightmost_stack: list(string and Tree)
137
+ :param rightmost_stack: The rightmost elements of the parser's
138
+ stack.
139
+ """
140
+
141
+ if len(rightmost_stack) != len(rhs):
142
+ return False
143
+ for i in range(len(rightmost_stack)):
144
+ if isinstance(rightmost_stack[i], Tree):
145
+ if not isinstance(rhs[i], Nonterminal):
146
+ return False
147
+ if rightmost_stack[i].label() != rhs[i].symbol():
148
+ return False
149
+ else:
150
+ if isinstance(rhs[i], Nonterminal):
151
+ return False
152
+ if rightmost_stack[i] != rhs[i]:
153
+ return False
154
+ return True
155
+
156
+ def _reduce(self, stack, remaining_text, production=None):
157
+ """
158
+ Find a CFG production whose right hand side matches the
159
+ rightmost stack elements; and combine those stack elements
160
+ into a single Tree, with the node specified by the
161
+ production's left-hand side. If more than one CFG production
162
+ matches the stack, then use the production that is listed
163
+ earliest in the grammar. The new Tree replaces the
164
+ elements in the stack.
165
+
166
+ :rtype: Production or None
167
+ :return: If a reduction is performed, then return the CFG
168
+ production that the reduction is based on; otherwise,
169
+ return false.
170
+ :type stack: list(string and Tree)
171
+ :param stack: A list of strings and Trees, encoding
172
+ the structure of the text that has been parsed so far.
173
+ :type remaining_text: list(str)
174
+ :param remaining_text: The portion of the text that is not yet
175
+ covered by ``stack``.
176
+ """
177
+ if production is None:
178
+ productions = self._grammar.productions()
179
+ else:
180
+ productions = [production]
181
+
182
+ # Try each production, in order.
183
+ for production in productions:
184
+ rhslen = len(production.rhs())
185
+
186
+ # check if the RHS of a production matches the top of the stack
187
+ if self._match_rhs(production.rhs(), stack[-rhslen:]):
188
+
189
+ # combine the tree to reflect the reduction
190
+ tree = Tree(production.lhs().symbol(), stack[-rhslen:])
191
+ stack[-rhslen:] = [tree]
192
+
193
+ # We reduced something
194
+ if self._trace:
195
+ self._trace_reduce(stack, production, remaining_text)
196
+ return production
197
+
198
+ # We didn't reduce anything
199
+ return None
200
+
201
+ def trace(self, trace=2):
202
+ """
203
+ Set the level of tracing output that should be generated when
204
+ parsing a text.
205
+
206
+ :type trace: int
207
+ :param trace: The trace level. A trace level of ``0`` will
208
+ generate no tracing output; and higher trace levels will
209
+ produce more verbose tracing output.
210
+ :rtype: None
211
+ """
212
+ # 1: just show shifts.
213
+ # 2: show shifts & reduces
214
+ # 3: display which tokens & productions are shifed/reduced
215
+ self._trace = trace
216
+
217
+ def _trace_stack(self, stack, remaining_text, marker=" "):
218
+ """
219
+ Print trace output displaying the given stack and text.
220
+
221
+ :rtype: None
222
+ :param marker: A character that is printed to the left of the
223
+ stack. This is used with trace level 2 to print 'S'
224
+ before shifted stacks and 'R' before reduced stacks.
225
+ """
226
+ s = " " + marker + " [ "
227
+ for elt in stack:
228
+ if isinstance(elt, Tree):
229
+ s += repr(Nonterminal(elt.label())) + " "
230
+ else:
231
+ s += repr(elt) + " "
232
+ s += "* " + " ".join(remaining_text) + "]"
233
+ print(s)
234
+
235
+ def _trace_shift(self, stack, remaining_text):
236
+ """
237
+ Print trace output displaying that a token has been shifted.
238
+
239
+ :rtype: None
240
+ """
241
+ if self._trace > 2:
242
+ print("Shift %r:" % stack[-1])
243
+ if self._trace == 2:
244
+ self._trace_stack(stack, remaining_text, "S")
245
+ elif self._trace > 0:
246
+ self._trace_stack(stack, remaining_text)
247
+
248
+ def _trace_reduce(self, stack, production, remaining_text):
249
+ """
250
+ Print trace output displaying that ``production`` was used to
251
+ reduce ``stack``.
252
+
253
+ :rtype: None
254
+ """
255
+ if self._trace > 2:
256
+ rhs = " ".join(production.rhs())
257
+ print(f"Reduce {production.lhs()!r} <- {rhs}")
258
+ if self._trace == 2:
259
+ self._trace_stack(stack, remaining_text, "R")
260
+ elif self._trace > 1:
261
+ self._trace_stack(stack, remaining_text)
262
+
263
+ def _check_grammar(self):
264
+ """
265
+ Check to make sure that all of the CFG productions are
266
+ potentially useful. If any productions can never be used,
267
+ then print a warning.
268
+
269
+ :rtype: None
270
+ """
271
+ productions = self._grammar.productions()
272
+
273
+ # Any production whose RHS is an extension of another production's RHS
274
+ # will never be used.
275
+ for i in range(len(productions)):
276
+ for j in range(i + 1, len(productions)):
277
+ rhs1 = productions[i].rhs()
278
+ rhs2 = productions[j].rhs()
279
+ if rhs1[: len(rhs2)] == rhs2:
280
+ print("Warning: %r will never be used" % productions[i])
281
+
282
+
283
+ ##//////////////////////////////////////////////////////
284
+ ## Stepping Shift/Reduce Parser
285
+ ##//////////////////////////////////////////////////////
286
+ class SteppingShiftReduceParser(ShiftReduceParser):
287
+ """
288
+ A ``ShiftReduceParser`` that allows you to setp through the parsing
289
+ process, performing a single operation at a time. It also allows
290
+ you to change the parser's grammar midway through parsing a text.
291
+
292
+ The ``initialize`` method is used to start parsing a text.
293
+ ``shift`` performs a single shift operation, and ``reduce`` performs
294
+ a single reduce operation. ``step`` will perform a single reduce
295
+ operation if possible; otherwise, it will perform a single shift
296
+ operation. ``parses`` returns the set of parses that have been
297
+ found by the parser.
298
+
299
+ :ivar _history: A list of ``(stack, remaining_text)`` pairs,
300
+ containing all of the previous states of the parser. This
301
+ history is used to implement the ``undo`` operation.
302
+ :see: ``nltk.grammar``
303
+ """
304
+
305
+ def __init__(self, grammar, trace=0):
306
+ super().__init__(grammar, trace)
307
+ self._stack = None
308
+ self._remaining_text = None
309
+ self._history = []
310
+
311
+ def parse(self, tokens):
312
+ tokens = list(tokens)
313
+ self.initialize(tokens)
314
+ while self.step():
315
+ pass
316
+ return self.parses()
317
+
318
+ def stack(self):
319
+ """
320
+ :return: The parser's stack.
321
+ :rtype: list(str and Tree)
322
+ """
323
+ return self._stack
324
+
325
+ def remaining_text(self):
326
+ """
327
+ :return: The portion of the text that is not yet covered by the
328
+ stack.
329
+ :rtype: list(str)
330
+ """
331
+ return self._remaining_text
332
+
333
+ def initialize(self, tokens):
334
+ """
335
+ Start parsing a given text. This sets the parser's stack to
336
+ ``[]`` and sets its remaining text to ``tokens``.
337
+ """
338
+ self._stack = []
339
+ self._remaining_text = tokens
340
+ self._history = []
341
+
342
+ def step(self):
343
+ """
344
+ Perform a single parsing operation. If a reduction is
345
+ possible, then perform that reduction, and return the
346
+ production that it is based on. Otherwise, if a shift is
347
+ possible, then perform it, and return True. Otherwise,
348
+ return False.
349
+
350
+ :return: False if no operation was performed; True if a shift was
351
+ performed; and the CFG production used to reduce if a
352
+ reduction was performed.
353
+ :rtype: Production or bool
354
+ """
355
+ return self.reduce() or self.shift()
356
+
357
+ def shift(self):
358
+ """
359
+ Move a token from the beginning of the remaining text to the
360
+ end of the stack. If there are no more tokens in the
361
+ remaining text, then do nothing.
362
+
363
+ :return: True if the shift operation was successful.
364
+ :rtype: bool
365
+ """
366
+ if len(self._remaining_text) == 0:
367
+ return False
368
+ self._history.append((self._stack[:], self._remaining_text[:]))
369
+ self._shift(self._stack, self._remaining_text)
370
+ return True
371
+
372
+ def reduce(self, production=None):
373
+ """
374
+ Use ``production`` to combine the rightmost stack elements into
375
+ a single Tree. If ``production`` does not match the
376
+ rightmost stack elements, then do nothing.
377
+
378
+ :return: The production used to reduce the stack, if a
379
+ reduction was performed. If no reduction was performed,
380
+ return None.
381
+
382
+ :rtype: Production or None
383
+ """
384
+ self._history.append((self._stack[:], self._remaining_text[:]))
385
+ return_val = self._reduce(self._stack, self._remaining_text, production)
386
+
387
+ if not return_val:
388
+ self._history.pop()
389
+ return return_val
390
+
391
+ def undo(self):
392
+ """
393
+ Return the parser to its state before the most recent
394
+ shift or reduce operation. Calling ``undo`` repeatedly return
395
+ the parser to successively earlier states. If no shift or
396
+ reduce operations have been performed, ``undo`` will make no
397
+ changes.
398
+
399
+ :return: true if an operation was successfully undone.
400
+ :rtype: bool
401
+ """
402
+ if len(self._history) == 0:
403
+ return False
404
+ (self._stack, self._remaining_text) = self._history.pop()
405
+ return True
406
+
407
+ def reducible_productions(self):
408
+ """
409
+ :return: A list of the productions for which reductions are
410
+ available for the current parser state.
411
+ :rtype: list(Production)
412
+ """
413
+ productions = []
414
+ for production in self._grammar.productions():
415
+ rhslen = len(production.rhs())
416
+ if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
417
+ productions.append(production)
418
+ return productions
419
+
420
+ def parses(self):
421
+ """
422
+ :return: An iterator of the parses that have been found by this
423
+ parser so far.
424
+ :rtype: iter(Tree)
425
+ """
426
+ if (
427
+ len(self._remaining_text) == 0
428
+ and len(self._stack) == 1
429
+ and self._stack[0].label() == self._grammar.start().symbol()
430
+ ):
431
+ yield self._stack[0]
432
+
433
+ # copied from nltk.parser
434
+
435
+ def set_grammar(self, grammar):
436
+ """
437
+ Change the grammar used to parse texts.
438
+
439
+ :param grammar: The new grammar.
440
+ :type grammar: CFG
441
+ """
442
+ self._grammar = grammar
443
+
444
+
445
+ ##//////////////////////////////////////////////////////
446
+ ## Demonstration Code
447
+ ##//////////////////////////////////////////////////////
448
+
449
+
450
+ def demo():
451
+ """
452
+ A demonstration of the shift-reduce parser.
453
+ """
454
+
455
+ from nltk import CFG, parse
456
+
457
+ grammar = CFG.fromstring(
458
+ """
459
+ S -> NP VP
460
+ NP -> Det N | Det N PP
461
+ VP -> V NP | V NP PP
462
+ PP -> P NP
463
+ NP -> 'I'
464
+ N -> 'man' | 'park' | 'telescope' | 'dog'
465
+ Det -> 'the' | 'a'
466
+ P -> 'in' | 'with'
467
+ V -> 'saw'
468
+ """
469
+ )
470
+
471
+ sent = "I saw a man in the park".split()
472
+
473
+ parser = parse.ShiftReduceParser(grammar, trace=2)
474
+ for p in parser.parse(sent):
475
+ print(p)
476
+
477
+
478
+ if __name__ == "__main__":
479
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/parse/stanford.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Interface to the Stanford Parser
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Xu <xxu@student.unimelb.edu.au>
5
+ #
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ import os
10
+ import tempfile
11
+ import warnings
12
+ from subprocess import PIPE
13
+
14
+ from nltk.internals import (
15
+ _java_options,
16
+ config_java,
17
+ find_jar_iter,
18
+ find_jars_within_path,
19
+ java,
20
+ )
21
+ from nltk.parse.api import ParserI
22
+ from nltk.parse.dependencygraph import DependencyGraph
23
+ from nltk.tree import Tree
24
+
25
+ _stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
26
+
27
+
28
+ class GenericStanfordParser(ParserI):
29
+ """Interface to the Stanford Parser"""
30
+
31
+ _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
32
+ _JAR = r"stanford-parser\.jar"
33
+ _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
34
+
35
+ _USE_STDIN = False
36
+ _DOUBLE_SPACED_OUTPUT = False
37
+
38
+ def __init__(
39
+ self,
40
+ path_to_jar=None,
41
+ path_to_models_jar=None,
42
+ model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
43
+ encoding="utf8",
44
+ verbose=False,
45
+ java_options="-mx4g",
46
+ corenlp_options="",
47
+ ):
48
+
49
+ # find the most recent code and model jar
50
+ stanford_jar = max(
51
+ find_jar_iter(
52
+ self._JAR,
53
+ path_to_jar,
54
+ env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
55
+ searchpath=(),
56
+ url=_stanford_url,
57
+ verbose=verbose,
58
+ is_regex=True,
59
+ ),
60
+ key=lambda model_path: os.path.dirname(model_path),
61
+ )
62
+
63
+ model_jar = max(
64
+ find_jar_iter(
65
+ self._MODEL_JAR_PATTERN,
66
+ path_to_models_jar,
67
+ env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
68
+ searchpath=(),
69
+ url=_stanford_url,
70
+ verbose=verbose,
71
+ is_regex=True,
72
+ ),
73
+ key=lambda model_path: os.path.dirname(model_path),
74
+ )
75
+
76
+ # self._classpath = (stanford_jar, model_jar)
77
+
78
+ # Adding logging jar files to classpath
79
+ stanford_dir = os.path.split(stanford_jar)[0]
80
+ self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
81
+
82
+ self.model_path = model_path
83
+ self._encoding = encoding
84
+ self.corenlp_options = corenlp_options
85
+ self.java_options = java_options
86
+
87
+ def _parse_trees_output(self, output_):
88
+ res = []
89
+ cur_lines = []
90
+ cur_trees = []
91
+ blank = False
92
+ for line in output_.splitlines(False):
93
+ if line == "":
94
+ if blank:
95
+ res.append(iter(cur_trees))
96
+ cur_trees = []
97
+ blank = False
98
+ elif self._DOUBLE_SPACED_OUTPUT:
99
+ cur_trees.append(self._make_tree("\n".join(cur_lines)))
100
+ cur_lines = []
101
+ blank = True
102
+ else:
103
+ res.append(iter([self._make_tree("\n".join(cur_lines))]))
104
+ cur_lines = []
105
+ else:
106
+ cur_lines.append(line)
107
+ blank = False
108
+ return iter(res)
109
+
110
+ def parse_sents(self, sentences, verbose=False):
111
+ """
112
+ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
113
+ list where each sentence is a list of words.
114
+ Each sentence will be automatically tagged with this StanfordParser instance's
115
+ tagger.
116
+ If whitespaces exists inside a token, then the token will be treated as
117
+ separate tokens.
118
+
119
+ :param sentences: Input sentences to parse
120
+ :type sentences: list(list(str))
121
+ :rtype: iter(iter(Tree))
122
+ """
123
+ cmd = [
124
+ self._MAIN_CLASS,
125
+ "-model",
126
+ self.model_path,
127
+ "-sentences",
128
+ "newline",
129
+ "-outputFormat",
130
+ self._OUTPUT_FORMAT,
131
+ "-tokenized",
132
+ "-escaper",
133
+ "edu.stanford.nlp.process.PTBEscapingProcessor",
134
+ ]
135
+ return self._parse_trees_output(
136
+ self._execute(
137
+ cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
138
+ )
139
+ )
140
+
141
+ def raw_parse(self, sentence, verbose=False):
142
+ """
143
+ Use StanfordParser to parse a sentence. Takes a sentence as a string;
144
+ before parsing, it will be automatically tokenized and tagged by
145
+ the Stanford Parser.
146
+
147
+ :param sentence: Input sentence to parse
148
+ :type sentence: str
149
+ :rtype: iter(Tree)
150
+ """
151
+ return next(self.raw_parse_sents([sentence], verbose))
152
+
153
+ def raw_parse_sents(self, sentences, verbose=False):
154
+ """
155
+ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
156
+ list of strings.
157
+ Each sentence will be automatically tokenized and tagged by the Stanford Parser.
158
+
159
+ :param sentences: Input sentences to parse
160
+ :type sentences: list(str)
161
+ :rtype: iter(iter(Tree))
162
+ """
163
+ cmd = [
164
+ self._MAIN_CLASS,
165
+ "-model",
166
+ self.model_path,
167
+ "-sentences",
168
+ "newline",
169
+ "-outputFormat",
170
+ self._OUTPUT_FORMAT,
171
+ ]
172
+ return self._parse_trees_output(
173
+ self._execute(cmd, "\n".join(sentences), verbose)
174
+ )
175
+
176
+ def tagged_parse(self, sentence, verbose=False):
177
+ """
178
+ Use StanfordParser to parse a sentence. Takes a sentence as a list of
179
+ (word, tag) tuples; the sentence must have already been tokenized and
180
+ tagged.
181
+
182
+ :param sentence: Input sentence to parse
183
+ :type sentence: list(tuple(str, str))
184
+ :rtype: iter(Tree)
185
+ """
186
+ return next(self.tagged_parse_sents([sentence], verbose))
187
+
188
+ def tagged_parse_sents(self, sentences, verbose=False):
189
+ """
190
+ Use StanfordParser to parse multiple sentences. Takes multiple sentences
191
+ where each sentence is a list of (word, tag) tuples.
192
+ The sentences must have already been tokenized and tagged.
193
+
194
+ :param sentences: Input sentences to parse
195
+ :type sentences: list(list(tuple(str, str)))
196
+ :rtype: iter(iter(Tree))
197
+ """
198
+ tag_separator = "/"
199
+ cmd = [
200
+ self._MAIN_CLASS,
201
+ "-model",
202
+ self.model_path,
203
+ "-sentences",
204
+ "newline",
205
+ "-outputFormat",
206
+ self._OUTPUT_FORMAT,
207
+ "-tokenized",
208
+ "-tagSeparator",
209
+ tag_separator,
210
+ "-tokenizerFactory",
211
+ "edu.stanford.nlp.process.WhitespaceTokenizer",
212
+ "-tokenizerMethod",
213
+ "newCoreLabelTokenizerFactory",
214
+ ]
215
+ # We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
216
+ return self._parse_trees_output(
217
+ self._execute(
218
+ cmd,
219
+ "\n".join(
220
+ " ".join(tag_separator.join(tagged) for tagged in sentence)
221
+ for sentence in sentences
222
+ ),
223
+ verbose,
224
+ )
225
+ )
226
+
227
+ def _execute(self, cmd, input_, verbose=False):
228
+ encoding = self._encoding
229
+ cmd.extend(["-encoding", encoding])
230
+ if self.corenlp_options:
231
+ cmd.extend(self.corenlp_options.split())
232
+
233
+ default_options = " ".join(_java_options)
234
+
235
+ # Configure java.
236
+ config_java(options=self.java_options, verbose=verbose)
237
+
238
+ # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
239
+ with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
240
+ # Write the actual sentences to the temporary input file
241
+ if isinstance(input_, str) and encoding:
242
+ input_ = input_.encode(encoding)
243
+ input_file.write(input_)
244
+ input_file.flush()
245
+
246
+ # Run the tagger and get the output.
247
+ if self._USE_STDIN:
248
+ input_file.seek(0)
249
+ stdout, stderr = java(
250
+ cmd,
251
+ classpath=self._classpath,
252
+ stdin=input_file,
253
+ stdout=PIPE,
254
+ stderr=PIPE,
255
+ )
256
+ else:
257
+ cmd.append(input_file.name)
258
+ stdout, stderr = java(
259
+ cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
260
+ )
261
+
262
+ stdout = stdout.replace(b"\xc2\xa0", b" ")
263
+ stdout = stdout.replace(b"\x00\xa0", b" ")
264
+ stdout = stdout.decode(encoding)
265
+
266
+ os.unlink(input_file.name)
267
+
268
+ # Return java configurations to their default values.
269
+ config_java(options=default_options, verbose=False)
270
+
271
+ return stdout
272
+
273
+
274
+ class StanfordParser(GenericStanfordParser):
275
+ """
276
+ >>> parser=StanfordParser(
277
+ ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
278
+ ... ) # doctest: +SKIP
279
+
280
+ >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP
281
+ [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
282
+ Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
283
+ Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
284
+
285
+ >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
286
+ ... "the quick brown fox jumps over the lazy dog",
287
+ ... "the quick grey wolf jumps over the lazy fox"
288
+ ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
289
+ [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
290
+ Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
291
+ Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
292
+ [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
293
+ [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
294
+ Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
295
+
296
+ >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
297
+ ... "I 'm a dog".split(),
298
+ ... "This is my friends ' cat ( the tabby )".split(),
299
+ ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
300
+ [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
301
+ Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
302
+ [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
303
+ Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
304
+ Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
305
+
306
+ >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
307
+ ... (
308
+ ... ("The", "DT"),
309
+ ... ("quick", "JJ"),
310
+ ... ("brown", "JJ"),
311
+ ... ("fox", "NN"),
312
+ ... ("jumped", "VBD"),
313
+ ... ("over", "IN"),
314
+ ... ("the", "DT"),
315
+ ... ("lazy", "JJ"),
316
+ ... ("dog", "NN"),
317
+ ... (".", "."),
318
+ ... ),
319
+ ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
320
+ [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
321
+ Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
322
+ [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
323
+ """
324
+
325
+ _OUTPUT_FORMAT = "penn"
326
+
327
+ def __init__(self, *args, **kwargs):
328
+ warnings.warn(
329
+ "The StanfordParser will be deprecated\n"
330
+ "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
331
+ DeprecationWarning,
332
+ stacklevel=2,
333
+ )
334
+
335
+ super().__init__(*args, **kwargs)
336
+
337
+ def _make_tree(self, result):
338
+ return Tree.fromstring(result)
339
+
340
+
341
+ class StanfordDependencyParser(GenericStanfordParser):
342
+
343
+ """
344
+ >>> dep_parser=StanfordDependencyParser(
345
+ ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
346
+ ... ) # doctest: +SKIP
347
+
348
+ >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
349
+ [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
350
+
351
+ >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
352
+ [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
353
+ ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
354
+ ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
355
+ ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
356
+
357
+ >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
358
+ ... "The quick brown fox jumps over the lazy dog.",
359
+ ... "The quick grey wolf jumps over the lazy fox."
360
+ ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
361
+ [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
362
+ Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
363
+
364
+ >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
365
+ ... "I 'm a dog".split(),
366
+ ... "This is my friends ' cat ( the tabby )".split(),
367
+ ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
368
+ [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
369
+
370
+ >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
371
+ ... (
372
+ ... ("The", "DT"),
373
+ ... ("quick", "JJ"),
374
+ ... ("brown", "JJ"),
375
+ ... ("fox", "NN"),
376
+ ... ("jumped", "VBD"),
377
+ ... ("over", "IN"),
378
+ ... ("the", "DT"),
379
+ ... ("lazy", "JJ"),
380
+ ... ("dog", "NN"),
381
+ ... (".", "."),
382
+ ... ),
383
+ ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
384
+ [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
385
+ ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
386
+ ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
387
+ ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
388
+
389
+ """
390
+
391
+ _OUTPUT_FORMAT = "conll2007"
392
+
393
+ def __init__(self, *args, **kwargs):
394
+ warnings.warn(
395
+ "The StanfordDependencyParser will be deprecated\n"
396
+ "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
397
+ DeprecationWarning,
398
+ stacklevel=2,
399
+ )
400
+
401
+ super().__init__(*args, **kwargs)
402
+
403
+ def _make_tree(self, result):
404
+ return DependencyGraph(result, top_relation_label="root")
405
+
406
+
407
+ class StanfordNeuralDependencyParser(GenericStanfordParser):
408
+ """
409
+ >>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP
410
+ >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP
411
+
412
+ >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
413
+ [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
414
+
415
+ >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
416
+ [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
417
+ (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
418
+ u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
419
+ ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
420
+ (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
421
+ u'punct', (u'.', u'.'))]]
422
+
423
+ >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
424
+ ... "The quick brown fox jumps over the lazy dog.",
425
+ ... "The quick grey wolf jumps over the lazy fox."
426
+ ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
427
+ [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
428
+ 'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
429
+ Tree('fox', ['over', 'the', 'lazy']), '.'])]
430
+
431
+ >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
432
+ ... "I 'm a dog".split(),
433
+ ... "This is my friends ' cat ( the tabby )".split(),
434
+ ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
435
+ [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
436
+ ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
437
+ """
438
+
439
+ _OUTPUT_FORMAT = "conll"
440
+ _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
441
+ _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
442
+ _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
443
+ _USE_STDIN = True
444
+ _DOUBLE_SPACED_OUTPUT = True
445
+
446
+ def __init__(self, *args, **kwargs):
447
+ warnings.warn(
448
+ "The StanfordNeuralDependencyParser will be deprecated\n"
449
+ "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
450
+ DeprecationWarning,
451
+ stacklevel=2,
452
+ )
453
+
454
+ super().__init__(*args, **kwargs)
455
+ self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
456
+
457
+ def tagged_parse_sents(self, sentences, verbose=False):
458
+ """
459
+ Currently unimplemented because the neural dependency parser (and
460
+ the StanfordCoreNLP pipeline class) doesn't support passing in pre-
461
+ tagged tokens.
462
+ """
463
+ raise NotImplementedError(
464
+ "tagged_parse[_sents] is not supported by "
465
+ "StanfordNeuralDependencyParser; use "
466
+ "parse[_sents] or raw_parse[_sents] instead."
467
+ )
468
+
469
+ def _make_tree(self, result):
470
+ return DependencyGraph(result, top_relation_label="ROOT")
.eggs/nltk-3.8-py3.10.egg/nltk/parse/transitionparser.py ADDED
@@ -0,0 +1,794 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers
2
+ #
3
+ # Author: Long Duong <longdt219@gmail.com>
4
+ #
5
+ # Copyright (C) 2001-2022 NLTK Project
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ import pickle
10
+ import tempfile
11
+ from copy import deepcopy
12
+ from operator import itemgetter
13
+ from os import remove
14
+
15
+ try:
16
+ from numpy import array
17
+ from scipy import sparse
18
+ from sklearn import svm
19
+ from sklearn.datasets import load_svmlight_file
20
+ except ImportError:
21
+ pass
22
+
23
+ from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI
24
+
25
+
26
+ class Configuration:
27
+ """
28
+ Class for holding configuration which is the partial analysis of the input sentence.
29
+ The transition based parser aims at finding set of operators that transfer the initial
30
+ configuration to the terminal configuration.
31
+
32
+ The configuration includes:
33
+ - Stack: for storing partially proceeded words
34
+ - Buffer: for storing remaining input words
35
+ - Set of arcs: for storing partially built dependency tree
36
+
37
+ This class also provides a method to represent a configuration as list of features.
38
+ """
39
+
40
+ def __init__(self, dep_graph):
41
+ """
42
+ :param dep_graph: the representation of an input in the form of dependency graph.
43
+ :type dep_graph: DependencyGraph where the dependencies are not specified.
44
+ """
45
+ # dep_graph.nodes contain list of token for a sentence
46
+ self.stack = [0] # The root element
47
+ self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer
48
+ self.arcs = [] # empty set of arc
49
+ self._tokens = dep_graph.nodes
50
+ self._max_address = len(self.buffer)
51
+
52
+ def __str__(self):
53
+ return (
54
+ "Stack : "
55
+ + str(self.stack)
56
+ + " Buffer : "
57
+ + str(self.buffer)
58
+ + " Arcs : "
59
+ + str(self.arcs)
60
+ )
61
+
62
+ def _check_informative(self, feat, flag=False):
63
+ """
64
+ Check whether a feature is informative
65
+ The flag control whether "_" is informative or not
66
+ """
67
+ if feat is None:
68
+ return False
69
+ if feat == "":
70
+ return False
71
+ if flag is False:
72
+ if feat == "_":
73
+ return False
74
+ return True
75
+
76
+ def extract_features(self):
77
+ """
78
+ Extract the set of features for the current configuration. Implement standard features as describe in
79
+ Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre.
80
+ Please note that these features are very basic.
81
+ :return: list(str)
82
+ """
83
+ result = []
84
+ # Todo : can come up with more complicated features set for better
85
+ # performance.
86
+ if len(self.stack) > 0:
87
+ # Stack 0
88
+ stack_idx0 = self.stack[len(self.stack) - 1]
89
+ token = self._tokens[stack_idx0]
90
+ if self._check_informative(token["word"], True):
91
+ result.append("STK_0_FORM_" + token["word"])
92
+ if "lemma" in token and self._check_informative(token["lemma"]):
93
+ result.append("STK_0_LEMMA_" + token["lemma"])
94
+ if self._check_informative(token["tag"]):
95
+ result.append("STK_0_POS_" + token["tag"])
96
+ if "feats" in token and self._check_informative(token["feats"]):
97
+ feats = token["feats"].split("|")
98
+ for feat in feats:
99
+ result.append("STK_0_FEATS_" + feat)
100
+ # Stack 1
101
+ if len(self.stack) > 1:
102
+ stack_idx1 = self.stack[len(self.stack) - 2]
103
+ token = self._tokens[stack_idx1]
104
+ if self._check_informative(token["tag"]):
105
+ result.append("STK_1_POS_" + token["tag"])
106
+
107
+ # Left most, right most dependency of stack[0]
108
+ left_most = 1000000
109
+ right_most = -1
110
+ dep_left_most = ""
111
+ dep_right_most = ""
112
+ for (wi, r, wj) in self.arcs:
113
+ if wi == stack_idx0:
114
+ if (wj > wi) and (wj > right_most):
115
+ right_most = wj
116
+ dep_right_most = r
117
+ if (wj < wi) and (wj < left_most):
118
+ left_most = wj
119
+ dep_left_most = r
120
+ if self._check_informative(dep_left_most):
121
+ result.append("STK_0_LDEP_" + dep_left_most)
122
+ if self._check_informative(dep_right_most):
123
+ result.append("STK_0_RDEP_" + dep_right_most)
124
+
125
+ # Check Buffered 0
126
+ if len(self.buffer) > 0:
127
+ # Buffer 0
128
+ buffer_idx0 = self.buffer[0]
129
+ token = self._tokens[buffer_idx0]
130
+ if self._check_informative(token["word"], True):
131
+ result.append("BUF_0_FORM_" + token["word"])
132
+ if "lemma" in token and self._check_informative(token["lemma"]):
133
+ result.append("BUF_0_LEMMA_" + token["lemma"])
134
+ if self._check_informative(token["tag"]):
135
+ result.append("BUF_0_POS_" + token["tag"])
136
+ if "feats" in token and self._check_informative(token["feats"]):
137
+ feats = token["feats"].split("|")
138
+ for feat in feats:
139
+ result.append("BUF_0_FEATS_" + feat)
140
+ # Buffer 1
141
+ if len(self.buffer) > 1:
142
+ buffer_idx1 = self.buffer[1]
143
+ token = self._tokens[buffer_idx1]
144
+ if self._check_informative(token["word"], True):
145
+ result.append("BUF_1_FORM_" + token["word"])
146
+ if self._check_informative(token["tag"]):
147
+ result.append("BUF_1_POS_" + token["tag"])
148
+ if len(self.buffer) > 2:
149
+ buffer_idx2 = self.buffer[2]
150
+ token = self._tokens[buffer_idx2]
151
+ if self._check_informative(token["tag"]):
152
+ result.append("BUF_2_POS_" + token["tag"])
153
+ if len(self.buffer) > 3:
154
+ buffer_idx3 = self.buffer[3]
155
+ token = self._tokens[buffer_idx3]
156
+ if self._check_informative(token["tag"]):
157
+ result.append("BUF_3_POS_" + token["tag"])
158
+ # Left most, right most dependency of stack[0]
159
+ left_most = 1000000
160
+ right_most = -1
161
+ dep_left_most = ""
162
+ dep_right_most = ""
163
+ for (wi, r, wj) in self.arcs:
164
+ if wi == buffer_idx0:
165
+ if (wj > wi) and (wj > right_most):
166
+ right_most = wj
167
+ dep_right_most = r
168
+ if (wj < wi) and (wj < left_most):
169
+ left_most = wj
170
+ dep_left_most = r
171
+ if self._check_informative(dep_left_most):
172
+ result.append("BUF_0_LDEP_" + dep_left_most)
173
+ if self._check_informative(dep_right_most):
174
+ result.append("BUF_0_RDEP_" + dep_right_most)
175
+
176
+ return result
177
+
178
+
179
+ class Transition:
180
+ """
181
+ This class defines a set of transition which is applied to a configuration to get another configuration
182
+ Note that for different parsing algorithm, the transition is different.
183
+ """
184
+
185
+ # Define set of transitions
186
+ LEFT_ARC = "LEFTARC"
187
+ RIGHT_ARC = "RIGHTARC"
188
+ SHIFT = "SHIFT"
189
+ REDUCE = "REDUCE"
190
+
191
+ def __init__(self, alg_option):
192
+ """
193
+ :param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
194
+ :type alg_option: str
195
+ """
196
+ self._algo = alg_option
197
+ if alg_option not in [
198
+ TransitionParser.ARC_STANDARD,
199
+ TransitionParser.ARC_EAGER,
200
+ ]:
201
+ raise ValueError(
202
+ " Currently we only support %s and %s "
203
+ % (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER)
204
+ )
205
+
206
+ def left_arc(self, conf, relation):
207
+ """
208
+ Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager
209
+
210
+ :param configuration: is the current configuration
211
+ :return: A new configuration or -1 if the pre-condition is not satisfied
212
+ """
213
+ if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
214
+ return -1
215
+ if conf.buffer[0] == 0:
216
+ # here is the Root element
217
+ return -1
218
+
219
+ idx_wi = conf.stack[len(conf.stack) - 1]
220
+
221
+ flag = True
222
+ if self._algo == TransitionParser.ARC_EAGER:
223
+ for (idx_parent, r, idx_child) in conf.arcs:
224
+ if idx_child == idx_wi:
225
+ flag = False
226
+
227
+ if flag:
228
+ conf.stack.pop()
229
+ idx_wj = conf.buffer[0]
230
+ conf.arcs.append((idx_wj, relation, idx_wi))
231
+ else:
232
+ return -1
233
+
234
+ def right_arc(self, conf, relation):
235
+ """
236
+ Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager
237
+
238
+ :param configuration: is the current configuration
239
+ :return: A new configuration or -1 if the pre-condition is not satisfied
240
+ """
241
+ if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
242
+ return -1
243
+ if self._algo == TransitionParser.ARC_STANDARD:
244
+ idx_wi = conf.stack.pop()
245
+ idx_wj = conf.buffer[0]
246
+ conf.buffer[0] = idx_wi
247
+ conf.arcs.append((idx_wi, relation, idx_wj))
248
+ else: # arc-eager
249
+ idx_wi = conf.stack[len(conf.stack) - 1]
250
+ idx_wj = conf.buffer.pop(0)
251
+ conf.stack.append(idx_wj)
252
+ conf.arcs.append((idx_wi, relation, idx_wj))
253
+
254
+ def reduce(self, conf):
255
+ """
256
+ Note that the algorithm for reduce is only available for arc-eager
257
+
258
+ :param configuration: is the current configuration
259
+ :return: A new configuration or -1 if the pre-condition is not satisfied
260
+ """
261
+
262
+ if self._algo != TransitionParser.ARC_EAGER:
263
+ return -1
264
+ if len(conf.stack) <= 0:
265
+ return -1
266
+
267
+ idx_wi = conf.stack[len(conf.stack) - 1]
268
+ flag = False
269
+ for (idx_parent, r, idx_child) in conf.arcs:
270
+ if idx_child == idx_wi:
271
+ flag = True
272
+ if flag:
273
+ conf.stack.pop() # reduce it
274
+ else:
275
+ return -1
276
+
277
+ def shift(self, conf):
278
+ """
279
+ Note that the algorithm for shift is the SAME for arc-standard and arc-eager
280
+
281
+ :param configuration: is the current configuration
282
+ :return: A new configuration or -1 if the pre-condition is not satisfied
283
+ """
284
+ if len(conf.buffer) <= 0:
285
+ return -1
286
+ idx_wi = conf.buffer.pop(0)
287
+ conf.stack.append(idx_wi)
288
+
289
+
290
+ class TransitionParser(ParserI):
291
+
292
+ """
293
+ Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
294
+ """
295
+
296
+ ARC_STANDARD = "arc-standard"
297
+ ARC_EAGER = "arc-eager"
298
+
299
+ def __init__(self, algorithm):
300
+ """
301
+ :param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
302
+ :type algorithm: str
303
+ """
304
+ if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
305
+ raise ValueError(
306
+ " Currently we only support %s and %s "
307
+ % (self.ARC_STANDARD, self.ARC_EAGER)
308
+ )
309
+ self._algorithm = algorithm
310
+
311
+ self._dictionary = {}
312
+ self._transition = {}
313
+ self._match_transition = {}
314
+
315
+ def _get_dep_relation(self, idx_parent, idx_child, depgraph):
316
+ p_node = depgraph.nodes[idx_parent]
317
+ c_node = depgraph.nodes[idx_child]
318
+
319
+ if c_node["word"] is None:
320
+ return None # Root word
321
+
322
+ if c_node["head"] == p_node["address"]:
323
+ return c_node["rel"]
324
+ else:
325
+ return None
326
+
327
+ def _convert_to_binary_features(self, features):
328
+ """
329
+ :param features: list of feature string which is needed to convert to binary features
330
+ :type features: list(str)
331
+ :return : string of binary features in libsvm format which is 'featureID:value' pairs
332
+ """
333
+ unsorted_result = []
334
+ for feature in features:
335
+ self._dictionary.setdefault(feature, len(self._dictionary))
336
+ unsorted_result.append(self._dictionary[feature])
337
+
338
+ # Default value of each feature is 1.0
339
+ return " ".join(
340
+ str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
341
+ )
342
+
343
+ def _is_projective(self, depgraph):
344
+ arc_list = []
345
+ for key in depgraph.nodes:
346
+ node = depgraph.nodes[key]
347
+
348
+ if "head" in node:
349
+ childIdx = node["address"]
350
+ parentIdx = node["head"]
351
+ if parentIdx is not None:
352
+ arc_list.append((parentIdx, childIdx))
353
+
354
+ for (parentIdx, childIdx) in arc_list:
355
+ # Ensure that childIdx < parentIdx
356
+ if childIdx > parentIdx:
357
+ temp = childIdx
358
+ childIdx = parentIdx
359
+ parentIdx = temp
360
+ for k in range(childIdx + 1, parentIdx):
361
+ for m in range(len(depgraph.nodes)):
362
+ if (m < childIdx) or (m > parentIdx):
363
+ if (k, m) in arc_list:
364
+ return False
365
+ if (m, k) in arc_list:
366
+ return False
367
+ return True
368
+
369
+ def _write_to_file(self, key, binary_features, input_file):
370
+ """
371
+ write the binary features to input file and update the transition dictionary
372
+ """
373
+ self._transition.setdefault(key, len(self._transition) + 1)
374
+ self._match_transition[self._transition[key]] = key
375
+
376
+ input_str = str(self._transition[key]) + " " + binary_features + "\n"
377
+ input_file.write(input_str.encode("utf-8"))
378
+
379
+ def _create_training_examples_arc_std(self, depgraphs, input_file):
380
+ """
381
+ Create the training example in the libsvm format and write it to the input_file.
382
+ Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009)
383
+ """
384
+ operation = Transition(self.ARC_STANDARD)
385
+ count_proj = 0
386
+ training_seq = []
387
+
388
+ for depgraph in depgraphs:
389
+ if not self._is_projective(depgraph):
390
+ continue
391
+
392
+ count_proj += 1
393
+ conf = Configuration(depgraph)
394
+ while len(conf.buffer) > 0:
395
+ b0 = conf.buffer[0]
396
+ features = conf.extract_features()
397
+ binary_features = self._convert_to_binary_features(features)
398
+
399
+ if len(conf.stack) > 0:
400
+ s0 = conf.stack[len(conf.stack) - 1]
401
+ # Left-arc operation
402
+ rel = self._get_dep_relation(b0, s0, depgraph)
403
+ if rel is not None:
404
+ key = Transition.LEFT_ARC + ":" + rel
405
+ self._write_to_file(key, binary_features, input_file)
406
+ operation.left_arc(conf, rel)
407
+ training_seq.append(key)
408
+ continue
409
+
410
+ # Right-arc operation
411
+ rel = self._get_dep_relation(s0, b0, depgraph)
412
+ if rel is not None:
413
+ precondition = True
414
+ # Get the max-index of buffer
415
+ maxID = conf._max_address
416
+
417
+ for w in range(maxID + 1):
418
+ if w != b0:
419
+ relw = self._get_dep_relation(b0, w, depgraph)
420
+ if relw is not None:
421
+ if (b0, relw, w) not in conf.arcs:
422
+ precondition = False
423
+
424
+ if precondition:
425
+ key = Transition.RIGHT_ARC + ":" + rel
426
+ self._write_to_file(key, binary_features, input_file)
427
+ operation.right_arc(conf, rel)
428
+ training_seq.append(key)
429
+ continue
430
+
431
+ # Shift operation as the default
432
+ key = Transition.SHIFT
433
+ self._write_to_file(key, binary_features, input_file)
434
+ operation.shift(conf)
435
+ training_seq.append(key)
436
+
437
+ print(" Number of training examples : " + str(len(depgraphs)))
438
+ print(" Number of valid (projective) examples : " + str(count_proj))
439
+ return training_seq
440
+
441
+ def _create_training_examples_arc_eager(self, depgraphs, input_file):
442
+ """
443
+ Create the training example in the libsvm format and write it to the input_file.
444
+ Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre
445
+ """
446
+ operation = Transition(self.ARC_EAGER)
447
+ countProj = 0
448
+ training_seq = []
449
+
450
+ for depgraph in depgraphs:
451
+ if not self._is_projective(depgraph):
452
+ continue
453
+
454
+ countProj += 1
455
+ conf = Configuration(depgraph)
456
+ while len(conf.buffer) > 0:
457
+ b0 = conf.buffer[0]
458
+ features = conf.extract_features()
459
+ binary_features = self._convert_to_binary_features(features)
460
+
461
+ if len(conf.stack) > 0:
462
+ s0 = conf.stack[len(conf.stack) - 1]
463
+ # Left-arc operation
464
+ rel = self._get_dep_relation(b0, s0, depgraph)
465
+ if rel is not None:
466
+ key = Transition.LEFT_ARC + ":" + rel
467
+ self._write_to_file(key, binary_features, input_file)
468
+ operation.left_arc(conf, rel)
469
+ training_seq.append(key)
470
+ continue
471
+
472
+ # Right-arc operation
473
+ rel = self._get_dep_relation(s0, b0, depgraph)
474
+ if rel is not None:
475
+ key = Transition.RIGHT_ARC + ":" + rel
476
+ self._write_to_file(key, binary_features, input_file)
477
+ operation.right_arc(conf, rel)
478
+ training_seq.append(key)
479
+ continue
480
+
481
+ # reduce operation
482
+ flag = False
483
+ for k in range(s0):
484
+ if self._get_dep_relation(k, b0, depgraph) is not None:
485
+ flag = True
486
+ if self._get_dep_relation(b0, k, depgraph) is not None:
487
+ flag = True
488
+ if flag:
489
+ key = Transition.REDUCE
490
+ self._write_to_file(key, binary_features, input_file)
491
+ operation.reduce(conf)
492
+ training_seq.append(key)
493
+ continue
494
+
495
+ # Shift operation as the default
496
+ key = Transition.SHIFT
497
+ self._write_to_file(key, binary_features, input_file)
498
+ operation.shift(conf)
499
+ training_seq.append(key)
500
+
501
+ print(" Number of training examples : " + str(len(depgraphs)))
502
+ print(" Number of valid (projective) examples : " + str(countProj))
503
+ return training_seq
504
+
505
+ def train(self, depgraphs, modelfile, verbose=True):
506
+ """
507
+ :param depgraphs : list of DependencyGraph as the training data
508
+ :type depgraphs : DependencyGraph
509
+ :param modelfile : file name to save the trained model
510
+ :type modelfile : str
511
+ """
512
+
513
+ try:
514
+ input_file = tempfile.NamedTemporaryFile(
515
+ prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
516
+ )
517
+
518
+ if self._algorithm == self.ARC_STANDARD:
519
+ self._create_training_examples_arc_std(depgraphs, input_file)
520
+ else:
521
+ self._create_training_examples_arc_eager(depgraphs, input_file)
522
+
523
+ input_file.close()
524
+ # Using the temporary file to train the libsvm classifier
525
+ x_train, y_train = load_svmlight_file(input_file.name)
526
+ # The parameter is set according to the paper:
527
+ # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
528
+ # Todo : because of probability = True => very slow due to
529
+ # cross-validation. Need to improve the speed here
530
+ model = svm.SVC(
531
+ kernel="poly",
532
+ degree=2,
533
+ coef0=0,
534
+ gamma=0.2,
535
+ C=0.5,
536
+ verbose=verbose,
537
+ probability=True,
538
+ )
539
+
540
+ model.fit(x_train, y_train)
541
+ # Save the model to file name (as pickle)
542
+ pickle.dump(model, open(modelfile, "wb"))
543
+ finally:
544
+ remove(input_file.name)
545
+
546
+ def parse(self, depgraphs, modelFile):
547
+ """
548
+ :param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy
549
+ :type depgraphs: list(DependencyGraph)
550
+ :param modelfile: the model file
551
+ :type modelfile: str
552
+ :return: list (DependencyGraph) with the 'head' and 'rel' information
553
+ """
554
+ result = []
555
+ # First load the model
556
+ model = pickle.load(open(modelFile, "rb"))
557
+ operation = Transition(self._algorithm)
558
+
559
+ for depgraph in depgraphs:
560
+ conf = Configuration(depgraph)
561
+ while len(conf.buffer) > 0:
562
+ features = conf.extract_features()
563
+ col = []
564
+ row = []
565
+ data = []
566
+ for feature in features:
567
+ if feature in self._dictionary:
568
+ col.append(self._dictionary[feature])
569
+ row.append(0)
570
+ data.append(1.0)
571
+ np_col = array(sorted(col)) # NB : index must be sorted
572
+ np_row = array(row)
573
+ np_data = array(data)
574
+
575
+ x_test = sparse.csr_matrix(
576
+ (np_data, (np_row, np_col)), shape=(1, len(self._dictionary))
577
+ )
578
+
579
+ # It's best to use decision function as follow BUT it's not supported yet for sparse SVM
580
+ # Using decision function to build the votes array
581
+ # dec_func = model.decision_function(x_test)[0]
582
+ # votes = {}
583
+ # k = 0
584
+ # for i in range(len(model.classes_)):
585
+ # for j in range(i+1, len(model.classes_)):
586
+ # #if dec_func[k] > 0:
587
+ # votes.setdefault(i,0)
588
+ # votes[i] +=1
589
+ # else:
590
+ # votes.setdefault(j,0)
591
+ # votes[j] +=1
592
+ # k +=1
593
+ # Sort votes according to the values
594
+ # sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
595
+
596
+ # We will use predict_proba instead of decision_function
597
+ prob_dict = {}
598
+ pred_prob = model.predict_proba(x_test)[0]
599
+ for i in range(len(pred_prob)):
600
+ prob_dict[i] = pred_prob[i]
601
+ sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True)
602
+
603
+ # Note that SHIFT is always a valid operation
604
+ for (y_pred_idx, confidence) in sorted_Prob:
605
+ # y_pred = model.predict(x_test)[0]
606
+ # From the prediction match to the operation
607
+ y_pred = model.classes_[y_pred_idx]
608
+
609
+ if y_pred in self._match_transition:
610
+ strTransition = self._match_transition[y_pred]
611
+ baseTransition = strTransition.split(":")[0]
612
+
613
+ if baseTransition == Transition.LEFT_ARC:
614
+ if (
615
+ operation.left_arc(conf, strTransition.split(":")[1])
616
+ != -1
617
+ ):
618
+ break
619
+ elif baseTransition == Transition.RIGHT_ARC:
620
+ if (
621
+ operation.right_arc(conf, strTransition.split(":")[1])
622
+ != -1
623
+ ):
624
+ break
625
+ elif baseTransition == Transition.REDUCE:
626
+ if operation.reduce(conf) != -1:
627
+ break
628
+ elif baseTransition == Transition.SHIFT:
629
+ if operation.shift(conf) != -1:
630
+ break
631
+ else:
632
+ raise ValueError(
633
+ "The predicted transition is not recognized, expected errors"
634
+ )
635
+
636
+ # Finish with operations build the dependency graph from Conf.arcs
637
+
638
+ new_depgraph = deepcopy(depgraph)
639
+ for key in new_depgraph.nodes:
640
+ node = new_depgraph.nodes[key]
641
+ node["rel"] = ""
642
+ # With the default, all the token depend on the Root
643
+ node["head"] = 0
644
+ for (head, rel, child) in conf.arcs:
645
+ c_node = new_depgraph.nodes[child]
646
+ c_node["head"] = head
647
+ c_node["rel"] = rel
648
+ result.append(new_depgraph)
649
+
650
+ return result
651
+
652
+
653
+ def demo():
654
+ """
655
+ >>> from nltk.parse import DependencyGraph, DependencyEvaluator
656
+ >>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
657
+ >>> gold_sent = DependencyGraph(\"""
658
+ ... Economic JJ 2 ATT
659
+ ... news NN 3 SBJ
660
+ ... has VBD 0 ROOT
661
+ ... little JJ 5 ATT
662
+ ... effect NN 3 OBJ
663
+ ... on IN 5 ATT
664
+ ... financial JJ 8 ATT
665
+ ... markets NNS 6 PC
666
+ ... . . 3 PU
667
+ ... \""")
668
+
669
+ >>> conf = Configuration(gold_sent)
670
+
671
+ ###################### Check the Initial Feature ########################
672
+
673
+ >>> print(', '.join(conf.extract_features()))
674
+ STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ
675
+
676
+ ###################### Check The Transition #######################
677
+ Check the Initialized Configuration
678
+ >>> print(conf)
679
+ Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : []
680
+
681
+ A. Do some transition checks for ARC-STANDARD
682
+
683
+ >>> operation = Transition('arc-standard')
684
+ >>> operation.shift(conf)
685
+ >>> operation.left_arc(conf, "ATT")
686
+ >>> operation.shift(conf)
687
+ >>> operation.left_arc(conf,"SBJ")
688
+ >>> operation.shift(conf)
689
+ >>> operation.shift(conf)
690
+ >>> operation.left_arc(conf, "ATT")
691
+ >>> operation.shift(conf)
692
+ >>> operation.shift(conf)
693
+ >>> operation.shift(conf)
694
+ >>> operation.left_arc(conf, "ATT")
695
+
696
+ Middle Configuration and Features Check
697
+ >>> print(conf)
698
+ Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)]
699
+
700
+ >>> print(', '.join(conf.extract_features()))
701
+ STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT
702
+
703
+ >>> operation.right_arc(conf, "PC")
704
+ >>> operation.right_arc(conf, "ATT")
705
+ >>> operation.right_arc(conf, "OBJ")
706
+ >>> operation.shift(conf)
707
+ >>> operation.right_arc(conf, "PU")
708
+ >>> operation.right_arc(conf, "ROOT")
709
+ >>> operation.shift(conf)
710
+
711
+ Terminated Configuration Check
712
+ >>> print(conf)
713
+ Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)]
714
+
715
+
716
+ B. Do some transition checks for ARC-EAGER
717
+
718
+ >>> conf = Configuration(gold_sent)
719
+ >>> operation = Transition('arc-eager')
720
+ >>> operation.shift(conf)
721
+ >>> operation.left_arc(conf,'ATT')
722
+ >>> operation.shift(conf)
723
+ >>> operation.left_arc(conf,'SBJ')
724
+ >>> operation.right_arc(conf,'ROOT')
725
+ >>> operation.shift(conf)
726
+ >>> operation.left_arc(conf,'ATT')
727
+ >>> operation.right_arc(conf,'OBJ')
728
+ >>> operation.right_arc(conf,'ATT')
729
+ >>> operation.shift(conf)
730
+ >>> operation.left_arc(conf,'ATT')
731
+ >>> operation.right_arc(conf,'PC')
732
+ >>> operation.reduce(conf)
733
+ >>> operation.reduce(conf)
734
+ >>> operation.reduce(conf)
735
+ >>> operation.right_arc(conf,'PU')
736
+ >>> print(conf)
737
+ Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)]
738
+
739
+ ###################### Check The Training Function #######################
740
+
741
+ A. Check the ARC-STANDARD training
742
+ >>> import tempfile
743
+ >>> import os
744
+ >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False)
745
+
746
+ >>> parser_std = TransitionParser('arc-standard')
747
+ >>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file)))
748
+ Number of training examples : 1
749
+ Number of valid (projective) examples : 1
750
+ SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT
751
+
752
+ >>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False)
753
+ Number of training examples : 1
754
+ Number of valid (projective) examples : 1
755
+ >>> input_file.close()
756
+ >>> remove(input_file.name)
757
+
758
+ B. Check the ARC-EAGER training
759
+
760
+ >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False)
761
+ >>> parser_eager = TransitionParser('arc-eager')
762
+ >>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file)))
763
+ Number of training examples : 1
764
+ Number of valid (projective) examples : 1
765
+ SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU
766
+
767
+ >>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False)
768
+ Number of training examples : 1
769
+ Number of valid (projective) examples : 1
770
+
771
+ >>> input_file.close()
772
+ >>> remove(input_file.name)
773
+
774
+ ###################### Check The Parsing Function ########################
775
+
776
+ A. Check the ARC-STANDARD parser
777
+
778
+ >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
779
+ >>> de = DependencyEvaluator(result, [gold_sent])
780
+ >>> de.eval() >= (0, 0)
781
+ True
782
+
783
+ B. Check the ARC-EAGER parser
784
+ >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
785
+ >>> de = DependencyEvaluator(result, [gold_sent])
786
+ >>> de.eval() >= (0, 0)
787
+ True
788
+
789
+ Remove test temporary files
790
+ >>> remove('temp.arceager.model')
791
+ >>> remove('temp.arcstd.model')
792
+
793
+ Note that result is very poor because of only one training example.
794
+ """
.eggs/nltk-3.8-py3.10.egg/nltk/parse/util.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Parser Utility Functions
2
+ #
3
+ # Author: Ewan Klein <ewan@inf.ed.ac.uk>
4
+ # Tom Aarsen <>
5
+ #
6
+ # Copyright (C) 2001-2022 NLTK Project
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+
11
+ """
12
+ Utility functions for parsers.
13
+ """
14
+
15
+ from nltk.data import load
16
+ from nltk.grammar import CFG, PCFG, FeatureGrammar
17
+ from nltk.parse.chart import Chart, ChartParser
18
+ from nltk.parse.featurechart import FeatureChart, FeatureChartParser
19
+ from nltk.parse.pchart import InsideChartParser
20
+
21
+
22
+ def load_parser(
23
+ grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args
24
+ ):
25
+ """
26
+ Load a grammar from a file, and build a parser based on that grammar.
27
+ The parser depends on the grammar format, and might also depend
28
+ on properties of the grammar itself.
29
+
30
+ The following grammar formats are currently supported:
31
+ - ``'cfg'`` (CFGs: ``CFG``)
32
+ - ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
33
+ - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
34
+
35
+ :type grammar_url: str
36
+ :param grammar_url: A URL specifying where the grammar is located.
37
+ The default protocol is ``"nltk:"``, which searches for the file
38
+ in the the NLTK data package.
39
+ :type trace: int
40
+ :param trace: The level of tracing that should be used when
41
+ parsing a text. ``0`` will generate no tracing output;
42
+ and higher numbers will produce more verbose tracing output.
43
+ :param parser: The class used for parsing; should be ``ChartParser``
44
+ or a subclass.
45
+ If None, the class depends on the grammar format.
46
+ :param chart_class: The class used for storing the chart;
47
+ should be ``Chart`` or a subclass.
48
+ Only used for CFGs and feature CFGs.
49
+ If None, the chart class depends on the grammar format.
50
+ :type beam_size: int
51
+ :param beam_size: The maximum length for the parser's edge queue.
52
+ Only used for probabilistic CFGs.
53
+ :param load_args: Keyword parameters used when loading the grammar.
54
+ See ``data.load`` for more information.
55
+ """
56
+ grammar = load(grammar_url, **load_args)
57
+ if not isinstance(grammar, CFG):
58
+ raise ValueError("The grammar must be a CFG, " "or a subclass thereof.")
59
+ if isinstance(grammar, PCFG):
60
+ if parser is None:
61
+ parser = InsideChartParser
62
+ return parser(grammar, trace=trace, beam_size=beam_size)
63
+
64
+ elif isinstance(grammar, FeatureGrammar):
65
+ if parser is None:
66
+ parser = FeatureChartParser
67
+ if chart_class is None:
68
+ chart_class = FeatureChart
69
+ return parser(grammar, trace=trace, chart_class=chart_class)
70
+
71
+ else: # Plain CFG.
72
+ if parser is None:
73
+ parser = ChartParser
74
+ if chart_class is None:
75
+ chart_class = Chart
76
+ return parser(grammar, trace=trace, chart_class=chart_class)
77
+
78
+
79
+ def taggedsent_to_conll(sentence):
80
+ """
81
+ A module to convert a single POS tagged sentence into CONLL format.
82
+
83
+ >>> from nltk import word_tokenize, pos_tag
84
+ >>> text = "This is a foobar sentence."
85
+ >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): # doctest: +NORMALIZE_WHITESPACE
86
+ ... print(line, end="")
87
+ 1 This _ DT DT _ 0 a _ _
88
+ 2 is _ VBZ VBZ _ 0 a _ _
89
+ 3 a _ DT DT _ 0 a _ _
90
+ 4 foobar _ JJ JJ _ 0 a _ _
91
+ 5 sentence _ NN NN _ 0 a _ _
92
+ 6 . _ . . _ 0 a _ _
93
+
94
+ :param sentence: A single input sentence to parse
95
+ :type sentence: list(tuple(str, str))
96
+ :rtype: iter(str)
97
+ :return: a generator yielding a single sentence in CONLL format.
98
+ """
99
+ for (i, (word, tag)) in enumerate(sentence, start=1):
100
+ input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
101
+ input_str = "\t".join(input_str) + "\n"
102
+ yield input_str
103
+
104
+
105
+ def taggedsents_to_conll(sentences):
106
+ """
107
+ A module to convert the a POS tagged document stream
108
+ (i.e. list of list of tuples, a list of sentences) and yield lines
109
+ in CONLL format. This module yields one line per word and two newlines
110
+ for end of sentence.
111
+
112
+ >>> from nltk import word_tokenize, sent_tokenize, pos_tag
113
+ >>> text = "This is a foobar sentence. Is that right?"
114
+ >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
115
+ >>> for line in taggedsents_to_conll(sentences): # doctest: +NORMALIZE_WHITESPACE
116
+ ... if line:
117
+ ... print(line, end="")
118
+ 1 This _ DT DT _ 0 a _ _
119
+ 2 is _ VBZ VBZ _ 0 a _ _
120
+ 3 a _ DT DT _ 0 a _ _
121
+ 4 foobar _ JJ JJ _ 0 a _ _
122
+ 5 sentence _ NN NN _ 0 a _ _
123
+ 6 . _ . . _ 0 a _ _
124
+ <BLANKLINE>
125
+ <BLANKLINE>
126
+ 1 Is _ VBZ VBZ _ 0 a _ _
127
+ 2 that _ IN IN _ 0 a _ _
128
+ 3 right _ NN NN _ 0 a _ _
129
+ 4 ? _ . . _ 0 a _ _
130
+ <BLANKLINE>
131
+ <BLANKLINE>
132
+
133
+ :param sentences: Input sentences to parse
134
+ :type sentence: list(list(tuple(str, str)))
135
+ :rtype: iter(str)
136
+ :return: a generator yielding sentences in CONLL format.
137
+ """
138
+ for sentence in sentences:
139
+ yield from taggedsent_to_conll(sentence)
140
+ yield "\n\n"
141
+
142
+
143
+ ######################################################################
144
+ # { Test Suites
145
+ ######################################################################
146
+
147
+
148
+ class TestGrammar:
149
+ """
150
+ Unit tests for CFG.
151
+ """
152
+
153
+ def __init__(self, grammar, suite, accept=None, reject=None):
154
+ self.test_grammar = grammar
155
+
156
+ self.cp = load_parser(grammar, trace=0)
157
+ self.suite = suite
158
+ self._accept = accept
159
+ self._reject = reject
160
+
161
+ def run(self, show_trees=False):
162
+ """
163
+ Sentences in the test suite are divided into two classes:
164
+
165
+ - grammatical (``accept``) and
166
+ - ungrammatical (``reject``).
167
+
168
+ If a sentence should parse according to the grammar, the value of
169
+ ``trees`` will be a non-empty list. If a sentence should be rejected
170
+ according to the grammar, then the value of ``trees`` will be None.
171
+ """
172
+ for test in self.suite:
173
+ print(test["doc"] + ":", end=" ")
174
+ for key in ["accept", "reject"]:
175
+ for sent in test[key]:
176
+ tokens = sent.split()
177
+ trees = list(self.cp.parse(tokens))
178
+ if show_trees and trees:
179
+ print()
180
+ print(sent)
181
+ for tree in trees:
182
+ print(tree)
183
+ if key == "accept":
184
+ if trees == []:
185
+ raise ValueError("Sentence '%s' failed to parse'" % sent)
186
+ else:
187
+ accepted = True
188
+ else:
189
+ if trees:
190
+ raise ValueError("Sentence '%s' received a parse'" % sent)
191
+ else:
192
+ rejected = True
193
+ if accepted and rejected:
194
+ print("All tests passed!")
195
+
196
+
197
+ def extract_test_sentences(string, comment_chars="#%;", encoding=None):
198
+ """
199
+ Parses a string with one test sentence per line.
200
+ Lines can optionally begin with:
201
+
202
+ - a bool, saying if the sentence is grammatical or not, or
203
+ - an int, giving the number of parse trees is should have,
204
+
205
+ The result information is followed by a colon, and then the sentence.
206
+ Empty lines and lines beginning with a comment char are ignored.
207
+
208
+ :return: a list of tuple of sentences and expected results,
209
+ where a sentence is a list of str,
210
+ and a result is None, or bool, or int
211
+
212
+ :param comment_chars: ``str`` of possible comment characters.
213
+ :param encoding: the encoding of the string, if it is binary
214
+ """
215
+ if encoding is not None:
216
+ string = string.decode(encoding)
217
+ sentences = []
218
+ for sentence in string.split("\n"):
219
+ if sentence == "" or sentence[0] in comment_chars:
220
+ continue
221
+ split_info = sentence.split(":", 1)
222
+ result = None
223
+ if len(split_info) == 2:
224
+ if split_info[0] in ["True", "true", "False", "false"]:
225
+ result = split_info[0] in ["True", "true"]
226
+ sentence = split_info[1]
227
+ else:
228
+ result = int(split_info[0])
229
+ sentence = split_info[1]
230
+ tokens = sentence.split()
231
+ if tokens == []:
232
+ continue
233
+ sentences += [(tokens, result)]
234
+ return sentences
.eggs/nltk-3.8-py3.10.egg/nltk/parse/viterbi.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Viterbi Probabilistic Parser
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ from functools import reduce
10
+
11
+ from nltk.parse.api import ParserI
12
+ from nltk.tree import ProbabilisticTree, Tree
13
+
14
+ ##//////////////////////////////////////////////////////
15
+ ## Viterbi PCFG Parser
16
+ ##//////////////////////////////////////////////////////
17
+
18
+
19
+ class ViterbiParser(ParserI):
20
+ """
21
+ A bottom-up ``PCFG`` parser that uses dynamic programming to find
22
+ the single most likely parse for a text. The ``ViterbiParser`` parser
23
+ parses texts by filling in a "most likely constituent table".
24
+ This table records the most probable tree representation for any
25
+ given span and node value. In particular, it has an entry for
26
+ every start index, end index, and node value, recording the most
27
+ likely subtree that spans from the start index to the end index,
28
+ and has the given node value.
29
+
30
+ The ``ViterbiParser`` parser fills in this table incrementally. It starts
31
+ by filling in all entries for constituents that span one element
32
+ of text (i.e., entries where the end index is one greater than the
33
+ start index). After it has filled in all table entries for
34
+ constituents that span one element of text, it fills in the
35
+ entries for constitutants that span two elements of text. It
36
+ continues filling in the entries for constituents spanning larger
37
+ and larger portions of the text, until the entire table has been
38
+ filled. Finally, it returns the table entry for a constituent
39
+ spanning the entire text, whose node value is the grammar's start
40
+ symbol.
41
+
42
+ In order to find the most likely constituent with a given span and
43
+ node value, the ``ViterbiParser`` parser considers all productions that
44
+ could produce that node value. For each production, it finds all
45
+ children that collectively cover the span and have the node values
46
+ specified by the production's right hand side. If the probability
47
+ of the tree formed by applying the production to the children is
48
+ greater than the probability of the current entry in the table,
49
+ then the table is updated with this new tree.
50
+
51
+ A pseudo-code description of the algorithm used by
52
+ ``ViterbiParser`` is:
53
+
54
+ | Create an empty most likely constituent table, *MLC*.
55
+ | For width in 1...len(text):
56
+ | For start in 1...len(text)-width:
57
+ | For prod in grammar.productions:
58
+ | For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
59
+ | where t[i].label()==prod.rhs[i],
60
+ | and the sequence covers [start:start+width]:
61
+ | old_p = MLC[start, start+width, prod.lhs]
62
+ | new_p = P(t[1])P(t[1])...P(t[n])P(prod)
63
+ | if new_p > old_p:
64
+ | new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
65
+ | MLC[start, start+width, prod.lhs] = new_tree
66
+ | Return MLC[0, len(text), start_symbol]
67
+
68
+ :type _grammar: PCFG
69
+ :ivar _grammar: The grammar used to parse sentences.
70
+ :type _trace: int
71
+ :ivar _trace: The level of tracing output that should be generated
72
+ when parsing a text.
73
+ """
74
+
75
+ def __init__(self, grammar, trace=0):
76
+ """
77
+ Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
78
+ parse texts.
79
+
80
+ :type grammar: PCFG
81
+ :param grammar: The grammar used to parse texts.
82
+ :type trace: int
83
+ :param trace: The level of tracing that should be used when
84
+ parsing a text. ``0`` will generate no tracing output;
85
+ and higher numbers will produce more verbose tracing
86
+ output.
87
+ """
88
+ self._grammar = grammar
89
+ self._trace = trace
90
+
91
+ def grammar(self):
92
+ return self._grammar
93
+
94
+ def trace(self, trace=2):
95
+ """
96
+ Set the level of tracing output that should be generated when
97
+ parsing a text.
98
+
99
+ :type trace: int
100
+ :param trace: The trace level. A trace level of ``0`` will
101
+ generate no tracing output; and higher trace levels will
102
+ produce more verbose tracing output.
103
+ :rtype: None
104
+ """
105
+ self._trace = trace
106
+
107
+ def parse(self, tokens):
108
+ # Inherit docs from ParserI
109
+
110
+ tokens = list(tokens)
111
+ self._grammar.check_coverage(tokens)
112
+
113
+ # The most likely constituent table. This table specifies the
114
+ # most likely constituent for a given span and type.
115
+ # Constituents can be either Trees or tokens. For Trees,
116
+ # the "type" is the Nonterminal for the tree's root node
117
+ # value. For Tokens, the "type" is the token's type.
118
+ # The table is stored as a dictionary, since it is sparse.
119
+ constituents = {}
120
+
121
+ # Initialize the constituents dictionary with the words from
122
+ # the text.
123
+ if self._trace:
124
+ print("Inserting tokens into the most likely" + " constituents table...")
125
+ for index in range(len(tokens)):
126
+ token = tokens[index]
127
+ constituents[index, index + 1, token] = token
128
+ if self._trace > 1:
129
+ self._trace_lexical_insertion(token, index, len(tokens))
130
+
131
+ # Consider each span of length 1, 2, ..., n; and add any trees
132
+ # that might cover that span to the constituents dictionary.
133
+ for length in range(1, len(tokens) + 1):
134
+ if self._trace:
135
+ print(
136
+ "Finding the most likely constituents"
137
+ + " spanning %d text elements..." % length
138
+ )
139
+ for start in range(len(tokens) - length + 1):
140
+ span = (start, start + length)
141
+ self._add_constituents_spanning(span, constituents, tokens)
142
+
143
+ # Return the tree that spans the entire text & have the right cat
144
+ tree = constituents.get((0, len(tokens), self._grammar.start()))
145
+ if tree is not None:
146
+ yield tree
147
+
148
+ def _add_constituents_spanning(self, span, constituents, tokens):
149
+ """
150
+ Find any constituents that might cover ``span``, and add them
151
+ to the most likely constituents table.
152
+
153
+ :rtype: None
154
+ :type span: tuple(int, int)
155
+ :param span: The section of the text for which we are
156
+ trying to find possible constituents. The span is
157
+ specified as a pair of integers, where the first integer
158
+ is the index of the first token that should be included in
159
+ the constituent; and the second integer is the index of
160
+ the first token that should not be included in the
161
+ constituent. I.e., the constituent should cover
162
+ ``text[span[0]:span[1]]``, where ``text`` is the text
163
+ that we are parsing.
164
+
165
+ :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
166
+ :param constituents: The most likely constituents table. This
167
+ table records the most probable tree representation for
168
+ any given span and node value. In particular,
169
+ ``constituents(s,e,nv)`` is the most likely
170
+ ``ProbabilisticTree`` that covers ``text[s:e]``
171
+ and has a node value ``nv.symbol()``, where ``text``
172
+ is the text that we are parsing. When
173
+ ``_add_constituents_spanning`` is called, ``constituents``
174
+ should contain all possible constituents that are shorter
175
+ than ``span``.
176
+
177
+ :type tokens: list of tokens
178
+ :param tokens: The text we are parsing. This is only used for
179
+ trace output.
180
+ """
181
+ # Since some of the grammar productions may be unary, we need to
182
+ # repeatedly try all of the productions until none of them add any
183
+ # new constituents.
184
+ changed = True
185
+ while changed:
186
+ changed = False
187
+
188
+ # Find all ways instantiations of the grammar productions that
189
+ # cover the span.
190
+ instantiations = self._find_instantiations(span, constituents)
191
+
192
+ # For each production instantiation, add a new
193
+ # ProbabilisticTree whose probability is the product
194
+ # of the childrens' probabilities and the production's
195
+ # probability.
196
+ for (production, children) in instantiations:
197
+ subtrees = [c for c in children if isinstance(c, Tree)]
198
+ p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
199
+ node = production.lhs().symbol()
200
+ tree = ProbabilisticTree(node, children, prob=p)
201
+
202
+ # If it's new a constituent, then add it to the
203
+ # constituents dictionary.
204
+ c = constituents.get((span[0], span[1], production.lhs()))
205
+ if self._trace > 1:
206
+ if c is None or c != tree:
207
+ if c is None or c.prob() < tree.prob():
208
+ print(" Insert:", end=" ")
209
+ else:
210
+ print(" Discard:", end=" ")
211
+ self._trace_production(production, p, span, len(tokens))
212
+ if c is None or c.prob() < tree.prob():
213
+ constituents[span[0], span[1], production.lhs()] = tree
214
+ changed = True
215
+
216
+ def _find_instantiations(self, span, constituents):
217
+ """
218
+ :return: a list of the production instantiations that cover a
219
+ given span of the text. A "production instantiation" is
220
+ a tuple containing a production and a list of children,
221
+ where the production's right hand side matches the list of
222
+ children; and the children cover ``span``. :rtype: list
223
+ of ``pair`` of ``Production``, (list of
224
+ (``ProbabilisticTree`` or token.
225
+
226
+ :type span: tuple(int, int)
227
+ :param span: The section of the text for which we are
228
+ trying to find production instantiations. The span is
229
+ specified as a pair of integers, where the first integer
230
+ is the index of the first token that should be covered by
231
+ the production instantiation; and the second integer is
232
+ the index of the first token that should not be covered by
233
+ the production instantiation.
234
+ :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
235
+ :param constituents: The most likely constituents table. This
236
+ table records the most probable tree representation for
237
+ any given span and node value. See the module
238
+ documentation for more information.
239
+ """
240
+ rv = []
241
+ for production in self._grammar.productions():
242
+ childlists = self._match_rhs(production.rhs(), span, constituents)
243
+
244
+ for childlist in childlists:
245
+ rv.append((production, childlist))
246
+ return rv
247
+
248
+ def _match_rhs(self, rhs, span, constituents):
249
+ """
250
+ :return: a set of all the lists of children that cover ``span``
251
+ and that match ``rhs``.
252
+ :rtype: list(list(ProbabilisticTree or token)
253
+
254
+ :type rhs: list(Nonterminal or any)
255
+ :param rhs: The list specifying what kinds of children need to
256
+ cover ``span``. Each nonterminal in ``rhs`` specifies
257
+ that the corresponding child should be a tree whose node
258
+ value is that nonterminal's symbol. Each terminal in ``rhs``
259
+ specifies that the corresponding child should be a token
260
+ whose type is that terminal.
261
+ :type span: tuple(int, int)
262
+ :param span: The section of the text for which we are
263
+ trying to find child lists. The span is specified as a
264
+ pair of integers, where the first integer is the index of
265
+ the first token that should be covered by the child list;
266
+ and the second integer is the index of the first token
267
+ that should not be covered by the child list.
268
+ :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
269
+ :param constituents: The most likely constituents table. This
270
+ table records the most probable tree representation for
271
+ any given span and node value. See the module
272
+ documentation for more information.
273
+ """
274
+ (start, end) = span
275
+
276
+ # Base case
277
+ if start >= end and rhs == ():
278
+ return [[]]
279
+ if start >= end or rhs == ():
280
+ return []
281
+
282
+ # Find everything that matches the 1st symbol of the RHS
283
+ childlists = []
284
+ for split in range(start, end + 1):
285
+ l = constituents.get((start, split, rhs[0]))
286
+ if l is not None:
287
+ rights = self._match_rhs(rhs[1:], (split, end), constituents)
288
+ childlists += [[l] + r for r in rights]
289
+
290
+ return childlists
291
+
292
+ def _trace_production(self, production, p, span, width):
293
+ """
294
+ Print trace output indicating that a given production has been
295
+ applied at a given location.
296
+
297
+ :param production: The production that has been applied
298
+ :type production: Production
299
+ :param p: The probability of the tree produced by the production.
300
+ :type p: float
301
+ :param span: The span of the production
302
+ :type span: tuple
303
+ :rtype: None
304
+ """
305
+
306
+ str = "|" + "." * span[0]
307
+ str += "=" * (span[1] - span[0])
308
+ str += "." * (width - span[1]) + "| "
309
+ str += "%s" % production
310
+ if self._trace > 2:
311
+ str = f"{str:<40} {p:12.10f} "
312
+
313
+ print(str)
314
+
315
+ def _trace_lexical_insertion(self, token, index, width):
316
+ str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
317
+ str += f"{token}"
318
+ print(str)
319
+
320
+ def __repr__(self):
321
+ return "<ViterbiParser for %r>" % self._grammar
322
+
323
+
324
+ ##//////////////////////////////////////////////////////
325
+ ## Test Code
326
+ ##//////////////////////////////////////////////////////
327
+
328
+
329
+ def demo():
330
+ """
331
+ A demonstration of the probabilistic parsers. The user is
332
+ prompted to select which demo to run, and how many parses should
333
+ be found; and then each parser is run on the same demo, and a
334
+ summary of the results are displayed.
335
+ """
336
+ import sys
337
+ import time
338
+
339
+ from nltk import tokenize
340
+ from nltk.grammar import PCFG
341
+ from nltk.parse import ViterbiParser
342
+
343
+ toy_pcfg1 = PCFG.fromstring(
344
+ """
345
+ S -> NP VP [1.0]
346
+ NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
347
+ Det -> 'the' [0.8] | 'my' [0.2]
348
+ N -> 'man' [0.5] | 'telescope' [0.5]
349
+ VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
350
+ V -> 'ate' [0.35] | 'saw' [0.65]
351
+ PP -> P NP [1.0]
352
+ P -> 'with' [0.61] | 'under' [0.39]
353
+ """
354
+ )
355
+
356
+ toy_pcfg2 = PCFG.fromstring(
357
+ """
358
+ S -> NP VP [1.0]
359
+ VP -> V NP [.59]
360
+ VP -> V [.40]
361
+ VP -> VP PP [.01]
362
+ NP -> Det N [.41]
363
+ NP -> Name [.28]
364
+ NP -> NP PP [.31]
365
+ PP -> P NP [1.0]
366
+ V -> 'saw' [.21]
367
+ V -> 'ate' [.51]
368
+ V -> 'ran' [.28]
369
+ N -> 'boy' [.11]
370
+ N -> 'cookie' [.12]
371
+ N -> 'table' [.13]
372
+ N -> 'telescope' [.14]
373
+ N -> 'hill' [.5]
374
+ Name -> 'Jack' [.52]
375
+ Name -> 'Bob' [.48]
376
+ P -> 'with' [.61]
377
+ P -> 'under' [.39]
378
+ Det -> 'the' [.41]
379
+ Det -> 'a' [.31]
380
+ Det -> 'my' [.28]
381
+ """
382
+ )
383
+
384
+ # Define two demos. Each demo has a sentence and a grammar.
385
+ demos = [
386
+ ("I saw the man with my telescope", toy_pcfg1),
387
+ ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
388
+ ]
389
+
390
+ # Ask the user which demo they want to use.
391
+ print()
392
+ for i in range(len(demos)):
393
+ print(f"{i + 1:>3}: {demos[i][0]}")
394
+ print(" %r" % demos[i][1])
395
+ print()
396
+ print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
397
+ try:
398
+ snum = int(sys.stdin.readline().strip()) - 1
399
+ sent, grammar = demos[snum]
400
+ except:
401
+ print("Bad sentence number")
402
+ return
403
+
404
+ # Tokenize the sentence.
405
+ tokens = sent.split()
406
+
407
+ parser = ViterbiParser(grammar)
408
+ all_parses = {}
409
+
410
+ print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
411
+ parser.trace(3)
412
+ t = time.time()
413
+ parses = parser.parse_all(tokens)
414
+ time = time.time() - t
415
+ average = (
416
+ reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
417
+ )
418
+ num_parses = len(parses)
419
+ for p in parses:
420
+ all_parses[p.freeze()] = 1
421
+
422
+ # Print some summary statistics
423
+ print()
424
+ print("Time (secs) # Parses Average P(parse)")
425
+ print("-----------------------------------------")
426
+ print("%11.4f%11d%19.14f" % (time, num_parses, average))
427
+ parses = all_parses.keys()
428
+ if parses:
429
+ p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
430
+ else:
431
+ p = 0
432
+ print("------------------------------------------")
433
+ print("%11s%11d%19.14f" % ("n/a", len(parses), p))
434
+
435
+ # Ask the user if we should draw the parses.
436
+ print()
437
+ print("Draw parses (y/n)? ", end=" ")
438
+ if sys.stdin.readline().strip().lower().startswith("y"):
439
+ from nltk.draw.tree import draw_trees
440
+
441
+ print(" please wait...")
442
+ draw_trees(*parses)
443
+
444
+ # Ask the user if we should print the parses.
445
+ print()
446
+ print("Print parses (y/n)? ", end=" ")
447
+ if sys.stdin.readline().strip().lower().startswith("y"):
448
+ for parse in parses:
449
+ print(parse)
450
+
451
+
452
+ if __name__ == "__main__":
453
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/sem/boxer.py ADDED
@@ -0,0 +1,1605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Interface to Boxer
2
+ # <http://svn.ask.it.usyd.edu.au/trac/candc/wiki/boxer>
3
+ #
4
+ # Author: Dan Garrette <dhgarrette@gmail.com>
5
+ #
6
+ # Copyright (C) 2001-2022 NLTK Project
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ """
11
+ An interface to Boxer.
12
+
13
+ This interface relies on the latest version of the development (subversion) version of
14
+ C&C and Boxer.
15
+
16
+ Usage
17
+ =====
18
+
19
+ Set the environment variable CANDC to the bin directory of your CandC installation.
20
+ The models directory should be in the CandC root directory.
21
+ For example::
22
+
23
+ /path/to/candc/
24
+ bin/
25
+ candc
26
+ boxer
27
+ models/
28
+ boxer/
29
+ """
30
+
31
+ import operator
32
+ import os
33
+ import re
34
+ import subprocess
35
+ import tempfile
36
+ from functools import reduce
37
+ from optparse import OptionParser
38
+
39
+ from nltk.internals import find_binary
40
+ from nltk.sem.drt import (
41
+ DRS,
42
+ DrtApplicationExpression,
43
+ DrtEqualityExpression,
44
+ DrtNegatedExpression,
45
+ DrtOrExpression,
46
+ DrtParser,
47
+ DrtProposition,
48
+ DrtTokens,
49
+ DrtVariableExpression,
50
+ )
51
+ from nltk.sem.logic import (
52
+ ExpectedMoreTokensException,
53
+ LogicalExpressionException,
54
+ UnexpectedTokenException,
55
+ Variable,
56
+ )
57
+
58
+
59
+ class Boxer:
60
+ """
61
+ This class is an interface to Johan Bos's program Boxer, a wide-coverage
62
+ semantic parser that produces Discourse Representation Structures (DRSs).
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ boxer_drs_interpreter=None,
68
+ elimeq=False,
69
+ bin_dir=None,
70
+ verbose=False,
71
+ resolve=True,
72
+ ):
73
+ """
74
+ :param boxer_drs_interpreter: A class that converts from the
75
+ ``AbstractBoxerDrs`` object hierarchy to a different object. The
76
+ default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK
77
+ DRT hierarchy.
78
+ :param elimeq: When set to true, Boxer removes all equalities from the
79
+ DRSs and discourse referents standing in the equality relation are
80
+ unified, but only if this can be done in a meaning-preserving manner.
81
+ :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction.
82
+ Resolution follows Van der Sandt's theory of binding and accommodation.
83
+ """
84
+ if boxer_drs_interpreter is None:
85
+ boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter()
86
+ self._boxer_drs_interpreter = boxer_drs_interpreter
87
+
88
+ self._resolve = resolve
89
+ self._elimeq = elimeq
90
+
91
+ self.set_bin_dir(bin_dir, verbose)
92
+
93
+ def set_bin_dir(self, bin_dir, verbose=False):
94
+ self._candc_bin = self._find_binary("candc", bin_dir, verbose)
95
+ self._candc_models_path = os.path.normpath(
96
+ os.path.join(self._candc_bin[:-5], "../models")
97
+ )
98
+ self._boxer_bin = self._find_binary("boxer", bin_dir, verbose)
99
+
100
+ def interpret(self, input, discourse_id=None, question=False, verbose=False):
101
+ """
102
+ Use Boxer to give a first order representation.
103
+
104
+ :param input: str Input sentence to parse
105
+ :param occur_index: bool Should predicates be occurrence indexed?
106
+ :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
107
+ :return: ``drt.DrtExpression``
108
+ """
109
+ discourse_ids = [discourse_id] if discourse_id is not None else None
110
+ (d,) = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
111
+ if not d:
112
+ raise Exception(f'Unable to interpret: "{input}"')
113
+ return d
114
+
115
+ def interpret_multi(self, input, discourse_id=None, question=False, verbose=False):
116
+ """
117
+ Use Boxer to give a first order representation.
118
+
119
+ :param input: list of str Input sentences to parse as a single discourse
120
+ :param occur_index: bool Should predicates be occurrence indexed?
121
+ :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
122
+ :return: ``drt.DrtExpression``
123
+ """
124
+ discourse_ids = [discourse_id] if discourse_id is not None else None
125
+ (d,) = self.interpret_multi_sents([input], discourse_ids, question, verbose)
126
+ if not d:
127
+ raise Exception(f'Unable to interpret: "{input}"')
128
+ return d
129
+
130
+ def interpret_sents(
131
+ self, inputs, discourse_ids=None, question=False, verbose=False
132
+ ):
133
+ """
134
+ Use Boxer to give a first order representation.
135
+
136
+ :param inputs: list of str Input sentences to parse as individual discourses
137
+ :param occur_index: bool Should predicates be occurrence indexed?
138
+ :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
139
+ :return: list of ``drt.DrtExpression``
140
+ """
141
+ return self.interpret_multi_sents(
142
+ [[input] for input in inputs], discourse_ids, question, verbose
143
+ )
144
+
145
+ def interpret_multi_sents(
146
+ self, inputs, discourse_ids=None, question=False, verbose=False
147
+ ):
148
+ """
149
+ Use Boxer to give a first order representation.
150
+
151
+ :param inputs: list of list of str Input discourses to parse
152
+ :param occur_index: bool Should predicates be occurrence indexed?
153
+ :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
154
+ :return: ``drt.DrtExpression``
155
+ """
156
+ if discourse_ids is not None:
157
+ assert len(inputs) == len(discourse_ids)
158
+ assert reduce(operator.and_, (id is not None for id in discourse_ids))
159
+ use_disc_id = True
160
+ else:
161
+ discourse_ids = list(map(str, range(len(inputs))))
162
+ use_disc_id = False
163
+
164
+ candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose)
165
+ boxer_out = self._call_boxer(candc_out, verbose=verbose)
166
+
167
+ # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out:
168
+ # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str)
169
+
170
+ drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id)
171
+ return [drs_dict.get(id, None) for id in discourse_ids]
172
+
173
+ def _call_candc(self, inputs, discourse_ids, question, verbose=False):
174
+ """
175
+ Call the ``candc`` binary with the given input.
176
+
177
+ :param inputs: list of list of str Input discourses to parse
178
+ :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
179
+ :param filename: str A filename for the output file
180
+ :return: stdout
181
+ """
182
+ args = [
183
+ "--models",
184
+ os.path.join(self._candc_models_path, ["boxer", "questions"][question]),
185
+ "--candc-printer",
186
+ "boxer",
187
+ ]
188
+ return self._call(
189
+ "\n".join(
190
+ sum(
191
+ ([f"<META>'{id}'"] + d for d, id in zip(inputs, discourse_ids)),
192
+ [],
193
+ )
194
+ ),
195
+ self._candc_bin,
196
+ args,
197
+ verbose,
198
+ )
199
+
200
+ def _call_boxer(self, candc_out, verbose=False):
201
+ """
202
+ Call the ``boxer`` binary with the given input.
203
+
204
+ :param candc_out: str output from C&C parser
205
+ :return: stdout
206
+ """
207
+ f = None
208
+ try:
209
+ fd, temp_filename = tempfile.mkstemp(
210
+ prefix="boxer-", suffix=".in", text=True
211
+ )
212
+ f = os.fdopen(fd, "w")
213
+ f.write(candc_out.decode("utf-8"))
214
+ finally:
215
+ if f:
216
+ f.close()
217
+
218
+ args = [
219
+ "--box",
220
+ "false",
221
+ "--semantics",
222
+ "drs",
223
+ #'--flat', 'false', # removed from boxer
224
+ "--resolve",
225
+ ["false", "true"][self._resolve],
226
+ "--elimeq",
227
+ ["false", "true"][self._elimeq],
228
+ "--format",
229
+ "prolog",
230
+ "--instantiate",
231
+ "true",
232
+ "--input",
233
+ temp_filename,
234
+ ]
235
+ stdout = self._call(None, self._boxer_bin, args, verbose)
236
+ os.remove(temp_filename)
237
+ return stdout
238
+
239
+ def _find_binary(self, name, bin_dir, verbose=False):
240
+ return find_binary(
241
+ name,
242
+ path_to_bin=bin_dir,
243
+ env_vars=["CANDC"],
244
+ url="http://svn.ask.it.usyd.edu.au/trac/candc/",
245
+ binary_names=[name, name + ".exe"],
246
+ verbose=verbose,
247
+ )
248
+
249
+ def _call(self, input_str, binary, args=[], verbose=False):
250
+ """
251
+ Call the binary with the given input.
252
+
253
+ :param input_str: A string whose contents are used as stdin.
254
+ :param binary: The location of the binary to call
255
+ :param args: A list of command-line arguments.
256
+ :return: stdout
257
+ """
258
+ if verbose:
259
+ print("Calling:", binary)
260
+ print("Args:", args)
261
+ print("Input:", input_str)
262
+ print("Command:", binary + " " + " ".join(args))
263
+
264
+ # Call via a subprocess
265
+ if input_str is None:
266
+ cmd = [binary] + args
267
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
268
+ else:
269
+ cmd = 'echo "{}" | {} {}'.format(input_str, binary, " ".join(args))
270
+ p = subprocess.Popen(
271
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
272
+ )
273
+ stdout, stderr = p.communicate()
274
+
275
+ if verbose:
276
+ print("Return code:", p.returncode)
277
+ if stdout:
278
+ print("stdout:\n", stdout, "\n")
279
+ if stderr:
280
+ print("stderr:\n", stderr, "\n")
281
+ if p.returncode != 0:
282
+ raise Exception(
283
+ "ERROR CALLING: {} {}\nReturncode: {}\n{}".format(
284
+ binary, " ".join(args), p.returncode, stderr
285
+ )
286
+ )
287
+
288
+ return stdout
289
+
290
+ def _parse_to_drs_dict(self, boxer_out, use_disc_id):
291
+ lines = boxer_out.decode("utf-8").split("\n")
292
+ drs_dict = {}
293
+ i = 0
294
+ while i < len(lines):
295
+ line = lines[i]
296
+ if line.startswith("id("):
297
+ comma_idx = line.index(",")
298
+ discourse_id = line[3:comma_idx]
299
+ if discourse_id[0] == "'" and discourse_id[-1] == "'":
300
+ discourse_id = discourse_id[1:-1]
301
+ drs_id = line[comma_idx + 1 : line.index(")")]
302
+ i += 1
303
+ line = lines[i]
304
+ assert line.startswith(f"sem({drs_id},")
305
+ if line[-4:] == "').'":
306
+ line = line[:-4] + ")."
307
+ assert line.endswith(")."), f"can't parse line: {line}"
308
+
309
+ search_start = len(f"sem({drs_id},[")
310
+ brace_count = 1
311
+ drs_start = -1
312
+ for j, c in enumerate(line[search_start:]):
313
+ if c == "[":
314
+ brace_count += 1
315
+ if c == "]":
316
+ brace_count -= 1
317
+ if brace_count == 0:
318
+ drs_start = search_start + j + 1
319
+ if line[drs_start : drs_start + 3] == "','":
320
+ drs_start = drs_start + 3
321
+ else:
322
+ drs_start = drs_start + 1
323
+ break
324
+ assert drs_start > -1
325
+
326
+ drs_input = line[drs_start:-2].strip()
327
+ parsed = self._parse_drs(drs_input, discourse_id, use_disc_id)
328
+ drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed)
329
+ i += 1
330
+ return drs_dict
331
+
332
+ def _parse_drs(self, drs_string, discourse_id, use_disc_id):
333
+ return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string)
334
+
335
+
336
+ class BoxerOutputDrsParser(DrtParser):
337
+ def __init__(self, discourse_id=None):
338
+ """
339
+ This class is used to parse the Prolog DRS output from Boxer into a
340
+ hierarchy of python objects.
341
+ """
342
+ DrtParser.__init__(self)
343
+ self.discourse_id = discourse_id
344
+ self.sentence_id_offset = None
345
+ self.quote_chars = [("'", "'", "\\", False)]
346
+
347
+ def parse(self, data, signature=None):
348
+ return DrtParser.parse(self, data, signature)
349
+
350
+ def get_all_symbols(self):
351
+ return ["(", ")", ",", "[", "]", ":"]
352
+
353
+ def handle(self, tok, context):
354
+ return self.handle_drs(tok)
355
+
356
+ def attempt_adjuncts(self, expression, context):
357
+ return expression
358
+
359
+ def parse_condition(self, indices):
360
+ """
361
+ Parse a DRS condition
362
+
363
+ :return: list of ``DrtExpression``
364
+ """
365
+ tok = self.token()
366
+ accum = self.handle_condition(tok, indices)
367
+ if accum is None:
368
+ raise UnexpectedTokenException(tok)
369
+ return accum
370
+
371
+ def handle_drs(self, tok):
372
+ if tok == "drs":
373
+ return self.parse_drs()
374
+ elif tok in ["merge", "smerge"]:
375
+ return self._handle_binary_expression(self._make_merge_expression)(None, [])
376
+ elif tok in ["alfa"]:
377
+ return self._handle_alfa(self._make_merge_expression)(None, [])
378
+
379
+ def handle_condition(self, tok, indices):
380
+ """
381
+ Handle a DRS condition
382
+
383
+ :param indices: list of int
384
+ :return: list of ``DrtExpression``
385
+ """
386
+ if tok == "not":
387
+ return [self._handle_not()]
388
+
389
+ if tok == "or":
390
+ conds = [self._handle_binary_expression(self._make_or_expression)]
391
+ elif tok == "imp":
392
+ conds = [self._handle_binary_expression(self._make_imp_expression)]
393
+ elif tok == "eq":
394
+ conds = [self._handle_eq()]
395
+ elif tok == "prop":
396
+ conds = [self._handle_prop()]
397
+
398
+ elif tok == "pred":
399
+ conds = [self._handle_pred()]
400
+ elif tok == "named":
401
+ conds = [self._handle_named()]
402
+ elif tok == "rel":
403
+ conds = [self._handle_rel()]
404
+ elif tok == "timex":
405
+ conds = self._handle_timex()
406
+ elif tok == "card":
407
+ conds = [self._handle_card()]
408
+
409
+ elif tok == "whq":
410
+ conds = [self._handle_whq()]
411
+ elif tok == "duplex":
412
+ conds = [self._handle_duplex()]
413
+
414
+ else:
415
+ conds = []
416
+
417
+ return sum(
418
+ (
419
+ [cond(sent_index, word_indices) for cond in conds]
420
+ for sent_index, word_indices in self._sent_and_word_indices(indices)
421
+ ),
422
+ [],
423
+ )
424
+
425
+ def _handle_not(self):
426
+ self.assertToken(self.token(), "(")
427
+ drs = self.process_next_expression(None)
428
+ self.assertToken(self.token(), ")")
429
+ return BoxerNot(drs)
430
+
431
+ def _handle_pred(self):
432
+ # pred(_G3943, dog, n, 0)
433
+ self.assertToken(self.token(), "(")
434
+ variable = self.parse_variable()
435
+ self.assertToken(self.token(), ",")
436
+ name = self.token()
437
+ self.assertToken(self.token(), ",")
438
+ pos = self.token()
439
+ self.assertToken(self.token(), ",")
440
+ sense = int(self.token())
441
+ self.assertToken(self.token(), ")")
442
+
443
+ def _handle_pred_f(sent_index, word_indices):
444
+ return BoxerPred(
445
+ self.discourse_id, sent_index, word_indices, variable, name, pos, sense
446
+ )
447
+
448
+ return _handle_pred_f
449
+
450
+ def _handle_duplex(self):
451
+ # duplex(whq, drs(...), var, drs(...))
452
+ self.assertToken(self.token(), "(")
453
+ # self.assertToken(self.token(), '[')
454
+ ans_types = []
455
+ # while self.token(0) != ']':
456
+ # cat = self.token()
457
+ # self.assertToken(self.token(), ':')
458
+ # if cat == 'des':
459
+ # ans_types.append(self.token())
460
+ # elif cat == 'num':
461
+ # ans_types.append('number')
462
+ # typ = self.token()
463
+ # if typ == 'cou':
464
+ # ans_types.append('count')
465
+ # else:
466
+ # ans_types.append(typ)
467
+ # else:
468
+ # ans_types.append(self.token())
469
+ # self.token() #swallow the ']'
470
+
471
+ self.assertToken(self.token(), "whq")
472
+ self.assertToken(self.token(), ",")
473
+ d1 = self.process_next_expression(None)
474
+ self.assertToken(self.token(), ",")
475
+ ref = self.parse_variable()
476
+ self.assertToken(self.token(), ",")
477
+ d2 = self.process_next_expression(None)
478
+ self.assertToken(self.token(), ")")
479
+ return lambda sent_index, word_indices: BoxerWhq(
480
+ self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
481
+ )
482
+
483
+ def _handle_named(self):
484
+ # named(x0, john, per, 0)
485
+ self.assertToken(self.token(), "(")
486
+ variable = self.parse_variable()
487
+ self.assertToken(self.token(), ",")
488
+ name = self.token()
489
+ self.assertToken(self.token(), ",")
490
+ type = self.token()
491
+ self.assertToken(self.token(), ",")
492
+ sense = self.token() # as per boxer rev 2554
493
+ self.assertToken(self.token(), ")")
494
+ return lambda sent_index, word_indices: BoxerNamed(
495
+ self.discourse_id, sent_index, word_indices, variable, name, type, sense
496
+ )
497
+
498
+ def _handle_rel(self):
499
+ # rel(_G3993, _G3943, agent, 0)
500
+ self.assertToken(self.token(), "(")
501
+ var1 = self.parse_variable()
502
+ self.assertToken(self.token(), ",")
503
+ var2 = self.parse_variable()
504
+ self.assertToken(self.token(), ",")
505
+ rel = self.token()
506
+ self.assertToken(self.token(), ",")
507
+ sense = int(self.token())
508
+ self.assertToken(self.token(), ")")
509
+ return lambda sent_index, word_indices: BoxerRel(
510
+ self.discourse_id, sent_index, word_indices, var1, var2, rel, sense
511
+ )
512
+
513
+ def _handle_timex(self):
514
+ # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
515
+ self.assertToken(self.token(), "(")
516
+ arg = self.parse_variable()
517
+ self.assertToken(self.token(), ",")
518
+ new_conds = self._handle_time_expression(arg)
519
+ self.assertToken(self.token(), ")")
520
+ return new_conds
521
+
522
+ def _handle_time_expression(self, arg):
523
+ # date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
524
+ tok = self.token()
525
+ self.assertToken(self.token(), "(")
526
+ if tok == "date":
527
+ conds = self._handle_date(arg)
528
+ elif tok == "time":
529
+ conds = self._handle_time(arg)
530
+ else:
531
+ return None
532
+ self.assertToken(self.token(), ")")
533
+ return [
534
+ lambda sent_index, word_indices: BoxerPred(
535
+ self.discourse_id, sent_index, word_indices, arg, tok, "n", 0
536
+ )
537
+ ] + [lambda sent_index, word_indices: cond for cond in conds]
538
+
539
+ def _handle_date(self, arg):
540
+ # []: (+), []:'XXXX', [1004]:'04', []:'XX'
541
+ conds = []
542
+ ((sent_index, word_indices),) = self._sent_and_word_indices(
543
+ self._parse_index_list()
544
+ )
545
+ self.assertToken(self.token(), "(")
546
+ pol = self.token()
547
+ self.assertToken(self.token(), ")")
548
+ conds.append(
549
+ BoxerPred(
550
+ self.discourse_id,
551
+ sent_index,
552
+ word_indices,
553
+ arg,
554
+ f"date_pol_{pol}",
555
+ "a",
556
+ 0,
557
+ )
558
+ )
559
+ self.assertToken(self.token(), ",")
560
+
561
+ ((sent_index, word_indices),) = self._sent_and_word_indices(
562
+ self._parse_index_list()
563
+ )
564
+ year = self.token()
565
+ if year != "XXXX":
566
+ year = year.replace(":", "_")
567
+ conds.append(
568
+ BoxerPred(
569
+ self.discourse_id,
570
+ sent_index,
571
+ word_indices,
572
+ arg,
573
+ f"date_year_{year}",
574
+ "a",
575
+ 0,
576
+ )
577
+ )
578
+ self.assertToken(self.token(), ",")
579
+
580
+ ((sent_index, word_indices),) = self._sent_and_word_indices(
581
+ self._parse_index_list()
582
+ )
583
+ month = self.token()
584
+ if month != "XX":
585
+ conds.append(
586
+ BoxerPred(
587
+ self.discourse_id,
588
+ sent_index,
589
+ word_indices,
590
+ arg,
591
+ f"date_month_{month}",
592
+ "a",
593
+ 0,
594
+ )
595
+ )
596
+ self.assertToken(self.token(), ",")
597
+
598
+ ((sent_index, word_indices),) = self._sent_and_word_indices(
599
+ self._parse_index_list()
600
+ )
601
+ day = self.token()
602
+ if day != "XX":
603
+ conds.append(
604
+ BoxerPred(
605
+ self.discourse_id,
606
+ sent_index,
607
+ word_indices,
608
+ arg,
609
+ f"date_day_{day}",
610
+ "a",
611
+ 0,
612
+ )
613
+ )
614
+
615
+ return conds
616
+
617
+ def _handle_time(self, arg):
618
+ # time([1018]:'18', []:'XX', []:'XX')
619
+ conds = []
620
+ self._parse_index_list()
621
+ hour = self.token()
622
+ if hour != "XX":
623
+ conds.append(self._make_atom("r_hour_2", arg, hour))
624
+ self.assertToken(self.token(), ",")
625
+
626
+ self._parse_index_list()
627
+ min = self.token()
628
+ if min != "XX":
629
+ conds.append(self._make_atom("r_min_2", arg, min))
630
+ self.assertToken(self.token(), ",")
631
+
632
+ self._parse_index_list()
633
+ sec = self.token()
634
+ if sec != "XX":
635
+ conds.append(self._make_atom("r_sec_2", arg, sec))
636
+
637
+ return conds
638
+
639
+ def _handle_card(self):
640
+ # card(_G18535, 28, ge)
641
+ self.assertToken(self.token(), "(")
642
+ variable = self.parse_variable()
643
+ self.assertToken(self.token(), ",")
644
+ value = self.token()
645
+ self.assertToken(self.token(), ",")
646
+ type = self.token()
647
+ self.assertToken(self.token(), ")")
648
+ return lambda sent_index, word_indices: BoxerCard(
649
+ self.discourse_id, sent_index, word_indices, variable, value, type
650
+ )
651
+
652
+ def _handle_prop(self):
653
+ # prop(_G15949, drs(...))
654
+ self.assertToken(self.token(), "(")
655
+ variable = self.parse_variable()
656
+ self.assertToken(self.token(), ",")
657
+ drs = self.process_next_expression(None)
658
+ self.assertToken(self.token(), ")")
659
+ return lambda sent_index, word_indices: BoxerProp(
660
+ self.discourse_id, sent_index, word_indices, variable, drs
661
+ )
662
+
663
+ def _parse_index_list(self):
664
+ # [1001,1002]:
665
+ indices = []
666
+ self.assertToken(self.token(), "[")
667
+ while self.token(0) != "]":
668
+ indices.append(self.parse_index())
669
+ if self.token(0) == ",":
670
+ self.token() # swallow ','
671
+ self.token() # swallow ']'
672
+ self.assertToken(self.token(), ":")
673
+ return indices
674
+
675
+ def parse_drs(self):
676
+ # drs([[1001]:_G3943],
677
+ # [[1002]:pred(_G3943, dog, n, 0)]
678
+ # )
679
+ self.assertToken(self.token(), "(")
680
+ self.assertToken(self.token(), "[")
681
+ refs = set()
682
+ while self.token(0) != "]":
683
+ indices = self._parse_index_list()
684
+ refs.add(self.parse_variable())
685
+ if self.token(0) == ",":
686
+ self.token() # swallow ','
687
+ self.token() # swallow ']'
688
+ self.assertToken(self.token(), ",")
689
+ self.assertToken(self.token(), "[")
690
+ conds = []
691
+ while self.token(0) != "]":
692
+ indices = self._parse_index_list()
693
+ conds.extend(self.parse_condition(indices))
694
+ if self.token(0) == ",":
695
+ self.token() # swallow ','
696
+ self.token() # swallow ']'
697
+ self.assertToken(self.token(), ")")
698
+ return BoxerDrs(list(refs), conds)
699
+
700
+ def _handle_binary_expression(self, make_callback):
701
+ self.assertToken(self.token(), "(")
702
+ drs1 = self.process_next_expression(None)
703
+ self.assertToken(self.token(), ",")
704
+ drs2 = self.process_next_expression(None)
705
+ self.assertToken(self.token(), ")")
706
+ return lambda sent_index, word_indices: make_callback(
707
+ sent_index, word_indices, drs1, drs2
708
+ )
709
+
710
+ def _handle_alfa(self, make_callback):
711
+ self.assertToken(self.token(), "(")
712
+ type = self.token()
713
+ self.assertToken(self.token(), ",")
714
+ drs1 = self.process_next_expression(None)
715
+ self.assertToken(self.token(), ",")
716
+ drs2 = self.process_next_expression(None)
717
+ self.assertToken(self.token(), ")")
718
+ return lambda sent_index, word_indices: make_callback(
719
+ sent_index, word_indices, drs1, drs2
720
+ )
721
+
722
+ def _handle_eq(self):
723
+ self.assertToken(self.token(), "(")
724
+ var1 = self.parse_variable()
725
+ self.assertToken(self.token(), ",")
726
+ var2 = self.parse_variable()
727
+ self.assertToken(self.token(), ")")
728
+ return lambda sent_index, word_indices: BoxerEq(
729
+ self.discourse_id, sent_index, word_indices, var1, var2
730
+ )
731
+
732
+ def _handle_whq(self):
733
+ self.assertToken(self.token(), "(")
734
+ self.assertToken(self.token(), "[")
735
+ ans_types = []
736
+ while self.token(0) != "]":
737
+ cat = self.token()
738
+ self.assertToken(self.token(), ":")
739
+ if cat == "des":
740
+ ans_types.append(self.token())
741
+ elif cat == "num":
742
+ ans_types.append("number")
743
+ typ = self.token()
744
+ if typ == "cou":
745
+ ans_types.append("count")
746
+ else:
747
+ ans_types.append(typ)
748
+ else:
749
+ ans_types.append(self.token())
750
+ self.token() # swallow the ']'
751
+
752
+ self.assertToken(self.token(), ",")
753
+ d1 = self.process_next_expression(None)
754
+ self.assertToken(self.token(), ",")
755
+ ref = self.parse_variable()
756
+ self.assertToken(self.token(), ",")
757
+ d2 = self.process_next_expression(None)
758
+ self.assertToken(self.token(), ")")
759
+ return lambda sent_index, word_indices: BoxerWhq(
760
+ self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
761
+ )
762
+
763
+ def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
764
+ return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
765
+
766
+ def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
767
+ return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
768
+
769
+ def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
770
+ return BoxerDrs(drs1.refs, drs1.conds, drs2)
771
+
772
+ def parse_variable(self):
773
+ var = self.token()
774
+ assert re.match(r"^[exps]\d+$", var), var
775
+ return var
776
+
777
+ def parse_index(self):
778
+ return int(self.token())
779
+
780
+ def _sent_and_word_indices(self, indices):
781
+ """
782
+ :return: list of (sent_index, word_indices) tuples
783
+ """
784
+ sent_indices = {(i / 1000) - 1 for i in indices if i >= 0}
785
+ if sent_indices:
786
+ pairs = []
787
+ for sent_index in sent_indices:
788
+ word_indices = [
789
+ (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1
790
+ ]
791
+ pairs.append((sent_index, word_indices))
792
+ return pairs
793
+ else:
794
+ word_indices = [(i % 1000) - 1 for i in indices]
795
+ return [(None, word_indices)]
796
+
797
+
798
+ class BoxerDrsParser(DrtParser):
799
+ """
800
+ Reparse the str form of subclasses of ``AbstractBoxerDrs``
801
+ """
802
+
803
+ def __init__(self, discourse_id=None):
804
+ DrtParser.__init__(self)
805
+ self.discourse_id = discourse_id
806
+
807
+ def get_all_symbols(self):
808
+ return [
809
+ DrtTokens.OPEN,
810
+ DrtTokens.CLOSE,
811
+ DrtTokens.COMMA,
812
+ DrtTokens.OPEN_BRACKET,
813
+ DrtTokens.CLOSE_BRACKET,
814
+ ]
815
+
816
+ def attempt_adjuncts(self, expression, context):
817
+ return expression
818
+
819
+ def handle(self, tok, context):
820
+ try:
821
+ # if tok == 'drs':
822
+ # self.assertNextToken(DrtTokens.OPEN)
823
+ # label = int(self.token())
824
+ # self.assertNextToken(DrtTokens.COMMA)
825
+ # refs = list(map(int, self.handle_refs()))
826
+ # self.assertNextToken(DrtTokens.COMMA)
827
+ # conds = self.handle_conds(None)
828
+ # self.assertNextToken(DrtTokens.CLOSE)
829
+ # return BoxerDrs(label, refs, conds)
830
+ if tok == "pred":
831
+ self.assertNextToken(DrtTokens.OPEN)
832
+ disc_id = (
833
+ self.discourse_id if self.discourse_id is not None else self.token()
834
+ )
835
+ self.assertNextToken(DrtTokens.COMMA)
836
+ sent_id = self.nullableIntToken()
837
+ self.assertNextToken(DrtTokens.COMMA)
838
+ word_ids = list(map(int, self.handle_refs()))
839
+ self.assertNextToken(DrtTokens.COMMA)
840
+ variable = int(self.token())
841
+ self.assertNextToken(DrtTokens.COMMA)
842
+ name = self.token()
843
+ self.assertNextToken(DrtTokens.COMMA)
844
+ pos = self.token()
845
+ self.assertNextToken(DrtTokens.COMMA)
846
+ sense = int(self.token())
847
+ self.assertNextToken(DrtTokens.CLOSE)
848
+ return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
849
+ elif tok == "named":
850
+ self.assertNextToken(DrtTokens.OPEN)
851
+ disc_id = (
852
+ self.discourse_id if self.discourse_id is not None else self.token()
853
+ )
854
+ self.assertNextToken(DrtTokens.COMMA)
855
+ sent_id = int(self.token())
856
+ self.assertNextToken(DrtTokens.COMMA)
857
+ word_ids = map(int, self.handle_refs())
858
+ self.assertNextToken(DrtTokens.COMMA)
859
+ variable = int(self.token())
860
+ self.assertNextToken(DrtTokens.COMMA)
861
+ name = self.token()
862
+ self.assertNextToken(DrtTokens.COMMA)
863
+ type = self.token()
864
+ self.assertNextToken(DrtTokens.COMMA)
865
+ sense = int(self.token())
866
+ self.assertNextToken(DrtTokens.CLOSE)
867
+ return BoxerNamed(
868
+ disc_id, sent_id, word_ids, variable, name, type, sense
869
+ )
870
+ elif tok == "rel":
871
+ self.assertNextToken(DrtTokens.OPEN)
872
+ disc_id = (
873
+ self.discourse_id if self.discourse_id is not None else self.token()
874
+ )
875
+ self.assertNextToken(DrtTokens.COMMA)
876
+ sent_id = self.nullableIntToken()
877
+ self.assertNextToken(DrtTokens.COMMA)
878
+ word_ids = list(map(int, self.handle_refs()))
879
+ self.assertNextToken(DrtTokens.COMMA)
880
+ var1 = int(self.token())
881
+ self.assertNextToken(DrtTokens.COMMA)
882
+ var2 = int(self.token())
883
+ self.assertNextToken(DrtTokens.COMMA)
884
+ rel = self.token()
885
+ self.assertNextToken(DrtTokens.COMMA)
886
+ sense = int(self.token())
887
+ self.assertNextToken(DrtTokens.CLOSE)
888
+ return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
889
+ elif tok == "prop":
890
+ self.assertNextToken(DrtTokens.OPEN)
891
+ disc_id = (
892
+ self.discourse_id if self.discourse_id is not None else self.token()
893
+ )
894
+ self.assertNextToken(DrtTokens.COMMA)
895
+ sent_id = int(self.token())
896
+ self.assertNextToken(DrtTokens.COMMA)
897
+ word_ids = list(map(int, self.handle_refs()))
898
+ self.assertNextToken(DrtTokens.COMMA)
899
+ variable = int(self.token())
900
+ self.assertNextToken(DrtTokens.COMMA)
901
+ drs = self.process_next_expression(None)
902
+ self.assertNextToken(DrtTokens.CLOSE)
903
+ return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
904
+ elif tok == "not":
905
+ self.assertNextToken(DrtTokens.OPEN)
906
+ drs = self.process_next_expression(None)
907
+ self.assertNextToken(DrtTokens.CLOSE)
908
+ return BoxerNot(drs)
909
+ elif tok == "imp":
910
+ self.assertNextToken(DrtTokens.OPEN)
911
+ drs1 = self.process_next_expression(None)
912
+ self.assertNextToken(DrtTokens.COMMA)
913
+ drs2 = self.process_next_expression(None)
914
+ self.assertNextToken(DrtTokens.CLOSE)
915
+ return BoxerDrs(drs1.refs, drs1.conds, drs2)
916
+ elif tok == "or":
917
+ self.assertNextToken(DrtTokens.OPEN)
918
+ disc_id = (
919
+ self.discourse_id if self.discourse_id is not None else self.token()
920
+ )
921
+ self.assertNextToken(DrtTokens.COMMA)
922
+ sent_id = self.nullableIntToken()
923
+ self.assertNextToken(DrtTokens.COMMA)
924
+ word_ids = map(int, self.handle_refs())
925
+ self.assertNextToken(DrtTokens.COMMA)
926
+ drs1 = self.process_next_expression(None)
927
+ self.assertNextToken(DrtTokens.COMMA)
928
+ drs2 = self.process_next_expression(None)
929
+ self.assertNextToken(DrtTokens.CLOSE)
930
+ return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
931
+ elif tok == "eq":
932
+ self.assertNextToken(DrtTokens.OPEN)
933
+ disc_id = (
934
+ self.discourse_id if self.discourse_id is not None else self.token()
935
+ )
936
+ self.assertNextToken(DrtTokens.COMMA)
937
+ sent_id = self.nullableIntToken()
938
+ self.assertNextToken(DrtTokens.COMMA)
939
+ word_ids = list(map(int, self.handle_refs()))
940
+ self.assertNextToken(DrtTokens.COMMA)
941
+ var1 = int(self.token())
942
+ self.assertNextToken(DrtTokens.COMMA)
943
+ var2 = int(self.token())
944
+ self.assertNextToken(DrtTokens.CLOSE)
945
+ return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
946
+ elif tok == "card":
947
+ self.assertNextToken(DrtTokens.OPEN)
948
+ disc_id = (
949
+ self.discourse_id if self.discourse_id is not None else self.token()
950
+ )
951
+ self.assertNextToken(DrtTokens.COMMA)
952
+ sent_id = self.nullableIntToken()
953
+ self.assertNextToken(DrtTokens.COMMA)
954
+ word_ids = map(int, self.handle_refs())
955
+ self.assertNextToken(DrtTokens.COMMA)
956
+ var = int(self.token())
957
+ self.assertNextToken(DrtTokens.COMMA)
958
+ value = self.token()
959
+ self.assertNextToken(DrtTokens.COMMA)
960
+ type = self.token()
961
+ self.assertNextToken(DrtTokens.CLOSE)
962
+ return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
963
+ elif tok == "whq":
964
+ self.assertNextToken(DrtTokens.OPEN)
965
+ disc_id = (
966
+ self.discourse_id if self.discourse_id is not None else self.token()
967
+ )
968
+ self.assertNextToken(DrtTokens.COMMA)
969
+ sent_id = self.nullableIntToken()
970
+ self.assertNextToken(DrtTokens.COMMA)
971
+ word_ids = list(map(int, self.handle_refs()))
972
+ self.assertNextToken(DrtTokens.COMMA)
973
+ ans_types = self.handle_refs()
974
+ self.assertNextToken(DrtTokens.COMMA)
975
+ drs1 = self.process_next_expression(None)
976
+ self.assertNextToken(DrtTokens.COMMA)
977
+ var = int(self.token())
978
+ self.assertNextToken(DrtTokens.COMMA)
979
+ drs2 = self.process_next_expression(None)
980
+ self.assertNextToken(DrtTokens.CLOSE)
981
+ return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2)
982
+ except Exception as e:
983
+ raise LogicalExpressionException(self._currentIndex, str(e)) from e
984
+ assert False, repr(tok)
985
+
986
+ def nullableIntToken(self):
987
+ t = self.token()
988
+ return int(t) if t != "None" else None
989
+
990
+ def get_next_token_variable(self, description):
991
+ try:
992
+ return self.token()
993
+ except ExpectedMoreTokensException as e:
994
+ raise ExpectedMoreTokensException(e.index, "Variable expected.") from e
995
+
996
+
997
+ class AbstractBoxerDrs:
998
+ def variables(self):
999
+ """
1000
+ :return: (set<variables>, set<events>, set<propositions>)
1001
+ """
1002
+ variables, events, propositions = self._variables()
1003
+ return (variables - (events | propositions), events, propositions - events)
1004
+
1005
+ def variable_types(self):
1006
+ vartypes = {}
1007
+ for t, vars in zip(("z", "e", "p"), self.variables()):
1008
+ for v in vars:
1009
+ vartypes[v] = t
1010
+ return vartypes
1011
+
1012
+ def _variables(self):
1013
+ """
1014
+ :return: (set<variables>, set<events>, set<propositions>)
1015
+ """
1016
+ return (set(), set(), set())
1017
+
1018
+ def atoms(self):
1019
+ return set()
1020
+
1021
+ def clean(self):
1022
+ return self
1023
+
1024
+ def _clean_name(self, name):
1025
+ return name.replace("-", "_").replace("'", "_")
1026
+
1027
+ def renumber_sentences(self, f):
1028
+ return self
1029
+
1030
+ def __hash__(self):
1031
+ return hash(f"{self}")
1032
+
1033
+
1034
+ class BoxerDrs(AbstractBoxerDrs):
1035
+ def __init__(self, refs, conds, consequent=None):
1036
+ AbstractBoxerDrs.__init__(self)
1037
+ self.refs = refs
1038
+ self.conds = conds
1039
+ self.consequent = consequent
1040
+
1041
+ def _variables(self):
1042
+ variables = (set(), set(), set())
1043
+ for cond in self.conds:
1044
+ for s, v in zip(variables, cond._variables()):
1045
+ s.update(v)
1046
+ if self.consequent is not None:
1047
+ for s, v in zip(variables, self.consequent._variables()):
1048
+ s.update(v)
1049
+ return variables
1050
+
1051
+ def atoms(self):
1052
+ atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set())
1053
+ if self.consequent is not None:
1054
+ atoms.update(self.consequent.atoms())
1055
+ return atoms
1056
+
1057
+ def clean(self):
1058
+ consequent = self.consequent.clean() if self.consequent else None
1059
+ return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
1060
+
1061
+ def renumber_sentences(self, f):
1062
+ consequent = self.consequent.renumber_sentences(f) if self.consequent else None
1063
+ return BoxerDrs(
1064
+ self.refs, [c.renumber_sentences(f) for c in self.conds], consequent
1065
+ )
1066
+
1067
+ def __repr__(self):
1068
+ s = "drs([{}], [{}])".format(
1069
+ ", ".join("%s" % r for r in self.refs),
1070
+ ", ".join("%s" % c for c in self.conds),
1071
+ )
1072
+ if self.consequent is not None:
1073
+ s = f"imp({s}, {self.consequent})"
1074
+ return s
1075
+
1076
+ def __eq__(self, other):
1077
+ return (
1078
+ self.__class__ == other.__class__
1079
+ and self.refs == other.refs
1080
+ and len(self.conds) == len(other.conds)
1081
+ and reduce(
1082
+ operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds))
1083
+ )
1084
+ and self.consequent == other.consequent
1085
+ )
1086
+
1087
+ def __ne__(self, other):
1088
+ return not self == other
1089
+
1090
+ __hash__ = AbstractBoxerDrs.__hash__
1091
+
1092
+
1093
+ class BoxerNot(AbstractBoxerDrs):
1094
+ def __init__(self, drs):
1095
+ AbstractBoxerDrs.__init__(self)
1096
+ self.drs = drs
1097
+
1098
+ def _variables(self):
1099
+ return self.drs._variables()
1100
+
1101
+ def atoms(self):
1102
+ return self.drs.atoms()
1103
+
1104
+ def clean(self):
1105
+ return BoxerNot(self.drs.clean())
1106
+
1107
+ def renumber_sentences(self, f):
1108
+ return BoxerNot(self.drs.renumber_sentences(f))
1109
+
1110
+ def __repr__(self):
1111
+ return "not(%s)" % (self.drs)
1112
+
1113
+ def __eq__(self, other):
1114
+ return self.__class__ == other.__class__ and self.drs == other.drs
1115
+
1116
+ def __ne__(self, other):
1117
+ return not self == other
1118
+
1119
+ __hash__ = AbstractBoxerDrs.__hash__
1120
+
1121
+
1122
+ class BoxerIndexed(AbstractBoxerDrs):
1123
+ def __init__(self, discourse_id, sent_index, word_indices):
1124
+ AbstractBoxerDrs.__init__(self)
1125
+ self.discourse_id = discourse_id
1126
+ self.sent_index = sent_index
1127
+ self.word_indices = word_indices
1128
+
1129
+ def atoms(self):
1130
+ return {self}
1131
+
1132
+ def __eq__(self, other):
1133
+ return (
1134
+ self.__class__ == other.__class__
1135
+ and self.discourse_id == other.discourse_id
1136
+ and self.sent_index == other.sent_index
1137
+ and self.word_indices == other.word_indices
1138
+ and reduce(operator.and_, (s == o for s, o in zip(self, other)))
1139
+ )
1140
+
1141
+ def __ne__(self, other):
1142
+ return not self == other
1143
+
1144
+ __hash__ = AbstractBoxerDrs.__hash__
1145
+
1146
+ def __repr__(self):
1147
+ s = "{}({}, {}, [{}]".format(
1148
+ self._pred(),
1149
+ self.discourse_id,
1150
+ self.sent_index,
1151
+ ", ".join("%s" % wi for wi in self.word_indices),
1152
+ )
1153
+ for v in self:
1154
+ s += ", %s" % v
1155
+ return s + ")"
1156
+
1157
+
1158
+ class BoxerPred(BoxerIndexed):
1159
+ def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense):
1160
+ BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
1161
+ self.var = var
1162
+ self.name = name
1163
+ self.pos = pos
1164
+ self.sense = sense
1165
+
1166
+ def _variables(self):
1167
+ return ({self.var}, set(), set())
1168
+
1169
+ def change_var(self, var):
1170
+ return BoxerPred(
1171
+ self.discourse_id,
1172
+ self.sent_index,
1173
+ self.word_indices,
1174
+ var,
1175
+ self.name,
1176
+ self.pos,
1177
+ self.sense,
1178
+ )
1179
+
1180
+ def clean(self):
1181
+ return BoxerPred(
1182
+ self.discourse_id,
1183
+ self.sent_index,
1184
+ self.word_indices,
1185
+ self.var,
1186
+ self._clean_name(self.name),
1187
+ self.pos,
1188
+ self.sense,
1189
+ )
1190
+
1191
+ def renumber_sentences(self, f):
1192
+ new_sent_index = f(self.sent_index)
1193
+ return BoxerPred(
1194
+ self.discourse_id,
1195
+ new_sent_index,
1196
+ self.word_indices,
1197
+ self.var,
1198
+ self.name,
1199
+ self.pos,
1200
+ self.sense,
1201
+ )
1202
+
1203
+ def __iter__(self):
1204
+ return iter((self.var, self.name, self.pos, self.sense))
1205
+
1206
+ def _pred(self):
1207
+ return "pred"
1208
+
1209
+
1210
+ class BoxerNamed(BoxerIndexed):
1211
+ def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense):
1212
+ BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
1213
+ self.var = var
1214
+ self.name = name
1215
+ self.type = type
1216
+ self.sense = sense
1217
+
1218
+ def _variables(self):
1219
+ return ({self.var}, set(), set())
1220
+
1221
+ def change_var(self, var):
1222
+ return BoxerNamed(
1223
+ self.discourse_id,
1224
+ self.sent_index,
1225
+ self.word_indices,
1226
+ var,
1227
+ self.name,
1228
+ self.type,
1229
+ self.sense,
1230
+ )
1231
+
1232
+ def clean(self):
1233
+ return BoxerNamed(
1234
+ self.discourse_id,
1235
+ self.sent_index,
1236
+ self.word_indices,
1237
+ self.var,
1238
+ self._clean_name(self.name),
1239
+ self.type,
1240
+ self.sense,
1241
+ )
1242
+
1243
+ def renumber_sentences(self, f):
1244
+ return BoxerNamed(
1245
+ self.discourse_id,
1246
+ f(self.sent_index),
1247
+ self.word_indices,
1248
+ self.var,
1249
+ self.name,
1250
+ self.type,
1251
+ self.sense,
1252
+ )
1253
+
1254
+ def __iter__(self):
1255
+ return iter((self.var, self.name, self.type, self.sense))
1256
+
1257
+ def _pred(self):
1258
+ return "named"
1259
+
1260
+
1261
+ class BoxerRel(BoxerIndexed):
1262
+ def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense):
1263
+ BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
1264
+ self.var1 = var1
1265
+ self.var2 = var2
1266
+ self.rel = rel
1267
+ self.sense = sense
1268
+
1269
+ def _variables(self):
1270
+ return ({self.var1, self.var2}, set(), set())
1271
+
1272
+ def clean(self):
1273
+ return BoxerRel(
1274
+ self.discourse_id,
1275
+ self.sent_index,
1276
+ self.word_indices,
1277
+ self.var1,
1278
+ self.var2,
1279
+ self._clean_name(self.rel),
1280
+ self.sense,
1281
+ )
1282
+
1283
+ def renumber_sentences(self, f):
1284
+ return BoxerRel(
1285
+ self.discourse_id,
1286
+ f(self.sent_index),
1287
+ self.word_indices,
1288
+ self.var1,
1289
+ self.var2,
1290
+ self.rel,
1291
+ self.sense,
1292
+ )
1293
+
1294
+ def __iter__(self):
1295
+ return iter((self.var1, self.var2, self.rel, self.sense))
1296
+
1297
+ def _pred(self):
1298
+ return "rel"
1299
+
1300
+
1301
+ class BoxerProp(BoxerIndexed):
1302
+ def __init__(self, discourse_id, sent_index, word_indices, var, drs):
1303
+ BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
1304
+ self.var = var
1305
+ self.drs = drs
1306
+
1307
+ def _variables(self):
1308
+ return tuple(
1309
+ map(operator.or_, (set(), set(), {self.var}), self.drs._variables())
1310
+ )
1311
+
1312
+ def referenced_labels(self):
1313
+ return {self.drs}
1314
+
1315
+ def atoms(self):
1316
+ return self.drs.atoms()
1317
+
1318
+ def clean(self):
1319
+ return BoxerProp(
1320
+ self.discourse_id,
1321
+ self.sent_index,
1322
+ self.word_indices,
1323
+ self.var,
1324
+ self.drs.clean(),
1325
+ )
1326
+
1327
+ def renumber_sentences(self, f):
1328
+ return BoxerProp(
1329
+ self.discourse_id,
1330
+ f(self.sent_index),
1331
+ self.word_indices,
1332
+ self.var,
1333
+ self.drs.renumber_sentences(f),
1334
+ )
1335
+
1336
+ def __iter__(self):
1337
+ return iter((self.var, self.drs))
1338
+
1339
+ def _pred(self):
1340
+ return "prop"
1341
+
1342
+
1343
+ class BoxerEq(BoxerIndexed):
1344
+ def __init__(self, discourse_id, sent_index, word_indices, var1, var2):
1345
+ BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
1346
+ self.var1 = var1
1347
+ self.var2 = var2
1348
+
1349
+ def _variables(self):
1350
+ return ({self.var1, self.var2}, set(), set())
1351
+
1352
+ def atoms(self):
1353
+ return set()
1354
+
1355
+ def renumber_sentences(self, f):
1356
+ return BoxerEq(
1357
+ self.discourse_id,
1358
+ f(self.sent_index),
1359
+ self.word_indices,
1360
+ self.var1,
1361
+ self.var2,
1362
+ )
1363
+
1364
+ def __iter__(self):
1365
+ return iter((self.var1, self.var2))
1366
+
1367
+ def _pred(self):
1368
+ return "eq"
1369
+
1370
+
1371
+ class BoxerCard(BoxerIndexed):
1372
+ def __init__(self, discourse_id, sent_index, word_indices, var, value, type):
1373
+ BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
1374
+ self.var = var
1375
+ self.value = value
1376
+ self.type = type
1377
+
1378
+ def _variables(self):
1379
+ return ({self.var}, set(), set())
1380
+
1381
+ def renumber_sentences(self, f):
1382
+ return BoxerCard(
1383
+ self.discourse_id,
1384
+ f(self.sent_index),
1385
+ self.word_indices,
1386
+ self.var,
1387
+ self.value,
1388
+ self.type,
1389
+ )
1390
+
1391
+ def __iter__(self):
1392
+ return iter((self.var, self.value, self.type))
1393
+
1394
+ def _pred(self):
1395
+ return "card"
1396
+
1397
+
1398
+ class BoxerOr(BoxerIndexed):
1399
+ def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2):
1400
+ BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
1401
+ self.drs1 = drs1
1402
+ self.drs2 = drs2
1403
+
1404
+ def _variables(self):
1405
+ return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables()))
1406
+
1407
+ def atoms(self):
1408
+ return self.drs1.atoms() | self.drs2.atoms()
1409
+
1410
+ def clean(self):
1411
+ return BoxerOr(
1412
+ self.discourse_id,
1413
+ self.sent_index,
1414
+ self.word_indices,
1415
+ self.drs1.clean(),
1416
+ self.drs2.clean(),
1417
+ )
1418
+
1419
+ def renumber_sentences(self, f):
1420
+ return BoxerOr(
1421
+ self.discourse_id,
1422
+ f(self.sent_index),
1423
+ self.word_indices,
1424
+ self.drs1,
1425
+ self.drs2,
1426
+ )
1427
+
1428
+ def __iter__(self):
1429
+ return iter((self.drs1, self.drs2))
1430
+
1431
+ def _pred(self):
1432
+ return "or"
1433
+
1434
+
1435
+ class BoxerWhq(BoxerIndexed):
1436
+ def __init__(
1437
+ self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2
1438
+ ):
1439
+ BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
1440
+ self.ans_types = ans_types
1441
+ self.drs1 = drs1
1442
+ self.variable = variable
1443
+ self.drs2 = drs2
1444
+
1445
+ def _variables(self):
1446
+ return tuple(
1447
+ map(
1448
+ operator.or_,
1449
+ ({self.variable}, set(), set()),
1450
+ self.drs1._variables(),
1451
+ self.drs2._variables(),
1452
+ )
1453
+ )
1454
+
1455
+ def atoms(self):
1456
+ return self.drs1.atoms() | self.drs2.atoms()
1457
+
1458
+ def clean(self):
1459
+ return BoxerWhq(
1460
+ self.discourse_id,
1461
+ self.sent_index,
1462
+ self.word_indices,
1463
+ self.ans_types,
1464
+ self.drs1.clean(),
1465
+ self.variable,
1466
+ self.drs2.clean(),
1467
+ )
1468
+
1469
+ def renumber_sentences(self, f):
1470
+ return BoxerWhq(
1471
+ self.discourse_id,
1472
+ f(self.sent_index),
1473
+ self.word_indices,
1474
+ self.ans_types,
1475
+ self.drs1,
1476
+ self.variable,
1477
+ self.drs2,
1478
+ )
1479
+
1480
+ def __iter__(self):
1481
+ return iter(
1482
+ ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2)
1483
+ )
1484
+
1485
+ def _pred(self):
1486
+ return "whq"
1487
+
1488
+
1489
+ class PassthroughBoxerDrsInterpreter:
1490
+ def interpret(self, ex):
1491
+ return ex
1492
+
1493
+
1494
+ class NltkDrtBoxerDrsInterpreter:
1495
+ def __init__(self, occur_index=False):
1496
+ self._occur_index = occur_index
1497
+
1498
+ def interpret(self, ex):
1499
+ """
1500
+ :param ex: ``AbstractBoxerDrs``
1501
+ :return: ``DrtExpression``
1502
+ """
1503
+ if isinstance(ex, BoxerDrs):
1504
+ drs = DRS(
1505
+ [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds))
1506
+ )
1507
+ if ex.consequent is not None:
1508
+ drs.consequent = self.interpret(ex.consequent)
1509
+ return drs
1510
+ elif isinstance(ex, BoxerNot):
1511
+ return DrtNegatedExpression(self.interpret(ex.drs))
1512
+ elif isinstance(ex, BoxerPred):
1513
+ pred = self._add_occur_indexing(f"{ex.pos}_{ex.name}", ex)
1514
+ return self._make_atom(pred, ex.var)
1515
+ elif isinstance(ex, BoxerNamed):
1516
+ pred = self._add_occur_indexing(f"ne_{ex.type}_{ex.name}", ex)
1517
+ return self._make_atom(pred, ex.var)
1518
+ elif isinstance(ex, BoxerRel):
1519
+ pred = self._add_occur_indexing("%s" % (ex.rel), ex)
1520
+ return self._make_atom(pred, ex.var1, ex.var2)
1521
+ elif isinstance(ex, BoxerProp):
1522
+ return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
1523
+ elif isinstance(ex, BoxerEq):
1524
+ return DrtEqualityExpression(
1525
+ DrtVariableExpression(Variable(ex.var1)),
1526
+ DrtVariableExpression(Variable(ex.var2)),
1527
+ )
1528
+ elif isinstance(ex, BoxerCard):
1529
+ pred = self._add_occur_indexing(f"card_{ex.type}_{ex.value}", ex)
1530
+ return self._make_atom(pred, ex.var)
1531
+ elif isinstance(ex, BoxerOr):
1532
+ return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
1533
+ elif isinstance(ex, BoxerWhq):
1534
+ drs1 = self.interpret(ex.drs1)
1535
+ drs2 = self.interpret(ex.drs2)
1536
+ return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
1537
+ assert False, f"{ex.__class__.__name__}: {ex}"
1538
+
1539
+ def _make_atom(self, pred, *args):
1540
+ accum = DrtVariableExpression(Variable(pred))
1541
+ for arg in args:
1542
+ accum = DrtApplicationExpression(
1543
+ accum, DrtVariableExpression(Variable(arg))
1544
+ )
1545
+ return accum
1546
+
1547
+ def _add_occur_indexing(self, base, ex):
1548
+ if self._occur_index and ex.sent_index is not None:
1549
+ if ex.discourse_id:
1550
+ base += "_%s" % ex.discourse_id
1551
+ base += "_s%s" % ex.sent_index
1552
+ base += "_w%s" % sorted(ex.word_indices)[0]
1553
+ return base
1554
+
1555
+
1556
+ class UnparseableInputException(Exception):
1557
+ pass
1558
+
1559
+
1560
+ if __name__ == "__main__":
1561
+ opts = OptionParser("usage: %prog TEXT [options]")
1562
+ opts.add_option(
1563
+ "--verbose",
1564
+ "-v",
1565
+ help="display verbose logs",
1566
+ action="store_true",
1567
+ default=False,
1568
+ dest="verbose",
1569
+ )
1570
+ opts.add_option(
1571
+ "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol"
1572
+ )
1573
+ opts.add_option(
1574
+ "--question",
1575
+ "-q",
1576
+ help="input is a question",
1577
+ action="store_true",
1578
+ default=False,
1579
+ dest="question",
1580
+ )
1581
+ opts.add_option(
1582
+ "--occur",
1583
+ "-o",
1584
+ help="occurrence index",
1585
+ action="store_true",
1586
+ default=False,
1587
+ dest="occur_index",
1588
+ )
1589
+ (options, args) = opts.parse_args()
1590
+
1591
+ if len(args) != 1:
1592
+ opts.error("incorrect number of arguments")
1593
+
1594
+ interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
1595
+ drs = Boxer(interpreter).interpret_multi(
1596
+ args[0].split(r"\n"), question=options.question, verbose=options.verbose
1597
+ )
1598
+ if drs is None:
1599
+ print(None)
1600
+ else:
1601
+ drs = drs.simplify().eliminate_equality()
1602
+ if options.fol:
1603
+ print(drs.fol().normalize())
1604
+ else:
1605
+ drs.pretty_print()
.eggs/nltk-3.8-py3.10.egg/nltk/sem/drt_glue_demo.py ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: GUI Demo for Glue Semantics with Discourse
2
+ # Representation Theory (DRT) as meaning language
3
+ #
4
+ # Author: Dan Garrette <dhgarrette@gmail.com>
5
+ #
6
+ # Copyright (C) 2001-2022 NLTK Project
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ try:
11
+ from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
12
+ from tkinter.font import Font
13
+
14
+ from nltk.draw.util import CanvasFrame, ShowText
15
+
16
+ except ImportError:
17
+ """Ignore ImportError because tkinter might not be available."""
18
+
19
+ from nltk.parse import MaltParser
20
+ from nltk.sem.drt import DrsDrawer, DrtVariableExpression
21
+ from nltk.sem.glue import DrtGlue
22
+ from nltk.sem.logic import Variable
23
+ from nltk.tag import RegexpTagger
24
+ from nltk.util import in_idle
25
+
26
+
27
+ class DrtGlueDemo:
28
+ def __init__(self, examples):
29
+ # Set up the main window.
30
+ self._top = Tk()
31
+ self._top.title("DRT Glue Demo")
32
+
33
+ # Set up key bindings.
34
+ self._init_bindings()
35
+
36
+ # Initialize the fonts.self._error = None
37
+ self._init_fonts(self._top)
38
+
39
+ self._examples = examples
40
+ self._readingCache = [None for example in examples]
41
+
42
+ # The user can hide the grammar.
43
+ self._show_grammar = IntVar(self._top)
44
+ self._show_grammar.set(1)
45
+
46
+ # Set the data to None
47
+ self._curExample = -1
48
+ self._readings = []
49
+ self._drs = None
50
+ self._drsWidget = None
51
+ self._error = None
52
+
53
+ self._init_glue()
54
+
55
+ # Create the basic frames.
56
+ self._init_menubar(self._top)
57
+ self._init_buttons(self._top)
58
+ self._init_exampleListbox(self._top)
59
+ self._init_readingListbox(self._top)
60
+ self._init_canvas(self._top)
61
+
62
+ # Resize callback
63
+ self._canvas.bind("<Configure>", self._configure)
64
+
65
+ #########################################
66
+ ## Initialization Helpers
67
+ #########################################
68
+
69
+ def _init_glue(self):
70
+ tagger = RegexpTagger(
71
+ [
72
+ ("^(David|Mary|John)$", "NNP"),
73
+ (
74
+ "^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
75
+ "VB",
76
+ ),
77
+ ("^(go|order|vanish|find|approach)$", "VB"),
78
+ ("^(a)$", "ex_quant"),
79
+ ("^(every)$", "univ_quant"),
80
+ ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
81
+ ("^(big|gray|former)$", "JJ"),
82
+ ("^(him|himself)$", "PRP"),
83
+ ]
84
+ )
85
+
86
+ depparser = MaltParser(tagger=tagger)
87
+ self._glue = DrtGlue(depparser=depparser, remove_duplicates=False)
88
+
89
+ def _init_fonts(self, root):
90
+ # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
91
+ self._sysfont = Font(font=Button()["font"])
92
+ root.option_add("*Font", self._sysfont)
93
+
94
+ # TWhat's our font size (default=same as sysfont)
95
+ self._size = IntVar(root)
96
+ self._size.set(self._sysfont.cget("size"))
97
+
98
+ self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
99
+ self._font = Font(family="helvetica", size=self._size.get())
100
+ if self._size.get() < 0:
101
+ big = self._size.get() - 2
102
+ else:
103
+ big = self._size.get() + 2
104
+ self._bigfont = Font(family="helvetica", weight="bold", size=big)
105
+
106
+ def _init_exampleListbox(self, parent):
107
+ self._exampleFrame = listframe = Frame(parent)
108
+ self._exampleFrame.pack(fill="both", side="left", padx=2)
109
+ self._exampleList_label = Label(
110
+ self._exampleFrame, font=self._boldfont, text="Examples"
111
+ )
112
+ self._exampleList_label.pack()
113
+ self._exampleList = Listbox(
114
+ self._exampleFrame,
115
+ selectmode="single",
116
+ relief="groove",
117
+ background="white",
118
+ foreground="#909090",
119
+ font=self._font,
120
+ selectforeground="#004040",
121
+ selectbackground="#c0f0c0",
122
+ )
123
+
124
+ self._exampleList.pack(side="right", fill="both", expand=1)
125
+
126
+ for example in self._examples:
127
+ self._exampleList.insert("end", (" %s" % example))
128
+ self._exampleList.config(height=min(len(self._examples), 25), width=40)
129
+
130
+ # Add a scrollbar if there are more than 25 examples.
131
+ if len(self._examples) > 25:
132
+ listscroll = Scrollbar(self._exampleFrame, orient="vertical")
133
+ self._exampleList.config(yscrollcommand=listscroll.set)
134
+ listscroll.config(command=self._exampleList.yview)
135
+ listscroll.pack(side="left", fill="y")
136
+
137
+ # If they select a example, apply it.
138
+ self._exampleList.bind("<<ListboxSelect>>", self._exampleList_select)
139
+
140
+ def _init_readingListbox(self, parent):
141
+ self._readingFrame = listframe = Frame(parent)
142
+ self._readingFrame.pack(fill="both", side="left", padx=2)
143
+ self._readingList_label = Label(
144
+ self._readingFrame, font=self._boldfont, text="Readings"
145
+ )
146
+ self._readingList_label.pack()
147
+ self._readingList = Listbox(
148
+ self._readingFrame,
149
+ selectmode="single",
150
+ relief="groove",
151
+ background="white",
152
+ foreground="#909090",
153
+ font=self._font,
154
+ selectforeground="#004040",
155
+ selectbackground="#c0f0c0",
156
+ )
157
+
158
+ self._readingList.pack(side="right", fill="both", expand=1)
159
+
160
+ # Add a scrollbar if there are more than 25 examples.
161
+ listscroll = Scrollbar(self._readingFrame, orient="vertical")
162
+ self._readingList.config(yscrollcommand=listscroll.set)
163
+ listscroll.config(command=self._readingList.yview)
164
+ listscroll.pack(side="right", fill="y")
165
+
166
+ self._populate_readingListbox()
167
+
168
+ def _populate_readingListbox(self):
169
+ # Populate the listbox with integers
170
+ self._readingList.delete(0, "end")
171
+ for i in range(len(self._readings)):
172
+ self._readingList.insert("end", (" %s" % (i + 1)))
173
+ self._readingList.config(height=min(len(self._readings), 25), width=5)
174
+
175
+ # If they select a example, apply it.
176
+ self._readingList.bind("<<ListboxSelect>>", self._readingList_select)
177
+
178
+ def _init_bindings(self):
179
+ # Key bindings are a good thing.
180
+ self._top.bind("<Control-q>", self.destroy)
181
+ self._top.bind("<Control-x>", self.destroy)
182
+ self._top.bind("<Escape>", self.destroy)
183
+ self._top.bind("n", self.next)
184
+ self._top.bind("<space>", self.next)
185
+ self._top.bind("p", self.prev)
186
+ self._top.bind("<BackSpace>", self.prev)
187
+
188
+ def _init_buttons(self, parent):
189
+ # Set up the frames.
190
+ self._buttonframe = buttonframe = Frame(parent)
191
+ buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
192
+ Button(
193
+ buttonframe,
194
+ text="Prev",
195
+ background="#90c0d0",
196
+ foreground="black",
197
+ command=self.prev,
198
+ ).pack(side="left")
199
+ Button(
200
+ buttonframe,
201
+ text="Next",
202
+ background="#90c0d0",
203
+ foreground="black",
204
+ command=self.next,
205
+ ).pack(side="left")
206
+
207
+ def _configure(self, event):
208
+ self._autostep = 0
209
+ (x1, y1, x2, y2) = self._cframe.scrollregion()
210
+ y2 = event.height - 6
211
+ self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
212
+ self._redraw()
213
+
214
+ def _init_canvas(self, parent):
215
+ self._cframe = CanvasFrame(
216
+ parent,
217
+ background="white",
218
+ # width=525, height=250,
219
+ closeenough=10,
220
+ border=2,
221
+ relief="sunken",
222
+ )
223
+ self._cframe.pack(expand=1, fill="both", side="top", pady=2)
224
+ canvas = self._canvas = self._cframe.canvas()
225
+
226
+ # Initially, there's no tree or text
227
+ self._tree = None
228
+ self._textwidgets = []
229
+ self._textline = None
230
+
231
+ def _init_menubar(self, parent):
232
+ menubar = Menu(parent)
233
+
234
+ filemenu = Menu(menubar, tearoff=0)
235
+ filemenu.add_command(
236
+ label="Exit", underline=1, command=self.destroy, accelerator="q"
237
+ )
238
+ menubar.add_cascade(label="File", underline=0, menu=filemenu)
239
+
240
+ actionmenu = Menu(menubar, tearoff=0)
241
+ actionmenu.add_command(
242
+ label="Next", underline=0, command=self.next, accelerator="n, Space"
243
+ )
244
+ actionmenu.add_command(
245
+ label="Previous", underline=0, command=self.prev, accelerator="p, Backspace"
246
+ )
247
+ menubar.add_cascade(label="Action", underline=0, menu=actionmenu)
248
+
249
+ optionmenu = Menu(menubar, tearoff=0)
250
+ optionmenu.add_checkbutton(
251
+ label="Remove Duplicates",
252
+ underline=0,
253
+ variable=self._glue.remove_duplicates,
254
+ command=self._toggle_remove_duplicates,
255
+ accelerator="r",
256
+ )
257
+ menubar.add_cascade(label="Options", underline=0, menu=optionmenu)
258
+
259
+ viewmenu = Menu(menubar, tearoff=0)
260
+ viewmenu.add_radiobutton(
261
+ label="Tiny",
262
+ variable=self._size,
263
+ underline=0,
264
+ value=10,
265
+ command=self.resize,
266
+ )
267
+ viewmenu.add_radiobutton(
268
+ label="Small",
269
+ variable=self._size,
270
+ underline=0,
271
+ value=12,
272
+ command=self.resize,
273
+ )
274
+ viewmenu.add_radiobutton(
275
+ label="Medium",
276
+ variable=self._size,
277
+ underline=0,
278
+ value=14,
279
+ command=self.resize,
280
+ )
281
+ viewmenu.add_radiobutton(
282
+ label="Large",
283
+ variable=self._size,
284
+ underline=0,
285
+ value=18,
286
+ command=self.resize,
287
+ )
288
+ viewmenu.add_radiobutton(
289
+ label="Huge",
290
+ variable=self._size,
291
+ underline=0,
292
+ value=24,
293
+ command=self.resize,
294
+ )
295
+ menubar.add_cascade(label="View", underline=0, menu=viewmenu)
296
+
297
+ helpmenu = Menu(menubar, tearoff=0)
298
+ helpmenu.add_command(label="About", underline=0, command=self.about)
299
+ menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
300
+
301
+ parent.config(menu=menubar)
302
+
303
+ #########################################
304
+ ## Main draw procedure
305
+ #########################################
306
+
307
+ def _redraw(self):
308
+ canvas = self._canvas
309
+
310
+ # Delete the old DRS, widgets, etc.
311
+ if self._drsWidget is not None:
312
+ self._drsWidget.clear()
313
+
314
+ if self._drs:
315
+ self._drsWidget = DrsWidget(self._canvas, self._drs)
316
+ self._drsWidget.draw()
317
+
318
+ if self._error:
319
+ self._drsWidget = DrsWidget(self._canvas, self._error)
320
+ self._drsWidget.draw()
321
+
322
+ #########################################
323
+ ## Button Callbacks
324
+ #########################################
325
+
326
+ def destroy(self, *e):
327
+ self._autostep = 0
328
+ if self._top is None:
329
+ return
330
+ self._top.destroy()
331
+ self._top = None
332
+
333
+ def prev(self, *e):
334
+ selection = self._readingList.curselection()
335
+ readingListSize = self._readingList.size()
336
+
337
+ # there are readings
338
+ if readingListSize > 0:
339
+ # if one reading is currently selected
340
+ if len(selection) == 1:
341
+ index = int(selection[0])
342
+
343
+ # if it's on (or before) the first item
344
+ if index <= 0:
345
+ self._select_previous_example()
346
+ else:
347
+ self._readingList_store_selection(index - 1)
348
+
349
+ else:
350
+ # select its first reading
351
+ self._readingList_store_selection(readingListSize - 1)
352
+
353
+ else:
354
+ self._select_previous_example()
355
+
356
+ def _select_previous_example(self):
357
+ # if the current example is not the first example
358
+ if self._curExample > 0:
359
+ self._exampleList_store_selection(self._curExample - 1)
360
+ else:
361
+ # go to the last example
362
+ self._exampleList_store_selection(len(self._examples) - 1)
363
+
364
+ def next(self, *e):
365
+ selection = self._readingList.curselection()
366
+ readingListSize = self._readingList.size()
367
+
368
+ # if there are readings
369
+ if readingListSize > 0:
370
+ # if one reading is currently selected
371
+ if len(selection) == 1:
372
+ index = int(selection[0])
373
+
374
+ # if it's on (or past) the last item
375
+ if index >= (readingListSize - 1):
376
+ self._select_next_example()
377
+ else:
378
+ self._readingList_store_selection(index + 1)
379
+
380
+ else:
381
+ # select its first reading
382
+ self._readingList_store_selection(0)
383
+
384
+ else:
385
+ self._select_next_example()
386
+
387
+ def _select_next_example(self):
388
+ # if the current example is not the last example
389
+ if self._curExample < len(self._examples) - 1:
390
+ self._exampleList_store_selection(self._curExample + 1)
391
+ else:
392
+ # go to the first example
393
+ self._exampleList_store_selection(0)
394
+
395
+ def about(self, *e):
396
+ ABOUT = (
397
+ "NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"
398
+ + "Written by Daniel H. Garrette"
399
+ )
400
+ TITLE = "About: NLTK DRT Glue Demo"
401
+ try:
402
+ from tkinter.messagebox import Message
403
+
404
+ Message(message=ABOUT, title=TITLE).show()
405
+ except:
406
+ ShowText(self._top, TITLE, ABOUT)
407
+
408
+ def postscript(self, *e):
409
+ self._autostep = 0
410
+ self._cframe.print_to_file()
411
+
412
+ def mainloop(self, *args, **kwargs):
413
+ """
414
+ Enter the Tkinter mainloop. This function must be called if
415
+ this demo is created from a non-interactive program (e.g.
416
+ from a secript); otherwise, the demo will close as soon as
417
+ the script completes.
418
+ """
419
+ if in_idle():
420
+ return
421
+ self._top.mainloop(*args, **kwargs)
422
+
423
+ def resize(self, size=None):
424
+ if size is not None:
425
+ self._size.set(size)
426
+ size = self._size.get()
427
+ self._font.configure(size=-(abs(size)))
428
+ self._boldfont.configure(size=-(abs(size)))
429
+ self._sysfont.configure(size=-(abs(size)))
430
+ self._bigfont.configure(size=-(abs(size + 2)))
431
+ self._redraw()
432
+
433
+ def _toggle_remove_duplicates(self):
434
+ self._glue.remove_duplicates = not self._glue.remove_duplicates
435
+
436
+ self._exampleList.selection_clear(0, "end")
437
+ self._readings = []
438
+ self._populate_readingListbox()
439
+ self._readingCache = [None for ex in self._examples]
440
+ self._curExample = -1
441
+ self._error = None
442
+
443
+ self._drs = None
444
+ self._redraw()
445
+
446
+ def _exampleList_select(self, event):
447
+ selection = self._exampleList.curselection()
448
+ if len(selection) != 1:
449
+ return
450
+ self._exampleList_store_selection(int(selection[0]))
451
+
452
+ def _exampleList_store_selection(self, index):
453
+ self._curExample = index
454
+ example = self._examples[index]
455
+
456
+ self._exampleList.selection_clear(0, "end")
457
+ if example:
458
+ cache = self._readingCache[index]
459
+ if cache:
460
+ if isinstance(cache, list):
461
+ self._readings = cache
462
+ self._error = None
463
+ else:
464
+ self._readings = []
465
+ self._error = cache
466
+ else:
467
+ try:
468
+ self._readings = self._glue.parse_to_meaning(example)
469
+ self._error = None
470
+ self._readingCache[index] = self._readings
471
+ except Exception as e:
472
+ self._readings = []
473
+ self._error = DrtVariableExpression(Variable("Error: " + str(e)))
474
+ self._readingCache[index] = self._error
475
+
476
+ # add a star to the end of the example
477
+ self._exampleList.delete(index)
478
+ self._exampleList.insert(index, (" %s *" % example))
479
+ self._exampleList.config(
480
+ height=min(len(self._examples), 25), width=40
481
+ )
482
+
483
+ self._populate_readingListbox()
484
+
485
+ self._exampleList.selection_set(index)
486
+
487
+ self._drs = None
488
+ self._redraw()
489
+
490
+ def _readingList_select(self, event):
491
+ selection = self._readingList.curselection()
492
+ if len(selection) != 1:
493
+ return
494
+ self._readingList_store_selection(int(selection[0]))
495
+
496
+ def _readingList_store_selection(self, index):
497
+ reading = self._readings[index]
498
+
499
+ self._readingList.selection_clear(0, "end")
500
+ if reading:
501
+ self._readingList.selection_set(index)
502
+
503
+ self._drs = reading.simplify().normalize().resolve_anaphora()
504
+
505
+ self._redraw()
506
+
507
+
508
+ class DrsWidget:
509
+ def __init__(self, canvas, drs, **attribs):
510
+ self._drs = drs
511
+ self._canvas = canvas
512
+ canvas.font = Font(
513
+ font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font")
514
+ )
515
+ canvas._BUFFER = 3
516
+ self.bbox = (0, 0, 0, 0)
517
+
518
+ def draw(self):
519
+ (right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw()
520
+ self.bbox = (0, 0, right + 1, bottom + 1)
521
+
522
+ def clear(self):
523
+ self._canvas.create_rectangle(self.bbox, fill="white", width="0")
524
+
525
+
526
+ def demo():
527
+ examples = [
528
+ "John walks",
529
+ "David sees Mary",
530
+ "David eats a sandwich",
531
+ "every man chases a dog",
532
+ # 'every man believes a dog yawns',
533
+ # 'John gives David a sandwich',
534
+ "John chases himself",
535
+ # 'John persuades David to order a pizza',
536
+ # 'John tries to go',
537
+ # 'John tries to find a unicorn',
538
+ # 'John seems to vanish',
539
+ # 'a unicorn seems to approach',
540
+ # 'every big cat leaves',
541
+ # 'every gray cat leaves',
542
+ # 'every big gray cat leaves',
543
+ # 'a former senator leaves',
544
+ # 'John likes a cat',
545
+ # 'John likes every cat',
546
+ # 'he walks',
547
+ # 'John walks and he leaves'
548
+ ]
549
+ DrtGlueDemo(examples).mainloop()
550
+
551
+
552
+ if __name__ == "__main__":
553
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/sem/glue.py ADDED
@@ -0,0 +1,835 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Glue Semantics
2
+ #
3
+ # Author: Dan Garrette <dhgarrette@gmail.com>
4
+ #
5
+ # Copyright (C) 2001-2022 NLTK Project
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ import os
10
+ from itertools import chain
11
+
12
+ import nltk
13
+ from nltk.internals import Counter
14
+ from nltk.sem import drt, linearlogic
15
+ from nltk.sem.logic import (
16
+ AbstractVariableExpression,
17
+ Expression,
18
+ LambdaExpression,
19
+ Variable,
20
+ VariableExpression,
21
+ )
22
+ from nltk.tag import BigramTagger, RegexpTagger, TrigramTagger, UnigramTagger
23
+
24
+ SPEC_SEMTYPES = {
25
+ "a": "ex_quant",
26
+ "an": "ex_quant",
27
+ "every": "univ_quant",
28
+ "the": "def_art",
29
+ "no": "no_quant",
30
+ "default": "ex_quant",
31
+ }
32
+
33
+ OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]
34
+
35
+
36
+ class GlueFormula:
37
+ def __init__(self, meaning, glue, indices=None):
38
+ if not indices:
39
+ indices = set()
40
+
41
+ if isinstance(meaning, str):
42
+ self.meaning = Expression.fromstring(meaning)
43
+ elif isinstance(meaning, Expression):
44
+ self.meaning = meaning
45
+ else:
46
+ raise RuntimeError(
47
+ "Meaning term neither string or expression: %s, %s"
48
+ % (meaning, meaning.__class__)
49
+ )
50
+
51
+ if isinstance(glue, str):
52
+ self.glue = linearlogic.LinearLogicParser().parse(glue)
53
+ elif isinstance(glue, linearlogic.Expression):
54
+ self.glue = glue
55
+ else:
56
+ raise RuntimeError(
57
+ "Glue term neither string or expression: %s, %s"
58
+ % (glue, glue.__class__)
59
+ )
60
+
61
+ self.indices = indices
62
+
63
+ def applyto(self, arg):
64
+ """self = (\\x.(walk x), (subj -o f))
65
+ arg = (john , subj)
66
+ returns ((walk john), f)
67
+ """
68
+ if self.indices & arg.indices: # if the sets are NOT disjoint
69
+ raise linearlogic.LinearLogicApplicationException(
70
+ f"'{self}' applied to '{arg}'. Indices are not disjoint."
71
+ )
72
+ else: # if the sets ARE disjoint
73
+ return_indices = self.indices | arg.indices
74
+
75
+ try:
76
+ return_glue = linearlogic.ApplicationExpression(
77
+ self.glue, arg.glue, arg.indices
78
+ )
79
+ except linearlogic.LinearLogicApplicationException as e:
80
+ raise linearlogic.LinearLogicApplicationException(
81
+ f"'{self.simplify()}' applied to '{arg.simplify()}'"
82
+ ) from e
83
+
84
+ arg_meaning_abstracted = arg.meaning
85
+ if return_indices:
86
+ for dep in self.glue.simplify().antecedent.dependencies[
87
+ ::-1
88
+ ]: # if self.glue is (A -o B), dep is in A.dependencies
89
+ arg_meaning_abstracted = self.make_LambdaExpression(
90
+ Variable("v%s" % dep), arg_meaning_abstracted
91
+ )
92
+ return_meaning = self.meaning.applyto(arg_meaning_abstracted)
93
+
94
+ return self.__class__(return_meaning, return_glue, return_indices)
95
+
96
+ def make_VariableExpression(self, name):
97
+ return VariableExpression(name)
98
+
99
+ def make_LambdaExpression(self, variable, term):
100
+ return LambdaExpression(variable, term)
101
+
102
+ def lambda_abstract(self, other):
103
+ assert isinstance(other, GlueFormula)
104
+ assert isinstance(other.meaning, AbstractVariableExpression)
105
+ return self.__class__(
106
+ self.make_LambdaExpression(other.meaning.variable, self.meaning),
107
+ linearlogic.ImpExpression(other.glue, self.glue),
108
+ )
109
+
110
+ def compile(self, counter=None):
111
+ """From Iddo Lev's PhD Dissertation p108-109"""
112
+ if not counter:
113
+ counter = Counter()
114
+ (compiled_glue, new_forms) = self.glue.simplify().compile_pos(
115
+ counter, self.__class__
116
+ )
117
+ return new_forms + [
118
+ self.__class__(self.meaning, compiled_glue, {counter.get()})
119
+ ]
120
+
121
+ def simplify(self):
122
+ return self.__class__(
123
+ self.meaning.simplify(), self.glue.simplify(), self.indices
124
+ )
125
+
126
+ def __eq__(self, other):
127
+ return (
128
+ self.__class__ == other.__class__
129
+ and self.meaning == other.meaning
130
+ and self.glue == other.glue
131
+ )
132
+
133
+ def __ne__(self, other):
134
+ return not self == other
135
+
136
+ # sorting for use in doctests which must be deterministic
137
+ def __lt__(self, other):
138
+ return str(self) < str(other)
139
+
140
+ def __str__(self):
141
+ assert isinstance(self.indices, set)
142
+ accum = f"{self.meaning} : {self.glue}"
143
+ if self.indices:
144
+ accum += (
145
+ " : {" + ", ".join(str(index) for index in sorted(self.indices)) + "}"
146
+ )
147
+ return accum
148
+
149
+ def __repr__(self):
150
+ return "%s" % self
151
+
152
+
153
+ class GlueDict(dict):
154
+ def __init__(self, filename, encoding=None):
155
+ self.filename = filename
156
+ self.file_encoding = encoding
157
+ self.read_file()
158
+
159
+ def read_file(self, empty_first=True):
160
+ if empty_first:
161
+ self.clear()
162
+
163
+ try:
164
+ contents = nltk.data.load(
165
+ self.filename, format="text", encoding=self.file_encoding
166
+ )
167
+ # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
168
+ except LookupError as e:
169
+ try:
170
+ contents = nltk.data.load(
171
+ "file:" + self.filename, format="text", encoding=self.file_encoding
172
+ )
173
+ except LookupError:
174
+ raise e
175
+ lines = contents.splitlines()
176
+
177
+ for line in lines: # example: 'n : (\\x.(<word> x), (v-or))'
178
+ # lambdacalc -^ linear logic -^
179
+ line = line.strip() # remove trailing newline
180
+ if not len(line):
181
+ continue # skip empty lines
182
+ if line[0] == "#":
183
+ continue # skip commented out lines
184
+
185
+ parts = line.split(
186
+ " : ", 2
187
+ ) # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
188
+
189
+ glue_formulas = []
190
+ paren_count = 0
191
+ tuple_start = 0
192
+ tuple_comma = 0
193
+
194
+ relationships = None
195
+
196
+ if len(parts) > 1:
197
+ for (i, c) in enumerate(parts[1]):
198
+ if c == "(":
199
+ if paren_count == 0: # if it's the first '(' of a tuple
200
+ tuple_start = i + 1 # then save the index
201
+ paren_count += 1
202
+ elif c == ")":
203
+ paren_count -= 1
204
+ if paren_count == 0: # if it's the last ')' of a tuple
205
+ meaning_term = parts[1][
206
+ tuple_start:tuple_comma
207
+ ] # '\\x.(<word> x)'
208
+ glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)'
209
+ glue_formulas.append(
210
+ [meaning_term, glue_term]
211
+ ) # add the GlueFormula to the list
212
+ elif c == ",":
213
+ if (
214
+ paren_count == 1
215
+ ): # if it's a comma separating the parts of the tuple
216
+ tuple_comma = i # then save the index
217
+ elif c == "#": # skip comments at the ends of lines
218
+ if (
219
+ paren_count != 0
220
+ ): # if the line hasn't parsed correctly so far
221
+ raise RuntimeError(
222
+ "Formula syntax is incorrect for entry " + line
223
+ )
224
+ break # break to the next line
225
+
226
+ if len(parts) > 2: # if there is a relationship entry at the end
227
+ rel_start = parts[2].index("[") + 1
228
+ rel_end = parts[2].index("]")
229
+ if rel_start == rel_end:
230
+ relationships = frozenset()
231
+ else:
232
+ relationships = frozenset(
233
+ r.strip() for r in parts[2][rel_start:rel_end].split(",")
234
+ )
235
+
236
+ try:
237
+ start_inheritance = parts[0].index("(")
238
+ end_inheritance = parts[0].index(")")
239
+ sem = parts[0][:start_inheritance].strip()
240
+ supertype = parts[0][start_inheritance + 1 : end_inheritance]
241
+ except:
242
+ sem = parts[0].strip()
243
+ supertype = None
244
+
245
+ if sem not in self:
246
+ self[sem] = {}
247
+
248
+ if (
249
+ relationships is None
250
+ ): # if not specified for a specific relationship set
251
+ # add all relationship entries for parents
252
+ if supertype:
253
+ for rels in self[supertype]:
254
+ if rels not in self[sem]:
255
+ self[sem][rels] = []
256
+ glue = self[supertype][rels]
257
+ self[sem][rels].extend(glue)
258
+ self[sem][rels].extend(
259
+ glue_formulas
260
+ ) # add the glue formulas to every rel entry
261
+ else:
262
+ if None not in self[sem]:
263
+ self[sem][None] = []
264
+ self[sem][None].extend(
265
+ glue_formulas
266
+ ) # add the glue formulas to every rel entry
267
+ else:
268
+ if relationships not in self[sem]:
269
+ self[sem][relationships] = []
270
+ if supertype:
271
+ self[sem][relationships].extend(self[supertype][relationships])
272
+ self[sem][relationships].extend(
273
+ glue_formulas
274
+ ) # add the glue entry to the dictionary
275
+
276
+ def __str__(self):
277
+ accum = ""
278
+ for pos in self:
279
+ str_pos = "%s" % pos
280
+ for relset in self[pos]:
281
+ i = 1
282
+ for gf in self[pos][relset]:
283
+ if i == 1:
284
+ accum += str_pos + ": "
285
+ else:
286
+ accum += " " * (len(str_pos) + 2)
287
+ accum += "%s" % gf
288
+ if relset and i == len(self[pos][relset]):
289
+ accum += " : %s" % relset
290
+ accum += "\n"
291
+ i += 1
292
+ return accum
293
+
294
+ def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False):
295
+ if node is None:
296
+ # TODO: should it be depgraph.root? Is this code tested?
297
+ top = depgraph.nodes[0]
298
+ depList = list(chain.from_iterable(top["deps"].values()))
299
+ root = depgraph.nodes[depList[0]]
300
+
301
+ return self.to_glueformula_list(depgraph, root, Counter(), verbose)
302
+
303
+ glueformulas = self.lookup(node, depgraph, counter)
304
+ for dep_idx in chain.from_iterable(node["deps"].values()):
305
+ dep = depgraph.nodes[dep_idx]
306
+ glueformulas.extend(
307
+ self.to_glueformula_list(depgraph, dep, counter, verbose)
308
+ )
309
+ return glueformulas
310
+
311
+ def lookup(self, node, depgraph, counter):
312
+ semtype_names = self.get_semtypes(node)
313
+
314
+ semtype = None
315
+ for name in semtype_names:
316
+ if name in self:
317
+ semtype = self[name]
318
+ break
319
+ if semtype is None:
320
+ # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
321
+ return []
322
+
323
+ self.add_missing_dependencies(node, depgraph)
324
+
325
+ lookup = self._lookup_semtype_option(semtype, node, depgraph)
326
+
327
+ if not len(lookup):
328
+ raise KeyError(
329
+ "There is no GlueDict entry for sem type of '%s' "
330
+ "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"])
331
+ )
332
+
333
+ return self.get_glueformulas_from_semtype_entry(
334
+ lookup, node["word"], node, depgraph, counter
335
+ )
336
+
337
+ def add_missing_dependencies(self, node, depgraph):
338
+ rel = node["rel"].lower()
339
+
340
+ if rel == "main":
341
+ headnode = depgraph.nodes[node["head"]]
342
+ subj = self.lookup_unique("subj", headnode, depgraph)
343
+ relation = subj["rel"]
344
+ node["deps"].setdefault(relation, [])
345
+ node["deps"][relation].append(subj["address"])
346
+ # node['deps'].append(subj['address'])
347
+
348
+ def _lookup_semtype_option(self, semtype, node, depgraph):
349
+ relationships = frozenset(
350
+ depgraph.nodes[dep]["rel"].lower()
351
+ for dep in chain.from_iterable(node["deps"].values())
352
+ if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS
353
+ )
354
+
355
+ try:
356
+ lookup = semtype[relationships]
357
+ except KeyError:
358
+ # An exact match is not found, so find the best match where
359
+ # 'best' is defined as the glue entry whose relationship set has the
360
+ # most relations of any possible relationship set that is a subset
361
+ # of the actual depgraph
362
+ best_match = frozenset()
363
+ for relset_option in set(semtype) - {None}:
364
+ if (
365
+ len(relset_option) > len(best_match)
366
+ and relset_option < relationships
367
+ ):
368
+ best_match = relset_option
369
+ if not best_match:
370
+ if None in semtype:
371
+ best_match = None
372
+ else:
373
+ return None
374
+ lookup = semtype[best_match]
375
+
376
+ return lookup
377
+
378
+ def get_semtypes(self, node):
379
+ """
380
+ Based on the node, return a list of plausible semtypes in order of
381
+ plausibility.
382
+ """
383
+ rel = node["rel"].lower()
384
+ word = node["word"].lower()
385
+
386
+ if rel == "spec":
387
+ if word in SPEC_SEMTYPES:
388
+ return [SPEC_SEMTYPES[word]]
389
+ else:
390
+ return [SPEC_SEMTYPES["default"]]
391
+ elif rel in ["nmod", "vmod"]:
392
+ return [node["tag"], rel]
393
+ else:
394
+ return [node["tag"]]
395
+
396
+ def get_glueformulas_from_semtype_entry(
397
+ self, lookup, word, node, depgraph, counter
398
+ ):
399
+ glueformulas = []
400
+
401
+ glueFormulaFactory = self.get_GlueFormula_factory()
402
+ for meaning, glue in lookup:
403
+ gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue)
404
+ if not len(glueformulas):
405
+ gf.word = word
406
+ else:
407
+ gf.word = f"{word}{len(glueformulas) + 1}"
408
+
409
+ gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
410
+
411
+ glueformulas.append(gf)
412
+ return glueformulas
413
+
414
+ def get_meaning_formula(self, generic, word):
415
+ """
416
+ :param generic: A meaning formula string containing the
417
+ parameter "<word>"
418
+ :param word: The actual word to be replace "<word>"
419
+ """
420
+ word = word.replace(".", "")
421
+ return generic.replace("<word>", word)
422
+
423
+ def initialize_labels(self, expr, node, depgraph, unique_index):
424
+ if isinstance(expr, linearlogic.AtomicExpression):
425
+ name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index)
426
+ if name[0].isupper():
427
+ return linearlogic.VariableExpression(name)
428
+ else:
429
+ return linearlogic.ConstantExpression(name)
430
+ else:
431
+ return linearlogic.ImpExpression(
432
+ self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
433
+ self.initialize_labels(expr.consequent, node, depgraph, unique_index),
434
+ )
435
+
436
+ def find_label_name(self, name, node, depgraph, unique_index):
437
+ try:
438
+ dot = name.index(".")
439
+
440
+ before_dot = name[:dot]
441
+ after_dot = name[dot + 1 :]
442
+ if before_dot == "super":
443
+ return self.find_label_name(
444
+ after_dot, depgraph.nodes[node["head"]], depgraph, unique_index
445
+ )
446
+ else:
447
+ return self.find_label_name(
448
+ after_dot,
449
+ self.lookup_unique(before_dot, node, depgraph),
450
+ depgraph,
451
+ unique_index,
452
+ )
453
+ except ValueError:
454
+ lbl = self.get_label(node)
455
+ if name == "f":
456
+ return lbl
457
+ elif name == "v":
458
+ return "%sv" % lbl
459
+ elif name == "r":
460
+ return "%sr" % lbl
461
+ elif name == "super":
462
+ return self.get_label(depgraph.nodes[node["head"]])
463
+ elif name == "var":
464
+ return f"{lbl.upper()}{unique_index}"
465
+ elif name == "a":
466
+ return self.get_label(self.lookup_unique("conja", node, depgraph))
467
+ elif name == "b":
468
+ return self.get_label(self.lookup_unique("conjb", node, depgraph))
469
+ else:
470
+ return self.get_label(self.lookup_unique(name, node, depgraph))
471
+
472
+ def get_label(self, node):
473
+ """
474
+ Pick an alphabetic character as identifier for an entity in the model.
475
+
476
+ :param value: where to index into the list of characters
477
+ :type value: int
478
+ """
479
+ value = node["address"]
480
+
481
+ letter = [
482
+ "f",
483
+ "g",
484
+ "h",
485
+ "i",
486
+ "j",
487
+ "k",
488
+ "l",
489
+ "m",
490
+ "n",
491
+ "o",
492
+ "p",
493
+ "q",
494
+ "r",
495
+ "s",
496
+ "t",
497
+ "u",
498
+ "v",
499
+ "w",
500
+ "x",
501
+ "y",
502
+ "z",
503
+ "a",
504
+ "b",
505
+ "c",
506
+ "d",
507
+ "e",
508
+ ][value - 1]
509
+ num = int(value) // 26
510
+ if num > 0:
511
+ return letter + str(num)
512
+ else:
513
+ return letter
514
+
515
+ def lookup_unique(self, rel, node, depgraph):
516
+ """
517
+ Lookup 'key'. There should be exactly one item in the associated relation.
518
+ """
519
+ deps = [
520
+ depgraph.nodes[dep]
521
+ for dep in chain.from_iterable(node["deps"].values())
522
+ if depgraph.nodes[dep]["rel"].lower() == rel.lower()
523
+ ]
524
+
525
+ if len(deps) == 0:
526
+ raise KeyError(
527
+ "'{}' doesn't contain a feature '{}'".format(node["word"], rel)
528
+ )
529
+ elif len(deps) > 1:
530
+ raise KeyError(
531
+ "'{}' should only have one feature '{}'".format(node["word"], rel)
532
+ )
533
+ else:
534
+ return deps[0]
535
+
536
+ def get_GlueFormula_factory(self):
537
+ return GlueFormula
538
+
539
+
540
+ class Glue:
541
+ def __init__(
542
+ self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
543
+ ):
544
+ self.verbose = verbose
545
+ self.remove_duplicates = remove_duplicates
546
+ self.depparser = depparser
547
+
548
+ from nltk import Prover9
549
+
550
+ self.prover = Prover9()
551
+
552
+ if semtype_file:
553
+ self.semtype_file = semtype_file
554
+ else:
555
+ self.semtype_file = os.path.join(
556
+ "grammars", "sample_grammars", "glue.semtype"
557
+ )
558
+
559
+ def train_depparser(self, depgraphs=None):
560
+ if depgraphs:
561
+ self.depparser.train(depgraphs)
562
+ else:
563
+ self.depparser.train_from_file(
564
+ nltk.data.find(
565
+ os.path.join("grammars", "sample_grammars", "glue_train.conll")
566
+ )
567
+ )
568
+
569
+ def parse_to_meaning(self, sentence):
570
+ readings = []
571
+ for agenda in self.parse_to_compiled(sentence):
572
+ readings.extend(self.get_readings(agenda))
573
+ return readings
574
+
575
+ def get_readings(self, agenda):
576
+ readings = []
577
+ agenda_length = len(agenda)
578
+ atomics = dict()
579
+ nonatomics = dict()
580
+ while agenda: # is not empty
581
+ cur = agenda.pop()
582
+ glue_simp = cur.glue.simplify()
583
+ if isinstance(
584
+ glue_simp, linearlogic.ImpExpression
585
+ ): # if cur.glue is non-atomic
586
+ for key in atomics:
587
+ try:
588
+ if isinstance(cur.glue, linearlogic.ApplicationExpression):
589
+ bindings = cur.glue.bindings
590
+ else:
591
+ bindings = linearlogic.BindingDict()
592
+ glue_simp.antecedent.unify(key, bindings)
593
+ for atomic in atomics[key]:
594
+ if not (
595
+ cur.indices & atomic.indices
596
+ ): # if the sets of indices are disjoint
597
+ try:
598
+ agenda.append(cur.applyto(atomic))
599
+ except linearlogic.LinearLogicApplicationException:
600
+ pass
601
+ except linearlogic.UnificationException:
602
+ pass
603
+ try:
604
+ nonatomics[glue_simp.antecedent].append(cur)
605
+ except KeyError:
606
+ nonatomics[glue_simp.antecedent] = [cur]
607
+
608
+ else: # else cur.glue is atomic
609
+ for key in nonatomics:
610
+ for nonatomic in nonatomics[key]:
611
+ try:
612
+ if isinstance(
613
+ nonatomic.glue, linearlogic.ApplicationExpression
614
+ ):
615
+ bindings = nonatomic.glue.bindings
616
+ else:
617
+ bindings = linearlogic.BindingDict()
618
+ glue_simp.unify(key, bindings)
619
+ if not (
620
+ cur.indices & nonatomic.indices
621
+ ): # if the sets of indices are disjoint
622
+ try:
623
+ agenda.append(nonatomic.applyto(cur))
624
+ except linearlogic.LinearLogicApplicationException:
625
+ pass
626
+ except linearlogic.UnificationException:
627
+ pass
628
+ try:
629
+ atomics[glue_simp].append(cur)
630
+ except KeyError:
631
+ atomics[glue_simp] = [cur]
632
+
633
+ for entry in atomics:
634
+ for gf in atomics[entry]:
635
+ if len(gf.indices) == agenda_length:
636
+ self._add_to_reading_list(gf, readings)
637
+ for entry in nonatomics:
638
+ for gf in nonatomics[entry]:
639
+ if len(gf.indices) == agenda_length:
640
+ self._add_to_reading_list(gf, readings)
641
+ return readings
642
+
643
+ def _add_to_reading_list(self, glueformula, reading_list):
644
+ add_reading = True
645
+ if self.remove_duplicates:
646
+ for reading in reading_list:
647
+ try:
648
+ if reading.equiv(glueformula.meaning, self.prover):
649
+ add_reading = False
650
+ break
651
+ except Exception as e:
652
+ # if there is an exception, the syntax of the formula
653
+ # may not be understandable by the prover, so don't
654
+ # throw out the reading.
655
+ print("Error when checking logical equality of statements", e)
656
+
657
+ if add_reading:
658
+ reading_list.append(glueformula.meaning)
659
+
660
+ def parse_to_compiled(self, sentence):
661
+ gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
662
+ return [self.gfl_to_compiled(gfl) for gfl in gfls]
663
+
664
+ def dep_parse(self, sentence):
665
+ """
666
+ Return a dependency graph for the sentence.
667
+
668
+ :param sentence: the sentence to be parsed
669
+ :type sentence: list(str)
670
+ :rtype: DependencyGraph
671
+ """
672
+
673
+ # Lazy-initialize the depparser
674
+ if self.depparser is None:
675
+ from nltk.parse import MaltParser
676
+
677
+ self.depparser = MaltParser(tagger=self.get_pos_tagger())
678
+ if not self.depparser._trained:
679
+ self.train_depparser()
680
+ return self.depparser.parse(sentence, verbose=self.verbose)
681
+
682
+ def depgraph_to_glue(self, depgraph):
683
+ return self.get_glue_dict().to_glueformula_list(depgraph)
684
+
685
+ def get_glue_dict(self):
686
+ return GlueDict(self.semtype_file)
687
+
688
+ def gfl_to_compiled(self, gfl):
689
+ index_counter = Counter()
690
+ return_list = []
691
+ for gf in gfl:
692
+ return_list.extend(gf.compile(index_counter))
693
+
694
+ if self.verbose:
695
+ print("Compiled Glue Premises:")
696
+ for cgf in return_list:
697
+ print(cgf)
698
+
699
+ return return_list
700
+
701
+ def get_pos_tagger(self):
702
+ from nltk.corpus import brown
703
+
704
+ regexp_tagger = RegexpTagger(
705
+ [
706
+ (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
707
+ (r"(The|the|A|a|An|an)$", "AT"), # articles
708
+ (r".*able$", "JJ"), # adjectives
709
+ (r".*ness$", "NN"), # nouns formed from adjectives
710
+ (r".*ly$", "RB"), # adverbs
711
+ (r".*s$", "NNS"), # plural nouns
712
+ (r".*ing$", "VBG"), # gerunds
713
+ (r".*ed$", "VBD"), # past tense verbs
714
+ (r".*", "NN"), # nouns (default)
715
+ ]
716
+ )
717
+ brown_train = brown.tagged_sents(categories="news")
718
+ unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
719
+ bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
720
+ trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
721
+
722
+ # Override particular words
723
+ main_tagger = RegexpTagger(
724
+ [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
725
+ backoff=trigram_tagger,
726
+ )
727
+
728
+ return main_tagger
729
+
730
+
731
+ class DrtGlueFormula(GlueFormula):
732
+ def __init__(self, meaning, glue, indices=None):
733
+ if not indices:
734
+ indices = set()
735
+
736
+ if isinstance(meaning, str):
737
+ self.meaning = drt.DrtExpression.fromstring(meaning)
738
+ elif isinstance(meaning, drt.DrtExpression):
739
+ self.meaning = meaning
740
+ else:
741
+ raise RuntimeError(
742
+ "Meaning term neither string or expression: %s, %s"
743
+ % (meaning, meaning.__class__)
744
+ )
745
+
746
+ if isinstance(glue, str):
747
+ self.glue = linearlogic.LinearLogicParser().parse(glue)
748
+ elif isinstance(glue, linearlogic.Expression):
749
+ self.glue = glue
750
+ else:
751
+ raise RuntimeError(
752
+ "Glue term neither string or expression: %s, %s"
753
+ % (glue, glue.__class__)
754
+ )
755
+
756
+ self.indices = indices
757
+
758
+ def make_VariableExpression(self, name):
759
+ return drt.DrtVariableExpression(name)
760
+
761
+ def make_LambdaExpression(self, variable, term):
762
+ return drt.DrtLambdaExpression(variable, term)
763
+
764
+
765
+ class DrtGlueDict(GlueDict):
766
+ def get_GlueFormula_factory(self):
767
+ return DrtGlueFormula
768
+
769
+
770
+ class DrtGlue(Glue):
771
+ def __init__(
772
+ self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
773
+ ):
774
+ if not semtype_file:
775
+ semtype_file = os.path.join(
776
+ "grammars", "sample_grammars", "drt_glue.semtype"
777
+ )
778
+ Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
779
+
780
+ def get_glue_dict(self):
781
+ return DrtGlueDict(self.semtype_file)
782
+
783
+
784
+ def demo(show_example=-1):
785
+ from nltk.parse import MaltParser
786
+
787
+ examples = [
788
+ "David sees Mary",
789
+ "David eats a sandwich",
790
+ "every man chases a dog",
791
+ "every man believes a dog sleeps",
792
+ "John gives David a sandwich",
793
+ "John chases himself",
794
+ ]
795
+ # 'John persuades David to order a pizza',
796
+ # 'John tries to go',
797
+ # 'John tries to find a unicorn',
798
+ # 'John seems to vanish',
799
+ # 'a unicorn seems to approach',
800
+ # 'every big cat leaves',
801
+ # 'every gray cat leaves',
802
+ # 'every big gray cat leaves',
803
+ # 'a former senator leaves',
804
+
805
+ print("============== DEMO ==============")
806
+
807
+ tagger = RegexpTagger(
808
+ [
809
+ ("^(David|Mary|John)$", "NNP"),
810
+ (
811
+ "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
812
+ "VB",
813
+ ),
814
+ ("^(go|order|vanish|find|approach)$", "VB"),
815
+ ("^(a)$", "ex_quant"),
816
+ ("^(every)$", "univ_quant"),
817
+ ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
818
+ ("^(big|gray|former)$", "JJ"),
819
+ ("^(him|himself)$", "PRP"),
820
+ ]
821
+ )
822
+
823
+ depparser = MaltParser(tagger=tagger)
824
+ glue = Glue(depparser=depparser, verbose=False)
825
+
826
+ for (i, sentence) in enumerate(examples):
827
+ if i == show_example or show_example == -1:
828
+ print(f"[[[Example {i}]]] {sentence}")
829
+ for reading in glue.parse_to_meaning(sentence.split()):
830
+ print(reading.simplify())
831
+ print("")
832
+
833
+
834
+ if __name__ == "__main__":
835
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/sem/hole.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Logic
2
+ #
3
+ # Author: Peter Wang
4
+ # Updated by: Dan Garrette <dhgarrette@gmail.com>
5
+ #
6
+ # Copyright (C) 2001-2022 NLTK Project
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ """
11
+ An implementation of the Hole Semantics model, following Blackburn and Bos,
12
+ Representation and Inference for Natural Language (CSLI, 2005).
13
+
14
+ The semantic representations are built by the grammar hole.fcfg.
15
+ This module contains driver code to read in sentences and parse them
16
+ according to a hole semantics grammar.
17
+
18
+ After parsing, the semantic representation is in the form of an underspecified
19
+ representation that is not easy to read. We use a "plugging" algorithm to
20
+ convert that representation into first-order logic formulas.
21
+ """
22
+
23
+ from functools import reduce
24
+
25
+ from nltk.parse import load_parser
26
+ from nltk.sem.logic import (
27
+ AllExpression,
28
+ AndExpression,
29
+ ApplicationExpression,
30
+ ExistsExpression,
31
+ IffExpression,
32
+ ImpExpression,
33
+ LambdaExpression,
34
+ NegatedExpression,
35
+ OrExpression,
36
+ )
37
+ from nltk.sem.skolemize import skolemize
38
+
39
+ # Note that in this code there may be multiple types of trees being referred to:
40
+ #
41
+ # 1. parse trees
42
+ # 2. the underspecified representation
43
+ # 3. first-order logic formula trees
44
+ # 4. the search space when plugging (search tree)
45
+ #
46
+
47
+
48
+ class Constants:
49
+ ALL = "ALL"
50
+ EXISTS = "EXISTS"
51
+ NOT = "NOT"
52
+ AND = "AND"
53
+ OR = "OR"
54
+ IMP = "IMP"
55
+ IFF = "IFF"
56
+ PRED = "PRED"
57
+ LEQ = "LEQ"
58
+ HOLE = "HOLE"
59
+ LABEL = "LABEL"
60
+
61
+ MAP = {
62
+ ALL: lambda v, e: AllExpression(v.variable, e),
63
+ EXISTS: lambda v, e: ExistsExpression(v.variable, e),
64
+ NOT: NegatedExpression,
65
+ AND: AndExpression,
66
+ OR: OrExpression,
67
+ IMP: ImpExpression,
68
+ IFF: IffExpression,
69
+ PRED: ApplicationExpression,
70
+ }
71
+
72
+
73
+ class HoleSemantics:
74
+ """
75
+ This class holds the broken-down components of a hole semantics, i.e. it
76
+ extracts the holes, labels, logic formula fragments and constraints out of
77
+ a big conjunction of such as produced by the hole semantics grammar. It
78
+ then provides some operations on the semantics dealing with holes, labels
79
+ and finding legal ways to plug holes with labels.
80
+ """
81
+
82
+ def __init__(self, usr):
83
+ """
84
+ Constructor. `usr' is a ``sem.Expression`` representing an
85
+ Underspecified Representation Structure (USR). A USR has the following
86
+ special predicates:
87
+ ALL(l,v,n),
88
+ EXISTS(l,v,n),
89
+ AND(l,n,n),
90
+ OR(l,n,n),
91
+ IMP(l,n,n),
92
+ IFF(l,n,n),
93
+ PRED(l,v,n,v[,v]*) where the brackets and star indicate zero or more repetitions,
94
+ LEQ(n,n),
95
+ HOLE(n),
96
+ LABEL(n)
97
+ where l is the label of the node described by the predicate, n is either
98
+ a label or a hole, and v is a variable.
99
+ """
100
+ self.holes = set()
101
+ self.labels = set()
102
+ self.fragments = {} # mapping of label -> formula fragment
103
+ self.constraints = set() # set of Constraints
104
+ self._break_down(usr)
105
+ self.top_most_labels = self._find_top_most_labels()
106
+ self.top_hole = self._find_top_hole()
107
+
108
+ def is_node(self, x):
109
+ """
110
+ Return true if x is a node (label or hole) in this semantic
111
+ representation.
112
+ """
113
+ return x in (self.labels | self.holes)
114
+
115
+ def _break_down(self, usr):
116
+ """
117
+ Extract holes, labels, formula fragments and constraints from the hole
118
+ semantics underspecified representation (USR).
119
+ """
120
+ if isinstance(usr, AndExpression):
121
+ self._break_down(usr.first)
122
+ self._break_down(usr.second)
123
+ elif isinstance(usr, ApplicationExpression):
124
+ func, args = usr.uncurry()
125
+ if func.variable.name == Constants.LEQ:
126
+ self.constraints.add(Constraint(args[0], args[1]))
127
+ elif func.variable.name == Constants.HOLE:
128
+ self.holes.add(args[0])
129
+ elif func.variable.name == Constants.LABEL:
130
+ self.labels.add(args[0])
131
+ else:
132
+ label = args[0]
133
+ assert label not in self.fragments
134
+ self.fragments[label] = (func, args[1:])
135
+ else:
136
+ raise ValueError(usr.label())
137
+
138
+ def _find_top_nodes(self, node_list):
139
+ top_nodes = node_list.copy()
140
+ for f in self.fragments.values():
141
+ # the label is the first argument of the predicate
142
+ args = f[1]
143
+ for arg in args:
144
+ if arg in node_list:
145
+ top_nodes.discard(arg)
146
+ return top_nodes
147
+
148
+ def _find_top_most_labels(self):
149
+ """
150
+ Return the set of labels which are not referenced directly as part of
151
+ another formula fragment. These will be the top-most labels for the
152
+ subtree that they are part of.
153
+ """
154
+ return self._find_top_nodes(self.labels)
155
+
156
+ def _find_top_hole(self):
157
+ """
158
+ Return the hole that will be the top of the formula tree.
159
+ """
160
+ top_holes = self._find_top_nodes(self.holes)
161
+ assert len(top_holes) == 1 # it must be unique
162
+ return top_holes.pop()
163
+
164
+ def pluggings(self):
165
+ """
166
+ Calculate and return all the legal pluggings (mappings of labels to
167
+ holes) of this semantics given the constraints.
168
+ """
169
+ record = []
170
+ self._plug_nodes([(self.top_hole, [])], self.top_most_labels, {}, record)
171
+ return record
172
+
173
+ def _plug_nodes(self, queue, potential_labels, plug_acc, record):
174
+ """
175
+ Plug the nodes in `queue' with the labels in `potential_labels'.
176
+
177
+ Each element of `queue' is a tuple of the node to plug and the list of
178
+ ancestor holes from the root of the graph to that node.
179
+
180
+ `potential_labels' is a set of the labels which are still available for
181
+ plugging.
182
+
183
+ `plug_acc' is the incomplete mapping of holes to labels made on the
184
+ current branch of the search tree so far.
185
+
186
+ `record' is a list of all the complete pluggings that we have found in
187
+ total so far. It is the only parameter that is destructively updated.
188
+ """
189
+ if queue != []:
190
+ (node, ancestors) = queue[0]
191
+ if node in self.holes:
192
+ # The node is a hole, try to plug it.
193
+ self._plug_hole(
194
+ node, ancestors, queue[1:], potential_labels, plug_acc, record
195
+ )
196
+ else:
197
+ assert node in self.labels
198
+ # The node is a label. Replace it in the queue by the holes and
199
+ # labels in the formula fragment named by that label.
200
+ args = self.fragments[node][1]
201
+ head = [(a, ancestors) for a in args if self.is_node(a)]
202
+ self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record)
203
+ else:
204
+ raise Exception("queue empty")
205
+
206
+ def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record):
207
+ """
208
+ Try all possible ways of plugging a single hole.
209
+ See _plug_nodes for the meanings of the parameters.
210
+ """
211
+ # Add the current hole we're trying to plug into the list of ancestors.
212
+ assert hole not in ancestors0
213
+ ancestors = [hole] + ancestors0
214
+
215
+ # Try each potential label in this hole in turn.
216
+ for l in potential_labels0:
217
+ # Is the label valid in this hole?
218
+ if self._violates_constraints(l, ancestors):
219
+ continue
220
+
221
+ plug_acc = plug_acc0.copy()
222
+ plug_acc[hole] = l
223
+ potential_labels = potential_labels0.copy()
224
+ potential_labels.remove(l)
225
+
226
+ if len(potential_labels) == 0:
227
+ # No more potential labels. That must mean all the holes have
228
+ # been filled so we have found a legal plugging so remember it.
229
+ #
230
+ # Note that the queue might not be empty because there might
231
+ # be labels on there that point to formula fragments with
232
+ # no holes in them. _sanity_check_plugging will make sure
233
+ # all holes are filled.
234
+ self._sanity_check_plugging(plug_acc, self.top_hole, [])
235
+ record.append(plug_acc)
236
+ else:
237
+ # Recursively try to fill in the rest of the holes in the
238
+ # queue. The label we just plugged into the hole could have
239
+ # holes of its own so at the end of the queue. Putting it on
240
+ # the end of the queue gives us a breadth-first search, so that
241
+ # all the holes at level i of the formula tree are filled
242
+ # before filling level i+1.
243
+ # A depth-first search would work as well since the trees must
244
+ # be finite but the bookkeeping would be harder.
245
+ self._plug_nodes(
246
+ queue + [(l, ancestors)], potential_labels, plug_acc, record
247
+ )
248
+
249
+ def _violates_constraints(self, label, ancestors):
250
+ """
251
+ Return True if the `label' cannot be placed underneath the holes given
252
+ by the set `ancestors' because it would violate the constraints imposed
253
+ on it.
254
+ """
255
+ for c in self.constraints:
256
+ if c.lhs == label:
257
+ if c.rhs not in ancestors:
258
+ return True
259
+ return False
260
+
261
+ def _sanity_check_plugging(self, plugging, node, ancestors):
262
+ """
263
+ Make sure that a given plugging is legal. We recursively go through
264
+ each node and make sure that no constraints are violated.
265
+ We also check that all holes have been filled.
266
+ """
267
+ if node in self.holes:
268
+ ancestors = [node] + ancestors
269
+ label = plugging[node]
270
+ else:
271
+ label = node
272
+ assert label in self.labels
273
+ for c in self.constraints:
274
+ if c.lhs == label:
275
+ assert c.rhs in ancestors
276
+ args = self.fragments[label][1]
277
+ for arg in args:
278
+ if self.is_node(arg):
279
+ self._sanity_check_plugging(plugging, arg, [label] + ancestors)
280
+
281
+ def formula_tree(self, plugging):
282
+ """
283
+ Return the first-order logic formula tree for this underspecified
284
+ representation using the plugging given.
285
+ """
286
+ return self._formula_tree(plugging, self.top_hole)
287
+
288
+ def _formula_tree(self, plugging, node):
289
+ if node in plugging:
290
+ return self._formula_tree(plugging, plugging[node])
291
+ elif node in self.fragments:
292
+ pred, args = self.fragments[node]
293
+ children = [self._formula_tree(plugging, arg) for arg in args]
294
+ return reduce(Constants.MAP[pred.variable.name], children)
295
+ else:
296
+ return node
297
+
298
+
299
+ class Constraint:
300
+ """
301
+ This class represents a constraint of the form (L =< N),
302
+ where L is a label and N is a node (a label or a hole).
303
+ """
304
+
305
+ def __init__(self, lhs, rhs):
306
+ self.lhs = lhs
307
+ self.rhs = rhs
308
+
309
+ def __eq__(self, other):
310
+ if self.__class__ == other.__class__:
311
+ return self.lhs == other.lhs and self.rhs == other.rhs
312
+ else:
313
+ return False
314
+
315
+ def __ne__(self, other):
316
+ return not (self == other)
317
+
318
+ def __hash__(self):
319
+ return hash(repr(self))
320
+
321
+ def __repr__(self):
322
+ return f"({self.lhs} < {self.rhs})"
323
+
324
+
325
+ def hole_readings(sentence, grammar_filename=None, verbose=False):
326
+ if not grammar_filename:
327
+ grammar_filename = "grammars/sample_grammars/hole.fcfg"
328
+
329
+ if verbose:
330
+ print("Reading grammar file", grammar_filename)
331
+
332
+ parser = load_parser(grammar_filename)
333
+
334
+ # Parse the sentence.
335
+ tokens = sentence.split()
336
+ trees = list(parser.parse(tokens))
337
+ if verbose:
338
+ print("Got %d different parses" % len(trees))
339
+
340
+ all_readings = []
341
+ for tree in trees:
342
+ # Get the semantic feature from the top of the parse tree.
343
+ sem = tree.label()["SEM"].simplify()
344
+
345
+ # Print the raw semantic representation.
346
+ if verbose:
347
+ print("Raw: ", sem)
348
+
349
+ # Skolemize away all quantifiers. All variables become unique.
350
+ while isinstance(sem, LambdaExpression):
351
+ sem = sem.term
352
+ skolemized = skolemize(sem)
353
+
354
+ if verbose:
355
+ print("Skolemized:", skolemized)
356
+
357
+ # Break the hole semantics representation down into its components
358
+ # i.e. holes, labels, formula fragments and constraints.
359
+ hole_sem = HoleSemantics(skolemized)
360
+
361
+ # Maybe show the details of the semantic representation.
362
+ if verbose:
363
+ print("Holes: ", hole_sem.holes)
364
+ print("Labels: ", hole_sem.labels)
365
+ print("Constraints: ", hole_sem.constraints)
366
+ print("Top hole: ", hole_sem.top_hole)
367
+ print("Top labels: ", hole_sem.top_most_labels)
368
+ print("Fragments:")
369
+ for l, f in hole_sem.fragments.items():
370
+ print(f"\t{l}: {f}")
371
+
372
+ # Find all the possible ways to plug the formulas together.
373
+ pluggings = hole_sem.pluggings()
374
+
375
+ # Build FOL formula trees using the pluggings.
376
+ readings = list(map(hole_sem.formula_tree, pluggings))
377
+
378
+ # Print out the formulas in a textual format.
379
+ if verbose:
380
+ for i, r in enumerate(readings):
381
+ print()
382
+ print("%d. %s" % (i, r))
383
+ print()
384
+
385
+ all_readings.extend(readings)
386
+
387
+ return all_readings
388
+
389
+
390
+ if __name__ == "__main__":
391
+ for r in hole_readings("a dog barks"):
392
+ print(r)
393
+ print()
394
+ for r in hole_readings("every girl chases a dog"):
395
+ print(r)
.eggs/nltk-3.8-py3.10.egg/nltk/stem/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Stemmers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # Steven Bird <stevenbird1@gmail.com>
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ """
11
+ NLTK Stemmers
12
+
13
+ Interfaces used to remove morphological affixes from words, leaving
14
+ only the word stem. Stemming algorithms aim to remove those affixes
15
+ required for eg. grammatical role, tense, derivational morphology
16
+ leaving only the stem of the word. This is a difficult problem due to
17
+ irregular words (eg. common verbs in English), complicated
18
+ morphological rules, and part-of-speech and sense ambiguities
19
+ (eg. ``ceil-`` is not the stem of ``ceiling``).
20
+
21
+ StemmerI defines a standard interface for stemmers.
22
+ """
23
+
24
+ from nltk.stem.api import StemmerI
25
+ from nltk.stem.arlstem import ARLSTem
26
+ from nltk.stem.arlstem2 import ARLSTem2
27
+ from nltk.stem.cistem import Cistem
28
+ from nltk.stem.isri import ISRIStemmer
29
+ from nltk.stem.lancaster import LancasterStemmer
30
+ from nltk.stem.porter import PorterStemmer
31
+ from nltk.stem.regexp import RegexpStemmer
32
+ from nltk.stem.rslp import RSLPStemmer
33
+ from nltk.stem.snowball import SnowballStemmer
34
+ from nltk.stem.wordnet import WordNetLemmatizer
.eggs/nltk-3.8-py3.10.egg/nltk/stem/api.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Stemmer Interface
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # Steven Bird <stevenbird1@gmail.com>
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ from abc import ABCMeta, abstractmethod
11
+
12
+
13
+ class StemmerI(metaclass=ABCMeta):
14
+ """
15
+ A processing interface for removing morphological affixes from
16
+ words. This process is known as stemming.
17
+
18
+ """
19
+
20
+ @abstractmethod
21
+ def stem(self, token):
22
+ """
23
+ Strip affixes from the token and return the stem.
24
+
25
+ :param token: The token that should be stemmed.
26
+ :type token: str
27
+ """
.eggs/nltk-3.8-py3.10.egg/nltk/stem/lancaster.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Stemmers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Tomcavage <stomcava@law.upenn.edu>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
10
+ Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
11
+ """
12
+ import re
13
+
14
+ from nltk.stem.api import StemmerI
15
+
16
+
17
+ class LancasterStemmer(StemmerI):
18
+ """
19
+ Lancaster Stemmer
20
+
21
+ >>> from nltk.stem.lancaster import LancasterStemmer
22
+ >>> st = LancasterStemmer()
23
+ >>> st.stem('maximum') # Remove "-um" when word is intact
24
+ 'maxim'
25
+ >>> st.stem('presumably') # Don't remove "-um" when word is not intact
26
+ 'presum'
27
+ >>> st.stem('multiply') # No action taken if word ends with "-ply"
28
+ 'multiply'
29
+ >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
30
+ 'provid'
31
+ >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
32
+ 'ow'
33
+ >>> st.stem('ear') # ditto
34
+ 'ear'
35
+ >>> st.stem('saying') # Words starting with consonant must contain at least 3
36
+ 'say'
37
+ >>> st.stem('crying') # letters and one of those letters must be a vowel
38
+ 'cry'
39
+ >>> st.stem('string') # ditto
40
+ 'string'
41
+ >>> st.stem('meant') # ditto
42
+ 'meant'
43
+ >>> st.stem('cement') # ditto
44
+ 'cem'
45
+ >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
46
+ >>> st_pre.stem('kilometer') # Test Prefix
47
+ 'met'
48
+ >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
49
+ >>> st_custom.stem("ness") # Change s to t
50
+ 'nest'
51
+ """
52
+
53
+ # The rule list is static since it doesn't change between instances
54
+ default_rule_tuple = (
55
+ "ai*2.", # -ia > - if intact
56
+ "a*1.", # -a > - if intact
57
+ "bb1.", # -bb > -b
58
+ "city3s.", # -ytic > -ys
59
+ "ci2>", # -ic > -
60
+ "cn1t>", # -nc > -nt
61
+ "dd1.", # -dd > -d
62
+ "dei3y>", # -ied > -y
63
+ "deec2ss.", # -ceed >", -cess
64
+ "dee1.", # -eed > -ee
65
+ "de2>", # -ed > -
66
+ "dooh4>", # -hood > -
67
+ "e1>", # -e > -
68
+ "feil1v.", # -lief > -liev
69
+ "fi2>", # -if > -
70
+ "gni3>", # -ing > -
71
+ "gai3y.", # -iag > -y
72
+ "ga2>", # -ag > -
73
+ "gg1.", # -gg > -g
74
+ "ht*2.", # -th > - if intact
75
+ "hsiug5ct.", # -guish > -ct
76
+ "hsi3>", # -ish > -
77
+ "i*1.", # -i > - if intact
78
+ "i1y>", # -i > -y
79
+ "ji1d.", # -ij > -id -- see nois4j> & vis3j>
80
+ "juf1s.", # -fuj > -fus
81
+ "ju1d.", # -uj > -ud
82
+ "jo1d.", # -oj > -od
83
+ "jeh1r.", # -hej > -her
84
+ "jrev1t.", # -verj > -vert
85
+ "jsim2t.", # -misj > -mit
86
+ "jn1d.", # -nj > -nd
87
+ "j1s.", # -j > -s
88
+ "lbaifi6.", # -ifiabl > -
89
+ "lbai4y.", # -iabl > -y
90
+ "lba3>", # -abl > -
91
+ "lbi3.", # -ibl > -
92
+ "lib2l>", # -bil > -bl
93
+ "lc1.", # -cl > c
94
+ "lufi4y.", # -iful > -y
95
+ "luf3>", # -ful > -
96
+ "lu2.", # -ul > -
97
+ "lai3>", # -ial > -
98
+ "lau3>", # -ual > -
99
+ "la2>", # -al > -
100
+ "ll1.", # -ll > -l
101
+ "mui3.", # -ium > -
102
+ "mu*2.", # -um > - if intact
103
+ "msi3>", # -ism > -
104
+ "mm1.", # -mm > -m
105
+ "nois4j>", # -sion > -j
106
+ "noix4ct.", # -xion > -ct
107
+ "noi3>", # -ion > -
108
+ "nai3>", # -ian > -
109
+ "na2>", # -an > -
110
+ "nee0.", # protect -een
111
+ "ne2>", # -en > -
112
+ "nn1.", # -nn > -n
113
+ "pihs4>", # -ship > -
114
+ "pp1.", # -pp > -p
115
+ "re2>", # -er > -
116
+ "rae0.", # protect -ear
117
+ "ra2.", # -ar > -
118
+ "ro2>", # -or > -
119
+ "ru2>", # -ur > -
120
+ "rr1.", # -rr > -r
121
+ "rt1>", # -tr > -t
122
+ "rei3y>", # -ier > -y
123
+ "sei3y>", # -ies > -y
124
+ "sis2.", # -sis > -s
125
+ "si2>", # -is > -
126
+ "ssen4>", # -ness > -
127
+ "ss0.", # protect -ss
128
+ "suo3>", # -ous > -
129
+ "su*2.", # -us > - if intact
130
+ "s*1>", # -s > - if intact
131
+ "s0.", # -s > -s
132
+ "tacilp4y.", # -plicat > -ply
133
+ "ta2>", # -at > -
134
+ "tnem4>", # -ment > -
135
+ "tne3>", # -ent > -
136
+ "tna3>", # -ant > -
137
+ "tpir2b.", # -ript > -rib
138
+ "tpro2b.", # -orpt > -orb
139
+ "tcud1.", # -duct > -duc
140
+ "tpmus2.", # -sumpt > -sum
141
+ "tpec2iv.", # -cept > -ceiv
142
+ "tulo2v.", # -olut > -olv
143
+ "tsis0.", # protect -sist
144
+ "tsi3>", # -ist > -
145
+ "tt1.", # -tt > -t
146
+ "uqi3.", # -iqu > -
147
+ "ugo1.", # -ogu > -og
148
+ "vis3j>", # -siv > -j
149
+ "vie0.", # protect -eiv
150
+ "vi2>", # -iv > -
151
+ "ylb1>", # -bly > -bl
152
+ "yli3y>", # -ily > -y
153
+ "ylp0.", # protect -ply
154
+ "yl2>", # -ly > -
155
+ "ygo1.", # -ogy > -og
156
+ "yhp1.", # -phy > -ph
157
+ "ymo1.", # -omy > -om
158
+ "ypo1.", # -opy > -op
159
+ "yti3>", # -ity > -
160
+ "yte3>", # -ety > -
161
+ "ytl2.", # -lty > -l
162
+ "yrtsi5.", # -istry > -
163
+ "yra3>", # -ary > -
164
+ "yro3>", # -ory > -
165
+ "yfi3.", # -ify > -
166
+ "ycn2t>", # -ncy > -nt
167
+ "yca3>", # -acy > -
168
+ "zi2>", # -iz > -
169
+ "zy1s.", # -yz > -ys
170
+ )
171
+
172
+ def __init__(self, rule_tuple=None, strip_prefix_flag=False):
173
+ """Create an instance of the Lancaster stemmer."""
174
+ # Setup an empty rule dictionary - this will be filled in later
175
+ self.rule_dictionary = {}
176
+ # Check if a user wants to strip prefix
177
+ self._strip_prefix = strip_prefix_flag
178
+ # Check if a user wants to use his/her own rule tuples.
179
+ self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
180
+
181
+ def parseRules(self, rule_tuple=None):
182
+ """Validate the set of rules used in this stemmer.
183
+
184
+ If this function is called as an individual method, without using stem
185
+ method, rule_tuple argument will be compiled into self.rule_dictionary.
186
+ If this function is called within stem, self._rule_tuple will be used.
187
+
188
+ """
189
+ # If there is no argument for the function, use class' own rule tuple.
190
+ rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
191
+ valid_rule = re.compile(r"^[a-z]+\*?\d[a-z]*[>\.]?$")
192
+ # Empty any old rules from the rule set before adding new ones
193
+ self.rule_dictionary = {}
194
+
195
+ for rule in rule_tuple:
196
+ if not valid_rule.match(rule):
197
+ raise ValueError(f"The rule {rule} is invalid")
198
+ first_letter = rule[0:1]
199
+ if first_letter in self.rule_dictionary:
200
+ self.rule_dictionary[first_letter].append(rule)
201
+ else:
202
+ self.rule_dictionary[first_letter] = [rule]
203
+
204
+ def stem(self, word):
205
+ """Stem a word using the Lancaster stemmer."""
206
+ # Lower-case the word, since all the rules are lower-cased
207
+ word = word.lower()
208
+ word = self.__stripPrefix(word) if self._strip_prefix else word
209
+
210
+ # Save a copy of the original word
211
+ intact_word = word
212
+
213
+ # If rule dictionary is empty, parse rule tuple.
214
+ if not self.rule_dictionary:
215
+ self.parseRules()
216
+
217
+ return self.__doStemming(word, intact_word)
218
+
219
+ def __doStemming(self, word, intact_word):
220
+ """Perform the actual word stemming"""
221
+
222
+ valid_rule = re.compile(r"^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
223
+
224
+ proceed = True
225
+
226
+ while proceed:
227
+
228
+ # Find the position of the last letter of the word to be stemmed
229
+ last_letter_position = self.__getLastLetter(word)
230
+
231
+ # Only stem the word if it has a last letter and a rule matching that last letter
232
+ if (
233
+ last_letter_position < 0
234
+ or word[last_letter_position] not in self.rule_dictionary
235
+ ):
236
+ proceed = False
237
+
238
+ else:
239
+ rule_was_applied = False
240
+
241
+ # Go through each rule that matches the word's final letter
242
+ for rule in self.rule_dictionary[word[last_letter_position]]:
243
+ rule_match = valid_rule.match(rule)
244
+ if rule_match:
245
+ (
246
+ ending_string,
247
+ intact_flag,
248
+ remove_total,
249
+ append_string,
250
+ cont_flag,
251
+ ) = rule_match.groups()
252
+
253
+ # Convert the number of chars to remove when stemming
254
+ # from a string to an integer
255
+ remove_total = int(remove_total)
256
+
257
+ # Proceed if word's ending matches rule's word ending
258
+ if word.endswith(ending_string[::-1]):
259
+ if intact_flag:
260
+ if word == intact_word and self.__isAcceptable(
261
+ word, remove_total
262
+ ):
263
+ word = self.__applyRule(
264
+ word, remove_total, append_string
265
+ )
266
+ rule_was_applied = True
267
+ if cont_flag == ".":
268
+ proceed = False
269
+ break
270
+ elif self.__isAcceptable(word, remove_total):
271
+ word = self.__applyRule(
272
+ word, remove_total, append_string
273
+ )
274
+ rule_was_applied = True
275
+ if cont_flag == ".":
276
+ proceed = False
277
+ break
278
+ # If no rules apply, the word doesn't need any more stemming
279
+ if rule_was_applied == False:
280
+ proceed = False
281
+ return word
282
+
283
+ def __getLastLetter(self, word):
284
+ """Get the zero-based index of the last alphabetic character in this string"""
285
+ last_letter = -1
286
+ for position in range(len(word)):
287
+ if word[position].isalpha():
288
+ last_letter = position
289
+ else:
290
+ break
291
+ return last_letter
292
+
293
+ def __isAcceptable(self, word, remove_total):
294
+ """Determine if the word is acceptable for stemming."""
295
+ word_is_acceptable = False
296
+ # If the word starts with a vowel, it must be at least 2
297
+ # characters long to be stemmed
298
+ if word[0] in "aeiouy":
299
+ if len(word) - remove_total >= 2:
300
+ word_is_acceptable = True
301
+ # If the word starts with a consonant, it must be at least 3
302
+ # characters long (including one vowel) to be stemmed
303
+ elif len(word) - remove_total >= 3:
304
+ if word[1] in "aeiouy":
305
+ word_is_acceptable = True
306
+ elif word[2] in "aeiouy":
307
+ word_is_acceptable = True
308
+ return word_is_acceptable
309
+
310
+ def __applyRule(self, word, remove_total, append_string):
311
+ """Apply the stemming rule to the word"""
312
+ # Remove letters from the end of the word
313
+ new_word_length = len(word) - remove_total
314
+ word = word[0:new_word_length]
315
+
316
+ # And add new letters to the end of the truncated word
317
+ if append_string:
318
+ word += append_string
319
+ return word
320
+
321
+ def __stripPrefix(self, word):
322
+ """Remove prefix from a word.
323
+
324
+ This function originally taken from Whoosh.
325
+
326
+ """
327
+ for prefix in (
328
+ "kilo",
329
+ "micro",
330
+ "milli",
331
+ "intra",
332
+ "ultra",
333
+ "mega",
334
+ "nano",
335
+ "pico",
336
+ "pseudo",
337
+ ):
338
+ if word.startswith(prefix):
339
+ return word[len(prefix) :]
340
+ return word
341
+
342
+ def __repr__(self):
343
+ return "<LancasterStemmer>"
.eggs/nltk-3.8-py3.10.egg/nltk/stem/rslp.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: RSLP Stemmer
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Tiago Tresoldi <tresoldi@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ # This code is based on the algorithm presented in the paper "A Stemming
9
+ # Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
10
+ # Christian Huyck, which unfortunately I had no access to. The code is a
11
+ # Python version, with some minor modifications of mine, to the description
12
+ # presented at https://www.webcitation.org/5NnvdIzOb and to the C source code
13
+ # available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
14
+ # Please note that this stemmer is intended for demonstration and educational
15
+ # purposes only. Feel free to write me for any comments, including the
16
+ # development of a different and/or better stemmer for Portuguese. I also
17
+ # suggest using NLTK's mailing list for Portuguese for any discussion.
18
+
19
+ # Este código é baseado no algoritmo apresentado no artigo "A Stemming
20
+ # Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
21
+ # Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
22
+ # código é uma conversão para Python, com algumas pequenas modificações
23
+ # minhas, daquele apresentado em https://www.webcitation.org/5NnvdIzOb e do
24
+ # código para linguagem C disponível em
25
+ # http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
26
+ # lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
27
+ # de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
28
+ # comentário, inclusive sobre o desenvolvimento de um stemmer diferente
29
+ # e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
30
+ # do NLTK para o português para qualquer debate.
31
+
32
+ from nltk.data import load
33
+ from nltk.stem.api import StemmerI
34
+
35
+
36
+ class RSLPStemmer(StemmerI):
37
+ """
38
+ A stemmer for Portuguese.
39
+
40
+ >>> from nltk.stem import RSLPStemmer
41
+ >>> st = RSLPStemmer()
42
+ >>> # opening lines of Erico Verissimo's "Música ao Longe"
43
+ >>> text = '''
44
+ ... Clarissa risca com giz no quadro-negro a paisagem que os alunos
45
+ ... devem copiar . Uma casinha de porta e janela , em cima duma
46
+ ... coxilha .'''
47
+ >>> for token in text.split(): # doctest: +NORMALIZE_WHITESPACE
48
+ ... print(st.stem(token))
49
+ clariss risc com giz no quadro-negr a pais que os alun dev copi .
50
+ uma cas de port e janel , em cim dum coxilh .
51
+ """
52
+
53
+ def __init__(self):
54
+ self._model = []
55
+
56
+ self._model.append(self.read_rule("step0.pt"))
57
+ self._model.append(self.read_rule("step1.pt"))
58
+ self._model.append(self.read_rule("step2.pt"))
59
+ self._model.append(self.read_rule("step3.pt"))
60
+ self._model.append(self.read_rule("step4.pt"))
61
+ self._model.append(self.read_rule("step5.pt"))
62
+ self._model.append(self.read_rule("step6.pt"))
63
+
64
+ def read_rule(self, filename):
65
+ rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
66
+ lines = rules.split("\n")
67
+
68
+ lines = [line for line in lines if line != ""] # remove blank lines
69
+ lines = [line for line in lines if line[0] != "#"] # remove comments
70
+
71
+ # NOTE: a simple but ugly hack to make this parser happy with double '\t's
72
+ lines = [line.replace("\t\t", "\t") for line in lines]
73
+
74
+ # parse rules
75
+ rules = []
76
+ for line in lines:
77
+ rule = []
78
+ tokens = line.split("\t")
79
+
80
+ # text to be searched for at the end of the string
81
+ rule.append(tokens[0][1:-1]) # remove quotes
82
+
83
+ # minimum stem size to perform the replacement
84
+ rule.append(int(tokens[1]))
85
+
86
+ # text to be replaced into
87
+ rule.append(tokens[2][1:-1]) # remove quotes
88
+
89
+ # exceptions to this rule
90
+ rule.append([token[1:-1] for token in tokens[3].split(",")])
91
+
92
+ # append to the results
93
+ rules.append(rule)
94
+
95
+ return rules
96
+
97
+ def stem(self, word):
98
+ word = word.lower()
99
+
100
+ # the word ends in 's'? apply rule for plural reduction
101
+ if word[-1] == "s":
102
+ word = self.apply_rule(word, 0)
103
+
104
+ # the word ends in 'a'? apply rule for feminine reduction
105
+ if word[-1] == "a":
106
+ word = self.apply_rule(word, 1)
107
+
108
+ # augmentative reduction
109
+ word = self.apply_rule(word, 3)
110
+
111
+ # adverb reduction
112
+ word = self.apply_rule(word, 2)
113
+
114
+ # noun reduction
115
+ prev_word = word
116
+ word = self.apply_rule(word, 4)
117
+ if word == prev_word:
118
+ # verb reduction
119
+ prev_word = word
120
+ word = self.apply_rule(word, 5)
121
+ if word == prev_word:
122
+ # vowel removal
123
+ word = self.apply_rule(word, 6)
124
+
125
+ return word
126
+
127
+ def apply_rule(self, word, rule_index):
128
+ rules = self._model[rule_index]
129
+ for rule in rules:
130
+ suffix_length = len(rule[0])
131
+ if word[-suffix_length:] == rule[0]: # if suffix matches
132
+ if len(word) >= suffix_length + rule[1]: # if we have minimum size
133
+ if word not in rule[3]: # if not an exception
134
+ word = word[:-suffix_length] + rule[2]
135
+ break
136
+
137
+ return word