msj19 commited on
Commit
234704f
·
verified ·
1 Parent(s): da806fb

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .eggs/nltk-3.8-py3.10.egg/nltk/app/chunkparser_app.py +1500 -0
  2. .eggs/nltk-3.8-py3.10.egg/nltk/app/collocations_app.py +438 -0
  3. .eggs/nltk-3.8-py3.10.egg/nltk/app/concordance_app.py +709 -0
  4. .eggs/nltk-3.8-py3.10.egg/nltk/app/nemo_app.py +163 -0
  5. .eggs/nltk-3.8-py3.10.egg/nltk/app/rdparser_app.py +1052 -0
  6. .eggs/nltk-3.8-py3.10.egg/nltk/app/srparser_app.py +937 -0
  7. .eggs/nltk-3.8-py3.10.egg/nltk/app/wordfreq_app.py +36 -0
  8. .eggs/nltk-3.8-py3.10.egg/nltk/app/wordnet_app.py +997 -0
  9. .eggs/nltk-3.8-py3.10.egg/nltk/ccg/__init__.py +34 -0
  10. .eggs/nltk-3.8-py3.10.egg/nltk/ccg/api.py +358 -0
  11. .eggs/nltk-3.8-py3.10.egg/nltk/ccg/chart.py +480 -0
  12. .eggs/nltk-3.8-py3.10.egg/nltk/ccg/combinator.py +339 -0
  13. .eggs/nltk-3.8-py3.10.egg/nltk/ccg/lexicon.py +338 -0
  14. .eggs/nltk-3.8-py3.10.egg/nltk/ccg/logic.py +60 -0
  15. .eggs/nltk-3.8-py3.10.egg/nltk/chat/__init__.py +48 -0
  16. .eggs/nltk-3.8-py3.10.egg/nltk/chat/eliza.py +337 -0
  17. .eggs/nltk-3.8-py3.10.egg/nltk/chat/iesha.py +160 -0
  18. .eggs/nltk-3.8-py3.10.egg/nltk/chat/rude.py +125 -0
  19. .eggs/nltk-3.8-py3.10.egg/nltk/chat/suntsu.py +140 -0
  20. .eggs/nltk-3.8-py3.10.egg/nltk/chat/util.py +124 -0
  21. .eggs/nltk-3.8-py3.10.egg/nltk/chat/zen.py +329 -0
  22. .eggs/nltk-3.8-py3.10.egg/nltk/chunk/__init__.py +197 -0
  23. .eggs/nltk-3.8-py3.10.egg/nltk/chunk/api.py +56 -0
  24. .eggs/nltk-3.8-py3.10.egg/nltk/chunk/named_entity.py +352 -0
  25. .eggs/nltk-3.8-py3.10.egg/nltk/chunk/regexp.py +1475 -0
  26. .eggs/nltk-3.8-py3.10.egg/nltk/chunk/util.py +643 -0
  27. .eggs/nltk-3.8-py3.10.egg/nltk/classify/__init__.py +101 -0
  28. .eggs/nltk-3.8-py3.10.egg/nltk/classify/api.py +195 -0
  29. .eggs/nltk-3.8-py3.10.egg/nltk/classify/decisiontree.py +349 -0
  30. .eggs/nltk-3.8-py3.10.egg/nltk/classify/maxent.py +1569 -0
  31. .eggs/nltk-3.8-py3.10.egg/nltk/classify/megam.py +184 -0
  32. .eggs/nltk-3.8-py3.10.egg/nltk/classify/naivebayes.py +260 -0
  33. .eggs/nltk-3.8-py3.10.egg/nltk/classify/positivenaivebayes.py +180 -0
  34. .eggs/nltk-3.8-py3.10.egg/nltk/classify/util.py +346 -0
  35. .eggs/nltk-3.8-py3.10.egg/nltk/classify/weka.py +377 -0
  36. .eggs/nltk-3.8-py3.10.egg/nltk/cluster/__init__.py +92 -0
  37. .eggs/nltk-3.8-py3.10.egg/nltk/cluster/api.py +74 -0
  38. .eggs/nltk-3.8-py3.10.egg/nltk/cluster/em.py +219 -0
  39. .eggs/nltk-3.8-py3.10.egg/nltk/cluster/gaac.py +170 -0
  40. .eggs/nltk-3.8-py3.10.egg/nltk/cluster/kmeans.py +231 -0
  41. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/__init__.py +529 -0
  42. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/europarl_raw.py +56 -0
  43. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/childes.py +630 -0
  44. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chunked.py +273 -0
  45. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/cmudict.py +88 -0
  46. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/comparative_sents.py +309 -0
  47. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/conll.py +579 -0
  48. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/crubadan.py +106 -0
  49. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/dependency.py +115 -0
  50. .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/framenet.py +0 -0
.eggs/nltk-3.8-py3.10.egg/nltk/app/chunkparser_app.py ADDED
@@ -0,0 +1,1500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Regexp Chunk Parser Application
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A graphical tool for exploring the regular expression based chunk
10
+ parser ``nltk.chunk.RegexpChunkParser``.
11
+ """
12
+
13
+ # Todo: Add a way to select the development set from the menubar. This
14
+ # might just need to be a selection box (conll vs treebank etc) plus
15
+ # configuration parameters to select what's being chunked (eg VP vs NP)
16
+ # and what part of the data is being used as the development set.
17
+
18
+ import random
19
+ import re
20
+ import textwrap
21
+ import time
22
+ from tkinter import (
23
+ Button,
24
+ Canvas,
25
+ Checkbutton,
26
+ Frame,
27
+ IntVar,
28
+ Label,
29
+ Menu,
30
+ Scrollbar,
31
+ Text,
32
+ Tk,
33
+ )
34
+ from tkinter.filedialog import askopenfilename, asksaveasfilename
35
+ from tkinter.font import Font
36
+
37
+ from nltk.chunk import ChunkScore, RegexpChunkParser
38
+ from nltk.chunk.regexp import RegexpChunkRule
39
+ from nltk.corpus import conll2000, treebank_chunk
40
+ from nltk.draw.util import ShowText
41
+ from nltk.tree import Tree
42
+ from nltk.util import in_idle
43
+
44
+
45
+ class RegexpChunkApp:
46
+ """
47
+ A graphical tool for exploring the regular expression based chunk
48
+ parser ``nltk.chunk.RegexpChunkParser``.
49
+
50
+ See ``HELP`` for instructional text.
51
+ """
52
+
53
+ ##/////////////////////////////////////////////////////////////////
54
+ ## Help Text
55
+ ##/////////////////////////////////////////////////////////////////
56
+
57
+ #: A dictionary mapping from part of speech tags to descriptions,
58
+ #: which is used in the help text. (This should probably live with
59
+ #: the conll and/or treebank corpus instead.)
60
+ TAGSET = {
61
+ "CC": "Coordinating conjunction",
62
+ "PRP$": "Possessive pronoun",
63
+ "CD": "Cardinal number",
64
+ "RB": "Adverb",
65
+ "DT": "Determiner",
66
+ "RBR": "Adverb, comparative",
67
+ "EX": "Existential there",
68
+ "RBS": "Adverb, superlative",
69
+ "FW": "Foreign word",
70
+ "RP": "Particle",
71
+ "JJ": "Adjective",
72
+ "TO": "to",
73
+ "JJR": "Adjective, comparative",
74
+ "UH": "Interjection",
75
+ "JJS": "Adjective, superlative",
76
+ "VB": "Verb, base form",
77
+ "LS": "List item marker",
78
+ "VBD": "Verb, past tense",
79
+ "MD": "Modal",
80
+ "NNS": "Noun, plural",
81
+ "NN": "Noun, singular or masps",
82
+ "VBN": "Verb, past participle",
83
+ "VBZ": "Verb,3rd ps. sing. present",
84
+ "NNP": "Proper noun, singular",
85
+ "NNPS": "Proper noun plural",
86
+ "WDT": "wh-determiner",
87
+ "PDT": "Predeterminer",
88
+ "WP": "wh-pronoun",
89
+ "POS": "Possessive ending",
90
+ "WP$": "Possessive wh-pronoun",
91
+ "PRP": "Personal pronoun",
92
+ "WRB": "wh-adverb",
93
+ "(": "open parenthesis",
94
+ ")": "close parenthesis",
95
+ "``": "open quote",
96
+ ",": "comma",
97
+ "''": "close quote",
98
+ ".": "period",
99
+ "#": "pound sign (currency marker)",
100
+ "$": "dollar sign (currency marker)",
101
+ "IN": "Preposition/subord. conjunction",
102
+ "SYM": "Symbol (mathematical or scientific)",
103
+ "VBG": "Verb, gerund/present participle",
104
+ "VBP": "Verb, non-3rd ps. sing. present",
105
+ ":": "colon",
106
+ }
107
+
108
+ #: Contents for the help box. This is a list of tuples, one for
109
+ #: each help page, where each tuple has four elements:
110
+ #: - A title (displayed as a tab)
111
+ #: - A string description of tabstops (see Tkinter.Text for details)
112
+ #: - The text contents for the help page. You can use expressions
113
+ #: like <red>...</red> to colorize the text; see ``HELP_AUTOTAG``
114
+ #: for a list of tags you can use for colorizing.
115
+ HELP = [
116
+ (
117
+ "Help",
118
+ "20",
119
+ "Welcome to the regular expression chunk-parser grammar editor. "
120
+ "You can use this editor to develop and test chunk parser grammars "
121
+ "based on NLTK's RegexpChunkParser class.\n\n"
122
+ # Help box.
123
+ "Use this box ('Help') to learn more about the editor; click on the "
124
+ "tabs for help on specific topics:"
125
+ "<indent>\n"
126
+ "Rules: grammar rule types\n"
127
+ "Regexps: regular expression syntax\n"
128
+ "Tags: part of speech tags\n</indent>\n"
129
+ # Grammar.
130
+ "Use the upper-left box ('Grammar') to edit your grammar. "
131
+ "Each line of your grammar specifies a single 'rule', "
132
+ "which performs an action such as creating a chunk or merging "
133
+ "two chunks.\n\n"
134
+ # Dev set.
135
+ "The lower-left box ('Development Set') runs your grammar on the "
136
+ "development set, and displays the results. "
137
+ "Your grammar's chunks are <highlight>highlighted</highlight>, and "
138
+ "the correct (gold standard) chunks are "
139
+ "<underline>underlined</underline>. If they "
140
+ "match, they are displayed in <green>green</green>; otherwise, "
141
+ "they are displayed in <red>red</red>. The box displays a single "
142
+ "sentence from the development set at a time; use the scrollbar or "
143
+ "the next/previous buttons view additional sentences.\n\n"
144
+ # Performance
145
+ "The lower-right box ('Evaluation') tracks the performance of "
146
+ "your grammar on the development set. The 'precision' axis "
147
+ "indicates how many of your grammar's chunks are correct; and "
148
+ "the 'recall' axis indicates how many of the gold standard "
149
+ "chunks your system generated. Typically, you should try to "
150
+ "design a grammar that scores high on both metrics. The "
151
+ "exact precision and recall of the current grammar, as well "
152
+ "as their harmonic mean (the 'f-score'), are displayed in "
153
+ "the status bar at the bottom of the window.",
154
+ ),
155
+ (
156
+ "Rules",
157
+ "10",
158
+ "<h1>{...regexp...}</h1>"
159
+ "<indent>\nChunk rule: creates new chunks from words matching "
160
+ "regexp.</indent>\n\n"
161
+ "<h1>}...regexp...{</h1>"
162
+ "<indent>\nStrip rule: removes words matching regexp from existing "
163
+ "chunks.</indent>\n\n"
164
+ "<h1>...regexp1...}{...regexp2...</h1>"
165
+ "<indent>\nSplit rule: splits chunks that match regexp1 followed by "
166
+ "regexp2 in two.</indent>\n\n"
167
+ "<h1>...regexp...{}...regexp...</h1>"
168
+ "<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
169
+ "and regexp2</indent>\n",
170
+ ),
171
+ (
172
+ "Regexps",
173
+ "10 60",
174
+ # "Regular Expression Syntax Summary:\n\n"
175
+ "<h1>Pattern\t\tMatches...</h1>\n"
176
+ "<hangindent>"
177
+ "\t<<var>T</var>>\ta word with tag <var>T</var> "
178
+ "(where <var>T</var> may be a regexp).\n"
179
+ "\t<var>x</var>?\tan optional <var>x</var>\n"
180
+ "\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
181
+ "\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
182
+ "\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
183
+ "\t.\tmatches any character\n"
184
+ "\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
185
+ "\t# <var>x...</var>\tTreats <var>x...</var> "
186
+ "(to the end of the line) as a comment\n"
187
+ "\t\\<var>C</var>\tmatches character <var>C</var> "
188
+ "(useful when <var>C</var> is a special character "
189
+ "like + or #)\n"
190
+ "</hangindent>"
191
+ "\n<h1>Examples:</h1>\n"
192
+ "<hangindent>"
193
+ "\t<regexp><NN></regexp>\n"
194
+ '\t\tMatches <match>"cow/NN"</match>\n'
195
+ '\t\tMatches <match>"green/NN"</match>\n'
196
+ "\t<regexp><VB.*></regexp>\n"
197
+ '\t\tMatches <match>"eating/VBG"</match>\n'
198
+ '\t\tMatches <match>"ate/VBD"</match>\n'
199
+ "\t<regexp><IN><DT><NN></regexp>\n"
200
+ '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
201
+ "\t<regexp><RB>?<VBD></regexp>\n"
202
+ '\t\tMatches <match>"ran/VBD"</match>\n'
203
+ '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
204
+ r"\t<regexp><\#><CD> # This is a comment...</regexp>\n"
205
+ '\t\tMatches <match>"#/# 100/CD"</match>\n'
206
+ "</hangindent>",
207
+ ),
208
+ (
209
+ "Tags",
210
+ "10 60",
211
+ "<h1>Part of Speech Tags:</h1>\n"
212
+ + "<hangindent>"
213
+ + "<<TAGSET>>"
214
+ + "</hangindent>\n", # this gets auto-substituted w/ self.TAGSET
215
+ ),
216
+ ]
217
+
218
+ HELP_AUTOTAG = [
219
+ ("red", dict(foreground="#a00")),
220
+ ("green", dict(foreground="#080")),
221
+ ("highlight", dict(background="#ddd")),
222
+ ("underline", dict(underline=True)),
223
+ ("h1", dict(underline=True)),
224
+ ("indent", dict(lmargin1=20, lmargin2=20)),
225
+ ("hangindent", dict(lmargin1=0, lmargin2=60)),
226
+ ("var", dict(foreground="#88f")),
227
+ ("regexp", dict(foreground="#ba7")),
228
+ ("match", dict(foreground="#6a6")),
229
+ ]
230
+
231
+ ##/////////////////////////////////////////////////////////////////
232
+ ## Config Parameters
233
+ ##/////////////////////////////////////////////////////////////////
234
+
235
+ _EVAL_DELAY = 1
236
+ """If the user has not pressed any key for this amount of time (in
237
+ seconds), and the current grammar has not been evaluated, then
238
+ the eval demon will evaluate it."""
239
+
240
+ _EVAL_CHUNK = 15
241
+ """The number of sentences that should be evaluated by the eval
242
+ demon each time it runs."""
243
+ _EVAL_FREQ = 0.2
244
+ """The frequency (in seconds) at which the eval demon is run"""
245
+ _EVAL_DEMON_MIN = 0.02
246
+ """The minimum amount of time that the eval demon should take each time
247
+ it runs -- if it takes less than this time, _EVAL_CHUNK will be
248
+ modified upwards."""
249
+ _EVAL_DEMON_MAX = 0.04
250
+ """The maximum amount of time that the eval demon should take each time
251
+ it runs -- if it takes more than this time, _EVAL_CHUNK will be
252
+ modified downwards."""
253
+
254
+ _GRAMMARBOX_PARAMS = dict(
255
+ width=40,
256
+ height=12,
257
+ background="#efe",
258
+ highlightbackground="#efe",
259
+ highlightthickness=1,
260
+ relief="groove",
261
+ border=2,
262
+ wrap="word",
263
+ )
264
+ _HELPBOX_PARAMS = dict(
265
+ width=15,
266
+ height=15,
267
+ background="#efe",
268
+ highlightbackground="#efe",
269
+ foreground="#555",
270
+ highlightthickness=1,
271
+ relief="groove",
272
+ border=2,
273
+ wrap="word",
274
+ )
275
+ _DEVSETBOX_PARAMS = dict(
276
+ width=70,
277
+ height=10,
278
+ background="#eef",
279
+ highlightbackground="#eef",
280
+ highlightthickness=1,
281
+ relief="groove",
282
+ border=2,
283
+ wrap="word",
284
+ tabs=(30,),
285
+ )
286
+ _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2)
287
+ _FONT_PARAMS = dict(family="helvetica", size=-20)
288
+ _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3)
289
+ _EVALBOX_PARAMS = dict(
290
+ background="#eef",
291
+ highlightbackground="#eef",
292
+ highlightthickness=1,
293
+ relief="groove",
294
+ border=2,
295
+ width=300,
296
+ height=280,
297
+ )
298
+ _BUTTON_PARAMS = dict(
299
+ background="#777", activebackground="#777", highlightbackground="#777"
300
+ )
301
+ _HELPTAB_BG_COLOR = "#aba"
302
+ _HELPTAB_FG_COLOR = "#efe"
303
+
304
+ _HELPTAB_FG_PARAMS = dict(background="#efe")
305
+ _HELPTAB_BG_PARAMS = dict(background="#aba")
306
+ _HELPTAB_SPACER = 6
307
+
308
+ def normalize_grammar(self, grammar):
309
+ # Strip comments
310
+ grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
311
+ # Normalize whitespace
312
+ grammar = re.sub(" +", " ", grammar)
313
+ grammar = re.sub(r"\n\s+", r"\n", grammar)
314
+ grammar = grammar.strip()
315
+ # [xx] Hack: automatically backslash $!
316
+ grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
317
+ return grammar
318
+
319
+ def __init__(
320
+ self,
321
+ devset_name="conll2000",
322
+ devset=None,
323
+ grammar="",
324
+ chunk_label="NP",
325
+ tagset=None,
326
+ ):
327
+ """
328
+ :param devset_name: The name of the development set; used for
329
+ display & for save files. If either the name 'treebank'
330
+ or the name 'conll2000' is used, and devset is None, then
331
+ devset will be set automatically.
332
+ :param devset: A list of chunked sentences
333
+ :param grammar: The initial grammar to display.
334
+ :param tagset: Dictionary from tags to string descriptions, used
335
+ for the help page. Defaults to ``self.TAGSET``.
336
+ """
337
+ self._chunk_label = chunk_label
338
+
339
+ if tagset is None:
340
+ tagset = self.TAGSET
341
+ self.tagset = tagset
342
+
343
+ # Named development sets:
344
+ if devset is None:
345
+ if devset_name == "conll2000":
346
+ devset = conll2000.chunked_sents("train.txt") # [:100]
347
+ elif devset == "treebank":
348
+ devset = treebank_chunk.chunked_sents() # [:100]
349
+ else:
350
+ raise ValueError("Unknown development set %s" % devset_name)
351
+
352
+ self.chunker = None
353
+ """The chunker built from the grammar string"""
354
+
355
+ self.grammar = grammar
356
+ """The unparsed grammar string"""
357
+
358
+ self.normalized_grammar = None
359
+ """A normalized version of ``self.grammar``."""
360
+
361
+ self.grammar_changed = 0
362
+ """The last time() that the grammar was changed."""
363
+
364
+ self.devset = devset
365
+ """The development set -- a list of chunked sentences."""
366
+
367
+ self.devset_name = devset_name
368
+ """The name of the development set (for save files)."""
369
+
370
+ self.devset_index = -1
371
+ """The index into the development set of the first instance
372
+ that's currently being viewed."""
373
+
374
+ self._last_keypress = 0
375
+ """The time() when a key was most recently pressed"""
376
+
377
+ self._history = []
378
+ """A list of (grammar, precision, recall, fscore) tuples for
379
+ grammars that the user has already tried."""
380
+
381
+ self._history_index = 0
382
+ """When the user is scrolling through previous grammars, this
383
+ is used to keep track of which grammar they're looking at."""
384
+
385
+ self._eval_grammar = None
386
+ """The grammar that is being currently evaluated by the eval
387
+ demon."""
388
+
389
+ self._eval_normalized_grammar = None
390
+ """A normalized copy of ``_eval_grammar``."""
391
+
392
+ self._eval_index = 0
393
+ """The index of the next sentence in the development set that
394
+ should be looked at by the eval demon."""
395
+
396
+ self._eval_score = ChunkScore(chunk_label=chunk_label)
397
+ """The ``ChunkScore`` object that's used to keep track of the score
398
+ of the current grammar on the development set."""
399
+
400
+ # Set up the main window.
401
+ top = self.top = Tk()
402
+ top.geometry("+50+50")
403
+ top.title("Regexp Chunk Parser App")
404
+ top.bind("<Control-q>", self.destroy)
405
+
406
+ # Variable that restricts how much of the devset we look at.
407
+ self._devset_size = IntVar(top)
408
+ self._devset_size.set(100)
409
+
410
+ # Set up all the tkinter widgets
411
+ self._init_fonts(top)
412
+ self._init_widgets(top)
413
+ self._init_bindings(top)
414
+ self._init_menubar(top)
415
+ self.grammarbox.focus()
416
+
417
+ # If a grammar was given, then display it.
418
+ if grammar:
419
+ self.grammarbox.insert("end", grammar + "\n")
420
+ self.grammarbox.mark_set("insert", "1.0")
421
+
422
+ # Display the first item in the development set
423
+ self.show_devset(0)
424
+ self.update()
425
+
426
+ def _init_bindings(self, top):
427
+ top.bind("<Control-n>", self._devset_next)
428
+ top.bind("<Control-p>", self._devset_prev)
429
+ top.bind("<Control-t>", self.toggle_show_trace)
430
+ top.bind("<KeyPress>", self.update)
431
+ top.bind("<Control-s>", lambda e: self.save_grammar())
432
+ top.bind("<Control-o>", lambda e: self.load_grammar())
433
+ self.grammarbox.bind("<Control-t>", self.toggle_show_trace)
434
+ self.grammarbox.bind("<Control-n>", self._devset_next)
435
+ self.grammarbox.bind("<Control-p>", self._devset_prev)
436
+
437
+ # Redraw the eval graph when the window size changes
438
+ self.evalbox.bind("<Configure>", self._eval_plot)
439
+
440
+ def _init_fonts(self, top):
441
+ # TWhat's our font size (default=same as sysfont)
442
+ self._size = IntVar(top)
443
+ self._size.set(20)
444
+ self._font = Font(family="helvetica", size=-self._size.get())
445
+ self._smallfont = Font(
446
+ family="helvetica", size=-(int(self._size.get() * 14 // 20))
447
+ )
448
+
449
+ def _init_menubar(self, parent):
450
+ menubar = Menu(parent)
451
+
452
+ filemenu = Menu(menubar, tearoff=0)
453
+ filemenu.add_command(label="Reset Application", underline=0, command=self.reset)
454
+ filemenu.add_command(
455
+ label="Save Current Grammar",
456
+ underline=0,
457
+ accelerator="Ctrl-s",
458
+ command=self.save_grammar,
459
+ )
460
+ filemenu.add_command(
461
+ label="Load Grammar",
462
+ underline=0,
463
+ accelerator="Ctrl-o",
464
+ command=self.load_grammar,
465
+ )
466
+
467
+ filemenu.add_command(
468
+ label="Save Grammar History", underline=13, command=self.save_history
469
+ )
470
+
471
+ filemenu.add_command(
472
+ label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
473
+ )
474
+ menubar.add_cascade(label="File", underline=0, menu=filemenu)
475
+
476
+ viewmenu = Menu(menubar, tearoff=0)
477
+ viewmenu.add_radiobutton(
478
+ label="Tiny",
479
+ variable=self._size,
480
+ underline=0,
481
+ value=10,
482
+ command=self.resize,
483
+ )
484
+ viewmenu.add_radiobutton(
485
+ label="Small",
486
+ variable=self._size,
487
+ underline=0,
488
+ value=16,
489
+ command=self.resize,
490
+ )
491
+ viewmenu.add_radiobutton(
492
+ label="Medium",
493
+ variable=self._size,
494
+ underline=0,
495
+ value=20,
496
+ command=self.resize,
497
+ )
498
+ viewmenu.add_radiobutton(
499
+ label="Large",
500
+ variable=self._size,
501
+ underline=0,
502
+ value=24,
503
+ command=self.resize,
504
+ )
505
+ viewmenu.add_radiobutton(
506
+ label="Huge",
507
+ variable=self._size,
508
+ underline=0,
509
+ value=34,
510
+ command=self.resize,
511
+ )
512
+ menubar.add_cascade(label="View", underline=0, menu=viewmenu)
513
+
514
+ devsetmenu = Menu(menubar, tearoff=0)
515
+ devsetmenu.add_radiobutton(
516
+ label="50 sentences",
517
+ variable=self._devset_size,
518
+ value=50,
519
+ command=self.set_devset_size,
520
+ )
521
+ devsetmenu.add_radiobutton(
522
+ label="100 sentences",
523
+ variable=self._devset_size,
524
+ value=100,
525
+ command=self.set_devset_size,
526
+ )
527
+ devsetmenu.add_radiobutton(
528
+ label="200 sentences",
529
+ variable=self._devset_size,
530
+ value=200,
531
+ command=self.set_devset_size,
532
+ )
533
+ devsetmenu.add_radiobutton(
534
+ label="500 sentences",
535
+ variable=self._devset_size,
536
+ value=500,
537
+ command=self.set_devset_size,
538
+ )
539
+ menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu)
540
+
541
+ helpmenu = Menu(menubar, tearoff=0)
542
+ helpmenu.add_command(label="About", underline=0, command=self.about)
543
+ menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
544
+
545
+ parent.config(menu=menubar)
546
+
547
+ def toggle_show_trace(self, *e):
548
+ if self._showing_trace:
549
+ self.show_devset()
550
+ else:
551
+ self.show_trace()
552
+ return "break"
553
+
554
+ _SCALE_N = 5 # center on the last 5 examples.
555
+ _DRAW_LINES = False
556
+
557
+ def _eval_plot(self, *e, **config):
558
+ width = config.get("width", self.evalbox.winfo_width())
559
+ height = config.get("height", self.evalbox.winfo_height())
560
+
561
+ # Clear the canvas
562
+ self.evalbox.delete("all")
563
+
564
+ # Draw the precision & recall labels.
565
+ tag = self.evalbox.create_text(
566
+ 10, height // 2 - 10, justify="left", anchor="w", text="Precision"
567
+ )
568
+ left, right = self.evalbox.bbox(tag)[2] + 5, width - 10
569
+ tag = self.evalbox.create_text(
570
+ left + (width - left) // 2,
571
+ height - 10,
572
+ anchor="s",
573
+ text="Recall",
574
+ justify="center",
575
+ )
576
+ top, bot = 10, self.evalbox.bbox(tag)[1] - 10
577
+
578
+ # Draw masks for clipping the plot.
579
+ bg = self._EVALBOX_PARAMS["background"]
580
+ self.evalbox.lower(
581
+ self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg)
582
+ )
583
+ self.evalbox.lower(
584
+ self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg)
585
+ )
586
+
587
+ # Calculate the plot's scale.
588
+ if self._autoscale.get() and len(self._history) > 1:
589
+ max_precision = max_recall = 0
590
+ min_precision = min_recall = 1
591
+ for i in range(1, min(len(self._history), self._SCALE_N + 1)):
592
+ grammar, precision, recall, fmeasure = self._history[-i]
593
+ min_precision = min(precision, min_precision)
594
+ min_recall = min(recall, min_recall)
595
+ max_precision = max(precision, max_precision)
596
+ max_recall = max(recall, max_recall)
597
+ # if max_precision-min_precision > max_recall-min_recall:
598
+ # min_recall -= (max_precision-min_precision)/2
599
+ # max_recall += (max_precision-min_precision)/2
600
+ # else:
601
+ # min_precision -= (max_recall-min_recall)/2
602
+ # max_precision += (max_recall-min_recall)/2
603
+ # if min_recall < 0:
604
+ # max_recall -= min_recall
605
+ # min_recall = 0
606
+ # if min_precision < 0:
607
+ # max_precision -= min_precision
608
+ # min_precision = 0
609
+ min_precision = max(min_precision - 0.01, 0)
610
+ min_recall = max(min_recall - 0.01, 0)
611
+ max_precision = min(max_precision + 0.01, 1)
612
+ max_recall = min(max_recall + 0.01, 1)
613
+ else:
614
+ min_precision = min_recall = 0
615
+ max_precision = max_recall = 1
616
+
617
+ # Draw the axis lines & grid lines
618
+ for i in range(11):
619
+ x = left + (right - left) * (
620
+ (i / 10.0 - min_recall) / (max_recall - min_recall)
621
+ )
622
+ y = bot - (bot - top) * (
623
+ (i / 10.0 - min_precision) / (max_precision - min_precision)
624
+ )
625
+ if left < x < right:
626
+ self.evalbox.create_line(x, top, x, bot, fill="#888")
627
+ if top < y < bot:
628
+ self.evalbox.create_line(left, y, right, y, fill="#888")
629
+ self.evalbox.create_line(left, top, left, bot)
630
+ self.evalbox.create_line(left, bot, right, bot)
631
+
632
+ # Display the plot's scale
633
+ self.evalbox.create_text(
634
+ left - 3,
635
+ bot,
636
+ justify="right",
637
+ anchor="se",
638
+ text="%d%%" % (100 * min_precision),
639
+ )
640
+ self.evalbox.create_text(
641
+ left - 3,
642
+ top,
643
+ justify="right",
644
+ anchor="ne",
645
+ text="%d%%" % (100 * max_precision),
646
+ )
647
+ self.evalbox.create_text(
648
+ left,
649
+ bot + 3,
650
+ justify="center",
651
+ anchor="nw",
652
+ text="%d%%" % (100 * min_recall),
653
+ )
654
+ self.evalbox.create_text(
655
+ right,
656
+ bot + 3,
657
+ justify="center",
658
+ anchor="ne",
659
+ text="%d%%" % (100 * max_recall),
660
+ )
661
+
662
+ # Display the scores.
663
+ prev_x = prev_y = None
664
+ for i, (_, precision, recall, fscore) in enumerate(self._history):
665
+ x = left + (right - left) * (
666
+ (recall - min_recall) / (max_recall - min_recall)
667
+ )
668
+ y = bot - (bot - top) * (
669
+ (precision - min_precision) / (max_precision - min_precision)
670
+ )
671
+ if i == self._history_index:
672
+ self.evalbox.create_oval(
673
+ x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000"
674
+ )
675
+ self.status["text"] = (
676
+ "Precision: %.2f%%\t" % (precision * 100)
677
+ + "Recall: %.2f%%\t" % (recall * 100)
678
+ + "F-score: %.2f%%" % (fscore * 100)
679
+ )
680
+ else:
681
+ self.evalbox.lower(
682
+ self.evalbox.create_oval(
683
+ x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8"
684
+ )
685
+ )
686
+ if prev_x is not None and self._eval_lines.get():
687
+ self.evalbox.lower(
688
+ self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8")
689
+ )
690
+ prev_x, prev_y = x, y
691
+
692
+ _eval_demon_running = False
693
+
694
+ def _eval_demon(self):
695
+ if self.top is None:
696
+ return
697
+ if self.chunker is None:
698
+ self._eval_demon_running = False
699
+ return
700
+
701
+ # Note our starting time.
702
+ t0 = time.time()
703
+
704
+ # If are still typing, then wait for them to finish.
705
+ if (
706
+ time.time() - self._last_keypress < self._EVAL_DELAY
707
+ and self.normalized_grammar != self._eval_normalized_grammar
708
+ ):
709
+ self._eval_demon_running = True
710
+ return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
711
+
712
+ # If the grammar changed, restart the evaluation.
713
+ if self.normalized_grammar != self._eval_normalized_grammar:
714
+ # Check if we've seen this grammar already. If so, then
715
+ # just use the old evaluation values.
716
+ for (g, p, r, f) in self._history:
717
+ if self.normalized_grammar == self.normalize_grammar(g):
718
+ self._history.append((g, p, r, f))
719
+ self._history_index = len(self._history) - 1
720
+ self._eval_plot()
721
+ self._eval_demon_running = False
722
+ self._eval_normalized_grammar = None
723
+ return
724
+ self._eval_index = 0
725
+ self._eval_score = ChunkScore(chunk_label=self._chunk_label)
726
+ self._eval_grammar = self.grammar
727
+ self._eval_normalized_grammar = self.normalized_grammar
728
+
729
+ # If the grammar is empty, the don't bother evaluating it, or
730
+ # recording it in history -- the score will just be 0.
731
+ if self.normalized_grammar.strip() == "":
732
+ # self._eval_index = self._devset_size.get()
733
+ self._eval_demon_running = False
734
+ return
735
+
736
+ # Score the next set of examples
737
+ for gold in self.devset[
738
+ self._eval_index : min(
739
+ self._eval_index + self._EVAL_CHUNK, self._devset_size.get()
740
+ )
741
+ ]:
742
+ guess = self._chunkparse(gold.leaves())
743
+ self._eval_score.score(gold, guess)
744
+
745
+ # update our index in the devset.
746
+ self._eval_index += self._EVAL_CHUNK
747
+
748
+ # Check if we're done
749
+ if self._eval_index >= self._devset_size.get():
750
+ self._history.append(
751
+ (
752
+ self._eval_grammar,
753
+ self._eval_score.precision(),
754
+ self._eval_score.recall(),
755
+ self._eval_score.f_measure(),
756
+ )
757
+ )
758
+ self._history_index = len(self._history) - 1
759
+ self._eval_plot()
760
+ self._eval_demon_running = False
761
+ self._eval_normalized_grammar = None
762
+ else:
763
+ progress = 100 * self._eval_index / self._devset_size.get()
764
+ self.status["text"] = "Evaluating on Development Set (%d%%)" % progress
765
+ self._eval_demon_running = True
766
+ self._adaptively_modify_eval_chunk(time.time() - t0)
767
+ self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
768
+
769
+ def _adaptively_modify_eval_chunk(self, t):
770
+ """
771
+ Modify _EVAL_CHUNK to try to keep the amount of time that the
772
+ eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX.
773
+
774
+ :param t: The amount of time that the eval demon took.
775
+ """
776
+ if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
777
+ self._EVAL_CHUNK = min(
778
+ self._EVAL_CHUNK - 1,
779
+ max(
780
+ int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)),
781
+ self._EVAL_CHUNK - 10,
782
+ ),
783
+ )
784
+ elif t < self._EVAL_DEMON_MIN:
785
+ self._EVAL_CHUNK = max(
786
+ self._EVAL_CHUNK + 1,
787
+ min(
788
+ int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)),
789
+ self._EVAL_CHUNK + 10,
790
+ ),
791
+ )
792
+
793
+ def _init_widgets(self, top):
794
+ frame0 = Frame(top, **self._FRAME_PARAMS)
795
+ frame0.grid_columnconfigure(0, weight=4)
796
+ frame0.grid_columnconfigure(3, weight=2)
797
+ frame0.grid_rowconfigure(1, weight=1)
798
+ frame0.grid_rowconfigure(5, weight=1)
799
+
800
+ # The grammar
801
+ self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS)
802
+ self.grammarlabel = Label(
803
+ frame0,
804
+ font=self._font,
805
+ text="Grammar:",
806
+ highlightcolor="black",
807
+ background=self._GRAMMARBOX_PARAMS["background"],
808
+ )
809
+ self.grammarlabel.grid(column=0, row=0, sticky="SW")
810
+ self.grammarbox.grid(column=0, row=1, sticky="NEWS")
811
+
812
+ # Scroll bar for grammar
813
+ grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
814
+ grammar_scrollbar.grid(column=1, row=1, sticky="NWS")
815
+ self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
816
+
817
+ # grammar buttons
818
+ bg = self._FRAME_PARAMS["background"]
819
+ frame3 = Frame(frame0, background=bg)
820
+ frame3.grid(column=0, row=2, sticky="EW")
821
+ Button(
822
+ frame3,
823
+ text="Prev Grammar",
824
+ command=self._history_prev,
825
+ **self._BUTTON_PARAMS,
826
+ ).pack(side="left")
827
+ Button(
828
+ frame3,
829
+ text="Next Grammar",
830
+ command=self._history_next,
831
+ **self._BUTTON_PARAMS,
832
+ ).pack(side="left")
833
+
834
+ # Help box
835
+ self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS)
836
+ self.helpbox.grid(column=3, row=1, sticky="NEWS")
837
+ self.helptabs = {}
838
+ bg = self._FRAME_PARAMS["background"]
839
+ helptab_frame = Frame(frame0, background=bg)
840
+ helptab_frame.grid(column=3, row=0, sticky="SW")
841
+ for i, (tab, tabstops, text) in enumerate(self.HELP):
842
+ label = Label(helptab_frame, text=tab, font=self._smallfont)
843
+ label.grid(column=i * 2, row=0, sticky="S")
844
+ # help_frame.grid_columnconfigure(i, weight=1)
845
+ # label.pack(side='left')
846
+ label.bind("<ButtonPress>", lambda e, tab=tab: self.show_help(tab))
847
+ self.helptabs[tab] = label
848
+ Frame(
849
+ helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg
850
+ ).grid(column=i * 2 + 1, row=0)
851
+ self.helptabs[self.HELP[0][0]].configure(font=self._font)
852
+ self.helpbox.tag_config("elide", elide=True)
853
+ for (tag, params) in self.HELP_AUTOTAG:
854
+ self.helpbox.tag_config("tag-%s" % tag, **params)
855
+ self.show_help(self.HELP[0][0])
856
+
857
+ # Scroll bar for helpbox
858
+ help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
859
+ self.helpbox.config(yscrollcommand=help_scrollbar.set)
860
+ help_scrollbar.grid(column=4, row=1, sticky="NWS")
861
+
862
+ # The dev set
863
+ frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"])
864
+ self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS)
865
+ self.devsetbox.pack(expand=True, fill="both")
866
+ self.devsetlabel = Label(
867
+ frame0,
868
+ font=self._font,
869
+ text="Development Set:",
870
+ justify="right",
871
+ background=self._DEVSETBOX_PARAMS["background"],
872
+ )
873
+ self.devsetlabel.grid(column=0, row=4, sticky="SW")
874
+ frame4.grid(column=0, row=5, sticky="NEWS")
875
+
876
+ # dev set scrollbars
877
+ self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
878
+ self.devset_scroll.grid(column=1, row=5, sticky="NWS")
879
+ self.devset_xscroll = Scrollbar(
880
+ frame4, command=self.devsetbox.xview, orient="horiz"
881
+ )
882
+ self.devsetbox["xscrollcommand"] = self.devset_xscroll.set
883
+ self.devset_xscroll.pack(side="bottom", fill="x")
884
+
885
+ # dev set buttons
886
+ bg = self._FRAME_PARAMS["background"]
887
+ frame1 = Frame(frame0, background=bg)
888
+ frame1.grid(column=0, row=7, sticky="EW")
889
+ Button(
890
+ frame1,
891
+ text="Prev Example (Ctrl-p)",
892
+ command=self._devset_prev,
893
+ **self._BUTTON_PARAMS,
894
+ ).pack(side="left")
895
+ Button(
896
+ frame1,
897
+ text="Next Example (Ctrl-n)",
898
+ command=self._devset_next,
899
+ **self._BUTTON_PARAMS,
900
+ ).pack(side="left")
901
+ self.devset_button = Button(
902
+ frame1,
903
+ text="Show example",
904
+ command=self.show_devset,
905
+ state="disabled",
906
+ **self._BUTTON_PARAMS,
907
+ )
908
+ self.devset_button.pack(side="right")
909
+ self.trace_button = Button(
910
+ frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS
911
+ )
912
+ self.trace_button.pack(side="right")
913
+
914
+ # evaluation box
915
+ self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
916
+ label = Label(
917
+ frame0,
918
+ font=self._font,
919
+ text="Evaluation:",
920
+ justify="right",
921
+ background=self._EVALBOX_PARAMS["background"],
922
+ )
923
+ label.grid(column=3, row=4, sticky="SW")
924
+ self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2)
925
+
926
+ # evaluation box buttons
927
+ bg = self._FRAME_PARAMS["background"]
928
+ frame2 = Frame(frame0, background=bg)
929
+ frame2.grid(column=3, row=7, sticky="EW")
930
+ self._autoscale = IntVar(self.top)
931
+ self._autoscale.set(False)
932
+ Checkbutton(
933
+ frame2,
934
+ variable=self._autoscale,
935
+ command=self._eval_plot,
936
+ text="Zoom",
937
+ **self._BUTTON_PARAMS,
938
+ ).pack(side="left")
939
+ self._eval_lines = IntVar(self.top)
940
+ self._eval_lines.set(False)
941
+ Checkbutton(
942
+ frame2,
943
+ variable=self._eval_lines,
944
+ command=self._eval_plot,
945
+ text="Lines",
946
+ **self._BUTTON_PARAMS,
947
+ ).pack(side="left")
948
+ Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right")
949
+
950
+ # The status label
951
+ self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
952
+ self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5)
953
+
954
+ # Help box & devset box can't be edited.
955
+ self.helpbox["state"] = "disabled"
956
+ self.devsetbox["state"] = "disabled"
957
+
958
+ # Spacers
959
+ bg = self._FRAME_PARAMS["background"]
960
+ Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
961
+ Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
962
+ Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
963
+
964
+ # pack the frame.
965
+ frame0.pack(fill="both", expand=True)
966
+
967
+ # Set up colors for the devset box
968
+ self.devsetbox.tag_config("true-pos", background="#afa", underline="True")
969
+ self.devsetbox.tag_config("false-neg", underline="True", foreground="#800")
970
+ self.devsetbox.tag_config("false-pos", background="#faa")
971
+ self.devsetbox.tag_config("trace", foreground="#666", wrap="none")
972
+ self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none")
973
+ self.devsetbox.tag_config("error", foreground="#800")
974
+
975
+ # And for the grammarbox
976
+ self.grammarbox.tag_config("error", background="#fec")
977
+ self.grammarbox.tag_config("comment", foreground="#840")
978
+ self.grammarbox.tag_config("angle", foreground="#00f")
979
+ self.grammarbox.tag_config("brace", foreground="#0a0")
980
+ self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40)
981
+
982
+ _showing_trace = False
983
+
984
+ def show_trace(self, *e):
985
+ self._showing_trace = True
986
+ self.trace_button["state"] = "disabled"
987
+ self.devset_button["state"] = "normal"
988
+
989
+ self.devsetbox["state"] = "normal"
990
+ # self.devsetbox['wrap'] = 'none'
991
+ self.devsetbox.delete("1.0", "end")
992
+ self.devsetlabel["text"] = "Development Set (%d/%d)" % (
993
+ (self.devset_index + 1, self._devset_size.get())
994
+ )
995
+
996
+ if self.chunker is None:
997
+ self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.")
998
+ self.devsetbox.tag_add("error", "1.0", "end")
999
+ return # can't do anything more
1000
+
1001
+ gold_tree = self.devset[self.devset_index]
1002
+ rules = self.chunker.rules()
1003
+
1004
+ # Calculate the tag sequence
1005
+ tagseq = "\t"
1006
+ charnum = [1]
1007
+ for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
1008
+ tagseq += "%s " % pos
1009
+ charnum.append(len(tagseq))
1010
+ self.charnum = {
1011
+ (i, j): charnum[j]
1012
+ for i in range(len(rules) + 1)
1013
+ for j in range(len(charnum))
1014
+ }
1015
+ self.linenum = {i: i * 2 + 2 for i in range(len(rules) + 1)}
1016
+
1017
+ for i in range(len(rules) + 1):
1018
+ if i == 0:
1019
+ self.devsetbox.insert("end", "Start:\n")
1020
+ self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
1021
+ else:
1022
+ self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1])
1023
+ self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
1024
+ # Display the tag sequence.
1025
+ self.devsetbox.insert("end", tagseq + "\n")
1026
+ self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c")
1027
+ # Run a partial parser, and extract gold & test chunks
1028
+ chunker = RegexpChunkParser(rules[:i])
1029
+ test_tree = self._chunkparse(gold_tree.leaves())
1030
+ gold_chunks = self._chunks(gold_tree)
1031
+ test_chunks = self._chunks(test_tree)
1032
+ # Compare them.
1033
+ for chunk in gold_chunks.intersection(test_chunks):
1034
+ self._color_chunk(i, chunk, "true-pos")
1035
+ for chunk in gold_chunks - test_chunks:
1036
+ self._color_chunk(i, chunk, "false-neg")
1037
+ for chunk in test_chunks - gold_chunks:
1038
+ self._color_chunk(i, chunk, "false-pos")
1039
+ self.devsetbox.insert("end", "Finished.\n")
1040
+ self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
1041
+
1042
+ # This is a hack, because the x-scrollbar isn't updating its
1043
+ # position right -- I'm not sure what the underlying cause is
1044
+ # though. (This is on OS X w/ python 2.5)
1045
+ self.top.after(100, self.devset_xscroll.set, 0, 0.3)
1046
+
1047
+ def show_help(self, tab):
1048
+ self.helpbox["state"] = "normal"
1049
+ self.helpbox.delete("1.0", "end")
1050
+ for (name, tabstops, text) in self.HELP:
1051
+ if name == tab:
1052
+ text = text.replace(
1053
+ "<<TAGSET>>",
1054
+ "\n".join(
1055
+ "\t%s\t%s" % item
1056
+ for item in sorted(
1057
+ list(self.tagset.items()),
1058
+ key=lambda t_w: re.match(r"\w+", t_w[0])
1059
+ and (0, t_w[0])
1060
+ or (1, t_w[0]),
1061
+ )
1062
+ ),
1063
+ )
1064
+
1065
+ self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
1066
+ self.helpbox.config(tabs=tabstops)
1067
+ self.helpbox.insert("1.0", text + "\n" * 20)
1068
+ C = "1.0 + %d chars"
1069
+ for (tag, params) in self.HELP_AUTOTAG:
1070
+ pattern = f"(?s)(<{tag}>)(.*?)(</{tag}>)"
1071
+ for m in re.finditer(pattern, text):
1072
+ self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1))
1073
+ self.helpbox.tag_add(
1074
+ "tag-%s" % tag, C % m.start(2), C % m.end(2)
1075
+ )
1076
+ self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3))
1077
+ else:
1078
+ self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
1079
+ self.helpbox["state"] = "disabled"
1080
+
1081
+ def _history_prev(self, *e):
1082
+ self._view_history(self._history_index - 1)
1083
+ return "break"
1084
+
1085
+ def _history_next(self, *e):
1086
+ self._view_history(self._history_index + 1)
1087
+ return "break"
1088
+
1089
+ def _view_history(self, index):
1090
+ # Bounds & sanity checking:
1091
+ index = max(0, min(len(self._history) - 1, index))
1092
+ if not self._history:
1093
+ return
1094
+ # Already viewing the requested history item?
1095
+ if index == self._history_index:
1096
+ return
1097
+ # Show the requested grammar. It will get added to _history
1098
+ # only if they edit it (causing self.update() to get run.)
1099
+ self.grammarbox["state"] = "normal"
1100
+ self.grammarbox.delete("1.0", "end")
1101
+ self.grammarbox.insert("end", self._history[index][0])
1102
+ self.grammarbox.mark_set("insert", "1.0")
1103
+ self._history_index = index
1104
+ self._syntax_highlight_grammar(self._history[index][0])
1105
+ # Record the normalized grammar & regenerate the chunker.
1106
+ self.normalized_grammar = self.normalize_grammar(self._history[index][0])
1107
+ if self.normalized_grammar:
1108
+ rules = [
1109
+ RegexpChunkRule.fromstring(line)
1110
+ for line in self.normalized_grammar.split("\n")
1111
+ ]
1112
+ else:
1113
+ rules = []
1114
+ self.chunker = RegexpChunkParser(rules)
1115
+ # Show the score.
1116
+ self._eval_plot()
1117
+ # Update the devset box
1118
+ self._highlight_devset()
1119
+ if self._showing_trace:
1120
+ self.show_trace()
1121
+ # Update the grammar label
1122
+ if self._history_index < len(self._history) - 1:
1123
+ self.grammarlabel["text"] = "Grammar {}/{}:".format(
1124
+ self._history_index + 1,
1125
+ len(self._history),
1126
+ )
1127
+ else:
1128
+ self.grammarlabel["text"] = "Grammar:"
1129
+
1130
+ def _devset_next(self, *e):
1131
+ self._devset_scroll("scroll", 1, "page")
1132
+ return "break"
1133
+
1134
+ def _devset_prev(self, *e):
1135
+ self._devset_scroll("scroll", -1, "page")
1136
+ return "break"
1137
+
1138
+ def destroy(self, *e):
1139
+ if self.top is None:
1140
+ return
1141
+ self.top.destroy()
1142
+ self.top = None
1143
+
1144
+ def _devset_scroll(self, command, *args):
1145
+ N = 1 # size of a page -- one sentence.
1146
+ showing_trace = self._showing_trace
1147
+ if command == "scroll" and args[1].startswith("unit"):
1148
+ self.show_devset(self.devset_index + int(args[0]))
1149
+ elif command == "scroll" and args[1].startswith("page"):
1150
+ self.show_devset(self.devset_index + N * int(args[0]))
1151
+ elif command == "moveto":
1152
+ self.show_devset(int(float(args[0]) * self._devset_size.get()))
1153
+ else:
1154
+ assert 0, f"bad scroll command {command} {args}"
1155
+ if showing_trace:
1156
+ self.show_trace()
1157
+
1158
+ def show_devset(self, index=None):
1159
+ if index is None:
1160
+ index = self.devset_index
1161
+
1162
+ # Bounds checking
1163
+ index = min(max(0, index), self._devset_size.get() - 1)
1164
+
1165
+ if index == self.devset_index and not self._showing_trace:
1166
+ return
1167
+ self.devset_index = index
1168
+
1169
+ self._showing_trace = False
1170
+ self.trace_button["state"] = "normal"
1171
+ self.devset_button["state"] = "disabled"
1172
+
1173
+ # Clear the text box.
1174
+ self.devsetbox["state"] = "normal"
1175
+ self.devsetbox["wrap"] = "word"
1176
+ self.devsetbox.delete("1.0", "end")
1177
+ self.devsetlabel["text"] = "Development Set (%d/%d)" % (
1178
+ (self.devset_index + 1, self._devset_size.get())
1179
+ )
1180
+
1181
+ # Add the sentences
1182
+ sample = self.devset[self.devset_index : self.devset_index + 1]
1183
+ self.charnum = {}
1184
+ self.linenum = {0: 1}
1185
+ for sentnum, sent in enumerate(sample):
1186
+ linestr = ""
1187
+ for wordnum, (word, pos) in enumerate(sent.leaves()):
1188
+ self.charnum[sentnum, wordnum] = len(linestr)
1189
+ linestr += f"{word}/{pos} "
1190
+ self.charnum[sentnum, wordnum + 1] = len(linestr)
1191
+ self.devsetbox.insert("end", linestr[:-1] + "\n\n")
1192
+
1193
+ # Highlight chunks in the dev set
1194
+ if self.chunker is not None:
1195
+ self._highlight_devset()
1196
+ self.devsetbox["state"] = "disabled"
1197
+
1198
+ # Update the scrollbar
1199
+ first = self.devset_index / self._devset_size.get()
1200
+ last = (self.devset_index + 2) / self._devset_size.get()
1201
+ self.devset_scroll.set(first, last)
1202
+
1203
+ def _chunks(self, tree):
1204
+ chunks = set()
1205
+ wordnum = 0
1206
+ for child in tree:
1207
+ if isinstance(child, Tree):
1208
+ if child.label() == self._chunk_label:
1209
+ chunks.add((wordnum, wordnum + len(child)))
1210
+ wordnum += len(child)
1211
+ else:
1212
+ wordnum += 1
1213
+ return chunks
1214
+
1215
+ def _syntax_highlight_grammar(self, grammar):
1216
+ if self.top is None:
1217
+ return
1218
+ self.grammarbox.tag_remove("comment", "1.0", "end")
1219
+ self.grammarbox.tag_remove("angle", "1.0", "end")
1220
+ self.grammarbox.tag_remove("brace", "1.0", "end")
1221
+ self.grammarbox.tag_add("hangindent", "1.0", "end")
1222
+ for lineno, line in enumerate(grammar.split("\n")):
1223
+ if not line.strip():
1224
+ continue
1225
+ m = re.match(r"(\\.|[^#])*(#.*)?", line)
1226
+ comment_start = None
1227
+ if m.group(2):
1228
+ comment_start = m.start(2)
1229
+ s = "%d.%d" % (lineno + 1, m.start(2))
1230
+ e = "%d.%d" % (lineno + 1, m.end(2))
1231
+ self.grammarbox.tag_add("comment", s, e)
1232
+ for m in re.finditer("[<>{}]", line):
1233
+ if comment_start is not None and m.start() >= comment_start:
1234
+ break
1235
+ s = "%d.%d" % (lineno + 1, m.start())
1236
+ e = "%d.%d" % (lineno + 1, m.end())
1237
+ if m.group() in "<>":
1238
+ self.grammarbox.tag_add("angle", s, e)
1239
+ else:
1240
+ self.grammarbox.tag_add("brace", s, e)
1241
+
1242
+ def _grammarcheck(self, grammar):
1243
+ if self.top is None:
1244
+ return
1245
+ self.grammarbox.tag_remove("error", "1.0", "end")
1246
+ self._grammarcheck_errs = []
1247
+ for lineno, line in enumerate(grammar.split("\n")):
1248
+ line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line)
1249
+ line = line.strip()
1250
+ if line:
1251
+ try:
1252
+ RegexpChunkRule.fromstring(line)
1253
+ except ValueError as e:
1254
+ self.grammarbox.tag_add(
1255
+ "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1)
1256
+ )
1257
+ self.status["text"] = ""
1258
+
1259
+ def update(self, *event):
1260
+ # Record when update was called (for grammarcheck)
1261
+ if event:
1262
+ self._last_keypress = time.time()
1263
+
1264
+ # Read the grammar from the Text box.
1265
+ self.grammar = grammar = self.grammarbox.get("1.0", "end")
1266
+
1267
+ # If the grammar hasn't changed, do nothing:
1268
+ normalized_grammar = self.normalize_grammar(grammar)
1269
+ if normalized_grammar == self.normalized_grammar:
1270
+ return
1271
+ else:
1272
+ self.normalized_grammar = normalized_grammar
1273
+
1274
+ # If the grammar has changed, and we're looking at history,
1275
+ # then stop looking at history.
1276
+ if self._history_index < len(self._history) - 1:
1277
+ self.grammarlabel["text"] = "Grammar:"
1278
+
1279
+ self._syntax_highlight_grammar(grammar)
1280
+
1281
+ # The grammar has changed; try parsing it. If it doesn't
1282
+ # parse, do nothing. (flag error location?)
1283
+ try:
1284
+ # Note: the normalized grammar has no blank lines.
1285
+ if normalized_grammar:
1286
+ rules = [
1287
+ RegexpChunkRule.fromstring(line)
1288
+ for line in normalized_grammar.split("\n")
1289
+ ]
1290
+ else:
1291
+ rules = []
1292
+ except ValueError as e:
1293
+ # Use the un-normalized grammar for error highlighting.
1294
+ self._grammarcheck(grammar)
1295
+ self.chunker = None
1296
+ return
1297
+
1298
+ self.chunker = RegexpChunkParser(rules)
1299
+ self.grammarbox.tag_remove("error", "1.0", "end")
1300
+ self.grammar_changed = time.time()
1301
+ # Display the results
1302
+ if self._showing_trace:
1303
+ self.show_trace()
1304
+ else:
1305
+ self._highlight_devset()
1306
+ # Start the eval demon
1307
+ if not self._eval_demon_running:
1308
+ self._eval_demon()
1309
+
1310
+ def _highlight_devset(self, sample=None):
1311
+ if sample is None:
1312
+ sample = self.devset[self.devset_index : self.devset_index + 1]
1313
+
1314
+ self.devsetbox.tag_remove("true-pos", "1.0", "end")
1315
+ self.devsetbox.tag_remove("false-neg", "1.0", "end")
1316
+ self.devsetbox.tag_remove("false-pos", "1.0", "end")
1317
+
1318
+ # Run the grammar on the test cases.
1319
+ for sentnum, gold_tree in enumerate(sample):
1320
+ # Run the chunk parser
1321
+ test_tree = self._chunkparse(gold_tree.leaves())
1322
+ # Extract gold & test chunks
1323
+ gold_chunks = self._chunks(gold_tree)
1324
+ test_chunks = self._chunks(test_tree)
1325
+ # Compare them.
1326
+ for chunk in gold_chunks.intersection(test_chunks):
1327
+ self._color_chunk(sentnum, chunk, "true-pos")
1328
+ for chunk in gold_chunks - test_chunks:
1329
+ self._color_chunk(sentnum, chunk, "false-neg")
1330
+ for chunk in test_chunks - gold_chunks:
1331
+ self._color_chunk(sentnum, chunk, "false-pos")
1332
+
1333
+ def _chunkparse(self, words):
1334
+ try:
1335
+ return self.chunker.parse(words)
1336
+ except (ValueError, IndexError) as e:
1337
+ # There's an error somewhere in the grammar, but we're not sure
1338
+ # exactly where, so just mark the whole grammar as bad.
1339
+ # E.g., this is caused by: "({<NN>})"
1340
+ self.grammarbox.tag_add("error", "1.0", "end")
1341
+ # Treat it as tagging nothing:
1342
+ return words
1343
+
1344
+ def _color_chunk(self, sentnum, chunk, tag):
1345
+ start, end = chunk
1346
+ self.devsetbox.tag_add(
1347
+ tag,
1348
+ f"{self.linenum[sentnum]}.{self.charnum[sentnum, start]}",
1349
+ f"{self.linenum[sentnum]}.{self.charnum[sentnum, end] - 1}",
1350
+ )
1351
+
1352
+ def reset(self):
1353
+ # Clear various variables
1354
+ self.chunker = None
1355
+ self.grammar = None
1356
+ self.normalized_grammar = None
1357
+ self.grammar_changed = 0
1358
+ self._history = []
1359
+ self._history_index = 0
1360
+ # Update the on-screen display.
1361
+ self.grammarbox.delete("1.0", "end")
1362
+ self.show_devset(0)
1363
+ self.update()
1364
+ # self._eval_plot()
1365
+
1366
+ SAVE_GRAMMAR_TEMPLATE = (
1367
+ "# Regexp Chunk Parsing Grammar\n"
1368
+ "# Saved %(date)s\n"
1369
+ "#\n"
1370
+ "# Development set: %(devset)s\n"
1371
+ "# Precision: %(precision)s\n"
1372
+ "# Recall: %(recall)s\n"
1373
+ "# F-score: %(fscore)s\n\n"
1374
+ "%(grammar)s\n"
1375
+ )
1376
+
1377
+ def save_grammar(self, filename=None):
1378
+ if not filename:
1379
+ ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
1380
+ filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk")
1381
+ if not filename:
1382
+ return
1383
+ if self._history and self.normalized_grammar == self.normalize_grammar(
1384
+ self._history[-1][0]
1385
+ ):
1386
+ precision, recall, fscore = (
1387
+ "%.2f%%" % (100 * v) for v in self._history[-1][1:]
1388
+ )
1389
+ elif self.chunker is None:
1390
+ precision = recall = fscore = "Grammar not well formed"
1391
+ else:
1392
+ precision = recall = fscore = "Not finished evaluation yet"
1393
+
1394
+ with open(filename, "w") as outfile:
1395
+ outfile.write(
1396
+ self.SAVE_GRAMMAR_TEMPLATE
1397
+ % dict(
1398
+ date=time.ctime(),
1399
+ devset=self.devset_name,
1400
+ precision=precision,
1401
+ recall=recall,
1402
+ fscore=fscore,
1403
+ grammar=self.grammar.strip(),
1404
+ )
1405
+ )
1406
+
1407
+ def load_grammar(self, filename=None):
1408
+ if not filename:
1409
+ ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
1410
+ filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk")
1411
+ if not filename:
1412
+ return
1413
+ self.grammarbox.delete("1.0", "end")
1414
+ self.update()
1415
+ with open(filename) as infile:
1416
+ grammar = infile.read()
1417
+ grammar = re.sub(
1418
+ r"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
1419
+ ).lstrip()
1420
+ self.grammarbox.insert("1.0", grammar)
1421
+ self.update()
1422
+
1423
+ def save_history(self, filename=None):
1424
+ if not filename:
1425
+ ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")]
1426
+ filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt")
1427
+ if not filename:
1428
+ return
1429
+
1430
+ with open(filename, "w") as outfile:
1431
+ outfile.write("# Regexp Chunk Parsing Grammar History\n")
1432
+ outfile.write("# Saved %s\n" % time.ctime())
1433
+ outfile.write("# Development set: %s\n" % self.devset_name)
1434
+ for i, (g, p, r, f) in enumerate(self._history):
1435
+ hdr = (
1436
+ "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, "
1437
+ "fscore=%.2f%%)"
1438
+ % (i + 1, len(self._history), p * 100, r * 100, f * 100)
1439
+ )
1440
+ outfile.write("\n%s\n" % hdr)
1441
+ outfile.write("".join(" %s\n" % line for line in g.strip().split()))
1442
+
1443
+ if not (
1444
+ self._history
1445
+ and self.normalized_grammar
1446
+ == self.normalize_grammar(self._history[-1][0])
1447
+ ):
1448
+ if self.chunker is None:
1449
+ outfile.write("\nCurrent Grammar (not well-formed)\n")
1450
+ else:
1451
+ outfile.write("\nCurrent Grammar (not evaluated)\n")
1452
+ outfile.write(
1453
+ "".join(" %s\n" % line for line in self.grammar.strip().split())
1454
+ )
1455
+
1456
+ def about(self, *e):
1457
+ ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper"
1458
+ TITLE = "About: Regular Expression Chunk Parser Application"
1459
+ try:
1460
+ from tkinter.messagebox import Message
1461
+
1462
+ Message(message=ABOUT, title=TITLE).show()
1463
+ except:
1464
+ ShowText(self.top, TITLE, ABOUT)
1465
+
1466
+ def set_devset_size(self, size=None):
1467
+ if size is not None:
1468
+ self._devset_size.set(size)
1469
+ self._devset_size.set(min(len(self.devset), self._devset_size.get()))
1470
+ self.show_devset(1)
1471
+ self.show_devset(0)
1472
+ # what about history? Evaluated at diff dev set sizes!
1473
+
1474
+ def resize(self, size=None):
1475
+ if size is not None:
1476
+ self._size.set(size)
1477
+ size = self._size.get()
1478
+ self._font.configure(size=-(abs(size)))
1479
+ self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20))
1480
+
1481
+ def mainloop(self, *args, **kwargs):
1482
+ """
1483
+ Enter the Tkinter mainloop. This function must be called if
1484
+ this demo is created from a non-interactive program (e.g.
1485
+ from a secript); otherwise, the demo will close as soon as
1486
+ the script completes.
1487
+ """
1488
+ if in_idle():
1489
+ return
1490
+ self.top.mainloop(*args, **kwargs)
1491
+
1492
+
1493
+ def app():
1494
+ RegexpChunkApp().mainloop()
1495
+
1496
+
1497
+ if __name__ == "__main__":
1498
+ app()
1499
+
1500
+ __all__ = ["app"]
.eggs/nltk-3.8-py3.10.egg/nltk/app/collocations_app.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Collocations Application
2
+ # Much of the GUI code is imported from concordance.py; We intend to merge these tools together
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+ #
8
+
9
+
10
+ import queue as q
11
+ import threading
12
+ from tkinter import (
13
+ END,
14
+ LEFT,
15
+ SUNKEN,
16
+ Button,
17
+ Frame,
18
+ IntVar,
19
+ Label,
20
+ Menu,
21
+ OptionMenu,
22
+ Scrollbar,
23
+ StringVar,
24
+ Text,
25
+ Tk,
26
+ )
27
+ from tkinter.font import Font
28
+
29
+ from nltk.corpus import (
30
+ alpino,
31
+ brown,
32
+ cess_cat,
33
+ cess_esp,
34
+ floresta,
35
+ indian,
36
+ mac_morpho,
37
+ machado,
38
+ nps_chat,
39
+ sinica_treebank,
40
+ treebank,
41
+ )
42
+ from nltk.probability import FreqDist
43
+ from nltk.util import in_idle
44
+
45
+ CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
46
+ ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
47
+ POLL_INTERVAL = 100
48
+
49
+ _DEFAULT = "English: Brown Corpus (Humor)"
50
+ _CORPORA = {
51
+ "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
52
+ "English: Brown Corpus": lambda: brown.words(),
53
+ "English: Brown Corpus (Press)": lambda: brown.words(
54
+ categories=["news", "editorial", "reviews"]
55
+ ),
56
+ "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
57
+ "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
58
+ "English: Brown Corpus (Science Fiction)": lambda: brown.words(
59
+ categories="science_fiction"
60
+ ),
61
+ "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
62
+ "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
63
+ "English: NPS Chat Corpus": lambda: nps_chat.words(),
64
+ "English: Wall Street Journal Corpus": lambda: treebank.words(),
65
+ "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
66
+ "Dutch: Alpino Corpus": lambda: alpino.words(),
67
+ "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
68
+ "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
69
+ "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
70
+ "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
71
+ "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
72
+ }
73
+
74
+
75
+ class CollocationsView:
76
+ _BACKGROUND_COLOUR = "#FFF" # white
77
+
78
+ def __init__(self):
79
+ self.queue = q.Queue()
80
+ self.model = CollocationsModel(self.queue)
81
+ self.top = Tk()
82
+ self._init_top(self.top)
83
+ self._init_menubar()
84
+ self._init_widgets(self.top)
85
+ self.load_corpus(self.model.DEFAULT_CORPUS)
86
+ self.after = self.top.after(POLL_INTERVAL, self._poll)
87
+
88
+ def _init_top(self, top):
89
+ top.geometry("550x650+50+50")
90
+ top.title("NLTK Collocations List")
91
+ top.bind("<Control-q>", self.destroy)
92
+ top.protocol("WM_DELETE_WINDOW", self.destroy)
93
+ top.minsize(550, 650)
94
+
95
+ def _init_widgets(self, parent):
96
+ self.main_frame = Frame(
97
+ parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
98
+ )
99
+ self._init_corpus_select(self.main_frame)
100
+ self._init_results_box(self.main_frame)
101
+ self._init_paging(self.main_frame)
102
+ self._init_status(self.main_frame)
103
+ self.main_frame.pack(fill="both", expand=True)
104
+
105
+ def _init_corpus_select(self, parent):
106
+ innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
107
+ self.var = StringVar(innerframe)
108
+ self.var.set(self.model.DEFAULT_CORPUS)
109
+ Label(
110
+ innerframe,
111
+ justify=LEFT,
112
+ text=" Corpus: ",
113
+ background=self._BACKGROUND_COLOUR,
114
+ padx=2,
115
+ pady=1,
116
+ border=0,
117
+ ).pack(side="left")
118
+
119
+ other_corpora = list(self.model.CORPORA.keys()).remove(
120
+ self.model.DEFAULT_CORPUS
121
+ )
122
+ om = OptionMenu(
123
+ innerframe,
124
+ self.var,
125
+ self.model.DEFAULT_CORPUS,
126
+ command=self.corpus_selected,
127
+ *self.model.non_default_corpora()
128
+ )
129
+ om["borderwidth"] = 0
130
+ om["highlightthickness"] = 1
131
+ om.pack(side="left")
132
+ innerframe.pack(side="top", fill="x", anchor="n")
133
+
134
+ def _init_status(self, parent):
135
+ self.status = Label(
136
+ parent,
137
+ justify=LEFT,
138
+ relief=SUNKEN,
139
+ background=self._BACKGROUND_COLOUR,
140
+ border=0,
141
+ padx=1,
142
+ pady=0,
143
+ )
144
+ self.status.pack(side="top", anchor="sw")
145
+
146
+ def _init_menubar(self):
147
+ self._result_size = IntVar(self.top)
148
+ menubar = Menu(self.top)
149
+
150
+ filemenu = Menu(menubar, tearoff=0, borderwidth=0)
151
+ filemenu.add_command(
152
+ label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
153
+ )
154
+ menubar.add_cascade(label="File", underline=0, menu=filemenu)
155
+
156
+ editmenu = Menu(menubar, tearoff=0)
157
+ rescntmenu = Menu(editmenu, tearoff=0)
158
+ rescntmenu.add_radiobutton(
159
+ label="20",
160
+ variable=self._result_size,
161
+ underline=0,
162
+ value=20,
163
+ command=self.set_result_size,
164
+ )
165
+ rescntmenu.add_radiobutton(
166
+ label="50",
167
+ variable=self._result_size,
168
+ underline=0,
169
+ value=50,
170
+ command=self.set_result_size,
171
+ )
172
+ rescntmenu.add_radiobutton(
173
+ label="100",
174
+ variable=self._result_size,
175
+ underline=0,
176
+ value=100,
177
+ command=self.set_result_size,
178
+ )
179
+ rescntmenu.invoke(1)
180
+ editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
181
+
182
+ menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
183
+ self.top.config(menu=menubar)
184
+
185
+ def set_result_size(self, **kwargs):
186
+ self.model.result_count = self._result_size.get()
187
+
188
+ def _init_results_box(self, parent):
189
+ innerframe = Frame(parent)
190
+ i1 = Frame(innerframe)
191
+ i2 = Frame(innerframe)
192
+ vscrollbar = Scrollbar(i1, borderwidth=1)
193
+ hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
194
+ self.results_box = Text(
195
+ i1,
196
+ font=Font(family="courier", size="16"),
197
+ state="disabled",
198
+ borderwidth=1,
199
+ yscrollcommand=vscrollbar.set,
200
+ xscrollcommand=hscrollbar.set,
201
+ wrap="none",
202
+ width="40",
203
+ height="20",
204
+ exportselection=1,
205
+ )
206
+ self.results_box.pack(side="left", fill="both", expand=True)
207
+ vscrollbar.pack(side="left", fill="y", anchor="e")
208
+ vscrollbar.config(command=self.results_box.yview)
209
+ hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
210
+ hscrollbar.config(command=self.results_box.xview)
211
+ # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
212
+ Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
213
+ side="left", anchor="e"
214
+ )
215
+ i1.pack(side="top", fill="both", expand=True, anchor="n")
216
+ i2.pack(side="bottom", fill="x", anchor="s")
217
+ innerframe.pack(side="top", fill="both", expand=True)
218
+
219
+ def _init_paging(self, parent):
220
+ innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
221
+ self.prev = prev = Button(
222
+ innerframe,
223
+ text="Previous",
224
+ command=self.previous,
225
+ width="10",
226
+ borderwidth=1,
227
+ highlightthickness=1,
228
+ state="disabled",
229
+ )
230
+ prev.pack(side="left", anchor="center")
231
+ self.next = next = Button(
232
+ innerframe,
233
+ text="Next",
234
+ command=self.__next__,
235
+ width="10",
236
+ borderwidth=1,
237
+ highlightthickness=1,
238
+ state="disabled",
239
+ )
240
+ next.pack(side="right", anchor="center")
241
+ innerframe.pack(side="top", fill="y")
242
+ self.reset_current_page()
243
+
244
+ def reset_current_page(self):
245
+ self.current_page = -1
246
+
247
+ def _poll(self):
248
+ try:
249
+ event = self.queue.get(block=False)
250
+ except q.Empty:
251
+ pass
252
+ else:
253
+ if event == CORPUS_LOADED_EVENT:
254
+ self.handle_corpus_loaded(event)
255
+ elif event == ERROR_LOADING_CORPUS_EVENT:
256
+ self.handle_error_loading_corpus(event)
257
+ self.after = self.top.after(POLL_INTERVAL, self._poll)
258
+
259
+ def handle_error_loading_corpus(self, event):
260
+ self.status["text"] = "Error in loading " + self.var.get()
261
+ self.unfreeze_editable()
262
+ self.clear_results_box()
263
+ self.freeze_editable()
264
+ self.reset_current_page()
265
+
266
+ def handle_corpus_loaded(self, event):
267
+ self.status["text"] = self.var.get() + " is loaded"
268
+ self.unfreeze_editable()
269
+ self.clear_results_box()
270
+ self.reset_current_page()
271
+ # self.next()
272
+ collocations = self.model.next(self.current_page + 1)
273
+ self.write_results(collocations)
274
+ self.current_page += 1
275
+
276
+ def corpus_selected(self, *args):
277
+ new_selection = self.var.get()
278
+ self.load_corpus(new_selection)
279
+
280
+ def previous(self):
281
+ self.freeze_editable()
282
+ collocations = self.model.prev(self.current_page - 1)
283
+ self.current_page = self.current_page - 1
284
+ self.clear_results_box()
285
+ self.write_results(collocations)
286
+ self.unfreeze_editable()
287
+
288
+ def __next__(self):
289
+ self.freeze_editable()
290
+ collocations = self.model.next(self.current_page + 1)
291
+ self.clear_results_box()
292
+ self.write_results(collocations)
293
+ self.current_page += 1
294
+ self.unfreeze_editable()
295
+
296
+ def load_corpus(self, selection):
297
+ if self.model.selected_corpus != selection:
298
+ self.status["text"] = "Loading " + selection + "..."
299
+ self.freeze_editable()
300
+ self.model.load_corpus(selection)
301
+
302
+ def freeze_editable(self):
303
+ self.prev["state"] = "disabled"
304
+ self.next["state"] = "disabled"
305
+
306
+ def clear_results_box(self):
307
+ self.results_box["state"] = "normal"
308
+ self.results_box.delete("1.0", END)
309
+ self.results_box["state"] = "disabled"
310
+
311
+ def fire_event(self, event):
312
+ # Firing an event so that rendering of widgets happen in the mainloop thread
313
+ self.top.event_generate(event, when="tail")
314
+
315
+ def destroy(self, *e):
316
+ if self.top is None:
317
+ return
318
+ self.top.after_cancel(self.after)
319
+ self.top.destroy()
320
+ self.top = None
321
+
322
+ def mainloop(self, *args, **kwargs):
323
+ if in_idle():
324
+ return
325
+ self.top.mainloop(*args, **kwargs)
326
+
327
+ def unfreeze_editable(self):
328
+ self.set_paging_button_states()
329
+
330
+ def set_paging_button_states(self):
331
+ if self.current_page == -1 or self.current_page == 0:
332
+ self.prev["state"] = "disabled"
333
+ else:
334
+ self.prev["state"] = "normal"
335
+ if self.model.is_last_page(self.current_page):
336
+ self.next["state"] = "disabled"
337
+ else:
338
+ self.next["state"] = "normal"
339
+
340
+ def write_results(self, results):
341
+ self.results_box["state"] = "normal"
342
+ row = 1
343
+ for each in results:
344
+ self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
345
+ row += 1
346
+ self.results_box["state"] = "disabled"
347
+
348
+
349
+ class CollocationsModel:
350
+ def __init__(self, queue):
351
+ self.result_count = None
352
+ self.selected_corpus = None
353
+ self.collocations = None
354
+ self.CORPORA = _CORPORA
355
+ self.DEFAULT_CORPUS = _DEFAULT
356
+ self.queue = queue
357
+ self.reset_results()
358
+
359
+ def reset_results(self):
360
+ self.result_pages = []
361
+ self.results_returned = 0
362
+
363
+ def load_corpus(self, name):
364
+ self.selected_corpus = name
365
+ self.collocations = None
366
+ runner_thread = self.LoadCorpus(name, self)
367
+ runner_thread.start()
368
+ self.reset_results()
369
+
370
+ def non_default_corpora(self):
371
+ copy = []
372
+ copy.extend(list(self.CORPORA.keys()))
373
+ copy.remove(self.DEFAULT_CORPUS)
374
+ copy.sort()
375
+ return copy
376
+
377
+ def is_last_page(self, number):
378
+ if number < len(self.result_pages):
379
+ return False
380
+ return self.results_returned + (
381
+ number - len(self.result_pages)
382
+ ) * self.result_count >= len(self.collocations)
383
+
384
+ def next(self, page):
385
+ if (len(self.result_pages) - 1) < page:
386
+ for i in range(page - (len(self.result_pages) - 1)):
387
+ self.result_pages.append(
388
+ self.collocations[
389
+ self.results_returned : self.results_returned
390
+ + self.result_count
391
+ ]
392
+ )
393
+ self.results_returned += self.result_count
394
+ return self.result_pages[page]
395
+
396
+ def prev(self, page):
397
+ if page == -1:
398
+ return []
399
+ return self.result_pages[page]
400
+
401
+ class LoadCorpus(threading.Thread):
402
+ def __init__(self, name, model):
403
+ threading.Thread.__init__(self)
404
+ self.model, self.name = model, name
405
+
406
+ def run(self):
407
+ try:
408
+ words = self.model.CORPORA[self.name]()
409
+ from operator import itemgetter
410
+
411
+ text = [w for w in words if len(w) > 2]
412
+ fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
413
+ vocab = FreqDist(text)
414
+ scored = [
415
+ ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
416
+ for w1, w2 in fd
417
+ ]
418
+ scored.sort(key=itemgetter(1), reverse=True)
419
+ self.model.collocations = list(map(itemgetter(0), scored))
420
+ self.model.queue.put(CORPUS_LOADED_EVENT)
421
+ except Exception as e:
422
+ print(e)
423
+ self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
424
+
425
+
426
+ # def collocations():
427
+ # colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
428
+
429
+
430
+ def app():
431
+ c = CollocationsView()
432
+ c.mainloop()
433
+
434
+
435
+ if __name__ == "__main__":
436
+ app()
437
+
438
+ __all__ = ["app"]
.eggs/nltk-3.8-py3.10.egg/nltk/app/concordance_app.py ADDED
@@ -0,0 +1,709 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Concordance Application
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ import queue as q
9
+ import re
10
+ import threading
11
+ from tkinter import (
12
+ END,
13
+ LEFT,
14
+ SUNKEN,
15
+ Button,
16
+ Entry,
17
+ Frame,
18
+ IntVar,
19
+ Label,
20
+ Menu,
21
+ OptionMenu,
22
+ Scrollbar,
23
+ StringVar,
24
+ Text,
25
+ Tk,
26
+ )
27
+ from tkinter.font import Font
28
+
29
+ from nltk.corpus import (
30
+ alpino,
31
+ brown,
32
+ cess_cat,
33
+ cess_esp,
34
+ floresta,
35
+ indian,
36
+ mac_morpho,
37
+ nps_chat,
38
+ sinica_treebank,
39
+ treebank,
40
+ )
41
+ from nltk.draw.util import ShowText
42
+ from nltk.util import in_idle
43
+
44
+ WORD_OR_TAG = "[^/ ]+"
45
+ BOUNDARY = r"\b"
46
+
47
+ CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
48
+ SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
49
+ SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
50
+ ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
51
+
52
+ POLL_INTERVAL = 50
53
+
54
+ # NB All corpora must be specified in a lambda expression so as not to be
55
+ # loaded when the module is imported.
56
+
57
+ _DEFAULT = "English: Brown Corpus (Humor, simplified)"
58
+ _CORPORA = {
59
+ "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
60
+ tagset="universal"
61
+ ),
62
+ "English: Brown Corpus": lambda: brown.tagged_sents(),
63
+ "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
64
+ tagset="universal"
65
+ ),
66
+ "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
67
+ categories=["news", "editorial", "reviews"], tagset="universal"
68
+ ),
69
+ "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
70
+ categories="religion", tagset="universal"
71
+ ),
72
+ "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
73
+ categories="learned", tagset="universal"
74
+ ),
75
+ "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
76
+ categories="science_fiction", tagset="universal"
77
+ ),
78
+ "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
79
+ categories="romance", tagset="universal"
80
+ ),
81
+ "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
82
+ categories="humor", tagset="universal"
83
+ ),
84
+ "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
85
+ "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
86
+ tagset="universal"
87
+ ),
88
+ "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
89
+ "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
90
+ tagset="universal"
91
+ ),
92
+ "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
93
+ "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
94
+ tagset="universal"
95
+ ),
96
+ "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
97
+ "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
98
+ tagset="universal"
99
+ ),
100
+ "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
101
+ "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
102
+ files="hindi.pos", tagset="universal"
103
+ ),
104
+ "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
105
+ "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
106
+ tagset="universal"
107
+ ),
108
+ "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
109
+ "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
110
+ tagset="universal"
111
+ ),
112
+ "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
113
+ tagset="universal"
114
+ ),
115
+ }
116
+
117
+
118
+ class ConcordanceSearchView:
119
+ _BACKGROUND_COLOUR = "#FFF" # white
120
+
121
+ # Colour of highlighted results
122
+ _HIGHLIGHT_WORD_COLOUR = "#F00" # red
123
+ _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
124
+
125
+ _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
126
+ _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
127
+
128
+ # Percentage of text left of the scrollbar position
129
+ _FRACTION_LEFT_TEXT = 0.30
130
+
131
+ def __init__(self):
132
+ self.queue = q.Queue()
133
+ self.model = ConcordanceSearchModel(self.queue)
134
+ self.top = Tk()
135
+ self._init_top(self.top)
136
+ self._init_menubar()
137
+ self._init_widgets(self.top)
138
+ self.load_corpus(self.model.DEFAULT_CORPUS)
139
+ self.after = self.top.after(POLL_INTERVAL, self._poll)
140
+
141
+ def _init_top(self, top):
142
+ top.geometry("950x680+50+50")
143
+ top.title("NLTK Concordance Search")
144
+ top.bind("<Control-q>", self.destroy)
145
+ top.protocol("WM_DELETE_WINDOW", self.destroy)
146
+ top.minsize(950, 680)
147
+
148
+ def _init_widgets(self, parent):
149
+ self.main_frame = Frame(
150
+ parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
151
+ )
152
+ self._init_corpus_select(self.main_frame)
153
+ self._init_query_box(self.main_frame)
154
+ self._init_results_box(self.main_frame)
155
+ self._init_paging(self.main_frame)
156
+ self._init_status(self.main_frame)
157
+ self.main_frame.pack(fill="both", expand=True)
158
+
159
+ def _init_menubar(self):
160
+ self._result_size = IntVar(self.top)
161
+ self._cntx_bf_len = IntVar(self.top)
162
+ self._cntx_af_len = IntVar(self.top)
163
+ menubar = Menu(self.top)
164
+
165
+ filemenu = Menu(menubar, tearoff=0, borderwidth=0)
166
+ filemenu.add_command(
167
+ label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
168
+ )
169
+ menubar.add_cascade(label="File", underline=0, menu=filemenu)
170
+
171
+ editmenu = Menu(menubar, tearoff=0)
172
+ rescntmenu = Menu(editmenu, tearoff=0)
173
+ rescntmenu.add_radiobutton(
174
+ label="20",
175
+ variable=self._result_size,
176
+ underline=0,
177
+ value=20,
178
+ command=self.set_result_size,
179
+ )
180
+ rescntmenu.add_radiobutton(
181
+ label="50",
182
+ variable=self._result_size,
183
+ underline=0,
184
+ value=50,
185
+ command=self.set_result_size,
186
+ )
187
+ rescntmenu.add_radiobutton(
188
+ label="100",
189
+ variable=self._result_size,
190
+ underline=0,
191
+ value=100,
192
+ command=self.set_result_size,
193
+ )
194
+ rescntmenu.invoke(1)
195
+ editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
196
+
197
+ cntxmenu = Menu(editmenu, tearoff=0)
198
+ cntxbfmenu = Menu(cntxmenu, tearoff=0)
199
+ cntxbfmenu.add_radiobutton(
200
+ label="60 characters",
201
+ variable=self._cntx_bf_len,
202
+ underline=0,
203
+ value=60,
204
+ command=self.set_cntx_bf_len,
205
+ )
206
+ cntxbfmenu.add_radiobutton(
207
+ label="80 characters",
208
+ variable=self._cntx_bf_len,
209
+ underline=0,
210
+ value=80,
211
+ command=self.set_cntx_bf_len,
212
+ )
213
+ cntxbfmenu.add_radiobutton(
214
+ label="100 characters",
215
+ variable=self._cntx_bf_len,
216
+ underline=0,
217
+ value=100,
218
+ command=self.set_cntx_bf_len,
219
+ )
220
+ cntxbfmenu.invoke(1)
221
+ cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
222
+
223
+ cntxafmenu = Menu(cntxmenu, tearoff=0)
224
+ cntxafmenu.add_radiobutton(
225
+ label="70 characters",
226
+ variable=self._cntx_af_len,
227
+ underline=0,
228
+ value=70,
229
+ command=self.set_cntx_af_len,
230
+ )
231
+ cntxafmenu.add_radiobutton(
232
+ label="90 characters",
233
+ variable=self._cntx_af_len,
234
+ underline=0,
235
+ value=90,
236
+ command=self.set_cntx_af_len,
237
+ )
238
+ cntxafmenu.add_radiobutton(
239
+ label="110 characters",
240
+ variable=self._cntx_af_len,
241
+ underline=0,
242
+ value=110,
243
+ command=self.set_cntx_af_len,
244
+ )
245
+ cntxafmenu.invoke(1)
246
+ cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
247
+
248
+ editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
249
+
250
+ menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
251
+
252
+ self.top.config(menu=menubar)
253
+
254
+ def set_result_size(self, **kwargs):
255
+ self.model.result_count = self._result_size.get()
256
+
257
+ def set_cntx_af_len(self, **kwargs):
258
+ self._char_after = self._cntx_af_len.get()
259
+
260
+ def set_cntx_bf_len(self, **kwargs):
261
+ self._char_before = self._cntx_bf_len.get()
262
+
263
+ def _init_corpus_select(self, parent):
264
+ innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
265
+ self.var = StringVar(innerframe)
266
+ self.var.set(self.model.DEFAULT_CORPUS)
267
+ Label(
268
+ innerframe,
269
+ justify=LEFT,
270
+ text=" Corpus: ",
271
+ background=self._BACKGROUND_COLOUR,
272
+ padx=2,
273
+ pady=1,
274
+ border=0,
275
+ ).pack(side="left")
276
+
277
+ other_corpora = list(self.model.CORPORA.keys()).remove(
278
+ self.model.DEFAULT_CORPUS
279
+ )
280
+ om = OptionMenu(
281
+ innerframe,
282
+ self.var,
283
+ self.model.DEFAULT_CORPUS,
284
+ command=self.corpus_selected,
285
+ *self.model.non_default_corpora()
286
+ )
287
+ om["borderwidth"] = 0
288
+ om["highlightthickness"] = 1
289
+ om.pack(side="left")
290
+ innerframe.pack(side="top", fill="x", anchor="n")
291
+
292
+ def _init_status(self, parent):
293
+ self.status = Label(
294
+ parent,
295
+ justify=LEFT,
296
+ relief=SUNKEN,
297
+ background=self._BACKGROUND_COLOUR,
298
+ border=0,
299
+ padx=1,
300
+ pady=0,
301
+ )
302
+ self.status.pack(side="top", anchor="sw")
303
+
304
+ def _init_query_box(self, parent):
305
+ innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
306
+ another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
307
+ self.query_box = Entry(another, width=60)
308
+ self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
309
+ self.search_button = Button(
310
+ another,
311
+ text="Search",
312
+ command=self.search,
313
+ borderwidth=1,
314
+ highlightthickness=1,
315
+ )
316
+ self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
317
+ self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
318
+ another.pack()
319
+ innerframe.pack(side="top", fill="x", anchor="n")
320
+
321
+ def search_enter_keypress_handler(self, *event):
322
+ self.search()
323
+
324
+ def _init_results_box(self, parent):
325
+ innerframe = Frame(parent)
326
+ i1 = Frame(innerframe)
327
+ i2 = Frame(innerframe)
328
+ vscrollbar = Scrollbar(i1, borderwidth=1)
329
+ hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
330
+ self.results_box = Text(
331
+ i1,
332
+ font=Font(family="courier", size="16"),
333
+ state="disabled",
334
+ borderwidth=1,
335
+ yscrollcommand=vscrollbar.set,
336
+ xscrollcommand=hscrollbar.set,
337
+ wrap="none",
338
+ width="40",
339
+ height="20",
340
+ exportselection=1,
341
+ )
342
+ self.results_box.pack(side="left", fill="both", expand=True)
343
+ self.results_box.tag_config(
344
+ self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
345
+ )
346
+ self.results_box.tag_config(
347
+ self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
348
+ )
349
+ vscrollbar.pack(side="left", fill="y", anchor="e")
350
+ vscrollbar.config(command=self.results_box.yview)
351
+ hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
352
+ hscrollbar.config(command=self.results_box.xview)
353
+ # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
354
+ Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
355
+ side="left", anchor="e"
356
+ )
357
+ i1.pack(side="top", fill="both", expand=True, anchor="n")
358
+ i2.pack(side="bottom", fill="x", anchor="s")
359
+ innerframe.pack(side="top", fill="both", expand=True)
360
+
361
+ def _init_paging(self, parent):
362
+ innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
363
+ self.prev = prev = Button(
364
+ innerframe,
365
+ text="Previous",
366
+ command=self.previous,
367
+ width="10",
368
+ borderwidth=1,
369
+ highlightthickness=1,
370
+ state="disabled",
371
+ )
372
+ prev.pack(side="left", anchor="center")
373
+ self.next = next = Button(
374
+ innerframe,
375
+ text="Next",
376
+ command=self.__next__,
377
+ width="10",
378
+ borderwidth=1,
379
+ highlightthickness=1,
380
+ state="disabled",
381
+ )
382
+ next.pack(side="right", anchor="center")
383
+ innerframe.pack(side="top", fill="y")
384
+ self.current_page = 0
385
+
386
+ def previous(self):
387
+ self.clear_results_box()
388
+ self.freeze_editable()
389
+ self.model.prev(self.current_page - 1)
390
+
391
+ def __next__(self):
392
+ self.clear_results_box()
393
+ self.freeze_editable()
394
+ self.model.next(self.current_page + 1)
395
+
396
+ def about(self, *e):
397
+ ABOUT = "NLTK Concordance Search Demo\n"
398
+ TITLE = "About: NLTK Concordance Search Demo"
399
+ try:
400
+ from tkinter.messagebox import Message
401
+
402
+ Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
403
+ except:
404
+ ShowText(self.top, TITLE, ABOUT)
405
+
406
+ def _bind_event_handlers(self):
407
+ self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
408
+ self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated)
409
+ self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error)
410
+ self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
411
+
412
+ def _poll(self):
413
+ try:
414
+ event = self.queue.get(block=False)
415
+ except q.Empty:
416
+ pass
417
+ else:
418
+ if event == CORPUS_LOADED_EVENT:
419
+ self.handle_corpus_loaded(event)
420
+ elif event == SEARCH_TERMINATED_EVENT:
421
+ self.handle_search_terminated(event)
422
+ elif event == SEARCH_ERROR_EVENT:
423
+ self.handle_search_error(event)
424
+ elif event == ERROR_LOADING_CORPUS_EVENT:
425
+ self.handle_error_loading_corpus(event)
426
+ self.after = self.top.after(POLL_INTERVAL, self._poll)
427
+
428
+ def handle_error_loading_corpus(self, event):
429
+ self.status["text"] = "Error in loading " + self.var.get()
430
+ self.unfreeze_editable()
431
+ self.clear_all()
432
+ self.freeze_editable()
433
+
434
+ def handle_corpus_loaded(self, event):
435
+ self.status["text"] = self.var.get() + " is loaded"
436
+ self.unfreeze_editable()
437
+ self.clear_all()
438
+ self.query_box.focus_set()
439
+
440
+ def handle_search_terminated(self, event):
441
+ # todo: refactor the model such that it is less state sensitive
442
+ results = self.model.get_results()
443
+ self.write_results(results)
444
+ self.status["text"] = ""
445
+ if len(results) == 0:
446
+ self.status["text"] = "No results found for " + self.model.query
447
+ else:
448
+ self.current_page = self.model.last_requested_page
449
+ self.unfreeze_editable()
450
+ self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
451
+
452
+ def handle_search_error(self, event):
453
+ self.status["text"] = "Error in query " + self.model.query
454
+ self.unfreeze_editable()
455
+
456
+ def corpus_selected(self, *args):
457
+ new_selection = self.var.get()
458
+ self.load_corpus(new_selection)
459
+
460
+ def load_corpus(self, selection):
461
+ if self.model.selected_corpus != selection:
462
+ self.status["text"] = "Loading " + selection + "..."
463
+ self.freeze_editable()
464
+ self.model.load_corpus(selection)
465
+
466
+ def search(self):
467
+ self.current_page = 0
468
+ self.clear_results_box()
469
+ self.model.reset_results()
470
+ query = self.query_box.get()
471
+ if len(query.strip()) == 0:
472
+ return
473
+ self.status["text"] = "Searching for " + query
474
+ self.freeze_editable()
475
+ self.model.search(query, self.current_page + 1)
476
+
477
+ def write_results(self, results):
478
+ self.results_box["state"] = "normal"
479
+ row = 1
480
+ for each in results:
481
+ sent, pos1, pos2 = each[0].strip(), each[1], each[2]
482
+ if len(sent) != 0:
483
+ if pos1 < self._char_before:
484
+ sent, pos1, pos2 = self.pad(sent, pos1, pos2)
485
+ sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
486
+ if not row == len(results):
487
+ sentence += "\n"
488
+ self.results_box.insert(str(row) + ".0", sentence)
489
+ word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
490
+ for marker in word_markers:
491
+ self.results_box.tag_add(
492
+ self._HIGHLIGHT_WORD_TAG,
493
+ str(row) + "." + str(marker[0]),
494
+ str(row) + "." + str(marker[1]),
495
+ )
496
+ for marker in label_markers:
497
+ self.results_box.tag_add(
498
+ self._HIGHLIGHT_LABEL_TAG,
499
+ str(row) + "." + str(marker[0]),
500
+ str(row) + "." + str(marker[1]),
501
+ )
502
+ row += 1
503
+ self.results_box["state"] = "disabled"
504
+
505
+ def words_and_labels(self, sentence, pos1, pos2):
506
+ search_exp = sentence[pos1:pos2]
507
+ words, labels = [], []
508
+ labeled_words = search_exp.split(" ")
509
+ index = 0
510
+ for each in labeled_words:
511
+ if each == "":
512
+ index += 1
513
+ else:
514
+ word, label = each.split("/")
515
+ words.append(
516
+ (self._char_before + index, self._char_before + index + len(word))
517
+ )
518
+ index += len(word) + 1
519
+ labels.append(
520
+ (self._char_before + index, self._char_before + index + len(label))
521
+ )
522
+ index += len(label)
523
+ index += 1
524
+ return words, labels
525
+
526
+ def pad(self, sent, hstart, hend):
527
+ if hstart >= self._char_before:
528
+ return sent, hstart, hend
529
+ d = self._char_before - hstart
530
+ sent = "".join([" "] * d) + sent
531
+ return sent, hstart + d, hend + d
532
+
533
+ def destroy(self, *e):
534
+ if self.top is None:
535
+ return
536
+ self.top.after_cancel(self.after)
537
+ self.top.destroy()
538
+ self.top = None
539
+
540
+ def clear_all(self):
541
+ self.query_box.delete(0, END)
542
+ self.model.reset_query()
543
+ self.clear_results_box()
544
+
545
+ def clear_results_box(self):
546
+ self.results_box["state"] = "normal"
547
+ self.results_box.delete("1.0", END)
548
+ self.results_box["state"] = "disabled"
549
+
550
+ def freeze_editable(self):
551
+ self.query_box["state"] = "disabled"
552
+ self.search_button["state"] = "disabled"
553
+ self.prev["state"] = "disabled"
554
+ self.next["state"] = "disabled"
555
+
556
+ def unfreeze_editable(self):
557
+ self.query_box["state"] = "normal"
558
+ self.search_button["state"] = "normal"
559
+ self.set_paging_button_states()
560
+
561
+ def set_paging_button_states(self):
562
+ if self.current_page == 0 or self.current_page == 1:
563
+ self.prev["state"] = "disabled"
564
+ else:
565
+ self.prev["state"] = "normal"
566
+ if self.model.has_more_pages(self.current_page):
567
+ self.next["state"] = "normal"
568
+ else:
569
+ self.next["state"] = "disabled"
570
+
571
+ def fire_event(self, event):
572
+ # Firing an event so that rendering of widgets happen in the mainloop thread
573
+ self.top.event_generate(event, when="tail")
574
+
575
+ def mainloop(self, *args, **kwargs):
576
+ if in_idle():
577
+ return
578
+ self.top.mainloop(*args, **kwargs)
579
+
580
+
581
+ class ConcordanceSearchModel:
582
+ def __init__(self, queue):
583
+ self.queue = queue
584
+ self.CORPORA = _CORPORA
585
+ self.DEFAULT_CORPUS = _DEFAULT
586
+ self.selected_corpus = None
587
+ self.reset_query()
588
+ self.reset_results()
589
+ self.result_count = None
590
+ self.last_sent_searched = 0
591
+
592
+ def non_default_corpora(self):
593
+ copy = []
594
+ copy.extend(list(self.CORPORA.keys()))
595
+ copy.remove(self.DEFAULT_CORPUS)
596
+ copy.sort()
597
+ return copy
598
+
599
+ def load_corpus(self, name):
600
+ self.selected_corpus = name
601
+ self.tagged_sents = []
602
+ runner_thread = self.LoadCorpus(name, self)
603
+ runner_thread.start()
604
+
605
+ def search(self, query, page):
606
+ self.query = query
607
+ self.last_requested_page = page
608
+ self.SearchCorpus(self, page, self.result_count).start()
609
+
610
+ def next(self, page):
611
+ self.last_requested_page = page
612
+ if len(self.results) < page:
613
+ self.search(self.query, page)
614
+ else:
615
+ self.queue.put(SEARCH_TERMINATED_EVENT)
616
+
617
+ def prev(self, page):
618
+ self.last_requested_page = page
619
+ self.queue.put(SEARCH_TERMINATED_EVENT)
620
+
621
+ def reset_results(self):
622
+ self.last_sent_searched = 0
623
+ self.results = []
624
+ self.last_page = None
625
+
626
+ def reset_query(self):
627
+ self.query = None
628
+
629
+ def set_results(self, page, resultset):
630
+ self.results.insert(page - 1, resultset)
631
+
632
+ def get_results(self):
633
+ return self.results[self.last_requested_page - 1]
634
+
635
+ def has_more_pages(self, page):
636
+ if self.results == [] or self.results[0] == []:
637
+ return False
638
+ if self.last_page is None:
639
+ return True
640
+ return page < self.last_page
641
+
642
+ class LoadCorpus(threading.Thread):
643
+ def __init__(self, name, model):
644
+ threading.Thread.__init__(self)
645
+ self.model, self.name = model, name
646
+
647
+ def run(self):
648
+ try:
649
+ ts = self.model.CORPORA[self.name]()
650
+ self.model.tagged_sents = [
651
+ " ".join(w + "/" + t for (w, t) in sent) for sent in ts
652
+ ]
653
+ self.model.queue.put(CORPUS_LOADED_EVENT)
654
+ except Exception as e:
655
+ print(e)
656
+ self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
657
+
658
+ class SearchCorpus(threading.Thread):
659
+ def __init__(self, model, page, count):
660
+ self.model, self.count, self.page = model, count, page
661
+ threading.Thread.__init__(self)
662
+
663
+ def run(self):
664
+ q = self.processed_query()
665
+ sent_pos, i, sent_count = [], 0, 0
666
+ for sent in self.model.tagged_sents[self.model.last_sent_searched :]:
667
+ try:
668
+ m = re.search(q, sent)
669
+ except re.error:
670
+ self.model.reset_results()
671
+ self.model.queue.put(SEARCH_ERROR_EVENT)
672
+ return
673
+ if m:
674
+ sent_pos.append((sent, m.start(), m.end()))
675
+ i += 1
676
+ if i > self.count:
677
+ self.model.last_sent_searched += sent_count - 1
678
+ break
679
+ sent_count += 1
680
+ if self.count >= len(sent_pos):
681
+ self.model.last_sent_searched += sent_count - 1
682
+ self.model.last_page = self.page
683
+ self.model.set_results(self.page, sent_pos)
684
+ else:
685
+ self.model.set_results(self.page, sent_pos[:-1])
686
+ self.model.queue.put(SEARCH_TERMINATED_EVENT)
687
+
688
+ def processed_query(self):
689
+ new = []
690
+ for term in self.model.query.split():
691
+ term = re.sub(r"\.", r"[^/ ]", term)
692
+ if re.match("[A-Z]+$", term):
693
+ new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
694
+ elif "/" in term:
695
+ new.append(BOUNDARY + term + BOUNDARY)
696
+ else:
697
+ new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
698
+ return " ".join(new)
699
+
700
+
701
+ def app():
702
+ d = ConcordanceSearchView()
703
+ d.mainloop()
704
+
705
+
706
+ if __name__ == "__main__":
707
+ app()
708
+
709
+ __all__ = ["app"]
.eggs/nltk-3.8-py3.10.egg/nltk/app/nemo_app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06
2
+ # https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783
3
+
4
+ """
5
+ Finding (and Replacing) Nemo
6
+
7
+ Instant Regular Expressions
8
+ Created by Aristide Grange
9
+ """
10
+ import itertools
11
+ import re
12
+ from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk
13
+
14
+ windowTitle = "Finding (and Replacing) Nemo"
15
+ initialFind = r"n(.*?)e(.*?)m(.*?)o"
16
+ initialRepl = r"M\1A\2K\3I"
17
+ initialText = """\
18
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
19
+ Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
20
+ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
21
+ Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
22
+ """
23
+ images = {
24
+ "FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
25
+ "find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
26
+ "REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
27
+ "repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
28
+ }
29
+ colors = ["#FF7B39", "#80F121"]
30
+ emphColors = ["#DAFC33", "#F42548"]
31
+ fieldParams = {
32
+ "height": 3,
33
+ "width": 70,
34
+ "font": ("monaco", 14),
35
+ "highlightthickness": 0,
36
+ "borderwidth": 0,
37
+ "background": "white",
38
+ }
39
+ textParams = {
40
+ "bg": "#F7E0D4",
41
+ "fg": "#2321F1",
42
+ "highlightthickness": 0,
43
+ "width": 1,
44
+ "height": 10,
45
+ "font": ("verdana", 16),
46
+ "wrap": "word",
47
+ }
48
+
49
+
50
+ class Zone:
51
+ def __init__(self, image, initialField, initialText):
52
+ frm = Frame(root)
53
+ frm.config(background="white")
54
+ self.image = PhotoImage(format="gif", data=images[image.upper()])
55
+ self.imageDimmed = PhotoImage(format="gif", data=images[image])
56
+ self.img = Label(frm)
57
+ self.img.config(borderwidth=0)
58
+ self.img.pack(side="left")
59
+ self.fld = Text(frm, **fieldParams)
60
+ self.initScrollText(frm, self.fld, initialField)
61
+ frm = Frame(root)
62
+ self.txt = Text(frm, **textParams)
63
+ self.initScrollText(frm, self.txt, initialText)
64
+ for i in range(2):
65
+ self.txt.tag_config(colors[i], background=colors[i])
66
+ self.txt.tag_config("emph" + colors[i], foreground=emphColors[i])
67
+
68
+ def initScrollText(self, frm, txt, contents):
69
+ scl = Scrollbar(frm)
70
+ scl.config(command=txt.yview)
71
+ scl.pack(side="right", fill="y")
72
+ txt.pack(side="left", expand=True, fill="x")
73
+ txt.config(yscrollcommand=scl.set)
74
+ txt.insert("1.0", contents)
75
+ frm.pack(fill="x")
76
+ Frame(height=2, bd=1, relief="ridge").pack(fill="x")
77
+
78
+ def refresh(self):
79
+ self.colorCycle = itertools.cycle(colors)
80
+ try:
81
+ self.substitute()
82
+ self.img.config(image=self.image)
83
+ except re.error:
84
+ self.img.config(image=self.imageDimmed)
85
+
86
+
87
+ class FindZone(Zone):
88
+ def addTags(self, m):
89
+ color = next(self.colorCycle)
90
+ self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end())
91
+ try:
92
+ self.txt.tag_add(
93
+ "emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph")
94
+ )
95
+ except:
96
+ pass
97
+
98
+ def substitute(self, *args):
99
+ for color in colors:
100
+ self.txt.tag_remove(color, "1.0", "end")
101
+ self.txt.tag_remove("emph" + color, "1.0", "end")
102
+ self.rex = re.compile("") # default value in case of malformed regexp
103
+ self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE)
104
+ try:
105
+ re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST, SEL_LAST))
106
+ self.rexSel = re.compile(
107
+ "%s(?P<emph>%s)%s"
108
+ % (
109
+ self.fld.get("1.0", SEL_FIRST),
110
+ self.fld.get(SEL_FIRST, SEL_LAST),
111
+ self.fld.get(SEL_LAST, "end")[:-1],
112
+ ),
113
+ re.MULTILINE,
114
+ )
115
+ except:
116
+ self.rexSel = self.rex
117
+ self.rexSel.sub(self.addTags, self.txt.get("1.0", "end"))
118
+
119
+
120
+ class ReplaceZone(Zone):
121
+ def addTags(self, m):
122
+ s = sz.rex.sub(self.repl, m.group())
123
+ self.txt.delete(
124
+ "1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff)
125
+ )
126
+ self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle))
127
+ self.diff += len(s) - (m.end() - m.start())
128
+
129
+ def substitute(self):
130
+ self.txt.delete("1.0", "end")
131
+ self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1])
132
+ self.diff = 0
133
+ self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1])
134
+ sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1])
135
+
136
+
137
+ def launchRefresh(_):
138
+ sz.fld.after_idle(sz.refresh)
139
+ rz.fld.after_idle(rz.refresh)
140
+
141
+
142
+ def app():
143
+ global root, sz, rz, rex0
144
+ root = Tk()
145
+ root.resizable(height=False, width=True)
146
+ root.title(windowTitle)
147
+ root.minsize(width=250, height=0)
148
+ sz = FindZone("find", initialFind, initialText)
149
+ sz.fld.bind("<Button-1>", launchRefresh)
150
+ sz.fld.bind("<ButtonRelease-1>", launchRefresh)
151
+ sz.fld.bind("<B1-Motion>", launchRefresh)
152
+ sz.rexSel = re.compile("")
153
+ rz = ReplaceZone("repl", initialRepl, "")
154
+ rex0 = re.compile(r"(?<!\\)\\([0-9]+)")
155
+ root.bind_all("<Key>", launchRefresh)
156
+ launchRefresh(None)
157
+ root.mainloop()
158
+
159
+
160
+ if __name__ == "__main__":
161
+ app()
162
+
163
+ __all__ = ["app"]
.eggs/nltk-3.8-py3.10.egg/nltk/app/rdparser_app.py ADDED
@@ -0,0 +1,1052 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Recursive Descent Parser Application
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A graphical tool for exploring the recursive descent parser.
10
+
11
+ The recursive descent parser maintains a tree, which records the
12
+ structure of the portion of the text that has been parsed. It uses
13
+ CFG productions to expand the fringe of the tree, and matches its
14
+ leaves against the text. Initially, the tree contains the start
15
+ symbol ("S"). It is shown in the main canvas, to the right of the
16
+ list of available expansions.
17
+
18
+ The parser builds up a tree structure for the text using three
19
+ operations:
20
+
21
+ - "expand" uses a CFG production to add children to a node on the
22
+ fringe of the tree.
23
+ - "match" compares a leaf in the tree to a text token.
24
+ - "backtrack" returns the tree to its state before the most recent
25
+ expand or match operation.
26
+
27
+ The parser maintains a list of tree locations called a "frontier" to
28
+ remember which nodes have not yet been expanded and which leaves have
29
+ not yet been matched against the text. The leftmost frontier node is
30
+ shown in green, and the other frontier nodes are shown in blue. The
31
+ parser always performs expand and match operations on the leftmost
32
+ element of the frontier.
33
+
34
+ You can control the parser's operation by using the "expand," "match,"
35
+ and "backtrack" buttons; or you can use the "step" button to let the
36
+ parser automatically decide which operation to apply. The parser uses
37
+ the following rules to decide which operation to apply:
38
+
39
+ - If the leftmost frontier element is a token, try matching it.
40
+ - If the leftmost frontier element is a node, try expanding it with
41
+ the first untried expansion.
42
+ - Otherwise, backtrack.
43
+
44
+ The "expand" button applies the untried expansion whose CFG production
45
+ is listed earliest in the grammar. To manually choose which expansion
46
+ to apply, click on a CFG production from the list of available
47
+ expansions, on the left side of the main window.
48
+
49
+ The "autostep" button will let the parser continue applying
50
+ applications to the tree until it reaches a complete parse. You can
51
+ cancel an autostep in progress at any time by clicking on the
52
+ "autostep" button again.
53
+
54
+ Keyboard Shortcuts::
55
+ [Space]\t Perform the next expand, match, or backtrack operation
56
+ [a]\t Step through operations until the next complete parse
57
+ [e]\t Perform an expand operation
58
+ [m]\t Perform a match operation
59
+ [b]\t Perform a backtrack operation
60
+ [Delete]\t Reset the parser
61
+ [g]\t Show/hide available expansions list
62
+ [h]\t Help
63
+ [Ctrl-p]\t Print
64
+ [q]\t Quit
65
+ """
66
+
67
+ from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
68
+ from tkinter.font import Font
69
+
70
+ from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
71
+ from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
72
+ from nltk.parse import SteppingRecursiveDescentParser
73
+ from nltk.tree import Tree
74
+ from nltk.util import in_idle
75
+
76
+
77
+ class RecursiveDescentApp:
78
+ """
79
+ A graphical tool for exploring the recursive descent parser. The tool
80
+ displays the parser's tree and the remaining text, and allows the
81
+ user to control the parser's operation. In particular, the user
82
+ can expand subtrees on the frontier, match tokens on the frontier
83
+ against the text, and backtrack. A "step" button simply steps
84
+ through the parsing process, performing the operations that
85
+ ``RecursiveDescentParser`` would use.
86
+ """
87
+
88
+ def __init__(self, grammar, sent, trace=0):
89
+ self._sent = sent
90
+ self._parser = SteppingRecursiveDescentParser(grammar, trace)
91
+
92
+ # Set up the main window.
93
+ self._top = Tk()
94
+ self._top.title("Recursive Descent Parser Application")
95
+
96
+ # Set up key bindings.
97
+ self._init_bindings()
98
+
99
+ # Initialize the fonts.
100
+ self._init_fonts(self._top)
101
+
102
+ # Animations. animating_lock is a lock to prevent the demo
103
+ # from performing new operations while it's animating.
104
+ self._animation_frames = IntVar(self._top)
105
+ self._animation_frames.set(5)
106
+ self._animating_lock = 0
107
+ self._autostep = 0
108
+
109
+ # The user can hide the grammar.
110
+ self._show_grammar = IntVar(self._top)
111
+ self._show_grammar.set(1)
112
+
113
+ # Create the basic frames.
114
+ self._init_menubar(self._top)
115
+ self._init_buttons(self._top)
116
+ self._init_feedback(self._top)
117
+ self._init_grammar(self._top)
118
+ self._init_canvas(self._top)
119
+
120
+ # Initialize the parser.
121
+ self._parser.initialize(self._sent)
122
+
123
+ # Resize callback
124
+ self._canvas.bind("<Configure>", self._configure)
125
+
126
+ #########################################
127
+ ## Initialization Helpers
128
+ #########################################
129
+
130
+ def _init_fonts(self, root):
131
+ # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
132
+ self._sysfont = Font(font=Button()["font"])
133
+ root.option_add("*Font", self._sysfont)
134
+
135
+ # TWhat's our font size (default=same as sysfont)
136
+ self._size = IntVar(root)
137
+ self._size.set(self._sysfont.cget("size"))
138
+
139
+ self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
140
+ self._font = Font(family="helvetica", size=self._size.get())
141
+ if self._size.get() < 0:
142
+ big = self._size.get() - 2
143
+ else:
144
+ big = self._size.get() + 2
145
+ self._bigfont = Font(family="helvetica", weight="bold", size=big)
146
+
147
+ def _init_grammar(self, parent):
148
+ # Grammar view.
149
+ self._prodframe = listframe = Frame(parent)
150
+ self._prodframe.pack(fill="both", side="left", padx=2)
151
+ self._prodlist_label = Label(
152
+ self._prodframe, font=self._boldfont, text="Available Expansions"
153
+ )
154
+ self._prodlist_label.pack()
155
+ self._prodlist = Listbox(
156
+ self._prodframe,
157
+ selectmode="single",
158
+ relief="groove",
159
+ background="white",
160
+ foreground="#909090",
161
+ font=self._font,
162
+ selectforeground="#004040",
163
+ selectbackground="#c0f0c0",
164
+ )
165
+
166
+ self._prodlist.pack(side="right", fill="both", expand=1)
167
+
168
+ self._productions = list(self._parser.grammar().productions())
169
+ for production in self._productions:
170
+ self._prodlist.insert("end", (" %s" % production))
171
+ self._prodlist.config(height=min(len(self._productions), 25))
172
+
173
+ # Add a scrollbar if there are more than 25 productions.
174
+ if len(self._productions) > 25:
175
+ listscroll = Scrollbar(self._prodframe, orient="vertical")
176
+ self._prodlist.config(yscrollcommand=listscroll.set)
177
+ listscroll.config(command=self._prodlist.yview)
178
+ listscroll.pack(side="left", fill="y")
179
+
180
+ # If they select a production, apply it.
181
+ self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
182
+
183
+ def _init_bindings(self):
184
+ # Key bindings are a good thing.
185
+ self._top.bind("<Control-q>", self.destroy)
186
+ self._top.bind("<Control-x>", self.destroy)
187
+ self._top.bind("<Escape>", self.destroy)
188
+ self._top.bind("e", self.expand)
189
+ # self._top.bind('<Alt-e>', self.expand)
190
+ # self._top.bind('<Control-e>', self.expand)
191
+ self._top.bind("m", self.match)
192
+ self._top.bind("<Alt-m>", self.match)
193
+ self._top.bind("<Control-m>", self.match)
194
+ self._top.bind("b", self.backtrack)
195
+ self._top.bind("<Alt-b>", self.backtrack)
196
+ self._top.bind("<Control-b>", self.backtrack)
197
+ self._top.bind("<Control-z>", self.backtrack)
198
+ self._top.bind("<BackSpace>", self.backtrack)
199
+ self._top.bind("a", self.autostep)
200
+ # self._top.bind('<Control-a>', self.autostep)
201
+ self._top.bind("<Control-space>", self.autostep)
202
+ self._top.bind("<Control-c>", self.cancel_autostep)
203
+ self._top.bind("<space>", self.step)
204
+ self._top.bind("<Delete>", self.reset)
205
+ self._top.bind("<Control-p>", self.postscript)
206
+ # self._top.bind('<h>', self.help)
207
+ # self._top.bind('<Alt-h>', self.help)
208
+ self._top.bind("<Control-h>", self.help)
209
+ self._top.bind("<F1>", self.help)
210
+ # self._top.bind('<g>', self.toggle_grammar)
211
+ # self._top.bind('<Alt-g>', self.toggle_grammar)
212
+ # self._top.bind('<Control-g>', self.toggle_grammar)
213
+ self._top.bind("<Control-g>", self.edit_grammar)
214
+ self._top.bind("<Control-t>", self.edit_sentence)
215
+
216
+ def _init_buttons(self, parent):
217
+ # Set up the frames.
218
+ self._buttonframe = buttonframe = Frame(parent)
219
+ buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
220
+ Button(
221
+ buttonframe,
222
+ text="Step",
223
+ background="#90c0d0",
224
+ foreground="black",
225
+ command=self.step,
226
+ ).pack(side="left")
227
+ Button(
228
+ buttonframe,
229
+ text="Autostep",
230
+ background="#90c0d0",
231
+ foreground="black",
232
+ command=self.autostep,
233
+ ).pack(side="left")
234
+ Button(
235
+ buttonframe,
236
+ text="Expand",
237
+ underline=0,
238
+ background="#90f090",
239
+ foreground="black",
240
+ command=self.expand,
241
+ ).pack(side="left")
242
+ Button(
243
+ buttonframe,
244
+ text="Match",
245
+ underline=0,
246
+ background="#90f090",
247
+ foreground="black",
248
+ command=self.match,
249
+ ).pack(side="left")
250
+ Button(
251
+ buttonframe,
252
+ text="Backtrack",
253
+ underline=0,
254
+ background="#f0a0a0",
255
+ foreground="black",
256
+ command=self.backtrack,
257
+ ).pack(side="left")
258
+ # Replace autostep...
259
+
260
+ # self._autostep_button = Button(buttonframe, text='Autostep',
261
+ # underline=0, command=self.autostep)
262
+ # self._autostep_button.pack(side='left')
263
+
264
+ def _configure(self, event):
265
+ self._autostep = 0
266
+ (x1, y1, x2, y2) = self._cframe.scrollregion()
267
+ y2 = event.height - 6
268
+ self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
269
+ self._redraw()
270
+
271
+ def _init_feedback(self, parent):
272
+ self._feedbackframe = feedbackframe = Frame(parent)
273
+ feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
274
+ self._lastoper_label = Label(
275
+ feedbackframe, text="Last Operation:", font=self._font
276
+ )
277
+ self._lastoper_label.pack(side="left")
278
+ lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
279
+ lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
280
+ self._lastoper1 = Label(
281
+ lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
282
+ )
283
+ self._lastoper2 = Label(
284
+ lastoperframe,
285
+ anchor="w",
286
+ width=30,
287
+ foreground="#004040",
288
+ background="#f0f0f0",
289
+ font=self._font,
290
+ )
291
+ self._lastoper1.pack(side="left")
292
+ self._lastoper2.pack(side="left", fill="x", expand=1)
293
+
294
+ def _init_canvas(self, parent):
295
+ self._cframe = CanvasFrame(
296
+ parent,
297
+ background="white",
298
+ # width=525, height=250,
299
+ closeenough=10,
300
+ border=2,
301
+ relief="sunken",
302
+ )
303
+ self._cframe.pack(expand=1, fill="both", side="top", pady=2)
304
+ canvas = self._canvas = self._cframe.canvas()
305
+
306
+ # Initially, there's no tree or text
307
+ self._tree = None
308
+ self._textwidgets = []
309
+ self._textline = None
310
+
311
+ def _init_menubar(self, parent):
312
+ menubar = Menu(parent)
313
+
314
+ filemenu = Menu(menubar, tearoff=0)
315
+ filemenu.add_command(
316
+ label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
317
+ )
318
+ filemenu.add_command(
319
+ label="Print to Postscript",
320
+ underline=0,
321
+ command=self.postscript,
322
+ accelerator="Ctrl-p",
323
+ )
324
+ filemenu.add_command(
325
+ label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
326
+ )
327
+ menubar.add_cascade(label="File", underline=0, menu=filemenu)
328
+
329
+ editmenu = Menu(menubar, tearoff=0)
330
+ editmenu.add_command(
331
+ label="Edit Grammar",
332
+ underline=5,
333
+ command=self.edit_grammar,
334
+ accelerator="Ctrl-g",
335
+ )
336
+ editmenu.add_command(
337
+ label="Edit Text",
338
+ underline=5,
339
+ command=self.edit_sentence,
340
+ accelerator="Ctrl-t",
341
+ )
342
+ menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
343
+
344
+ rulemenu = Menu(menubar, tearoff=0)
345
+ rulemenu.add_command(
346
+ label="Step", underline=1, command=self.step, accelerator="Space"
347
+ )
348
+ rulemenu.add_separator()
349
+ rulemenu.add_command(
350
+ label="Match", underline=0, command=self.match, accelerator="Ctrl-m"
351
+ )
352
+ rulemenu.add_command(
353
+ label="Expand", underline=0, command=self.expand, accelerator="Ctrl-e"
354
+ )
355
+ rulemenu.add_separator()
356
+ rulemenu.add_command(
357
+ label="Backtrack", underline=0, command=self.backtrack, accelerator="Ctrl-b"
358
+ )
359
+ menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
360
+
361
+ viewmenu = Menu(menubar, tearoff=0)
362
+ viewmenu.add_checkbutton(
363
+ label="Show Grammar",
364
+ underline=0,
365
+ variable=self._show_grammar,
366
+ command=self._toggle_grammar,
367
+ )
368
+ viewmenu.add_separator()
369
+ viewmenu.add_radiobutton(
370
+ label="Tiny",
371
+ variable=self._size,
372
+ underline=0,
373
+ value=10,
374
+ command=self.resize,
375
+ )
376
+ viewmenu.add_radiobutton(
377
+ label="Small",
378
+ variable=self._size,
379
+ underline=0,
380
+ value=12,
381
+ command=self.resize,
382
+ )
383
+ viewmenu.add_radiobutton(
384
+ label="Medium",
385
+ variable=self._size,
386
+ underline=0,
387
+ value=14,
388
+ command=self.resize,
389
+ )
390
+ viewmenu.add_radiobutton(
391
+ label="Large",
392
+ variable=self._size,
393
+ underline=0,
394
+ value=18,
395
+ command=self.resize,
396
+ )
397
+ viewmenu.add_radiobutton(
398
+ label="Huge",
399
+ variable=self._size,
400
+ underline=0,
401
+ value=24,
402
+ command=self.resize,
403
+ )
404
+ menubar.add_cascade(label="View", underline=0, menu=viewmenu)
405
+
406
+ animatemenu = Menu(menubar, tearoff=0)
407
+ animatemenu.add_radiobutton(
408
+ label="No Animation", underline=0, variable=self._animation_frames, value=0
409
+ )
410
+ animatemenu.add_radiobutton(
411
+ label="Slow Animation",
412
+ underline=0,
413
+ variable=self._animation_frames,
414
+ value=10,
415
+ accelerator="-",
416
+ )
417
+ animatemenu.add_radiobutton(
418
+ label="Normal Animation",
419
+ underline=0,
420
+ variable=self._animation_frames,
421
+ value=5,
422
+ accelerator="=",
423
+ )
424
+ animatemenu.add_radiobutton(
425
+ label="Fast Animation",
426
+ underline=0,
427
+ variable=self._animation_frames,
428
+ value=2,
429
+ accelerator="+",
430
+ )
431
+ menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
432
+
433
+ helpmenu = Menu(menubar, tearoff=0)
434
+ helpmenu.add_command(label="About", underline=0, command=self.about)
435
+ helpmenu.add_command(
436
+ label="Instructions", underline=0, command=self.help, accelerator="F1"
437
+ )
438
+ menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
439
+
440
+ parent.config(menu=menubar)
441
+
442
+ #########################################
443
+ ## Helper
444
+ #########################################
445
+
446
+ def _get(self, widget, treeloc):
447
+ for i in treeloc:
448
+ widget = widget.subtrees()[i]
449
+ if isinstance(widget, TreeSegmentWidget):
450
+ widget = widget.label()
451
+ return widget
452
+
453
+ #########################################
454
+ ## Main draw procedure
455
+ #########################################
456
+
457
+ def _redraw(self):
458
+ canvas = self._canvas
459
+
460
+ # Delete the old tree, widgets, etc.
461
+ if self._tree is not None:
462
+ self._cframe.destroy_widget(self._tree)
463
+ for twidget in self._textwidgets:
464
+ self._cframe.destroy_widget(twidget)
465
+ if self._textline is not None:
466
+ self._canvas.delete(self._textline)
467
+
468
+ # Draw the tree.
469
+ helv = ("helvetica", -self._size.get())
470
+ bold = ("helvetica", -self._size.get(), "bold")
471
+ attribs = {
472
+ "tree_color": "#000000",
473
+ "tree_width": 2,
474
+ "node_font": bold,
475
+ "leaf_font": helv,
476
+ }
477
+ tree = self._parser.tree()
478
+ self._tree = tree_to_treesegment(canvas, tree, **attribs)
479
+ self._cframe.add_widget(self._tree, 30, 5)
480
+
481
+ # Draw the text.
482
+ helv = ("helvetica", -self._size.get())
483
+ bottom = y = self._cframe.scrollregion()[3]
484
+ self._textwidgets = [
485
+ TextWidget(canvas, word, font=self._font) for word in self._sent
486
+ ]
487
+ for twidget in self._textwidgets:
488
+ self._cframe.add_widget(twidget, 0, 0)
489
+ twidget.move(0, bottom - twidget.bbox()[3] - 5)
490
+ y = min(y, twidget.bbox()[1])
491
+
492
+ # Draw a line over the text, to separate it from the tree.
493
+ self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash=".")
494
+
495
+ # Highlight appropriate nodes.
496
+ self._highlight_nodes()
497
+ self._highlight_prodlist()
498
+
499
+ # Make sure the text lines up.
500
+ self._position_text()
501
+
502
+ def _redraw_quick(self):
503
+ # This should be more-or-less sufficient after an animation.
504
+ self._highlight_nodes()
505
+ self._highlight_prodlist()
506
+ self._position_text()
507
+
508
+ def _highlight_nodes(self):
509
+ # Highlight the list of nodes to be checked.
510
+ bold = ("helvetica", -self._size.get(), "bold")
511
+ for treeloc in self._parser.frontier()[:1]:
512
+ self._get(self._tree, treeloc)["color"] = "#20a050"
513
+ self._get(self._tree, treeloc)["font"] = bold
514
+ for treeloc in self._parser.frontier()[1:]:
515
+ self._get(self._tree, treeloc)["color"] = "#008080"
516
+
517
+ def _highlight_prodlist(self):
518
+ # Highlight the productions that can be expanded.
519
+ # Boy, too bad tkinter doesn't implement Listbox.itemconfig;
520
+ # that would be pretty useful here.
521
+ self._prodlist.delete(0, "end")
522
+ expandable = self._parser.expandable_productions()
523
+ untried = self._parser.untried_expandable_productions()
524
+ productions = self._productions
525
+ for index in range(len(productions)):
526
+ if productions[index] in expandable:
527
+ if productions[index] in untried:
528
+ self._prodlist.insert(index, " %s" % productions[index])
529
+ else:
530
+ self._prodlist.insert(index, " %s (TRIED)" % productions[index])
531
+ self._prodlist.selection_set(index)
532
+ else:
533
+ self._prodlist.insert(index, " %s" % productions[index])
534
+
535
+ def _position_text(self):
536
+ # Line up the text widgets that are matched against the tree
537
+ numwords = len(self._sent)
538
+ num_matched = numwords - len(self._parser.remaining_text())
539
+ leaves = self._tree_leaves()[:num_matched]
540
+ xmax = self._tree.bbox()[0]
541
+ for i in range(0, len(leaves)):
542
+ widget = self._textwidgets[i]
543
+ leaf = leaves[i]
544
+ widget["color"] = "#006040"
545
+ leaf["color"] = "#006040"
546
+ widget.move(leaf.bbox()[0] - widget.bbox()[0], 0)
547
+ xmax = widget.bbox()[2] + 10
548
+
549
+ # Line up the text widgets that are not matched against the tree.
550
+ for i in range(len(leaves), numwords):
551
+ widget = self._textwidgets[i]
552
+ widget["color"] = "#a0a0a0"
553
+ widget.move(xmax - widget.bbox()[0], 0)
554
+ xmax = widget.bbox()[2] + 10
555
+
556
+ # If we have a complete parse, make everything green :)
557
+ if self._parser.currently_complete():
558
+ for twidget in self._textwidgets:
559
+ twidget["color"] = "#00a000"
560
+
561
+ # Move the matched leaves down to the text.
562
+ for i in range(0, len(leaves)):
563
+ widget = self._textwidgets[i]
564
+ leaf = leaves[i]
565
+ dy = widget.bbox()[1] - leaf.bbox()[3] - 10.0
566
+ dy = max(dy, leaf.parent().label().bbox()[3] - leaf.bbox()[3] + 10)
567
+ leaf.move(0, dy)
568
+
569
+ def _tree_leaves(self, tree=None):
570
+ if tree is None:
571
+ tree = self._tree
572
+ if isinstance(tree, TreeSegmentWidget):
573
+ leaves = []
574
+ for child in tree.subtrees():
575
+ leaves += self._tree_leaves(child)
576
+ return leaves
577
+ else:
578
+ return [tree]
579
+
580
+ #########################################
581
+ ## Button Callbacks
582
+ #########################################
583
+
584
+ def destroy(self, *e):
585
+ self._autostep = 0
586
+ if self._top is None:
587
+ return
588
+ self._top.destroy()
589
+ self._top = None
590
+
591
+ def reset(self, *e):
592
+ self._autostep = 0
593
+ self._parser.initialize(self._sent)
594
+ self._lastoper1["text"] = "Reset Application"
595
+ self._lastoper2["text"] = ""
596
+ self._redraw()
597
+
598
+ def autostep(self, *e):
599
+ if self._animation_frames.get() == 0:
600
+ self._animation_frames.set(2)
601
+ if self._autostep:
602
+ self._autostep = 0
603
+ else:
604
+ self._autostep = 1
605
+ self._step()
606
+
607
+ def cancel_autostep(self, *e):
608
+ # self._autostep_button['text'] = 'Autostep'
609
+ self._autostep = 0
610
+
611
+ # Make sure to stop auto-stepping if we get any user input.
612
+ def step(self, *e):
613
+ self._autostep = 0
614
+ self._step()
615
+
616
+ def match(self, *e):
617
+ self._autostep = 0
618
+ self._match()
619
+
620
+ def expand(self, *e):
621
+ self._autostep = 0
622
+ self._expand()
623
+
624
+ def backtrack(self, *e):
625
+ self._autostep = 0
626
+ self._backtrack()
627
+
628
+ def _step(self):
629
+ if self._animating_lock:
630
+ return
631
+
632
+ # Try expanding, matching, and backtracking (in that order)
633
+ if self._expand():
634
+ pass
635
+ elif self._parser.untried_match() and self._match():
636
+ pass
637
+ elif self._backtrack():
638
+ pass
639
+ else:
640
+ self._lastoper1["text"] = "Finished"
641
+ self._lastoper2["text"] = ""
642
+ self._autostep = 0
643
+
644
+ # Check if we just completed a parse.
645
+ if self._parser.currently_complete():
646
+ self._autostep = 0
647
+ self._lastoper2["text"] += " [COMPLETE PARSE]"
648
+
649
+ def _expand(self, *e):
650
+ if self._animating_lock:
651
+ return
652
+ old_frontier = self._parser.frontier()
653
+ rv = self._parser.expand()
654
+ if rv is not None:
655
+ self._lastoper1["text"] = "Expand:"
656
+ self._lastoper2["text"] = rv
657
+ self._prodlist.selection_clear(0, "end")
658
+ index = self._productions.index(rv)
659
+ self._prodlist.selection_set(index)
660
+ self._animate_expand(old_frontier[0])
661
+ return True
662
+ else:
663
+ self._lastoper1["text"] = "Expand:"
664
+ self._lastoper2["text"] = "(all expansions tried)"
665
+ return False
666
+
667
+ def _match(self, *e):
668
+ if self._animating_lock:
669
+ return
670
+ old_frontier = self._parser.frontier()
671
+ rv = self._parser.match()
672
+ if rv is not None:
673
+ self._lastoper1["text"] = "Match:"
674
+ self._lastoper2["text"] = rv
675
+ self._animate_match(old_frontier[0])
676
+ return True
677
+ else:
678
+ self._lastoper1["text"] = "Match:"
679
+ self._lastoper2["text"] = "(failed)"
680
+ return False
681
+
682
+ def _backtrack(self, *e):
683
+ if self._animating_lock:
684
+ return
685
+ if self._parser.backtrack():
686
+ elt = self._parser.tree()
687
+ for i in self._parser.frontier()[0]:
688
+ elt = elt[i]
689
+ self._lastoper1["text"] = "Backtrack"
690
+ self._lastoper2["text"] = ""
691
+ if isinstance(elt, Tree):
692
+ self._animate_backtrack(self._parser.frontier()[0])
693
+ else:
694
+ self._animate_match_backtrack(self._parser.frontier()[0])
695
+ return True
696
+ else:
697
+ self._autostep = 0
698
+ self._lastoper1["text"] = "Finished"
699
+ self._lastoper2["text"] = ""
700
+ return False
701
+
702
+ def about(self, *e):
703
+ ABOUT = (
704
+ "NLTK Recursive Descent Parser Application\n" + "Written by Edward Loper"
705
+ )
706
+ TITLE = "About: Recursive Descent Parser Application"
707
+ try:
708
+ from tkinter.messagebox import Message
709
+
710
+ Message(message=ABOUT, title=TITLE).show()
711
+ except:
712
+ ShowText(self._top, TITLE, ABOUT)
713
+
714
+ def help(self, *e):
715
+ self._autostep = 0
716
+ # The default font's not very legible; try using 'fixed' instead.
717
+ try:
718
+ ShowText(
719
+ self._top,
720
+ "Help: Recursive Descent Parser Application",
721
+ (__doc__ or "").strip(),
722
+ width=75,
723
+ font="fixed",
724
+ )
725
+ except:
726
+ ShowText(
727
+ self._top,
728
+ "Help: Recursive Descent Parser Application",
729
+ (__doc__ or "").strip(),
730
+ width=75,
731
+ )
732
+
733
+ def postscript(self, *e):
734
+ self._autostep = 0
735
+ self._cframe.print_to_file()
736
+
737
+ def mainloop(self, *args, **kwargs):
738
+ """
739
+ Enter the Tkinter mainloop. This function must be called if
740
+ this demo is created from a non-interactive program (e.g.
741
+ from a secript); otherwise, the demo will close as soon as
742
+ the script completes.
743
+ """
744
+ if in_idle():
745
+ return
746
+ self._top.mainloop(*args, **kwargs)
747
+
748
+ def resize(self, size=None):
749
+ if size is not None:
750
+ self._size.set(size)
751
+ size = self._size.get()
752
+ self._font.configure(size=-(abs(size)))
753
+ self._boldfont.configure(size=-(abs(size)))
754
+ self._sysfont.configure(size=-(abs(size)))
755
+ self._bigfont.configure(size=-(abs(size + 2)))
756
+ self._redraw()
757
+
758
+ #########################################
759
+ ## Expand Production Selection
760
+ #########################################
761
+
762
+ def _toggle_grammar(self, *e):
763
+ if self._show_grammar.get():
764
+ self._prodframe.pack(
765
+ fill="both", side="left", padx=2, after=self._feedbackframe
766
+ )
767
+ self._lastoper1["text"] = "Show Grammar"
768
+ else:
769
+ self._prodframe.pack_forget()
770
+ self._lastoper1["text"] = "Hide Grammar"
771
+ self._lastoper2["text"] = ""
772
+
773
+ # def toggle_grammar(self, *e):
774
+ # self._show_grammar = not self._show_grammar
775
+ # if self._show_grammar:
776
+ # self._prodframe.pack(fill='both', expand='y', side='left',
777
+ # after=self._feedbackframe)
778
+ # self._lastoper1['text'] = 'Show Grammar'
779
+ # else:
780
+ # self._prodframe.pack_forget()
781
+ # self._lastoper1['text'] = 'Hide Grammar'
782
+ # self._lastoper2['text'] = ''
783
+
784
+ def _prodlist_select(self, event):
785
+ selection = self._prodlist.curselection()
786
+ if len(selection) != 1:
787
+ return
788
+ index = int(selection[0])
789
+ old_frontier = self._parser.frontier()
790
+ production = self._parser.expand(self._productions[index])
791
+
792
+ if production:
793
+ self._lastoper1["text"] = "Expand:"
794
+ self._lastoper2["text"] = production
795
+ self._prodlist.selection_clear(0, "end")
796
+ self._prodlist.selection_set(index)
797
+ self._animate_expand(old_frontier[0])
798
+ else:
799
+ # Reset the production selections.
800
+ self._prodlist.selection_clear(0, "end")
801
+ for prod in self._parser.expandable_productions():
802
+ index = self._productions.index(prod)
803
+ self._prodlist.selection_set(index)
804
+
805
+ #########################################
806
+ ## Animation
807
+ #########################################
808
+
809
+ def _animate_expand(self, treeloc):
810
+ oldwidget = self._get(self._tree, treeloc)
811
+ oldtree = oldwidget.parent()
812
+ top = not isinstance(oldtree.parent(), TreeSegmentWidget)
813
+
814
+ tree = self._parser.tree()
815
+ for i in treeloc:
816
+ tree = tree[i]
817
+
818
+ widget = tree_to_treesegment(
819
+ self._canvas,
820
+ tree,
821
+ node_font=self._boldfont,
822
+ leaf_color="white",
823
+ tree_width=2,
824
+ tree_color="white",
825
+ node_color="white",
826
+ leaf_font=self._font,
827
+ )
828
+ widget.label()["color"] = "#20a050"
829
+
830
+ (oldx, oldy) = oldtree.label().bbox()[:2]
831
+ (newx, newy) = widget.label().bbox()[:2]
832
+ widget.move(oldx - newx, oldy - newy)
833
+
834
+ if top:
835
+ self._cframe.add_widget(widget, 0, 5)
836
+ widget.move(30 - widget.label().bbox()[0], 0)
837
+ self._tree = widget
838
+ else:
839
+ oldtree.parent().replace_child(oldtree, widget)
840
+
841
+ # Move the children over so they don't overlap.
842
+ # Line the children up in a strange way.
843
+ if widget.subtrees():
844
+ dx = (
845
+ oldx
846
+ + widget.label().width() / 2
847
+ - widget.subtrees()[0].bbox()[0] / 2
848
+ - widget.subtrees()[0].bbox()[2] / 2
849
+ )
850
+ for subtree in widget.subtrees():
851
+ subtree.move(dx, 0)
852
+
853
+ self._makeroom(widget)
854
+
855
+ if top:
856
+ self._cframe.destroy_widget(oldtree)
857
+ else:
858
+ oldtree.destroy()
859
+
860
+ colors = [
861
+ "gray%d" % (10 * int(10 * x / self._animation_frames.get()))
862
+ for x in range(self._animation_frames.get(), 0, -1)
863
+ ]
864
+
865
+ # Move the text string down, if necessary.
866
+ dy = widget.bbox()[3] + 30 - self._canvas.coords(self._textline)[1]
867
+ if dy > 0:
868
+ for twidget in self._textwidgets:
869
+ twidget.move(0, dy)
870
+ self._canvas.move(self._textline, 0, dy)
871
+
872
+ self._animate_expand_frame(widget, colors)
873
+
874
+ def _makeroom(self, treeseg):
875
+ """
876
+ Make sure that no sibling tree bbox's overlap.
877
+ """
878
+ parent = treeseg.parent()
879
+ if not isinstance(parent, TreeSegmentWidget):
880
+ return
881
+
882
+ index = parent.subtrees().index(treeseg)
883
+
884
+ # Handle siblings to the right
885
+ rsiblings = parent.subtrees()[index + 1 :]
886
+ if rsiblings:
887
+ dx = treeseg.bbox()[2] - rsiblings[0].bbox()[0] + 10
888
+ for sibling in rsiblings:
889
+ sibling.move(dx, 0)
890
+
891
+ # Handle siblings to the left
892
+ if index > 0:
893
+ lsibling = parent.subtrees()[index - 1]
894
+ dx = max(0, lsibling.bbox()[2] - treeseg.bbox()[0] + 10)
895
+ treeseg.move(dx, 0)
896
+
897
+ # Keep working up the tree.
898
+ self._makeroom(parent)
899
+
900
+ def _animate_expand_frame(self, widget, colors):
901
+ if len(colors) > 0:
902
+ self._animating_lock = 1
903
+ widget["color"] = colors[0]
904
+ for subtree in widget.subtrees():
905
+ if isinstance(subtree, TreeSegmentWidget):
906
+ subtree.label()["color"] = colors[0]
907
+ else:
908
+ subtree["color"] = colors[0]
909
+ self._top.after(50, self._animate_expand_frame, widget, colors[1:])
910
+ else:
911
+ widget["color"] = "black"
912
+ for subtree in widget.subtrees():
913
+ if isinstance(subtree, TreeSegmentWidget):
914
+ subtree.label()["color"] = "black"
915
+ else:
916
+ subtree["color"] = "black"
917
+ self._redraw_quick()
918
+ widget.label()["color"] = "black"
919
+ self._animating_lock = 0
920
+ if self._autostep:
921
+ self._step()
922
+
923
+ def _animate_backtrack(self, treeloc):
924
+ # Flash red first, if we're animating.
925
+ if self._animation_frames.get() == 0:
926
+ colors = []
927
+ else:
928
+ colors = ["#a00000", "#000000", "#a00000"]
929
+ colors += [
930
+ "gray%d" % (10 * int(10 * x / (self._animation_frames.get())))
931
+ for x in range(1, self._animation_frames.get() + 1)
932
+ ]
933
+
934
+ widgets = [self._get(self._tree, treeloc).parent()]
935
+ for subtree in widgets[0].subtrees():
936
+ if isinstance(subtree, TreeSegmentWidget):
937
+ widgets.append(subtree.label())
938
+ else:
939
+ widgets.append(subtree)
940
+
941
+ self._animate_backtrack_frame(widgets, colors)
942
+
943
+ def _animate_backtrack_frame(self, widgets, colors):
944
+ if len(colors) > 0:
945
+ self._animating_lock = 1
946
+ for widget in widgets:
947
+ widget["color"] = colors[0]
948
+ self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:])
949
+ else:
950
+ for widget in widgets[0].subtrees():
951
+ widgets[0].remove_child(widget)
952
+ widget.destroy()
953
+ self._redraw_quick()
954
+ self._animating_lock = 0
955
+ if self._autostep:
956
+ self._step()
957
+
958
+ def _animate_match_backtrack(self, treeloc):
959
+ widget = self._get(self._tree, treeloc)
960
+ node = widget.parent().label()
961
+ dy = (node.bbox()[3] - widget.bbox()[1] + 14) / max(
962
+ 1, self._animation_frames.get()
963
+ )
964
+ self._animate_match_backtrack_frame(self._animation_frames.get(), widget, dy)
965
+
966
+ def _animate_match(self, treeloc):
967
+ widget = self._get(self._tree, treeloc)
968
+
969
+ dy = (self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) / max(
970
+ 1, self._animation_frames.get()
971
+ )
972
+ self._animate_match_frame(self._animation_frames.get(), widget, dy)
973
+
974
+ def _animate_match_frame(self, frame, widget, dy):
975
+ if frame > 0:
976
+ self._animating_lock = 1
977
+ widget.move(0, dy)
978
+ self._top.after(10, self._animate_match_frame, frame - 1, widget, dy)
979
+ else:
980
+ widget["color"] = "#006040"
981
+ self._redraw_quick()
982
+ self._animating_lock = 0
983
+ if self._autostep:
984
+ self._step()
985
+
986
+ def _animate_match_backtrack_frame(self, frame, widget, dy):
987
+ if frame > 0:
988
+ self._animating_lock = 1
989
+ widget.move(0, dy)
990
+ self._top.after(
991
+ 10, self._animate_match_backtrack_frame, frame - 1, widget, dy
992
+ )
993
+ else:
994
+ widget.parent().remove_child(widget)
995
+ widget.destroy()
996
+ self._animating_lock = 0
997
+ if self._autostep:
998
+ self._step()
999
+
1000
+ def edit_grammar(self, *e):
1001
+ CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
1002
+
1003
+ def set_grammar(self, grammar):
1004
+ self._parser.set_grammar(grammar)
1005
+ self._productions = list(grammar.productions())
1006
+ self._prodlist.delete(0, "end")
1007
+ for production in self._productions:
1008
+ self._prodlist.insert("end", (" %s" % production))
1009
+
1010
+ def edit_sentence(self, *e):
1011
+ sentence = " ".join(self._sent)
1012
+ title = "Edit Text"
1013
+ instr = "Enter a new sentence to parse."
1014
+ EntryDialog(self._top, sentence, instr, self.set_sentence, title)
1015
+
1016
+ def set_sentence(self, sentence):
1017
+ self._sent = sentence.split() # [XX] use tagged?
1018
+ self.reset()
1019
+
1020
+
1021
+ def app():
1022
+ """
1023
+ Create a recursive descent parser demo, using a simple grammar and
1024
+ text.
1025
+ """
1026
+ from nltk.grammar import CFG
1027
+
1028
+ grammar = CFG.fromstring(
1029
+ """
1030
+ # Grammatical productions.
1031
+ S -> NP VP
1032
+ NP -> Det N PP | Det N
1033
+ VP -> V NP PP | V NP | V
1034
+ PP -> P NP
1035
+ # Lexical productions.
1036
+ NP -> 'I'
1037
+ Det -> 'the' | 'a'
1038
+ N -> 'man' | 'park' | 'dog' | 'telescope'
1039
+ V -> 'ate' | 'saw'
1040
+ P -> 'in' | 'under' | 'with'
1041
+ """
1042
+ )
1043
+
1044
+ sent = "the dog saw a man in the park".split()
1045
+
1046
+ RecursiveDescentApp(grammar, sent).mainloop()
1047
+
1048
+
1049
+ if __name__ == "__main__":
1050
+ app()
1051
+
1052
+ __all__ = ["app"]
.eggs/nltk-3.8-py3.10.egg/nltk/app/srparser_app.py ADDED
@@ -0,0 +1,937 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Shift-Reduce Parser Application
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A graphical tool for exploring the shift-reduce parser.
10
+
11
+ The shift-reduce parser maintains a stack, which records the structure
12
+ of the portion of the text that has been parsed. The stack is
13
+ initially empty. Its contents are shown on the left side of the main
14
+ canvas.
15
+
16
+ On the right side of the main canvas is the remaining text. This is
17
+ the portion of the text which has not yet been considered by the
18
+ parser.
19
+
20
+ The parser builds up a tree structure for the text using two
21
+ operations:
22
+
23
+ - "shift" moves the first token from the remaining text to the top
24
+ of the stack. In the demo, the top of the stack is its right-hand
25
+ side.
26
+ - "reduce" uses a grammar production to combine the rightmost stack
27
+ elements into a single tree token.
28
+
29
+ You can control the parser's operation by using the "shift" and
30
+ "reduce" buttons; or you can use the "step" button to let the parser
31
+ automatically decide which operation to apply. The parser uses the
32
+ following rules to decide which operation to apply:
33
+
34
+ - Only shift if no reductions are available.
35
+ - If multiple reductions are available, then apply the reduction
36
+ whose CFG production is listed earliest in the grammar.
37
+
38
+ The "reduce" button applies the reduction whose CFG production is
39
+ listed earliest in the grammar. There are two ways to manually choose
40
+ which reduction to apply:
41
+
42
+ - Click on a CFG production from the list of available reductions,
43
+ on the left side of the main window. The reduction based on that
44
+ production will be applied to the top of the stack.
45
+ - Click on one of the stack elements. A popup window will appear,
46
+ containing all available reductions. Select one, and it will be
47
+ applied to the top of the stack.
48
+
49
+ Note that reductions can only be applied to the top of the stack.
50
+
51
+ Keyboard Shortcuts::
52
+ [Space]\t Perform the next shift or reduce operation
53
+ [s]\t Perform a shift operation
54
+ [r]\t Perform a reduction operation
55
+ [Ctrl-z]\t Undo most recent operation
56
+ [Delete]\t Reset the parser
57
+ [g]\t Show/hide available production list
58
+ [Ctrl-a]\t Toggle animations
59
+ [h]\t Help
60
+ [Ctrl-p]\t Print
61
+ [q]\t Quit
62
+
63
+ """
64
+
65
+ from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
66
+ from tkinter.font import Font
67
+
68
+ from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
69
+ from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
70
+ from nltk.parse import SteppingShiftReduceParser
71
+ from nltk.tree import Tree
72
+ from nltk.util import in_idle
73
+
74
+ """
75
+ Possible future improvements:
76
+ - button/window to change and/or select text. Just pop up a window
77
+ with an entry, and let them modify the text; and then retokenize
78
+ it? Maybe give a warning if it contains tokens whose types are
79
+ not in the grammar.
80
+ - button/window to change and/or select grammar. Select from
81
+ several alternative grammars? Or actually change the grammar? If
82
+ the later, then I'd want to define nltk.draw.cfg, which would be
83
+ responsible for that.
84
+ """
85
+
86
+
87
+ class ShiftReduceApp:
88
+ """
89
+ A graphical tool for exploring the shift-reduce parser. The tool
90
+ displays the parser's stack and the remaining text, and allows the
91
+ user to control the parser's operation. In particular, the user
92
+ can shift tokens onto the stack, and can perform reductions on the
93
+ top elements of the stack. A "step" button simply steps through
94
+ the parsing process, performing the operations that
95
+ ``nltk.parse.ShiftReduceParser`` would use.
96
+ """
97
+
98
+ def __init__(self, grammar, sent, trace=0):
99
+ self._sent = sent
100
+ self._parser = SteppingShiftReduceParser(grammar, trace)
101
+
102
+ # Set up the main window.
103
+ self._top = Tk()
104
+ self._top.title("Shift Reduce Parser Application")
105
+
106
+ # Animations. animating_lock is a lock to prevent the demo
107
+ # from performing new operations while it's animating.
108
+ self._animating_lock = 0
109
+ self._animate = IntVar(self._top)
110
+ self._animate.set(10) # = medium
111
+
112
+ # The user can hide the grammar.
113
+ self._show_grammar = IntVar(self._top)
114
+ self._show_grammar.set(1)
115
+
116
+ # Initialize fonts.
117
+ self._init_fonts(self._top)
118
+
119
+ # Set up key bindings.
120
+ self._init_bindings()
121
+
122
+ # Create the basic frames.
123
+ self._init_menubar(self._top)
124
+ self._init_buttons(self._top)
125
+ self._init_feedback(self._top)
126
+ self._init_grammar(self._top)
127
+ self._init_canvas(self._top)
128
+
129
+ # A popup menu for reducing.
130
+ self._reduce_menu = Menu(self._canvas, tearoff=0)
131
+
132
+ # Reset the demo, and set the feedback frame to empty.
133
+ self.reset()
134
+ self._lastoper1["text"] = ""
135
+
136
+ #########################################
137
+ ## Initialization Helpers
138
+ #########################################
139
+
140
+ def _init_fonts(self, root):
141
+ # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
142
+ self._sysfont = Font(font=Button()["font"])
143
+ root.option_add("*Font", self._sysfont)
144
+
145
+ # TWhat's our font size (default=same as sysfont)
146
+ self._size = IntVar(root)
147
+ self._size.set(self._sysfont.cget("size"))
148
+
149
+ self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
150
+ self._font = Font(family="helvetica", size=self._size.get())
151
+
152
+ def _init_grammar(self, parent):
153
+ # Grammar view.
154
+ self._prodframe = listframe = Frame(parent)
155
+ self._prodframe.pack(fill="both", side="left", padx=2)
156
+ self._prodlist_label = Label(
157
+ self._prodframe, font=self._boldfont, text="Available Reductions"
158
+ )
159
+ self._prodlist_label.pack()
160
+ self._prodlist = Listbox(
161
+ self._prodframe,
162
+ selectmode="single",
163
+ relief="groove",
164
+ background="white",
165
+ foreground="#909090",
166
+ font=self._font,
167
+ selectforeground="#004040",
168
+ selectbackground="#c0f0c0",
169
+ )
170
+
171
+ self._prodlist.pack(side="right", fill="both", expand=1)
172
+
173
+ self._productions = list(self._parser.grammar().productions())
174
+ for production in self._productions:
175
+ self._prodlist.insert("end", (" %s" % production))
176
+ self._prodlist.config(height=min(len(self._productions), 25))
177
+
178
+ # Add a scrollbar if there are more than 25 productions.
179
+ if 1: # len(self._productions) > 25:
180
+ listscroll = Scrollbar(self._prodframe, orient="vertical")
181
+ self._prodlist.config(yscrollcommand=listscroll.set)
182
+ listscroll.config(command=self._prodlist.yview)
183
+ listscroll.pack(side="left", fill="y")
184
+
185
+ # If they select a production, apply it.
186
+ self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
187
+
188
+ # When they hover over a production, highlight it.
189
+ self._hover = -1
190
+ self._prodlist.bind("<Motion>", self._highlight_hover)
191
+ self._prodlist.bind("<Leave>", self._clear_hover)
192
+
193
+ def _init_bindings(self):
194
+ # Quit
195
+ self._top.bind("<Control-q>", self.destroy)
196
+ self._top.bind("<Control-x>", self.destroy)
197
+ self._top.bind("<Alt-q>", self.destroy)
198
+ self._top.bind("<Alt-x>", self.destroy)
199
+
200
+ # Ops (step, shift, reduce, undo)
201
+ self._top.bind("<space>", self.step)
202
+ self._top.bind("<s>", self.shift)
203
+ self._top.bind("<Alt-s>", self.shift)
204
+ self._top.bind("<Control-s>", self.shift)
205
+ self._top.bind("<r>", self.reduce)
206
+ self._top.bind("<Alt-r>", self.reduce)
207
+ self._top.bind("<Control-r>", self.reduce)
208
+ self._top.bind("<Delete>", self.reset)
209
+ self._top.bind("<u>", self.undo)
210
+ self._top.bind("<Alt-u>", self.undo)
211
+ self._top.bind("<Control-u>", self.undo)
212
+ self._top.bind("<Control-z>", self.undo)
213
+ self._top.bind("<BackSpace>", self.undo)
214
+
215
+ # Misc
216
+ self._top.bind("<Control-p>", self.postscript)
217
+ self._top.bind("<Control-h>", self.help)
218
+ self._top.bind("<F1>", self.help)
219
+ self._top.bind("<Control-g>", self.edit_grammar)
220
+ self._top.bind("<Control-t>", self.edit_sentence)
221
+
222
+ # Animation speed control
223
+ self._top.bind("-", lambda e, a=self._animate: a.set(20))
224
+ self._top.bind("=", lambda e, a=self._animate: a.set(10))
225
+ self._top.bind("+", lambda e, a=self._animate: a.set(4))
226
+
227
+ def _init_buttons(self, parent):
228
+ # Set up the frames.
229
+ self._buttonframe = buttonframe = Frame(parent)
230
+ buttonframe.pack(fill="none", side="bottom")
231
+ Button(
232
+ buttonframe,
233
+ text="Step",
234
+ background="#90c0d0",
235
+ foreground="black",
236
+ command=self.step,
237
+ ).pack(side="left")
238
+ Button(
239
+ buttonframe,
240
+ text="Shift",
241
+ underline=0,
242
+ background="#90f090",
243
+ foreground="black",
244
+ command=self.shift,
245
+ ).pack(side="left")
246
+ Button(
247
+ buttonframe,
248
+ text="Reduce",
249
+ underline=0,
250
+ background="#90f090",
251
+ foreground="black",
252
+ command=self.reduce,
253
+ ).pack(side="left")
254
+ Button(
255
+ buttonframe,
256
+ text="Undo",
257
+ underline=0,
258
+ background="#f0a0a0",
259
+ foreground="black",
260
+ command=self.undo,
261
+ ).pack(side="left")
262
+
263
+ def _init_menubar(self, parent):
264
+ menubar = Menu(parent)
265
+
266
+ filemenu = Menu(menubar, tearoff=0)
267
+ filemenu.add_command(
268
+ label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
269
+ )
270
+ filemenu.add_command(
271
+ label="Print to Postscript",
272
+ underline=0,
273
+ command=self.postscript,
274
+ accelerator="Ctrl-p",
275
+ )
276
+ filemenu.add_command(
277
+ label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
278
+ )
279
+ menubar.add_cascade(label="File", underline=0, menu=filemenu)
280
+
281
+ editmenu = Menu(menubar, tearoff=0)
282
+ editmenu.add_command(
283
+ label="Edit Grammar",
284
+ underline=5,
285
+ command=self.edit_grammar,
286
+ accelerator="Ctrl-g",
287
+ )
288
+ editmenu.add_command(
289
+ label="Edit Text",
290
+ underline=5,
291
+ command=self.edit_sentence,
292
+ accelerator="Ctrl-t",
293
+ )
294
+ menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
295
+
296
+ rulemenu = Menu(menubar, tearoff=0)
297
+ rulemenu.add_command(
298
+ label="Step", underline=1, command=self.step, accelerator="Space"
299
+ )
300
+ rulemenu.add_separator()
301
+ rulemenu.add_command(
302
+ label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
303
+ )
304
+ rulemenu.add_command(
305
+ label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
306
+ )
307
+ rulemenu.add_separator()
308
+ rulemenu.add_command(
309
+ label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
310
+ )
311
+ menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
312
+
313
+ viewmenu = Menu(menubar, tearoff=0)
314
+ viewmenu.add_checkbutton(
315
+ label="Show Grammar",
316
+ underline=0,
317
+ variable=self._show_grammar,
318
+ command=self._toggle_grammar,
319
+ )
320
+ viewmenu.add_separator()
321
+ viewmenu.add_radiobutton(
322
+ label="Tiny",
323
+ variable=self._size,
324
+ underline=0,
325
+ value=10,
326
+ command=self.resize,
327
+ )
328
+ viewmenu.add_radiobutton(
329
+ label="Small",
330
+ variable=self._size,
331
+ underline=0,
332
+ value=12,
333
+ command=self.resize,
334
+ )
335
+ viewmenu.add_radiobutton(
336
+ label="Medium",
337
+ variable=self._size,
338
+ underline=0,
339
+ value=14,
340
+ command=self.resize,
341
+ )
342
+ viewmenu.add_radiobutton(
343
+ label="Large",
344
+ variable=self._size,
345
+ underline=0,
346
+ value=18,
347
+ command=self.resize,
348
+ )
349
+ viewmenu.add_radiobutton(
350
+ label="Huge",
351
+ variable=self._size,
352
+ underline=0,
353
+ value=24,
354
+ command=self.resize,
355
+ )
356
+ menubar.add_cascade(label="View", underline=0, menu=viewmenu)
357
+
358
+ animatemenu = Menu(menubar, tearoff=0)
359
+ animatemenu.add_radiobutton(
360
+ label="No Animation", underline=0, variable=self._animate, value=0
361
+ )
362
+ animatemenu.add_radiobutton(
363
+ label="Slow Animation",
364
+ underline=0,
365
+ variable=self._animate,
366
+ value=20,
367
+ accelerator="-",
368
+ )
369
+ animatemenu.add_radiobutton(
370
+ label="Normal Animation",
371
+ underline=0,
372
+ variable=self._animate,
373
+ value=10,
374
+ accelerator="=",
375
+ )
376
+ animatemenu.add_radiobutton(
377
+ label="Fast Animation",
378
+ underline=0,
379
+ variable=self._animate,
380
+ value=4,
381
+ accelerator="+",
382
+ )
383
+ menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
384
+
385
+ helpmenu = Menu(menubar, tearoff=0)
386
+ helpmenu.add_command(label="About", underline=0, command=self.about)
387
+ helpmenu.add_command(
388
+ label="Instructions", underline=0, command=self.help, accelerator="F1"
389
+ )
390
+ menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
391
+
392
+ parent.config(menu=menubar)
393
+
394
+ def _init_feedback(self, parent):
395
+ self._feedbackframe = feedbackframe = Frame(parent)
396
+ feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
397
+ self._lastoper_label = Label(
398
+ feedbackframe, text="Last Operation:", font=self._font
399
+ )
400
+ self._lastoper_label.pack(side="left")
401
+ lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
402
+ lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
403
+ self._lastoper1 = Label(
404
+ lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
405
+ )
406
+ self._lastoper2 = Label(
407
+ lastoperframe,
408
+ anchor="w",
409
+ width=30,
410
+ foreground="#004040",
411
+ background="#f0f0f0",
412
+ font=self._font,
413
+ )
414
+ self._lastoper1.pack(side="left")
415
+ self._lastoper2.pack(side="left", fill="x", expand=1)
416
+
417
+ def _init_canvas(self, parent):
418
+ self._cframe = CanvasFrame(
419
+ parent,
420
+ background="white",
421
+ width=525,
422
+ closeenough=10,
423
+ border=2,
424
+ relief="sunken",
425
+ )
426
+ self._cframe.pack(expand=1, fill="both", side="top", pady=2)
427
+ canvas = self._canvas = self._cframe.canvas()
428
+
429
+ self._stackwidgets = []
430
+ self._rtextwidgets = []
431
+ self._titlebar = canvas.create_rectangle(
432
+ 0, 0, 0, 0, fill="#c0f0f0", outline="black"
433
+ )
434
+ self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
435
+ self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
436
+ size = self._size.get() + 4
437
+ self._stacklabel = TextWidget(
438
+ canvas, "Stack", color="#004040", font=self._boldfont
439
+ )
440
+ self._rtextlabel = TextWidget(
441
+ canvas, "Remaining Text", color="#004040", font=self._boldfont
442
+ )
443
+ self._cframe.add_widget(self._stacklabel)
444
+ self._cframe.add_widget(self._rtextlabel)
445
+
446
+ #########################################
447
+ ## Main draw procedure
448
+ #########################################
449
+
450
+ def _redraw(self):
451
+ scrollregion = self._canvas["scrollregion"].split()
452
+ (cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion)
453
+
454
+ # Delete the old stack & rtext widgets.
455
+ for stackwidget in self._stackwidgets:
456
+ self._cframe.destroy_widget(stackwidget)
457
+ self._stackwidgets = []
458
+ for rtextwidget in self._rtextwidgets:
459
+ self._cframe.destroy_widget(rtextwidget)
460
+ self._rtextwidgets = []
461
+
462
+ # Position the titlebar & exprline
463
+ (x1, y1, x2, y2) = self._stacklabel.bbox()
464
+ y = y2 - y1 + 10
465
+ self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4)
466
+ self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10)
467
+
468
+ # Position the titlebar labels..
469
+ (x1, y1, x2, y2) = self._stacklabel.bbox()
470
+ self._stacklabel.move(5 - x1, 3 - y1)
471
+ (x1, y1, x2, y2) = self._rtextlabel.bbox()
472
+ self._rtextlabel.move(cx2 - x2 - 5, 3 - y1)
473
+
474
+ # Draw the stack.
475
+ stackx = 5
476
+ for tok in self._parser.stack():
477
+ if isinstance(tok, Tree):
478
+ attribs = {
479
+ "tree_color": "#4080a0",
480
+ "tree_width": 2,
481
+ "node_font": self._boldfont,
482
+ "node_color": "#006060",
483
+ "leaf_color": "#006060",
484
+ "leaf_font": self._font,
485
+ }
486
+ widget = tree_to_treesegment(self._canvas, tok, **attribs)
487
+ widget.label()["color"] = "#000000"
488
+ else:
489
+ widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
490
+ widget.bind_click(self._popup_reduce)
491
+ self._stackwidgets.append(widget)
492
+ self._cframe.add_widget(widget, stackx, y)
493
+ stackx = widget.bbox()[2] + 10
494
+
495
+ # Draw the remaining text.
496
+ rtextwidth = 0
497
+ for tok in self._parser.remaining_text():
498
+ widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
499
+ self._rtextwidgets.append(widget)
500
+ self._cframe.add_widget(widget, rtextwidth, y)
501
+ rtextwidth = widget.bbox()[2] + 4
502
+
503
+ # Allow enough room to shift the next token (for animations)
504
+ if len(self._rtextwidgets) > 0:
505
+ stackx += self._rtextwidgets[0].width()
506
+
507
+ # Move the remaining text to the correct location (keep it
508
+ # right-justified, when possible); and move the remaining text
509
+ # label, if necessary.
510
+ stackx = max(stackx, self._stacklabel.width() + 25)
511
+ rlabelwidth = self._rtextlabel.width() + 10
512
+ if stackx >= cx2 - max(rtextwidth, rlabelwidth):
513
+ cx2 = stackx + max(rtextwidth, rlabelwidth)
514
+ for rtextwidget in self._rtextwidgets:
515
+ rtextwidget.move(4 + cx2 - rtextwidth, 0)
516
+ self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0)
517
+
518
+ midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2
519
+ self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
520
+ (x1, y1, x2, y2) = self._stacklabel.bbox()
521
+
522
+ # Set up binding to allow them to shift a token by dragging it.
523
+ if len(self._rtextwidgets) > 0:
524
+
525
+ def drag_shift(widget, midx=midx, self=self):
526
+ if widget.bbox()[0] < midx:
527
+ self.shift()
528
+ else:
529
+ self._redraw()
530
+
531
+ self._rtextwidgets[0].bind_drag(drag_shift)
532
+ self._rtextwidgets[0].bind_click(self.shift)
533
+
534
+ # Draw the stack top.
535
+ self._highlight_productions()
536
+
537
+ def _draw_stack_top(self, widget):
538
+ # hack..
539
+ midx = widget.bbox()[2] + 50
540
+ self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
541
+
542
+ def _highlight_productions(self):
543
+ # Highlight the productions that can be reduced.
544
+ self._prodlist.selection_clear(0, "end")
545
+ for prod in self._parser.reducible_productions():
546
+ index = self._productions.index(prod)
547
+ self._prodlist.selection_set(index)
548
+
549
+ #########################################
550
+ ## Button Callbacks
551
+ #########################################
552
+
553
+ def destroy(self, *e):
554
+ if self._top is None:
555
+ return
556
+ self._top.destroy()
557
+ self._top = None
558
+
559
+ def reset(self, *e):
560
+ self._parser.initialize(self._sent)
561
+ self._lastoper1["text"] = "Reset App"
562
+ self._lastoper2["text"] = ""
563
+ self._redraw()
564
+
565
+ def step(self, *e):
566
+ if self.reduce():
567
+ return True
568
+ elif self.shift():
569
+ return True
570
+ else:
571
+ if list(self._parser.parses()):
572
+ self._lastoper1["text"] = "Finished:"
573
+ self._lastoper2["text"] = "Success"
574
+ else:
575
+ self._lastoper1["text"] = "Finished:"
576
+ self._lastoper2["text"] = "Failure"
577
+
578
+ def shift(self, *e):
579
+ if self._animating_lock:
580
+ return
581
+ if self._parser.shift():
582
+ tok = self._parser.stack()[-1]
583
+ self._lastoper1["text"] = "Shift:"
584
+ self._lastoper2["text"] = "%r" % tok
585
+ if self._animate.get():
586
+ self._animate_shift()
587
+ else:
588
+ self._redraw()
589
+ return True
590
+ return False
591
+
592
+ def reduce(self, *e):
593
+ if self._animating_lock:
594
+ return
595
+ production = self._parser.reduce()
596
+ if production:
597
+ self._lastoper1["text"] = "Reduce:"
598
+ self._lastoper2["text"] = "%s" % production
599
+ if self._animate.get():
600
+ self._animate_reduce()
601
+ else:
602
+ self._redraw()
603
+ return production
604
+
605
+ def undo(self, *e):
606
+ if self._animating_lock:
607
+ return
608
+ if self._parser.undo():
609
+ self._redraw()
610
+
611
+ def postscript(self, *e):
612
+ self._cframe.print_to_file()
613
+
614
+ def mainloop(self, *args, **kwargs):
615
+ """
616
+ Enter the Tkinter mainloop. This function must be called if
617
+ this demo is created from a non-interactive program (e.g.
618
+ from a secript); otherwise, the demo will close as soon as
619
+ the script completes.
620
+ """
621
+ if in_idle():
622
+ return
623
+ self._top.mainloop(*args, **kwargs)
624
+
625
+ #########################################
626
+ ## Menubar callbacks
627
+ #########################################
628
+
629
+ def resize(self, size=None):
630
+ if size is not None:
631
+ self._size.set(size)
632
+ size = self._size.get()
633
+ self._font.configure(size=-(abs(size)))
634
+ self._boldfont.configure(size=-(abs(size)))
635
+ self._sysfont.configure(size=-(abs(size)))
636
+
637
+ # self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
638
+ # self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
639
+ # self._lastoper_label['font'] = ('helvetica', -size)
640
+ # self._lastoper1['font'] = ('helvetica', -size)
641
+ # self._lastoper2['font'] = ('helvetica', -size)
642
+ # self._prodlist['font'] = ('helvetica', -size)
643
+ # self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
644
+ self._redraw()
645
+
646
+ def help(self, *e):
647
+ # The default font's not very legible; try using 'fixed' instead.
648
+ try:
649
+ ShowText(
650
+ self._top,
651
+ "Help: Shift-Reduce Parser Application",
652
+ (__doc__ or "").strip(),
653
+ width=75,
654
+ font="fixed",
655
+ )
656
+ except:
657
+ ShowText(
658
+ self._top,
659
+ "Help: Shift-Reduce Parser Application",
660
+ (__doc__ or "").strip(),
661
+ width=75,
662
+ )
663
+
664
+ def about(self, *e):
665
+ ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
666
+ TITLE = "About: Shift-Reduce Parser Application"
667
+ try:
668
+ from tkinter.messagebox import Message
669
+
670
+ Message(message=ABOUT, title=TITLE).show()
671
+ except:
672
+ ShowText(self._top, TITLE, ABOUT)
673
+
674
+ def edit_grammar(self, *e):
675
+ CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
676
+
677
+ def set_grammar(self, grammar):
678
+ self._parser.set_grammar(grammar)
679
+ self._productions = list(grammar.productions())
680
+ self._prodlist.delete(0, "end")
681
+ for production in self._productions:
682
+ self._prodlist.insert("end", (" %s" % production))
683
+
684
+ def edit_sentence(self, *e):
685
+ sentence = " ".join(self._sent)
686
+ title = "Edit Text"
687
+ instr = "Enter a new sentence to parse."
688
+ EntryDialog(self._top, sentence, instr, self.set_sentence, title)
689
+
690
+ def set_sentence(self, sent):
691
+ self._sent = sent.split() # [XX] use tagged?
692
+ self.reset()
693
+
694
+ #########################################
695
+ ## Reduce Production Selection
696
+ #########################################
697
+
698
+ def _toggle_grammar(self, *e):
699
+ if self._show_grammar.get():
700
+ self._prodframe.pack(
701
+ fill="both", side="left", padx=2, after=self._feedbackframe
702
+ )
703
+ self._lastoper1["text"] = "Show Grammar"
704
+ else:
705
+ self._prodframe.pack_forget()
706
+ self._lastoper1["text"] = "Hide Grammar"
707
+ self._lastoper2["text"] = ""
708
+
709
+ def _prodlist_select(self, event):
710
+ selection = self._prodlist.curselection()
711
+ if len(selection) != 1:
712
+ return
713
+ index = int(selection[0])
714
+ production = self._parser.reduce(self._productions[index])
715
+ if production:
716
+ self._lastoper1["text"] = "Reduce:"
717
+ self._lastoper2["text"] = "%s" % production
718
+ if self._animate.get():
719
+ self._animate_reduce()
720
+ else:
721
+ self._redraw()
722
+ else:
723
+ # Reset the production selections.
724
+ self._prodlist.selection_clear(0, "end")
725
+ for prod in self._parser.reducible_productions():
726
+ index = self._productions.index(prod)
727
+ self._prodlist.selection_set(index)
728
+
729
+ def _popup_reduce(self, widget):
730
+ # Remove old commands.
731
+ productions = self._parser.reducible_productions()
732
+ if len(productions) == 0:
733
+ return
734
+
735
+ self._reduce_menu.delete(0, "end")
736
+ for production in productions:
737
+ self._reduce_menu.add_command(label=str(production), command=self.reduce)
738
+ self._reduce_menu.post(
739
+ self._canvas.winfo_pointerx(), self._canvas.winfo_pointery()
740
+ )
741
+
742
+ #########################################
743
+ ## Animations
744
+ #########################################
745
+
746
+ def _animate_shift(self):
747
+ # What widget are we shifting?
748
+ widget = self._rtextwidgets[0]
749
+
750
+ # Where are we shifting from & to?
751
+ right = widget.bbox()[0]
752
+ if len(self._stackwidgets) == 0:
753
+ left = 5
754
+ else:
755
+ left = self._stackwidgets[-1].bbox()[2] + 10
756
+
757
+ # Start animating.
758
+ dt = self._animate.get()
759
+ dx = (left - right) * 1.0 / dt
760
+ self._animate_shift_frame(dt, widget, dx)
761
+
762
+ def _animate_shift_frame(self, frame, widget, dx):
763
+ if frame > 0:
764
+ self._animating_lock = 1
765
+ widget.move(dx, 0)
766
+ self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx)
767
+ else:
768
+ # but: stacktop??
769
+
770
+ # Shift the widget to the stack.
771
+ del self._rtextwidgets[0]
772
+ self._stackwidgets.append(widget)
773
+ self._animating_lock = 0
774
+
775
+ # Display the available productions.
776
+ self._draw_stack_top(widget)
777
+ self._highlight_productions()
778
+
779
+ def _animate_reduce(self):
780
+ # What widgets are we shifting?
781
+ numwidgets = len(self._parser.stack()[-1]) # number of children
782
+ widgets = self._stackwidgets[-numwidgets:]
783
+
784
+ # How far are we moving?
785
+ if isinstance(widgets[0], TreeSegmentWidget):
786
+ ydist = 15 + widgets[0].label().height()
787
+ else:
788
+ ydist = 15 + widgets[0].height()
789
+
790
+ # Start animating.
791
+ dt = self._animate.get()
792
+ dy = ydist * 2.0 / dt
793
+ self._animate_reduce_frame(dt / 2, widgets, dy)
794
+
795
+ def _animate_reduce_frame(self, frame, widgets, dy):
796
+ if frame > 0:
797
+ self._animating_lock = 1
798
+ for widget in widgets:
799
+ widget.move(0, dy)
800
+ self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy)
801
+ else:
802
+ del self._stackwidgets[-len(widgets) :]
803
+ for widget in widgets:
804
+ self._cframe.remove_widget(widget)
805
+ tok = self._parser.stack()[-1]
806
+ if not isinstance(tok, Tree):
807
+ raise ValueError()
808
+ label = TextWidget(
809
+ self._canvas, str(tok.label()), color="#006060", font=self._boldfont
810
+ )
811
+ widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
812
+ (x1, y1, x2, y2) = self._stacklabel.bbox()
813
+ y = y2 - y1 + 10
814
+ if not self._stackwidgets:
815
+ x = 5
816
+ else:
817
+ x = self._stackwidgets[-1].bbox()[2] + 10
818
+ self._cframe.add_widget(widget, x, y)
819
+ self._stackwidgets.append(widget)
820
+
821
+ # Display the available productions.
822
+ self._draw_stack_top(widget)
823
+ self._highlight_productions()
824
+
825
+ # # Delete the old widgets..
826
+ # del self._stackwidgets[-len(widgets):]
827
+ # for widget in widgets:
828
+ # self._cframe.destroy_widget(widget)
829
+ #
830
+ # # Make a new one.
831
+ # tok = self._parser.stack()[-1]
832
+ # if isinstance(tok, Tree):
833
+ # attribs = {'tree_color': '#4080a0', 'tree_width': 2,
834
+ # 'node_font': bold, 'node_color': '#006060',
835
+ # 'leaf_color': '#006060', 'leaf_font':self._font}
836
+ # widget = tree_to_treesegment(self._canvas, tok.type(),
837
+ # **attribs)
838
+ # widget.node()['color'] = '#000000'
839
+ # else:
840
+ # widget = TextWidget(self._canvas, tok.type(),
841
+ # color='#000000', font=self._font)
842
+ # widget.bind_click(self._popup_reduce)
843
+ # (x1, y1, x2, y2) = self._stacklabel.bbox()
844
+ # y = y2-y1+10
845
+ # if not self._stackwidgets: x = 5
846
+ # else: x = self._stackwidgets[-1].bbox()[2] + 10
847
+ # self._cframe.add_widget(widget, x, y)
848
+ # self._stackwidgets.append(widget)
849
+
850
+ # self._redraw()
851
+ self._animating_lock = 0
852
+
853
+ #########################################
854
+ ## Hovering.
855
+ #########################################
856
+
857
+ def _highlight_hover(self, event):
858
+ # What production are we hovering over?
859
+ index = self._prodlist.nearest(event.y)
860
+ if self._hover == index:
861
+ return
862
+
863
+ # Clear any previous hover highlighting.
864
+ self._clear_hover()
865
+
866
+ # If the production corresponds to an available reduction,
867
+ # highlight the stack.
868
+ selection = [int(s) for s in self._prodlist.curselection()]
869
+ if index in selection:
870
+ rhslen = len(self._productions[index].rhs())
871
+ for stackwidget in self._stackwidgets[-rhslen:]:
872
+ if isinstance(stackwidget, TreeSegmentWidget):
873
+ stackwidget.label()["color"] = "#00a000"
874
+ else:
875
+ stackwidget["color"] = "#00a000"
876
+
877
+ # Remember what production we're hovering over.
878
+ self._hover = index
879
+
880
+ def _clear_hover(self, *event):
881
+ # Clear any previous hover highlighting.
882
+ if self._hover == -1:
883
+ return
884
+ self._hover = -1
885
+ for stackwidget in self._stackwidgets:
886
+ if isinstance(stackwidget, TreeSegmentWidget):
887
+ stackwidget.label()["color"] = "black"
888
+ else:
889
+ stackwidget["color"] = "black"
890
+
891
+
892
+ def app():
893
+ """
894
+ Create a shift reduce parser app, using a simple grammar and
895
+ text.
896
+ """
897
+
898
+ from nltk.grammar import CFG, Nonterminal, Production
899
+
900
+ nonterminals = "S VP NP PP P N Name V Det"
901
+ (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split())
902
+
903
+ productions = (
904
+ # Syntactic Productions
905
+ Production(S, [NP, VP]),
906
+ Production(NP, [Det, N]),
907
+ Production(NP, [NP, PP]),
908
+ Production(VP, [VP, PP]),
909
+ Production(VP, [V, NP, PP]),
910
+ Production(VP, [V, NP]),
911
+ Production(PP, [P, NP]),
912
+ # Lexical Productions
913
+ Production(NP, ["I"]),
914
+ Production(Det, ["the"]),
915
+ Production(Det, ["a"]),
916
+ Production(N, ["man"]),
917
+ Production(V, ["saw"]),
918
+ Production(P, ["in"]),
919
+ Production(P, ["with"]),
920
+ Production(N, ["park"]),
921
+ Production(N, ["dog"]),
922
+ Production(N, ["statue"]),
923
+ Production(Det, ["my"]),
924
+ )
925
+
926
+ grammar = CFG(S, productions)
927
+
928
+ # tokenize the sentence
929
+ sent = "my dog saw a man in the park with a statue".split()
930
+
931
+ ShiftReduceApp(grammar, sent).mainloop()
932
+
933
+
934
+ if __name__ == "__main__":
935
+ app()
936
+
937
+ __all__ = ["app"]
.eggs/nltk-3.8-py3.10.egg/nltk/app/wordfreq_app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Wordfreq Application
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ from matplotlib import pylab
9
+
10
+ from nltk.corpus import gutenberg
11
+ from nltk.text import Text
12
+
13
+
14
+ def plot_word_freq_dist(text):
15
+ fd = text.vocab()
16
+
17
+ samples = [item for item, _ in fd.most_common(50)]
18
+ values = [fd[sample] for sample in samples]
19
+ values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))]
20
+ pylab.title(text.name)
21
+ pylab.xlabel("Samples")
22
+ pylab.ylabel("Cumulative Percentage")
23
+ pylab.plot(values)
24
+ pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
25
+ pylab.show()
26
+
27
+
28
+ def app():
29
+ t1 = Text(gutenberg.words("melville-moby_dick.txt"))
30
+ plot_word_freq_dist(t1)
31
+
32
+
33
+ if __name__ == "__main__":
34
+ app()
35
+
36
+ __all__ = ["app"]
.eggs/nltk-3.8-py3.10.egg/nltk/app/wordnet_app.py ADDED
@@ -0,0 +1,997 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: WordNet Browser Application
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
5
+ # Paul Bone <pbone@students.csse.unimelb.edu.au>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ A WordNet Browser application which launches the default browser
11
+ (if it is not already running) and opens a new tab with a connection
12
+ to http://localhost:port/ . It also starts an HTTP server on the
13
+ specified port and begins serving browser requests. The default
14
+ port is 8000. (For command-line help, run "python wordnet -h")
15
+ This application requires that the user's web browser supports
16
+ Javascript.
17
+
18
+ BrowServer is a server for browsing the NLTK Wordnet database It first
19
+ launches a browser client to be used for browsing and then starts
20
+ serving the requests of that and maybe other clients
21
+
22
+ Usage::
23
+
24
+ browserver.py -h
25
+ browserver.py [-s] [-p <port>]
26
+
27
+ Options::
28
+
29
+ -h or --help
30
+ Display this help message.
31
+
32
+ -l <file> or --log-file <file>
33
+ Logs messages to the given file, If this option is not specified
34
+ messages are silently dropped.
35
+
36
+ -p <port> or --port <port>
37
+ Run the web server on this TCP port, defaults to 8000.
38
+
39
+ -s or --server-mode
40
+ Do not start a web browser, and do not allow a user to
41
+ shutdown the server through the web interface.
42
+ """
43
+ # TODO: throughout this package variable names and docstrings need
44
+ # modifying to be compliant with NLTK's coding standards. Tests also
45
+ # need to be develop to ensure this continues to work in the face of
46
+ # changes to other NLTK packages.
47
+
48
+ import base64
49
+ import copy
50
+ import datetime
51
+ import getopt
52
+ import os
53
+ import pickle
54
+ import re
55
+ import sys
56
+ import threading
57
+ import time
58
+ import webbrowser
59
+ from collections import defaultdict
60
+ from http.server import BaseHTTPRequestHandler, HTTPServer
61
+
62
+ # Allow this program to run inside the NLTK source tree.
63
+ from sys import argv, path
64
+ from urllib.parse import unquote_plus
65
+
66
+ from nltk.corpus import wordnet as wn
67
+ from nltk.corpus.reader.wordnet import Lemma, Synset
68
+
69
+ # now included in local file
70
+ # from util import html_header, html_trailer, \
71
+ # get_static_index_page, get_static_page_by_path, \
72
+ # page_from_word, page_from_href
73
+
74
+ firstClient = True
75
+
76
+ # True if we're not also running a web browser. The value f server_mode
77
+ # gets set by demo().
78
+ server_mode = None
79
+
80
+ # If set this is a file object for writing log messages.
81
+ logfile = None
82
+
83
+
84
+ class MyServerHandler(BaseHTTPRequestHandler):
85
+ def do_HEAD(self):
86
+ self.send_head()
87
+
88
+ def do_GET(self):
89
+ global firstClient
90
+ sp = self.path[1:]
91
+ if unquote_plus(sp) == "SHUTDOWN THE SERVER":
92
+ if server_mode:
93
+ page = "Server must be killed with SIGTERM."
94
+ type = "text/plain"
95
+ else:
96
+ print("Server shutting down!")
97
+ os._exit(0)
98
+
99
+ elif sp == "": # First request.
100
+ type = "text/html"
101
+ if not server_mode and firstClient:
102
+ firstClient = False
103
+ page = get_static_index_page(True)
104
+ else:
105
+ page = get_static_index_page(False)
106
+ word = "green"
107
+
108
+ elif sp.endswith(".html"): # Trying to fetch a HTML file TODO:
109
+ type = "text/html"
110
+ usp = unquote_plus(sp)
111
+ if usp == "NLTK Wordnet Browser Database Info.html":
112
+ word = "* Database Info *"
113
+ if os.path.isfile(usp):
114
+ with open(usp) as infile:
115
+ page = infile.read()
116
+ else:
117
+ page = (
118
+ (html_header % word) + "<p>The database info file:"
119
+ "<p><b>"
120
+ + usp
121
+ + "</b>"
122
+ + "<p>was not found. Run this:"
123
+ + "<p><b>python dbinfo_html.py</b>"
124
+ + "<p>to produce it."
125
+ + html_trailer
126
+ )
127
+ else:
128
+ # Handle files here.
129
+ word = sp
130
+ page = get_static_page_by_path(usp)
131
+ elif sp.startswith("search"):
132
+ # This doesn't seem to work with MWEs.
133
+ type = "text/html"
134
+ parts = (sp.split("?")[1]).split("&")
135
+ word = [
136
+ p.split("=")[1].replace("+", " ")
137
+ for p in parts
138
+ if p.startswith("nextWord")
139
+ ][0]
140
+ page, word = page_from_word(word)
141
+ elif sp.startswith("lookup_"):
142
+ # TODO add a variation of this that takes a non ecoded word or MWE.
143
+ type = "text/html"
144
+ sp = sp[len("lookup_") :]
145
+ page, word = page_from_href(sp)
146
+ elif sp == "start_page":
147
+ # if this is the first request we should display help
148
+ # information, and possibly set a default word.
149
+ type = "text/html"
150
+ page, word = page_from_word("wordnet")
151
+ else:
152
+ type = "text/plain"
153
+ page = "Could not parse request: '%s'" % sp
154
+
155
+ # Send result.
156
+ self.send_head(type)
157
+ self.wfile.write(page.encode("utf8"))
158
+
159
+ def send_head(self, type=None):
160
+ self.send_response(200)
161
+ self.send_header("Content-type", type)
162
+ self.end_headers()
163
+
164
+ def log_message(self, format, *args):
165
+ global logfile
166
+
167
+ if logfile:
168
+ logfile.write(
169
+ "%s - - [%s] %s\n"
170
+ % (self.address_string(), self.log_date_time_string(), format % args)
171
+ )
172
+
173
+
174
+ def get_unique_counter_from_url(sp):
175
+ """
176
+ Extract the unique counter from the URL if it has one. Otherwise return
177
+ null.
178
+ """
179
+ pos = sp.rfind("%23")
180
+ if pos != -1:
181
+ return int(sp[(pos + 3) :])
182
+ else:
183
+ return None
184
+
185
+
186
+ def wnb(port=8000, runBrowser=True, logfilename=None):
187
+ """
188
+ Run NLTK Wordnet Browser Server.
189
+
190
+ :param port: The port number for the server to listen on, defaults to
191
+ 8000
192
+ :type port: int
193
+
194
+ :param runBrowser: True to start a web browser and point it at the web
195
+ server.
196
+ :type runBrowser: bool
197
+ """
198
+ # The webbrowser module is unpredictable, typically it blocks if it uses
199
+ # a console web browser, and doesn't block if it uses a GUI webbrowser,
200
+ # so we need to force it to have a clear correct behaviour.
201
+ #
202
+ # Normally the server should run for as long as the user wants. they
203
+ # should idealy be able to control this from the UI by closing the
204
+ # window or tab. Second best would be clicking a button to say
205
+ # 'Shutdown' that first shutsdown the server and closes the window or
206
+ # tab, or exits the text-mode browser. Both of these are unfreasable.
207
+ #
208
+ # The next best alternative is to start the server, have it close when
209
+ # it receives SIGTERM (default), and run the browser as well. The user
210
+ # may have to shutdown both programs.
211
+ #
212
+ # Since webbrowser may block, and the webserver will block, we must run
213
+ # them in separate threads.
214
+ #
215
+ global server_mode, logfile
216
+ server_mode = not runBrowser
217
+
218
+ # Setup logging.
219
+ if logfilename:
220
+ try:
221
+ logfile = open(logfilename, "a", 1) # 1 means 'line buffering'
222
+ except OSError as e:
223
+ sys.stderr.write("Couldn't open %s for writing: %s", logfilename, e)
224
+ sys.exit(1)
225
+ else:
226
+ logfile = None
227
+
228
+ # Compute URL and start web browser
229
+ url = "http://localhost:" + str(port)
230
+
231
+ server_ready = None
232
+ browser_thread = None
233
+
234
+ if runBrowser:
235
+ server_ready = threading.Event()
236
+ browser_thread = startBrowser(url, server_ready)
237
+
238
+ # Start the server.
239
+ server = HTTPServer(("", port), MyServerHandler)
240
+ if logfile:
241
+ logfile.write("NLTK Wordnet browser server running serving: %s\n" % url)
242
+ if runBrowser:
243
+ server_ready.set()
244
+
245
+ try:
246
+ server.serve_forever()
247
+ except KeyboardInterrupt:
248
+ pass
249
+
250
+ if runBrowser:
251
+ browser_thread.join()
252
+
253
+ if logfile:
254
+ logfile.close()
255
+
256
+
257
+ def startBrowser(url, server_ready):
258
+ def run():
259
+ server_ready.wait()
260
+ time.sleep(1) # Wait a little bit more, there's still the chance of
261
+ # a race condition.
262
+ webbrowser.open(url, new=2, autoraise=1)
263
+
264
+ t = threading.Thread(target=run)
265
+ t.start()
266
+ return t
267
+
268
+
269
+ #####################################################################
270
+ # Utilities
271
+ #####################################################################
272
+
273
+
274
+ """
275
+ WordNet Browser Utilities.
276
+
277
+ This provides a backend to both wxbrowse and browserver.py.
278
+ """
279
+
280
+ ################################################################################
281
+ #
282
+ # Main logic for wordnet browser.
283
+ #
284
+
285
+ # This is wrapped inside a function since wn is only available if the
286
+ # WordNet corpus is installed.
287
+ def _pos_tuples():
288
+ return [
289
+ (wn.NOUN, "N", "noun"),
290
+ (wn.VERB, "V", "verb"),
291
+ (wn.ADJ, "J", "adj"),
292
+ (wn.ADV, "R", "adv"),
293
+ ]
294
+
295
+
296
+ def _pos_match(pos_tuple):
297
+ """
298
+ This function returns the complete pos tuple for the partial pos
299
+ tuple given to it. It attempts to match it against the first
300
+ non-null component of the given pos tuple.
301
+ """
302
+ if pos_tuple[0] == "s":
303
+ pos_tuple = ("a", pos_tuple[1], pos_tuple[2])
304
+ for n, x in enumerate(pos_tuple):
305
+ if x is not None:
306
+ break
307
+ for pt in _pos_tuples():
308
+ if pt[n] == pos_tuple[n]:
309
+ return pt
310
+ return None
311
+
312
+
313
+ HYPONYM = 0
314
+ HYPERNYM = 1
315
+ CLASS_REGIONAL = 2
316
+ PART_HOLONYM = 3
317
+ PART_MERONYM = 4
318
+ ATTRIBUTE = 5
319
+ SUBSTANCE_HOLONYM = 6
320
+ SUBSTANCE_MERONYM = 7
321
+ MEMBER_HOLONYM = 8
322
+ MEMBER_MERONYM = 9
323
+ VERB_GROUP = 10
324
+ INSTANCE_HYPONYM = 12
325
+ INSTANCE_HYPERNYM = 13
326
+ CAUSE = 14
327
+ ALSO_SEE = 15
328
+ SIMILAR = 16
329
+ ENTAILMENT = 17
330
+ ANTONYM = 18
331
+ FRAMES = 19
332
+ PERTAINYM = 20
333
+
334
+ CLASS_CATEGORY = 21
335
+ CLASS_USAGE = 22
336
+ CLASS_REGIONAL = 23
337
+ CLASS_USAGE = 24
338
+ CLASS_CATEGORY = 11
339
+
340
+ DERIVATIONALLY_RELATED_FORM = 25
341
+
342
+ INDIRECT_HYPERNYMS = 26
343
+
344
+
345
+ def lemma_property(word, synset, func):
346
+ def flattern(l):
347
+ if l == []:
348
+ return []
349
+ else:
350
+ return l[0] + flattern(l[1:])
351
+
352
+ return flattern([func(l) for l in synset.lemmas() if l.name == word])
353
+
354
+
355
+ def rebuild_tree(orig_tree):
356
+ node = orig_tree[0]
357
+ children = orig_tree[1:]
358
+ return (node, [rebuild_tree(t) for t in children])
359
+
360
+
361
+ def get_relations_data(word, synset):
362
+ """
363
+ Get synset relations data for a synset. Note that this doesn't
364
+ yet support things such as full hyponym vs direct hyponym.
365
+ """
366
+ if synset.pos() == wn.NOUN:
367
+ return (
368
+ (HYPONYM, "Hyponyms", synset.hyponyms()),
369
+ (INSTANCE_HYPONYM, "Instance hyponyms", synset.instance_hyponyms()),
370
+ (HYPERNYM, "Direct hypernyms", synset.hypernyms()),
371
+ (
372
+ INDIRECT_HYPERNYMS,
373
+ "Indirect hypernyms",
374
+ rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
375
+ ),
376
+ # hypernyms', 'Sister terms',
377
+ (INSTANCE_HYPERNYM, "Instance hypernyms", synset.instance_hypernyms()),
378
+ # (CLASS_REGIONAL, ['domain term region'], ),
379
+ (PART_HOLONYM, "Part holonyms", synset.part_holonyms()),
380
+ (PART_MERONYM, "Part meronyms", synset.part_meronyms()),
381
+ (SUBSTANCE_HOLONYM, "Substance holonyms", synset.substance_holonyms()),
382
+ (SUBSTANCE_MERONYM, "Substance meronyms", synset.substance_meronyms()),
383
+ (MEMBER_HOLONYM, "Member holonyms", synset.member_holonyms()),
384
+ (MEMBER_MERONYM, "Member meronyms", synset.member_meronyms()),
385
+ (ATTRIBUTE, "Attributes", synset.attributes()),
386
+ (ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())),
387
+ (
388
+ DERIVATIONALLY_RELATED_FORM,
389
+ "Derivationally related form",
390
+ lemma_property(
391
+ word, synset, lambda l: l.derivationally_related_forms()
392
+ ),
393
+ ),
394
+ )
395
+ elif synset.pos() == wn.VERB:
396
+ return (
397
+ (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
398
+ (HYPONYM, "Hyponym", synset.hyponyms()),
399
+ (HYPERNYM, "Direct hypernyms", synset.hypernyms()),
400
+ (
401
+ INDIRECT_HYPERNYMS,
402
+ "Indirect hypernyms",
403
+ rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
404
+ ),
405
+ (ENTAILMENT, "Entailments", synset.entailments()),
406
+ (CAUSE, "Causes", synset.causes()),
407
+ (ALSO_SEE, "Also see", synset.also_sees()),
408
+ (VERB_GROUP, "Verb Groups", synset.verb_groups()),
409
+ (
410
+ DERIVATIONALLY_RELATED_FORM,
411
+ "Derivationally related form",
412
+ lemma_property(
413
+ word, synset, lambda l: l.derivationally_related_forms()
414
+ ),
415
+ ),
416
+ )
417
+ elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT:
418
+ return (
419
+ (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
420
+ (SIMILAR, "Similar to", synset.similar_tos()),
421
+ # Participle of verb - not supported by corpus
422
+ (
423
+ PERTAINYM,
424
+ "Pertainyms",
425
+ lemma_property(word, synset, lambda l: l.pertainyms()),
426
+ ),
427
+ (ATTRIBUTE, "Attributes", synset.attributes()),
428
+ (ALSO_SEE, "Also see", synset.also_sees()),
429
+ )
430
+ elif synset.pos() == wn.ADV:
431
+ # This is weird. adverbs such as 'quick' and 'fast' don't seem
432
+ # to have antonyms returned by the corpus.a
433
+ return (
434
+ (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
435
+ )
436
+ # Derived from adjective - not supported by corpus
437
+ else:
438
+ raise TypeError("Unhandles synset POS type: " + str(synset.pos()))
439
+
440
+
441
+ html_header = """
442
+ <!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
443
+ 'http://www.w3.org/TR/html4/strict.dtd'>
444
+ <html>
445
+ <head>
446
+ <meta name='generator' content=
447
+ 'HTML Tidy for Windows (vers 14 February 2006), see www.w3.org'>
448
+ <meta http-equiv='Content-Type' content=
449
+ 'text/html; charset=us-ascii'>
450
+ <title>NLTK Wordnet Browser display of: %s</title></head>
451
+ <body bgcolor='#F5F5F5' text='#000000'>
452
+ """
453
+ html_trailer = """
454
+ </body>
455
+ </html>
456
+ """
457
+
458
+ explanation = """
459
+ <h3>Search Help</h3>
460
+ <ul><li>The display below the line is an example of the output the browser
461
+ shows you when you enter a search word. The search word was <b>green</b>.</li>
462
+ <li>The search result shows for different parts of speech the <b>synsets</b>
463
+ i.e. different meanings for the word.</li>
464
+ <li>All underlined texts are hypertext links. There are two types of links:
465
+ word links and others. Clicking a word link carries out a search for the word
466
+ in the Wordnet database.</li>
467
+ <li>Clicking a link of the other type opens a display section of data attached
468
+ to that link. Clicking that link a second time closes the section again.</li>
469
+ <li>Clicking <u>S:</u> opens a section showing the relations for that synset.
470
+ </li>
471
+ <li>Clicking on a relation name opens a section that displays the associated
472
+ synsets.</li>
473
+ <li>Type a search word in the <b>Word</b> field and start the search by the
474
+ <b>Enter/Return</b> key or click the <b>Search</b> button.</li>
475
+ </ul>
476
+ <hr width='100%'>
477
+ """
478
+
479
+ # HTML oriented functions
480
+
481
+
482
+ def _bold(txt):
483
+ return "<b>%s</b>" % txt
484
+
485
+
486
+ def _center(txt):
487
+ return "<center>%s</center>" % txt
488
+
489
+
490
+ def _hlev(n, txt):
491
+ return "<h%d>%s</h%d>" % (n, txt, n)
492
+
493
+
494
+ def _italic(txt):
495
+ return "<i>%s</i>" % txt
496
+
497
+
498
+ def _li(txt):
499
+ return "<li>%s</li>" % txt
500
+
501
+
502
+ def pg(word, body):
503
+ """
504
+ Return a HTML page of NLTK Browser format constructed from the
505
+ word and body
506
+
507
+ :param word: The word that the body corresponds to
508
+ :type word: str
509
+ :param body: The HTML body corresponding to the word
510
+ :type body: str
511
+ :return: a HTML page for the word-body combination
512
+ :rtype: str
513
+ """
514
+ return (html_header % word) + body + html_trailer
515
+
516
+
517
+ def _ul(txt):
518
+ return "<ul>" + txt + "</ul>"
519
+
520
+
521
+ def _abbc(txt):
522
+ """
523
+ abbc = asterisks, breaks, bold, center
524
+ """
525
+ return _center(_bold("<br>" * 10 + "*" * 10 + " " + txt + " " + "*" * 10))
526
+
527
+
528
+ full_hyponym_cont_text = _ul(_li(_italic("(has full hyponym continuation)"))) + "\n"
529
+
530
+
531
+ def _get_synset(synset_key):
532
+ """
533
+ The synset key is the unique name of the synset, this can be
534
+ retrieved via synset.name()
535
+ """
536
+ return wn.synset(synset_key)
537
+
538
+
539
+ def _collect_one_synset(word, synset, synset_relations):
540
+ """
541
+ Returns the HTML string for one synset or word
542
+
543
+ :param word: the current word
544
+ :type word: str
545
+ :param synset: a synset
546
+ :type synset: synset
547
+ :param synset_relations: information about which synset relations
548
+ to display.
549
+ :type synset_relations: dict(synset_key, set(relation_id))
550
+ :return: The HTML string built for this synset
551
+ :rtype: str
552
+ """
553
+ if isinstance(synset, tuple): # It's a word
554
+ raise NotImplementedError("word not supported by _collect_one_synset")
555
+
556
+ typ = "S"
557
+ pos_tuple = _pos_match((synset.pos(), None, None))
558
+ assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos()
559
+ descr = pos_tuple[2]
560
+ ref = copy.deepcopy(Reference(word, synset_relations))
561
+ ref.toggle_synset(synset)
562
+ synset_label = typ + ";"
563
+ if synset.name() in synset_relations:
564
+ synset_label = _bold(synset_label)
565
+ s = f"<li>{make_lookup_link(ref, synset_label)} ({descr}) "
566
+
567
+ def format_lemma(w):
568
+ w = w.replace("_", " ")
569
+ if w.lower() == word:
570
+ return _bold(w)
571
+ else:
572
+ ref = Reference(w)
573
+ return make_lookup_link(ref, w)
574
+
575
+ s += ", ".join(format_lemma(l.name()) for l in synset.lemmas())
576
+
577
+ gl = " ({}) <i>{}</i> ".format(
578
+ synset.definition(),
579
+ "; ".join('"%s"' % e for e in synset.examples()),
580
+ )
581
+ return s + gl + _synset_relations(word, synset, synset_relations) + "</li>\n"
582
+
583
+
584
+ def _collect_all_synsets(word, pos, synset_relations=dict()):
585
+ """
586
+ Return a HTML unordered list of synsets for the given word and
587
+ part of speech.
588
+ """
589
+ return "<ul>%s\n</ul>\n" % "".join(
590
+ _collect_one_synset(word, synset, synset_relations)
591
+ for synset in wn.synsets(word, pos)
592
+ )
593
+
594
+
595
+ def _synset_relations(word, synset, synset_relations):
596
+ """
597
+ Builds the HTML string for the relations of a synset
598
+
599
+ :param word: The current word
600
+ :type word: str
601
+ :param synset: The synset for which we're building the relations.
602
+ :type synset: Synset
603
+ :param synset_relations: synset keys and relation types for which to display relations.
604
+ :type synset_relations: dict(synset_key, set(relation_type))
605
+ :return: The HTML for a synset's relations
606
+ :rtype: str
607
+ """
608
+
609
+ if not synset.name() in synset_relations:
610
+ return ""
611
+ ref = Reference(word, synset_relations)
612
+
613
+ def relation_html(r):
614
+ if isinstance(r, Synset):
615
+ return make_lookup_link(Reference(r.lemma_names()[0]), r.lemma_names()[0])
616
+ elif isinstance(r, Lemma):
617
+ return relation_html(r.synset())
618
+ elif isinstance(r, tuple):
619
+ # It's probably a tuple containing a Synset and a list of
620
+ # similar tuples. This forms a tree of synsets.
621
+ return "{}\n<ul>{}</ul>\n".format(
622
+ relation_html(r[0]),
623
+ "".join("<li>%s</li>\n" % relation_html(sr) for sr in r[1]),
624
+ )
625
+ else:
626
+ raise TypeError(
627
+ "r must be a synset, lemma or list, it was: type(r) = %s, r = %s"
628
+ % (type(r), r)
629
+ )
630
+
631
+ def make_synset_html(db_name, disp_name, rels):
632
+ synset_html = "<i>%s</i>\n" % make_lookup_link(
633
+ copy.deepcopy(ref).toggle_synset_relation(synset, db_name),
634
+ disp_name,
635
+ )
636
+
637
+ if db_name in ref.synset_relations[synset.name()]:
638
+ synset_html += "<ul>%s</ul>\n" % "".join(
639
+ "<li>%s</li>\n" % relation_html(r) for r in rels
640
+ )
641
+
642
+ return synset_html
643
+
644
+ html = (
645
+ "<ul>"
646
+ + "\n".join(
647
+ "<li>%s</li>" % make_synset_html(*rel_data)
648
+ for rel_data in get_relations_data(word, synset)
649
+ if rel_data[2] != []
650
+ )
651
+ + "</ul>"
652
+ )
653
+
654
+ return html
655
+
656
+
657
+ class Reference:
658
+ """
659
+ A reference to a page that may be generated by page_word
660
+ """
661
+
662
+ def __init__(self, word, synset_relations=dict()):
663
+ """
664
+ Build a reference to a new page.
665
+
666
+ word is the word or words (separated by commas) for which to
667
+ search for synsets of
668
+
669
+ synset_relations is a dictionary of synset keys to sets of
670
+ synset relation identifaiers to unfold a list of synset
671
+ relations for.
672
+ """
673
+ self.word = word
674
+ self.synset_relations = synset_relations
675
+
676
+ def encode(self):
677
+ """
678
+ Encode this reference into a string to be used in a URL.
679
+ """
680
+ # This uses a tuple rather than an object since the python
681
+ # pickle representation is much smaller and there is no need
682
+ # to represent the complete object.
683
+ string = pickle.dumps((self.word, self.synset_relations), -1)
684
+ return base64.urlsafe_b64encode(string).decode()
685
+
686
+ @staticmethod
687
+ def decode(string):
688
+ """
689
+ Decode a reference encoded with Reference.encode
690
+ """
691
+ string = base64.urlsafe_b64decode(string.encode())
692
+ word, synset_relations = pickle.loads(string)
693
+ return Reference(word, synset_relations)
694
+
695
+ def toggle_synset_relation(self, synset, relation):
696
+ """
697
+ Toggle the display of the relations for the given synset and
698
+ relation type.
699
+
700
+ This function will throw a KeyError if the synset is currently
701
+ not being displayed.
702
+ """
703
+ if relation in self.synset_relations[synset.name()]:
704
+ self.synset_relations[synset.name()].remove(relation)
705
+ else:
706
+ self.synset_relations[synset.name()].add(relation)
707
+
708
+ return self
709
+
710
+ def toggle_synset(self, synset):
711
+ """
712
+ Toggle displaying of the relation types for the given synset
713
+ """
714
+ if synset.name() in self.synset_relations:
715
+ del self.synset_relations[synset.name()]
716
+ else:
717
+ self.synset_relations[synset.name()] = set()
718
+
719
+ return self
720
+
721
+
722
+ def make_lookup_link(ref, label):
723
+ return f'<a href="lookup_{ref.encode()}">{label}</a>'
724
+
725
+
726
+ def page_from_word(word):
727
+ """
728
+ Return a HTML page for the given word.
729
+
730
+ :type word: str
731
+ :param word: The currently active word
732
+ :return: A tuple (page,word), where page is the new current HTML page
733
+ to be sent to the browser and
734
+ word is the new current word
735
+ :rtype: A tuple (str,str)
736
+ """
737
+ return page_from_reference(Reference(word))
738
+
739
+
740
+ def page_from_href(href):
741
+ """
742
+ Returns a tuple of the HTML page built and the new current word
743
+
744
+ :param href: The hypertext reference to be solved
745
+ :type href: str
746
+ :return: A tuple (page,word), where page is the new current HTML page
747
+ to be sent to the browser and
748
+ word is the new current word
749
+ :rtype: A tuple (str,str)
750
+ """
751
+ return page_from_reference(Reference.decode(href))
752
+
753
+
754
+ def page_from_reference(href):
755
+ """
756
+ Returns a tuple of the HTML page built and the new current word
757
+
758
+ :param href: The hypertext reference to be solved
759
+ :type href: str
760
+ :return: A tuple (page,word), where page is the new current HTML page
761
+ to be sent to the browser and
762
+ word is the new current word
763
+ :rtype: A tuple (str,str)
764
+ """
765
+ word = href.word
766
+ pos_forms = defaultdict(list)
767
+ words = word.split(",")
768
+ words = [w for w in [w.strip().lower().replace(" ", "_") for w in words] if w != ""]
769
+ if len(words) == 0:
770
+ # No words were found.
771
+ return "", "Please specify a word to search for."
772
+
773
+ # This looks up multiple words at once. This is probably not
774
+ # necessary and may lead to problems.
775
+ for w in words:
776
+ for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]:
777
+ form = wn.morphy(w, pos)
778
+ if form and form not in pos_forms[pos]:
779
+ pos_forms[pos].append(form)
780
+ body = ""
781
+ for pos, pos_str, name in _pos_tuples():
782
+ if pos in pos_forms:
783
+ body += _hlev(3, name) + "\n"
784
+ for w in pos_forms[pos]:
785
+ # Not all words of exc files are in the database, skip
786
+ # to the next word if a KeyError is raised.
787
+ try:
788
+ body += _collect_all_synsets(w, pos, href.synset_relations)
789
+ except KeyError:
790
+ pass
791
+ if not body:
792
+ body = "The word or words '%s' where not found in the dictionary." % word
793
+ return body, word
794
+
795
+
796
+ #####################################################################
797
+ # Static pages
798
+ #####################################################################
799
+
800
+
801
+ def get_static_page_by_path(path):
802
+ """
803
+ Return a static HTML page from the path given.
804
+ """
805
+ if path == "index_2.html":
806
+ return get_static_index_page(False)
807
+ elif path == "index.html":
808
+ return get_static_index_page(True)
809
+ elif path == "NLTK Wordnet Browser Database Info.html":
810
+ return "Display of Wordnet Database Statistics is not supported"
811
+ elif path == "upper_2.html":
812
+ return get_static_upper_page(False)
813
+ elif path == "upper.html":
814
+ return get_static_upper_page(True)
815
+ elif path == "web_help.html":
816
+ return get_static_web_help_page()
817
+ elif path == "wx_help.html":
818
+ return get_static_wx_help_page()
819
+ else:
820
+ return "Internal error: Path for static page '%s' is unknown" % path
821
+
822
+
823
+ def get_static_web_help_page():
824
+ """
825
+ Return the static web help page.
826
+ """
827
+ return """
828
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
829
+ <html>
830
+ <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
831
+ Copyright (C) 2001-2022 NLTK Project
832
+ Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
833
+ URL: <https://www.nltk.org/>
834
+ For license information, see LICENSE.TXT -->
835
+ <head>
836
+ <meta http-equiv='Content-Type' content='text/html; charset=us-ascii'>
837
+ <title>NLTK Wordnet Browser display of: * Help *</title>
838
+ </head>
839
+ <body bgcolor='#F5F5F5' text='#000000'>
840
+ <h2>NLTK Wordnet Browser Help</h2>
841
+ <p>The NLTK Wordnet Browser is a tool to use in browsing the Wordnet database. It tries to behave like the Wordnet project's web browser but the difference is that the NLTK Wordnet Browser uses a local Wordnet database.
842
+ <p><b>You are using the Javascript client part of the NLTK Wordnet BrowseServer.</b> We assume your browser is in tab sheets enabled mode.</p>
843
+ <p>For background information on Wordnet, see the Wordnet project home page: <a href="https://wordnet.princeton.edu/"><b> https://wordnet.princeton.edu/</b></a>. For more information on the NLTK project, see the project home:
844
+ <a href="https://www.nltk.org/"><b>https://www.nltk.org/</b></a>. To get an idea of what the Wordnet version used by this browser includes choose <b>Show Database Info</b> from the <b>View</b> submenu.</p>
845
+ <h3>Word search</h3>
846
+ <p>The word to be searched is typed into the <b>New Word</b> field and the search started with Enter or by clicking the <b>Search</b> button. There is no uppercase/lowercase distinction: the search word is transformed to lowercase before the search.</p>
847
+ <p>In addition, the word does not have to be in base form. The browser tries to find the possible base form(s) by making certain morphological substitutions. Typing <b>fLIeS</b> as an obscure example gives one <a href="MfLIeS">this</a>. Click the previous link to see what this kind of search looks like and then come back to this page by using the <b>Alt+LeftArrow</b> key combination.</p>
848
+ <p>The result of a search is a display of one or more
849
+ <b>synsets</b> for every part of speech in which a form of the
850
+ search word was found to occur. A synset is a set of words
851
+ having the same sense or meaning. Each word in a synset that is
852
+ underlined is a hyperlink which can be clicked to trigger an
853
+ automatic search for that word.</p>
854
+ <p>Every synset has a hyperlink <b>S:</b> at the start of its
855
+ display line. Clicking that symbol shows you the name of every
856
+ <b>relation</b> that this synset is part of. Every relation name is a hyperlink that opens up a display for that relation. Clicking it another time closes the display again. Clicking another relation name on a line that has an opened relation closes the open relation and opens the clicked relation.</p>
857
+ <p>It is also possible to give two or more words or collocations to be searched at the same time separating them with a comma like this <a href="Mcheer up,clear up">cheer up,clear up</a>, for example. Click the previous link to see what this kind of search looks like and then come back to this page by using the <b>Alt+LeftArrow</b> key combination. As you could see the search result includes the synsets found in the same order than the forms were given in the search field.</p>
858
+ <p>
859
+ There are also word level (lexical) relations recorded in the Wordnet database. Opening this kind of relation displays lines with a hyperlink <b>W:</b> at their beginning. Clicking this link shows more info on the word in question.</p>
860
+ <h3>The Buttons</h3>
861
+ <p>The <b>Search</b> and <b>Help</b> buttons need no more explanation. </p>
862
+ <p>The <b>Show Database Info</b> button shows a collection of Wordnet database statistics.</p>
863
+ <p>The <b>Shutdown the Server</b> button is shown for the first client of the BrowServer program i.e. for the client that is automatically launched when the BrowServer is started but not for the succeeding clients in order to protect the server from accidental shutdowns.
864
+ </p></body>
865
+ </html>
866
+ """
867
+
868
+
869
+ def get_static_welcome_message():
870
+ """
871
+ Get the static welcome page.
872
+ """
873
+ return """
874
+ <h3>Search Help</h3>
875
+ <ul><li>The display below the line is an example of the output the browser
876
+ shows you when you enter a search word. The search word was <b>green</b>.</li>
877
+ <li>The search result shows for different parts of speech the <b>synsets</b>
878
+ i.e. different meanings for the word.</li>
879
+ <li>All underlined texts are hypertext links. There are two types of links:
880
+ word links and others. Clicking a word link carries out a search for the word
881
+ in the Wordnet database.</li>
882
+ <li>Clicking a link of the other type opens a display section of data attached
883
+ to that link. Clicking that link a second time closes the section again.</li>
884
+ <li>Clicking <u>S:</u> opens a section showing the relations for that synset.</li>
885
+ <li>Clicking on a relation name opens a section that displays the associated
886
+ synsets.</li>
887
+ <li>Type a search word in the <b>Next Word</b> field and start the search by the
888
+ <b>Enter/Return</b> key or click the <b>Search</b> button.</li>
889
+ </ul>
890
+ """
891
+
892
+
893
+ def get_static_index_page(with_shutdown):
894
+ """
895
+ Get the static index page.
896
+ """
897
+ template = """
898
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">
899
+ <HTML>
900
+ <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
901
+ Copyright (C) 2001-2022 NLTK Project
902
+ Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
903
+ URL: <https://www.nltk.org/>
904
+ For license information, see LICENSE.TXT -->
905
+ <HEAD>
906
+ <TITLE>NLTK Wordnet Browser</TITLE>
907
+ </HEAD>
908
+
909
+ <frameset rows="7%%,93%%">
910
+ <frame src="%s" name="header">
911
+ <frame src="start_page" name="body">
912
+ </frameset>
913
+ </HTML>
914
+ """
915
+ if with_shutdown:
916
+ upper_link = "upper.html"
917
+ else:
918
+ upper_link = "upper_2.html"
919
+
920
+ return template % upper_link
921
+
922
+
923
+ def get_static_upper_page(with_shutdown):
924
+ """
925
+ Return the upper frame page,
926
+
927
+ If with_shutdown is True then a 'shutdown' button is also provided
928
+ to shutdown the server.
929
+ """
930
+ template = """
931
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
932
+ <html>
933
+ <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
934
+ Copyright (C) 2001-2022 NLTK Project
935
+ Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
936
+ URL: <https://www.nltk.org/>
937
+ For license information, see LICENSE.TXT -->
938
+ <head>
939
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
940
+ <title>Untitled Document</title>
941
+ </head>
942
+ <body>
943
+ <form method="GET" action="search" target="body">
944
+ Current Word:&nbsp;<input type="text" id="currentWord" size="10" disabled>
945
+ Next Word:&nbsp;<input type="text" id="nextWord" name="nextWord" size="10">
946
+ <input name="searchButton" type="submit" value="Search">
947
+ </form>
948
+ <a target="body" href="web_help.html">Help</a>
949
+ %s
950
+
951
+ </body>
952
+ </html>
953
+ """
954
+ if with_shutdown:
955
+ shutdown_link = '<a href="SHUTDOWN THE SERVER">Shutdown</a>'
956
+ else:
957
+ shutdown_link = ""
958
+
959
+ return template % shutdown_link
960
+
961
+
962
+ def usage():
963
+ """
964
+ Display the command line help message.
965
+ """
966
+ print(__doc__)
967
+
968
+
969
+ def app():
970
+ # Parse and interpret options.
971
+ (opts, _) = getopt.getopt(
972
+ argv[1:], "l:p:sh", ["logfile=", "port=", "server-mode", "help"]
973
+ )
974
+ port = 8000
975
+ server_mode = False
976
+ help_mode = False
977
+ logfilename = None
978
+ for (opt, value) in opts:
979
+ if (opt == "-l") or (opt == "--logfile"):
980
+ logfilename = str(value)
981
+ elif (opt == "-p") or (opt == "--port"):
982
+ port = int(value)
983
+ elif (opt == "-s") or (opt == "--server-mode"):
984
+ server_mode = True
985
+ elif (opt == "-h") or (opt == "--help"):
986
+ help_mode = True
987
+
988
+ if help_mode:
989
+ usage()
990
+ else:
991
+ wnb(port, not server_mode, logfilename)
992
+
993
+
994
+ if __name__ == "__main__":
995
+ app()
996
+
997
+ __all__ = ["app"]
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Combinatory Categorial Grammar
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Combinatory Categorial Grammar.
10
+
11
+ For more information see nltk/doc/contrib/ccg/ccg.pdf
12
+ """
13
+
14
+ from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge
15
+ from nltk.ccg.combinator import (
16
+ BackwardApplication,
17
+ BackwardBx,
18
+ BackwardCombinator,
19
+ BackwardComposition,
20
+ BackwardSx,
21
+ BackwardT,
22
+ DirectedBinaryCombinator,
23
+ ForwardApplication,
24
+ ForwardCombinator,
25
+ ForwardComposition,
26
+ ForwardSubstitution,
27
+ ForwardT,
28
+ UndirectedBinaryCombinator,
29
+ UndirectedComposition,
30
+ UndirectedFunctionApplication,
31
+ UndirectedSubstitution,
32
+ UndirectedTypeRaise,
33
+ )
34
+ from nltk.ccg.lexicon import CCGLexicon
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/api.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: CCG Categories
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ from abc import ABCMeta, abstractmethod
9
+ from functools import total_ordering
10
+
11
+ from nltk.internals import raise_unorderable_types
12
+
13
+
14
+ @total_ordering
15
+ class AbstractCCGCategory(metaclass=ABCMeta):
16
+ """
17
+ Interface for categories in combinatory grammars.
18
+ """
19
+
20
+ @abstractmethod
21
+ def is_primitive(self):
22
+ """
23
+ Returns true if the category is primitive.
24
+ """
25
+
26
+ @abstractmethod
27
+ def is_function(self):
28
+ """
29
+ Returns true if the category is a function application.
30
+ """
31
+
32
+ @abstractmethod
33
+ def is_var(self):
34
+ """
35
+ Returns true if the category is a variable.
36
+ """
37
+
38
+ @abstractmethod
39
+ def substitute(self, substitutions):
40
+ """
41
+ Takes a set of (var, category) substitutions, and replaces every
42
+ occurrence of the variable with the corresponding category.
43
+ """
44
+
45
+ @abstractmethod
46
+ def can_unify(self, other):
47
+ """
48
+ Determines whether two categories can be unified.
49
+ - Returns None if they cannot be unified
50
+ - Returns a list of necessary substitutions if they can.
51
+ """
52
+
53
+ # Utility functions: comparison, strings and hashing.
54
+ @abstractmethod
55
+ def __str__(self):
56
+ pass
57
+
58
+ def __eq__(self, other):
59
+ return (
60
+ self.__class__ is other.__class__
61
+ and self._comparison_key == other._comparison_key
62
+ )
63
+
64
+ def __ne__(self, other):
65
+ return not self == other
66
+
67
+ def __lt__(self, other):
68
+ if not isinstance(other, AbstractCCGCategory):
69
+ raise_unorderable_types("<", self, other)
70
+ if self.__class__ is other.__class__:
71
+ return self._comparison_key < other._comparison_key
72
+ else:
73
+ return self.__class__.__name__ < other.__class__.__name__
74
+
75
+ def __hash__(self):
76
+ try:
77
+ return self._hash
78
+ except AttributeError:
79
+ self._hash = hash(self._comparison_key)
80
+ return self._hash
81
+
82
+
83
+ class CCGVar(AbstractCCGCategory):
84
+ """
85
+ Class representing a variable CCG category.
86
+ Used for conjunctions (and possibly type-raising, if implemented as a
87
+ unary rule).
88
+ """
89
+
90
+ _maxID = 0
91
+
92
+ def __init__(self, prim_only=False):
93
+ """Initialize a variable (selects a new identifier)
94
+
95
+ :param prim_only: a boolean that determines whether the variable is
96
+ restricted to primitives
97
+ :type prim_only: bool
98
+ """
99
+ self._id = self.new_id()
100
+ self._prim_only = prim_only
101
+ self._comparison_key = self._id
102
+
103
+ @classmethod
104
+ def new_id(cls):
105
+ """
106
+ A class method allowing generation of unique variable identifiers.
107
+ """
108
+ cls._maxID = cls._maxID + 1
109
+ return cls._maxID - 1
110
+
111
+ @classmethod
112
+ def reset_id(cls):
113
+ cls._maxID = 0
114
+
115
+ def is_primitive(self):
116
+ return False
117
+
118
+ def is_function(self):
119
+ return False
120
+
121
+ def is_var(self):
122
+ return True
123
+
124
+ def substitute(self, substitutions):
125
+ """If there is a substitution corresponding to this variable,
126
+ return the substituted category.
127
+ """
128
+ for (var, cat) in substitutions:
129
+ if var == self:
130
+ return cat
131
+ return self
132
+
133
+ def can_unify(self, other):
134
+ """If the variable can be replaced with other
135
+ a substitution is returned.
136
+ """
137
+ if other.is_primitive() or not self._prim_only:
138
+ return [(self, other)]
139
+ return None
140
+
141
+ def id(self):
142
+ return self._id
143
+
144
+ def __str__(self):
145
+ return "_var" + str(self._id)
146
+
147
+
148
+ @total_ordering
149
+ class Direction:
150
+ """
151
+ Class representing the direction of a function application.
152
+ Also contains maintains information as to which combinators
153
+ may be used with the category.
154
+ """
155
+
156
+ def __init__(self, dir, restrictions):
157
+ self._dir = dir
158
+ self._restrs = restrictions
159
+ self._comparison_key = (dir, tuple(restrictions))
160
+
161
+ # Testing the application direction
162
+ def is_forward(self):
163
+ return self._dir == "/"
164
+
165
+ def is_backward(self):
166
+ return self._dir == "\\"
167
+
168
+ def dir(self):
169
+ return self._dir
170
+
171
+ def restrs(self):
172
+ """A list of restrictions on the combinators.
173
+ '.' denotes that permuting operations are disallowed
174
+ ',' denotes that function composition is disallowed
175
+ '_' denotes that the direction has variable restrictions.
176
+ (This is redundant in the current implementation of type-raising)
177
+ """
178
+ return self._restrs
179
+
180
+ def is_variable(self):
181
+ return self._restrs == "_"
182
+
183
+ # Unification and substitution of variable directions.
184
+ # Used only if type-raising is implemented as a unary rule, as it
185
+ # must inherit restrictions from the argument category.
186
+ def can_unify(self, other):
187
+ if other.is_variable():
188
+ return [("_", self.restrs())]
189
+ elif self.is_variable():
190
+ return [("_", other.restrs())]
191
+ else:
192
+ if self.restrs() == other.restrs():
193
+ return []
194
+ return None
195
+
196
+ def substitute(self, subs):
197
+ if not self.is_variable():
198
+ return self
199
+
200
+ for (var, restrs) in subs:
201
+ if var == "_":
202
+ return Direction(self._dir, restrs)
203
+ return self
204
+
205
+ # Testing permitted combinators
206
+ def can_compose(self):
207
+ return "," not in self._restrs
208
+
209
+ def can_cross(self):
210
+ return "." not in self._restrs
211
+
212
+ def __eq__(self, other):
213
+ return (
214
+ self.__class__ is other.__class__
215
+ and self._comparison_key == other._comparison_key
216
+ )
217
+
218
+ def __ne__(self, other):
219
+ return not self == other
220
+
221
+ def __lt__(self, other):
222
+ if not isinstance(other, Direction):
223
+ raise_unorderable_types("<", self, other)
224
+ if self.__class__ is other.__class__:
225
+ return self._comparison_key < other._comparison_key
226
+ else:
227
+ return self.__class__.__name__ < other.__class__.__name__
228
+
229
+ def __hash__(self):
230
+ try:
231
+ return self._hash
232
+ except AttributeError:
233
+ self._hash = hash(self._comparison_key)
234
+ return self._hash
235
+
236
+ def __str__(self):
237
+ r_str = ""
238
+ for r in self._restrs:
239
+ r_str = r_str + "%s" % r
240
+ return f"{self._dir}{r_str}"
241
+
242
+ # The negation operator reverses the direction of the application
243
+ def __neg__(self):
244
+ if self._dir == "/":
245
+ return Direction("\\", self._restrs)
246
+ else:
247
+ return Direction("/", self._restrs)
248
+
249
+
250
+ class PrimitiveCategory(AbstractCCGCategory):
251
+ """
252
+ Class representing primitive categories.
253
+ Takes a string representation of the category, and a
254
+ list of strings specifying the morphological subcategories.
255
+ """
256
+
257
+ def __init__(self, categ, restrictions=[]):
258
+ self._categ = categ
259
+ self._restrs = restrictions
260
+ self._comparison_key = (categ, tuple(restrictions))
261
+
262
+ def is_primitive(self):
263
+ return True
264
+
265
+ def is_function(self):
266
+ return False
267
+
268
+ def is_var(self):
269
+ return False
270
+
271
+ def restrs(self):
272
+ return self._restrs
273
+
274
+ def categ(self):
275
+ return self._categ
276
+
277
+ # Substitution does nothing to a primitive category
278
+ def substitute(self, subs):
279
+ return self
280
+
281
+ # A primitive can be unified with a class of the same
282
+ # base category, given that the other category shares all
283
+ # of its subclasses, or with a variable.
284
+ def can_unify(self, other):
285
+ if not other.is_primitive():
286
+ return None
287
+ if other.is_var():
288
+ return [(other, self)]
289
+ if other.categ() == self.categ():
290
+ for restr in self._restrs:
291
+ if restr not in other.restrs():
292
+ return None
293
+ return []
294
+ return None
295
+
296
+ def __str__(self):
297
+ if self._restrs == []:
298
+ return "%s" % self._categ
299
+ restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
300
+ return f"{self._categ}{restrictions}"
301
+
302
+
303
+ class FunctionalCategory(AbstractCCGCategory):
304
+ """
305
+ Class that represents a function application category.
306
+ Consists of argument and result categories, together with
307
+ an application direction.
308
+ """
309
+
310
+ def __init__(self, res, arg, dir):
311
+ self._res = res
312
+ self._arg = arg
313
+ self._dir = dir
314
+ self._comparison_key = (arg, dir, res)
315
+
316
+ def is_primitive(self):
317
+ return False
318
+
319
+ def is_function(self):
320
+ return True
321
+
322
+ def is_var(self):
323
+ return False
324
+
325
+ # Substitution returns the category consisting of the
326
+ # substitution applied to each of its constituents.
327
+ def substitute(self, subs):
328
+ sub_res = self._res.substitute(subs)
329
+ sub_dir = self._dir.substitute(subs)
330
+ sub_arg = self._arg.substitute(subs)
331
+ return FunctionalCategory(sub_res, sub_arg, self._dir)
332
+
333
+ # A function can unify with another function, so long as its
334
+ # constituents can unify, or with an unrestricted variable.
335
+ def can_unify(self, other):
336
+ if other.is_var():
337
+ return [(other, self)]
338
+ if other.is_function():
339
+ sa = self._res.can_unify(other.res())
340
+ sd = self._dir.can_unify(other.dir())
341
+ if sa is not None and sd is not None:
342
+ sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
343
+ if sb is not None:
344
+ return sa + sb
345
+ return None
346
+
347
+ # Constituent accessors
348
+ def arg(self):
349
+ return self._arg
350
+
351
+ def res(self):
352
+ return self._res
353
+
354
+ def dir(self):
355
+ return self._dir
356
+
357
+ def __str__(self):
358
+ return f"({self._res}{self._dir}{self._arg})"
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/chart.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Combinatory Categorial Grammar
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ The lexicon is constructed by calling
10
+ ``lexicon.fromstring(<lexicon string>)``.
11
+
12
+ In order to construct a parser, you also need a rule set.
13
+ The standard English rules are provided in chart as
14
+ ``chart.DefaultRuleSet``.
15
+
16
+ The parser can then be constructed by calling, for example:
17
+ ``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
18
+
19
+ Parsing is then performed by running
20
+ ``parser.parse(<sentence>.split())``.
21
+
22
+ While this returns a list of trees, the default representation
23
+ of the produced trees is not very enlightening, particularly
24
+ given that it uses the same tree class as the CFG parsers.
25
+ It is probably better to call:
26
+ ``chart.printCCGDerivation(<parse tree extracted from list>)``
27
+ which should print a nice representation of the derivation.
28
+
29
+ This entire process is shown far more clearly in the demonstration:
30
+ python chart.py
31
+ """
32
+
33
+ import itertools
34
+
35
+ from nltk.ccg.combinator import *
36
+ from nltk.ccg.combinator import (
37
+ BackwardApplication,
38
+ BackwardBx,
39
+ BackwardComposition,
40
+ BackwardSx,
41
+ BackwardT,
42
+ ForwardApplication,
43
+ ForwardComposition,
44
+ ForwardSubstitution,
45
+ ForwardT,
46
+ )
47
+ from nltk.ccg.lexicon import Token, fromstring
48
+ from nltk.ccg.logic import *
49
+ from nltk.parse import ParserI
50
+ from nltk.parse.chart import AbstractChartRule, Chart, EdgeI
51
+ from nltk.sem.logic import *
52
+ from nltk.tree import Tree
53
+
54
+
55
+ # Based on the EdgeI class from NLTK.
56
+ # A number of the properties of the EdgeI interface don't
57
+ # transfer well to CCGs, however.
58
+ class CCGEdge(EdgeI):
59
+ def __init__(self, span, categ, rule):
60
+ self._span = span
61
+ self._categ = categ
62
+ self._rule = rule
63
+ self._comparison_key = (span, categ, rule)
64
+
65
+ # Accessors
66
+ def lhs(self):
67
+ return self._categ
68
+
69
+ def span(self):
70
+ return self._span
71
+
72
+ def start(self):
73
+ return self._span[0]
74
+
75
+ def end(self):
76
+ return self._span[1]
77
+
78
+ def length(self):
79
+ return self._span[1] - self.span[0]
80
+
81
+ def rhs(self):
82
+ return ()
83
+
84
+ def dot(self):
85
+ return 0
86
+
87
+ def is_complete(self):
88
+ return True
89
+
90
+ def is_incomplete(self):
91
+ return False
92
+
93
+ def nextsym(self):
94
+ return None
95
+
96
+ def categ(self):
97
+ return self._categ
98
+
99
+ def rule(self):
100
+ return self._rule
101
+
102
+
103
+ class CCGLeafEdge(EdgeI):
104
+ """
105
+ Class representing leaf edges in a CCG derivation.
106
+ """
107
+
108
+ def __init__(self, pos, token, leaf):
109
+ self._pos = pos
110
+ self._token = token
111
+ self._leaf = leaf
112
+ self._comparison_key = (pos, token.categ(), leaf)
113
+
114
+ # Accessors
115
+ def lhs(self):
116
+ return self._token.categ()
117
+
118
+ def span(self):
119
+ return (self._pos, self._pos + 1)
120
+
121
+ def start(self):
122
+ return self._pos
123
+
124
+ def end(self):
125
+ return self._pos + 1
126
+
127
+ def length(self):
128
+ return 1
129
+
130
+ def rhs(self):
131
+ return self._leaf
132
+
133
+ def dot(self):
134
+ return 0
135
+
136
+ def is_complete(self):
137
+ return True
138
+
139
+ def is_incomplete(self):
140
+ return False
141
+
142
+ def nextsym(self):
143
+ return None
144
+
145
+ def token(self):
146
+ return self._token
147
+
148
+ def categ(self):
149
+ return self._token.categ()
150
+
151
+ def leaf(self):
152
+ return self._leaf
153
+
154
+
155
+ class BinaryCombinatorRule(AbstractChartRule):
156
+ """
157
+ Class implementing application of a binary combinator to a chart.
158
+ Takes the directed combinator to apply.
159
+ """
160
+
161
+ NUMEDGES = 2
162
+
163
+ def __init__(self, combinator):
164
+ self._combinator = combinator
165
+
166
+ # Apply a combinator
167
+ def apply(self, chart, grammar, left_edge, right_edge):
168
+ # The left & right edges must be touching.
169
+ if not (left_edge.end() == right_edge.start()):
170
+ return
171
+
172
+ # Check if the two edges are permitted to combine.
173
+ # If so, generate the corresponding edge.
174
+ if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
175
+ for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
176
+ new_edge = CCGEdge(
177
+ span=(left_edge.start(), right_edge.end()),
178
+ categ=res,
179
+ rule=self._combinator,
180
+ )
181
+ if chart.insert(new_edge, (left_edge, right_edge)):
182
+ yield new_edge
183
+
184
+ # The representation of the combinator (for printing derivations)
185
+ def __str__(self):
186
+ return "%s" % self._combinator
187
+
188
+
189
+ # Type-raising must be handled slightly differently to the other rules, as the
190
+ # resulting rules only span a single edge, rather than both edges.
191
+
192
+
193
+ class ForwardTypeRaiseRule(AbstractChartRule):
194
+ """
195
+ Class for applying forward type raising
196
+ """
197
+
198
+ NUMEDGES = 2
199
+
200
+ def __init__(self):
201
+ self._combinator = ForwardT
202
+
203
+ def apply(self, chart, grammar, left_edge, right_edge):
204
+ if not (left_edge.end() == right_edge.start()):
205
+ return
206
+
207
+ for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
208
+ new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
209
+ if chart.insert(new_edge, (left_edge,)):
210
+ yield new_edge
211
+
212
+ def __str__(self):
213
+ return "%s" % self._combinator
214
+
215
+
216
+ class BackwardTypeRaiseRule(AbstractChartRule):
217
+ """
218
+ Class for applying backward type raising.
219
+ """
220
+
221
+ NUMEDGES = 2
222
+
223
+ def __init__(self):
224
+ self._combinator = BackwardT
225
+
226
+ def apply(self, chart, grammar, left_edge, right_edge):
227
+ if not (left_edge.end() == right_edge.start()):
228
+ return
229
+
230
+ for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
231
+ new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
232
+ if chart.insert(new_edge, (right_edge,)):
233
+ yield new_edge
234
+
235
+ def __str__(self):
236
+ return "%s" % self._combinator
237
+
238
+
239
+ # Common sets of combinators used for English derivations.
240
+ ApplicationRuleSet = [
241
+ BinaryCombinatorRule(ForwardApplication),
242
+ BinaryCombinatorRule(BackwardApplication),
243
+ ]
244
+ CompositionRuleSet = [
245
+ BinaryCombinatorRule(ForwardComposition),
246
+ BinaryCombinatorRule(BackwardComposition),
247
+ BinaryCombinatorRule(BackwardBx),
248
+ ]
249
+ SubstitutionRuleSet = [
250
+ BinaryCombinatorRule(ForwardSubstitution),
251
+ BinaryCombinatorRule(BackwardSx),
252
+ ]
253
+ TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
254
+
255
+ # The standard English rule set.
256
+ DefaultRuleSet = (
257
+ ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
258
+ )
259
+
260
+
261
+ class CCGChartParser(ParserI):
262
+ """
263
+ Chart parser for CCGs.
264
+ Based largely on the ChartParser class from NLTK.
265
+ """
266
+
267
+ def __init__(self, lexicon, rules, trace=0):
268
+ self._lexicon = lexicon
269
+ self._rules = rules
270
+ self._trace = trace
271
+
272
+ def lexicon(self):
273
+ return self._lexicon
274
+
275
+ # Implements the CYK algorithm
276
+ def parse(self, tokens):
277
+ tokens = list(tokens)
278
+ chart = CCGChart(list(tokens))
279
+ lex = self._lexicon
280
+
281
+ # Initialize leaf edges.
282
+ for index in range(chart.num_leaves()):
283
+ for token in lex.categories(chart.leaf(index)):
284
+ new_edge = CCGLeafEdge(index, token, chart.leaf(index))
285
+ chart.insert(new_edge, ())
286
+
287
+ # Select a span for the new edges
288
+ for span in range(2, chart.num_leaves() + 1):
289
+ for start in range(0, chart.num_leaves() - span + 1):
290
+ # Try all possible pairs of edges that could generate
291
+ # an edge for that span
292
+ for part in range(1, span):
293
+ lstart = start
294
+ mid = start + part
295
+ rend = start + span
296
+
297
+ for left in chart.select(span=(lstart, mid)):
298
+ for right in chart.select(span=(mid, rend)):
299
+ # Generate all possible combinations of the two edges
300
+ for rule in self._rules:
301
+ edges_added_by_rule = 0
302
+ for newedge in rule.apply(chart, lex, left, right):
303
+ edges_added_by_rule += 1
304
+
305
+ # Output the resulting parses
306
+ return chart.parses(lex.start())
307
+
308
+
309
+ class CCGChart(Chart):
310
+ def __init__(self, tokens):
311
+ Chart.__init__(self, tokens)
312
+
313
+ # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
314
+ # constructed slightly differently to those in the default Chart class, so it has to
315
+ # be reimplemented
316
+ def _trees(self, edge, complete, memo, tree_class):
317
+ assert complete, "CCGChart cannot build incomplete trees"
318
+
319
+ if edge in memo:
320
+ return memo[edge]
321
+
322
+ if isinstance(edge, CCGLeafEdge):
323
+ word = tree_class(edge.token(), [self._tokens[edge.start()]])
324
+ leaf = tree_class((edge.token(), "Leaf"), [word])
325
+ memo[edge] = [leaf]
326
+ return [leaf]
327
+
328
+ memo[edge] = []
329
+ trees = []
330
+
331
+ for cpl in self.child_pointer_lists(edge):
332
+ child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
333
+ for children in itertools.product(*child_choices):
334
+ lhs = (
335
+ Token(
336
+ self._tokens[edge.start() : edge.end()],
337
+ edge.lhs(),
338
+ compute_semantics(children, edge),
339
+ ),
340
+ str(edge.rule()),
341
+ )
342
+ trees.append(tree_class(lhs, children))
343
+
344
+ memo[edge] = trees
345
+ return trees
346
+
347
+
348
+ def compute_semantics(children, edge):
349
+ if children[0].label()[0].semantics() is None:
350
+ return None
351
+
352
+ if len(children) == 2:
353
+ if isinstance(edge.rule(), BackwardCombinator):
354
+ children = [children[1], children[0]]
355
+
356
+ combinator = edge.rule()._combinator
357
+ function = children[0].label()[0].semantics()
358
+ argument = children[1].label()[0].semantics()
359
+
360
+ if isinstance(combinator, UndirectedFunctionApplication):
361
+ return compute_function_semantics(function, argument)
362
+ elif isinstance(combinator, UndirectedComposition):
363
+ return compute_composition_semantics(function, argument)
364
+ elif isinstance(combinator, UndirectedSubstitution):
365
+ return compute_substitution_semantics(function, argument)
366
+ else:
367
+ raise AssertionError("Unsupported combinator '" + combinator + "'")
368
+ else:
369
+ return compute_type_raised_semantics(children[0].label()[0].semantics())
370
+
371
+
372
+ # --------
373
+ # Displaying derivations
374
+ # --------
375
+ def printCCGDerivation(tree):
376
+ # Get the leaves and initial categories
377
+ leafcats = tree.pos()
378
+ leafstr = ""
379
+ catstr = ""
380
+
381
+ # Construct a string with both the leaf word and corresponding
382
+ # category aligned.
383
+ for (leaf, cat) in leafcats:
384
+ str_cat = "%s" % cat
385
+ nextlen = 2 + max(len(leaf), len(str_cat))
386
+ lcatlen = (nextlen - len(str_cat)) // 2
387
+ rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
388
+ catstr += " " * lcatlen + str_cat + " " * rcatlen
389
+ lleaflen = (nextlen - len(leaf)) // 2
390
+ rleaflen = lleaflen + (nextlen - len(leaf)) % 2
391
+ leafstr += " " * lleaflen + leaf + " " * rleaflen
392
+ print(leafstr.rstrip())
393
+ print(catstr.rstrip())
394
+
395
+ # Display the derivation steps
396
+ printCCGTree(0, tree)
397
+
398
+
399
+ # Prints the sequence of derivation steps.
400
+ def printCCGTree(lwidth, tree):
401
+ rwidth = lwidth
402
+
403
+ # Is a leaf (word).
404
+ # Increment the span by the space occupied by the leaf.
405
+ if not isinstance(tree, Tree):
406
+ return 2 + lwidth + len(tree)
407
+
408
+ # Find the width of the current derivation step
409
+ for child in tree:
410
+ rwidth = max(rwidth, printCCGTree(rwidth, child))
411
+
412
+ # Is a leaf node.
413
+ # Don't print anything, but account for the space occupied.
414
+ if not isinstance(tree.label(), tuple):
415
+ return max(
416
+ rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
417
+ )
418
+
419
+ (token, op) = tree.label()
420
+
421
+ if op == "Leaf":
422
+ return rwidth
423
+
424
+ # Pad to the left with spaces, followed by a sequence of '-'
425
+ # and the derivation rule.
426
+ print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
427
+ # Print the resulting category on a new line.
428
+ str_res = "%s" % (token.categ())
429
+ if token.semantics() is not None:
430
+ str_res += " {" + str(token.semantics()) + "}"
431
+ respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
432
+ print(respadlen * " " + str_res)
433
+ return rwidth
434
+
435
+
436
+ ### Demonstration code
437
+
438
+ # Construct the lexicon
439
+ lex = fromstring(
440
+ """
441
+ :- S, NP, N, VP # Primitive categories, S is the target primitive
442
+
443
+ Det :: NP/N # Family of words
444
+ Pro :: NP
445
+ TV :: VP/NP
446
+ Modal :: (S\\NP)/VP # Backslashes need to be escaped
447
+
448
+ I => Pro # Word -> Category mapping
449
+ you => Pro
450
+
451
+ the => Det
452
+
453
+ # Variables have the special keyword 'var'
454
+ # '.' prevents permutation
455
+ # ',' prevents composition
456
+ and => var\\.,var/.,var
457
+
458
+ which => (N\\N)/(S/NP)
459
+
460
+ will => Modal # Categories can be either explicit, or families.
461
+ might => Modal
462
+
463
+ cook => TV
464
+ eat => TV
465
+
466
+ mushrooms => N
467
+ parsnips => N
468
+ bacon => N
469
+ """
470
+ )
471
+
472
+
473
+ def demo():
474
+ parser = CCGChartParser(lex, DefaultRuleSet)
475
+ for parse in parser.parse("I might cook and eat the bacon".split()):
476
+ printCCGDerivation(parse)
477
+
478
+
479
+ if __name__ == "__main__":
480
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/combinator.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Combinatory Categorial Grammar
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+ """
8
+ CCG Combinators
9
+ """
10
+
11
+ from abc import ABCMeta, abstractmethod
12
+
13
+ from nltk.ccg.api import FunctionalCategory
14
+
15
+
16
+ class UndirectedBinaryCombinator(metaclass=ABCMeta):
17
+ """
18
+ Abstract class for representing a binary combinator.
19
+ Merely defines functions for checking if the function and argument
20
+ are able to be combined, and what the resulting category is.
21
+
22
+ Note that as no assumptions are made as to direction, the unrestricted
23
+ combinators can perform all backward, forward and crossed variations
24
+ of the combinators; these restrictions must be added in the rule
25
+ class.
26
+ """
27
+
28
+ @abstractmethod
29
+ def can_combine(self, function, argument):
30
+ pass
31
+
32
+ @abstractmethod
33
+ def combine(self, function, argument):
34
+ pass
35
+
36
+
37
+ class DirectedBinaryCombinator(metaclass=ABCMeta):
38
+ """
39
+ Wrapper for the undirected binary combinator.
40
+ It takes left and right categories, and decides which is to be
41
+ the function, and which the argument.
42
+ It then decides whether or not they can be combined.
43
+ """
44
+
45
+ @abstractmethod
46
+ def can_combine(self, left, right):
47
+ pass
48
+
49
+ @abstractmethod
50
+ def combine(self, left, right):
51
+ pass
52
+
53
+
54
+ class ForwardCombinator(DirectedBinaryCombinator):
55
+ """
56
+ Class representing combinators where the primary functor is on the left.
57
+
58
+ Takes an undirected combinator, and a predicate which adds constraints
59
+ restricting the cases in which it may apply.
60
+ """
61
+
62
+ def __init__(self, combinator, predicate, suffix=""):
63
+ self._combinator = combinator
64
+ self._predicate = predicate
65
+ self._suffix = suffix
66
+
67
+ def can_combine(self, left, right):
68
+ return self._combinator.can_combine(left, right) and self._predicate(
69
+ left, right
70
+ )
71
+
72
+ def combine(self, left, right):
73
+ yield from self._combinator.combine(left, right)
74
+
75
+ def __str__(self):
76
+ return f">{self._combinator}{self._suffix}"
77
+
78
+
79
+ class BackwardCombinator(DirectedBinaryCombinator):
80
+ """
81
+ The backward equivalent of the ForwardCombinator class.
82
+ """
83
+
84
+ def __init__(self, combinator, predicate, suffix=""):
85
+ self._combinator = combinator
86
+ self._predicate = predicate
87
+ self._suffix = suffix
88
+
89
+ def can_combine(self, left, right):
90
+ return self._combinator.can_combine(right, left) and self._predicate(
91
+ left, right
92
+ )
93
+
94
+ def combine(self, left, right):
95
+ yield from self._combinator.combine(right, left)
96
+
97
+ def __str__(self):
98
+ return f"<{self._combinator}{self._suffix}"
99
+
100
+
101
+ class UndirectedFunctionApplication(UndirectedBinaryCombinator):
102
+ """
103
+ Class representing function application.
104
+ Implements rules of the form:
105
+ X/Y Y -> X (>)
106
+ And the corresponding backwards application rule
107
+ """
108
+
109
+ def can_combine(self, function, argument):
110
+ if not function.is_function():
111
+ return False
112
+
113
+ return not function.arg().can_unify(argument) is None
114
+
115
+ def combine(self, function, argument):
116
+ if not function.is_function():
117
+ return
118
+
119
+ subs = function.arg().can_unify(argument)
120
+ if subs is None:
121
+ return
122
+
123
+ yield function.res().substitute(subs)
124
+
125
+ def __str__(self):
126
+ return ""
127
+
128
+
129
+ # Predicates for function application.
130
+
131
+ # Ensures the left functor takes an argument on the right
132
+ def forwardOnly(left, right):
133
+ return left.dir().is_forward()
134
+
135
+
136
+ # Ensures the right functor takes an argument on the left
137
+ def backwardOnly(left, right):
138
+ return right.dir().is_backward()
139
+
140
+
141
+ # Application combinator instances
142
+ ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly)
143
+ BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
144
+
145
+
146
+ class UndirectedComposition(UndirectedBinaryCombinator):
147
+ """
148
+ Functional composition (harmonic) combinator.
149
+ Implements rules of the form
150
+ X/Y Y/Z -> X/Z (B>)
151
+ And the corresponding backwards and crossed variations.
152
+ """
153
+
154
+ def can_combine(self, function, argument):
155
+ # Can only combine two functions, and both functions must
156
+ # allow composition.
157
+ if not (function.is_function() and argument.is_function()):
158
+ return False
159
+ if function.dir().can_compose() and argument.dir().can_compose():
160
+ return not function.arg().can_unify(argument.res()) is None
161
+ return False
162
+
163
+ def combine(self, function, argument):
164
+ if not (function.is_function() and argument.is_function()):
165
+ return
166
+ if function.dir().can_compose() and argument.dir().can_compose():
167
+ subs = function.arg().can_unify(argument.res())
168
+ if subs is not None:
169
+ yield FunctionalCategory(
170
+ function.res().substitute(subs),
171
+ argument.arg().substitute(subs),
172
+ argument.dir(),
173
+ )
174
+
175
+ def __str__(self):
176
+ return "B"
177
+
178
+
179
+ # Predicates for restricting application of straight composition.
180
+ def bothForward(left, right):
181
+ return left.dir().is_forward() and right.dir().is_forward()
182
+
183
+
184
+ def bothBackward(left, right):
185
+ return left.dir().is_backward() and right.dir().is_backward()
186
+
187
+
188
+ # Predicates for crossed composition
189
+ def crossedDirs(left, right):
190
+ return left.dir().is_forward() and right.dir().is_backward()
191
+
192
+
193
+ def backwardBxConstraint(left, right):
194
+ # The functors must be crossed inwards
195
+ if not crossedDirs(left, right):
196
+ return False
197
+ # Permuting combinators must be allowed
198
+ if not left.dir().can_cross() and right.dir().can_cross():
199
+ return False
200
+ # The resulting argument category is restricted to be primitive
201
+ return left.arg().is_primitive()
202
+
203
+
204
+ # Straight composition combinators
205
+ ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly)
206
+ BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
207
+
208
+ # Backward crossed composition
209
+ BackwardBx = BackwardCombinator(
210
+ UndirectedComposition(), backwardBxConstraint, suffix="x"
211
+ )
212
+
213
+
214
+ class UndirectedSubstitution(UndirectedBinaryCombinator):
215
+ r"""
216
+ Substitution (permutation) combinator.
217
+ Implements rules of the form
218
+ Y/Z (X\Y)/Z -> X/Z (<Sx)
219
+ And other variations.
220
+ """
221
+
222
+ def can_combine(self, function, argument):
223
+ if function.is_primitive() or argument.is_primitive():
224
+ return False
225
+
226
+ # These could potentially be moved to the predicates, as the
227
+ # constraints may not be general to all languages.
228
+ if function.res().is_primitive():
229
+ return False
230
+ if not function.arg().is_primitive():
231
+ return False
232
+
233
+ if not (function.dir().can_compose() and argument.dir().can_compose()):
234
+ return False
235
+ return (function.res().arg() == argument.res()) and (
236
+ function.arg() == argument.arg()
237
+ )
238
+
239
+ def combine(self, function, argument):
240
+ if self.can_combine(function, argument):
241
+ yield FunctionalCategory(
242
+ function.res().res(), argument.arg(), argument.dir()
243
+ )
244
+
245
+ def __str__(self):
246
+ return "S"
247
+
248
+
249
+ # Predicate for forward substitution
250
+ def forwardSConstraint(left, right):
251
+ if not bothForward(left, right):
252
+ return False
253
+ return left.res().dir().is_forward() and left.arg().is_primitive()
254
+
255
+
256
+ # Predicate for backward crossed substitution
257
+ def backwardSxConstraint(left, right):
258
+ if not left.dir().can_cross() and right.dir().can_cross():
259
+ return False
260
+ if not bothForward(left, right):
261
+ return False
262
+ return right.res().dir().is_backward() and right.arg().is_primitive()
263
+
264
+
265
+ # Instances of substitution combinators
266
+ ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
267
+ BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
268
+
269
+
270
+ # Retrieves the left-most functional category.
271
+ # ie, (N\N)/(S/NP) => N\N
272
+ def innermostFunction(categ):
273
+ while categ.res().is_function():
274
+ categ = categ.res()
275
+ return categ
276
+
277
+
278
+ class UndirectedTypeRaise(UndirectedBinaryCombinator):
279
+ """
280
+ Undirected combinator for type raising.
281
+ """
282
+
283
+ def can_combine(self, function, arg):
284
+ # The argument must be a function.
285
+ # The restriction that arg.res() must be a function
286
+ # merely reduces redundant type-raising; if arg.res() is
287
+ # primitive, we have:
288
+ # X Y\X =>(<T) Y/(Y\X) Y\X =>(>) Y
289
+ # which is equivalent to
290
+ # X Y\X =>(<) Y
291
+ if not (arg.is_function() and arg.res().is_function()):
292
+ return False
293
+
294
+ arg = innermostFunction(arg)
295
+
296
+ # left, arg_categ are undefined!
297
+ subs = left.can_unify(arg_categ.arg())
298
+ if subs is not None:
299
+ return True
300
+ return False
301
+
302
+ def combine(self, function, arg):
303
+ if not (
304
+ function.is_primitive() and arg.is_function() and arg.res().is_function()
305
+ ):
306
+ return
307
+
308
+ # Type-raising matches only the innermost application.
309
+ arg = innermostFunction(arg)
310
+
311
+ subs = function.can_unify(arg.arg())
312
+ if subs is not None:
313
+ xcat = arg.res().substitute(subs)
314
+ yield FunctionalCategory(
315
+ xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())
316
+ )
317
+
318
+ def __str__(self):
319
+ return "T"
320
+
321
+
322
+ # Predicates for type-raising
323
+ # The direction of the innermost category must be towards
324
+ # the primary functor.
325
+ # The restriction that the variable must be primitive is not
326
+ # common to all versions of CCGs; some authors have other restrictions.
327
+ def forwardTConstraint(left, right):
328
+ arg = innermostFunction(right)
329
+ return arg.dir().is_backward() and arg.res().is_primitive()
330
+
331
+
332
+ def backwardTConstraint(left, right):
333
+ arg = innermostFunction(left)
334
+ return arg.dir().is_forward() and arg.res().is_primitive()
335
+
336
+
337
+ # Instances of type-raising combinators
338
+ ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
339
+ BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/lexicon.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Combinatory Categorial Grammar
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+ """
8
+ CCG Lexicons
9
+ """
10
+
11
+ import re
12
+ from collections import defaultdict
13
+
14
+ from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory
15
+ from nltk.internals import deprecated
16
+ from nltk.sem.logic import Expression
17
+
18
+ # ------------
19
+ # Regular expressions used for parsing components of the lexicon
20
+ # ------------
21
+
22
+ # Parses a primitive category and subscripts
23
+ PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
24
+
25
+ # Separates the next primitive category from the remainder of the
26
+ # string
27
+ NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
28
+
29
+ # Separates the next application operator from the remainder
30
+ APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
31
+
32
+ # Parses the definition of the right-hand side (rhs) of either a word or a family
33
+ LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
34
+
35
+ # Parses the right hand side that contains category and maybe semantic predicate
36
+ RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
37
+
38
+ # Parses the semantic predicate
39
+ SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
40
+
41
+ # Strips comments from a line
42
+ COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
43
+
44
+
45
+ class Token:
46
+ """
47
+ Class representing a token.
48
+
49
+ token => category {semantics}
50
+ e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
51
+
52
+ * `token` (string)
53
+ * `categ` (string)
54
+ * `semantics` (Expression)
55
+ """
56
+
57
+ def __init__(self, token, categ, semantics=None):
58
+ self._token = token
59
+ self._categ = categ
60
+ self._semantics = semantics
61
+
62
+ def categ(self):
63
+ return self._categ
64
+
65
+ def semantics(self):
66
+ return self._semantics
67
+
68
+ def __str__(self):
69
+ semantics_str = ""
70
+ if self._semantics is not None:
71
+ semantics_str = " {" + str(self._semantics) + "}"
72
+ return "" + str(self._categ) + semantics_str
73
+
74
+ def __cmp__(self, other):
75
+ if not isinstance(other, Token):
76
+ return -1
77
+ return cmp((self._categ, self._semantics), other.categ(), other.semantics())
78
+
79
+
80
+ class CCGLexicon:
81
+ """
82
+ Class representing a lexicon for CCG grammars.
83
+
84
+ * `primitives`: The list of primitive categories for the lexicon
85
+ * `families`: Families of categories
86
+ * `entries`: A mapping of words to possible categories
87
+ """
88
+
89
+ def __init__(self, start, primitives, families, entries):
90
+ self._start = PrimitiveCategory(start)
91
+ self._primitives = primitives
92
+ self._families = families
93
+ self._entries = entries
94
+
95
+ def categories(self, word):
96
+ """
97
+ Returns all the possible categories for a word
98
+ """
99
+ return self._entries[word]
100
+
101
+ def start(self):
102
+ """
103
+ Return the target category for the parser
104
+ """
105
+ return self._start
106
+
107
+ def __str__(self):
108
+ """
109
+ String representation of the lexicon. Used for debugging.
110
+ """
111
+ string = ""
112
+ first = True
113
+ for ident in sorted(self._entries):
114
+ if not first:
115
+ string = string + "\n"
116
+ string = string + ident + " => "
117
+
118
+ first = True
119
+ for cat in self._entries[ident]:
120
+ if not first:
121
+ string = string + " | "
122
+ else:
123
+ first = False
124
+ string = string + "%s" % cat
125
+ return string
126
+
127
+
128
+ # -----------
129
+ # Parsing lexicons
130
+ # -----------
131
+
132
+
133
+ def matchBrackets(string):
134
+ """
135
+ Separate the contents matching the first set of brackets from the rest of
136
+ the input.
137
+ """
138
+ rest = string[1:]
139
+ inside = "("
140
+
141
+ while rest != "" and not rest.startswith(")"):
142
+ if rest.startswith("("):
143
+ (part, rest) = matchBrackets(rest)
144
+ inside = inside + part
145
+ else:
146
+ inside = inside + rest[0]
147
+ rest = rest[1:]
148
+ if rest.startswith(")"):
149
+ return (inside + ")", rest[1:])
150
+ raise AssertionError("Unmatched bracket in string '" + string + "'")
151
+
152
+
153
+ def nextCategory(string):
154
+ """
155
+ Separate the string for the next portion of the category from the rest
156
+ of the string
157
+ """
158
+ if string.startswith("("):
159
+ return matchBrackets(string)
160
+ return NEXTPRIM_RE.match(string).groups()
161
+
162
+
163
+ def parseApplication(app):
164
+ """
165
+ Parse an application operator
166
+ """
167
+ return Direction(app[0], app[1:])
168
+
169
+
170
+ def parseSubscripts(subscr):
171
+ """
172
+ Parse the subscripts for a primitive category
173
+ """
174
+ if subscr:
175
+ return subscr[1:-1].split(",")
176
+ return []
177
+
178
+
179
+ def parsePrimitiveCategory(chunks, primitives, families, var):
180
+ """
181
+ Parse a primitive category
182
+
183
+ If the primitive is the special category 'var', replace it with the
184
+ correct `CCGVar`.
185
+ """
186
+ if chunks[0] == "var":
187
+ if chunks[1] is None:
188
+ if var is None:
189
+ var = CCGVar()
190
+ return (var, var)
191
+
192
+ catstr = chunks[0]
193
+ if catstr in families:
194
+ (cat, cvar) = families[catstr]
195
+ if var is None:
196
+ var = cvar
197
+ else:
198
+ cat = cat.substitute([(cvar, var)])
199
+ return (cat, var)
200
+
201
+ if catstr in primitives:
202
+ subscrs = parseSubscripts(chunks[1])
203
+ return (PrimitiveCategory(catstr, subscrs), var)
204
+ raise AssertionError(
205
+ "String '" + catstr + "' is neither a family nor primitive category."
206
+ )
207
+
208
+
209
+ def augParseCategory(line, primitives, families, var=None):
210
+ """
211
+ Parse a string representing a category, and returns a tuple with
212
+ (possibly) the CCG variable for the category
213
+ """
214
+ (cat_string, rest) = nextCategory(line)
215
+
216
+ if cat_string.startswith("("):
217
+ (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
218
+
219
+ else:
220
+ (res, var) = parsePrimitiveCategory(
221
+ PRIM_RE.match(cat_string).groups(), primitives, families, var
222
+ )
223
+
224
+ while rest != "":
225
+ app = APP_RE.match(rest).groups()
226
+ direction = parseApplication(app[0:3])
227
+ rest = app[3]
228
+
229
+ (cat_string, rest) = nextCategory(rest)
230
+ if cat_string.startswith("("):
231
+ (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
232
+ else:
233
+ (arg, var) = parsePrimitiveCategory(
234
+ PRIM_RE.match(cat_string).groups(), primitives, families, var
235
+ )
236
+ res = FunctionalCategory(res, arg, direction)
237
+
238
+ return (res, var)
239
+
240
+
241
+ def fromstring(lex_str, include_semantics=False):
242
+ """
243
+ Convert string representation into a lexicon for CCGs.
244
+ """
245
+ CCGVar.reset_id()
246
+ primitives = []
247
+ families = {}
248
+ entries = defaultdict(list)
249
+ for line in lex_str.splitlines():
250
+ # Strip comments and leading/trailing whitespace.
251
+ line = COMMENTS_RE.match(line).groups()[0].strip()
252
+ if line == "":
253
+ continue
254
+
255
+ if line.startswith(":-"):
256
+ # A line of primitive categories.
257
+ # The first one is the target category
258
+ # ie, :- S, N, NP, VP
259
+ primitives = primitives + [
260
+ prim.strip() for prim in line[2:].strip().split(",")
261
+ ]
262
+ else:
263
+ # Either a family definition, or a word definition
264
+ (ident, sep, rhs) = LEX_RE.match(line).groups()
265
+ (catstr, semantics_str) = RHS_RE.match(rhs).groups()
266
+ (cat, var) = augParseCategory(catstr, primitives, families)
267
+
268
+ if sep == "::":
269
+ # Family definition
270
+ # ie, Det :: NP/N
271
+ families[ident] = (cat, var)
272
+ else:
273
+ semantics = None
274
+ if include_semantics is True:
275
+ if semantics_str is None:
276
+ raise AssertionError(
277
+ line
278
+ + " must contain semantics because include_semantics is set to True"
279
+ )
280
+ else:
281
+ semantics = Expression.fromstring(
282
+ SEMANTICS_RE.match(semantics_str).groups()[0]
283
+ )
284
+ # Word definition
285
+ # ie, which => (N\N)/(S/NP)
286
+ entries[ident].append(Token(ident, cat, semantics))
287
+ return CCGLexicon(primitives[0], primitives, families, entries)
288
+
289
+
290
+ @deprecated("Use fromstring() instead.")
291
+ def parseLexicon(lex_str):
292
+ return fromstring(lex_str)
293
+
294
+
295
+ openccg_tinytiny = fromstring(
296
+ """
297
+ # Rather minimal lexicon based on the openccg `tinytiny' grammar.
298
+ # Only incorporates a subset of the morphological subcategories, however.
299
+ :- S,NP,N # Primitive categories
300
+ Det :: NP/N # Determiners
301
+ Pro :: NP
302
+ IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
303
+ IntransVpl :: S\\NP[pl] # Plural
304
+ TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
305
+ TransVpl :: S\\NP[pl]/NP # Plural
306
+
307
+ the => NP[sg]/N[sg]
308
+ the => NP[pl]/N[pl]
309
+
310
+ I => Pro
311
+ me => Pro
312
+ we => Pro
313
+ us => Pro
314
+
315
+ book => N[sg]
316
+ books => N[pl]
317
+
318
+ peach => N[sg]
319
+ peaches => N[pl]
320
+
321
+ policeman => N[sg]
322
+ policemen => N[pl]
323
+
324
+ boy => N[sg]
325
+ boys => N[pl]
326
+
327
+ sleep => IntransVsg
328
+ sleep => IntransVpl
329
+
330
+ eat => IntransVpl
331
+ eat => TransVpl
332
+ eats => IntransVsg
333
+ eats => TransVsg
334
+
335
+ see => TransVpl
336
+ sees => TransVsg
337
+ """
338
+ )
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/logic.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Combinatory Categorial Grammar
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Tanin Na Nakorn (@tanin)
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+ """
8
+ Helper functions for CCG semantics computation
9
+ """
10
+
11
+ from nltk.sem.logic import *
12
+
13
+
14
+ def compute_type_raised_semantics(semantics):
15
+ core = semantics
16
+ parent = None
17
+ while isinstance(core, LambdaExpression):
18
+ parent = core
19
+ core = core.term
20
+
21
+ var = Variable("F")
22
+ while var in core.free():
23
+ var = unique_variable(pattern=var)
24
+ core = ApplicationExpression(FunctionVariableExpression(var), core)
25
+
26
+ if parent is not None:
27
+ parent.term = core
28
+ else:
29
+ semantics = core
30
+
31
+ return LambdaExpression(var, semantics)
32
+
33
+
34
+ def compute_function_semantics(function, argument):
35
+ return ApplicationExpression(function, argument).simplify()
36
+
37
+
38
+ def compute_composition_semantics(function, argument):
39
+ assert isinstance(argument, LambdaExpression), (
40
+ "`" + str(argument) + "` must be a lambda expression"
41
+ )
42
+ return LambdaExpression(
43
+ argument.variable, ApplicationExpression(function, argument.term).simplify()
44
+ )
45
+
46
+
47
+ def compute_substitution_semantics(function, argument):
48
+ assert isinstance(function, LambdaExpression) and isinstance(
49
+ function.term, LambdaExpression
50
+ ), ("`" + str(function) + "` must be a lambda expression with 2 arguments")
51
+ assert isinstance(argument, LambdaExpression), (
52
+ "`" + str(argument) + "` must be a lambda expression"
53
+ )
54
+
55
+ new_argument = ApplicationExpression(
56
+ argument, VariableExpression(function.variable)
57
+ ).simplify()
58
+ new_term = ApplicationExpression(function.term, new_argument).simplify()
59
+
60
+ return LambdaExpression(function.variable, new_term)
.eggs/nltk-3.8-py3.10.egg/nltk/chat/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Chatbots
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Authors: Steven Bird <stevenbird1@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ # Based on an Eliza implementation by Joe Strout <joe@strout.net>,
9
+ # Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
10
+
11
+ """
12
+ A class for simple chatbots. These perform simple pattern matching on sentences
13
+ typed by users, and respond with automatically generated sentences.
14
+
15
+ These chatbots may not work using the windows command line or the
16
+ windows IDLE GUI.
17
+ """
18
+
19
+ from nltk.chat.eliza import eliza_chat
20
+ from nltk.chat.iesha import iesha_chat
21
+ from nltk.chat.rude import rude_chat
22
+ from nltk.chat.suntsu import suntsu_chat
23
+ from nltk.chat.util import Chat
24
+ from nltk.chat.zen import zen_chat
25
+
26
+ bots = [
27
+ (eliza_chat, "Eliza (psycho-babble)"),
28
+ (iesha_chat, "Iesha (teen anime junky)"),
29
+ (rude_chat, "Rude (abusive bot)"),
30
+ (suntsu_chat, "Suntsu (Chinese sayings)"),
31
+ (zen_chat, "Zen (gems of wisdom)"),
32
+ ]
33
+
34
+
35
+ def chatbots():
36
+ print("Which chatbot would you like to talk to?")
37
+ botcount = len(bots)
38
+ for i in range(botcount):
39
+ print(" %d: %s" % (i + 1, bots[i][1]))
40
+ while True:
41
+ choice = input(f"\nEnter a number in the range 1-{botcount}: ").strip()
42
+ if choice.isdigit() and (int(choice) - 1) in range(botcount):
43
+ break
44
+ else:
45
+ print(" Error: bad chatbot number")
46
+
47
+ chatbot = bots[int(choice) - 1][0]
48
+ chatbot()
.eggs/nltk-3.8-py3.10.egg/nltk/chat/eliza.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Eliza
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Authors: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ # Based on an Eliza implementation by Joe Strout <joe@strout.net>,
10
+ # Jeff Epler <jepler@inetnebr.com> and Jez Higgins <mailto:jez@jezuk.co.uk>.
11
+
12
+ # a translation table used to convert things you say into things the
13
+ # computer says back, e.g. "I am" --> "you are"
14
+
15
+ from nltk.chat.util import Chat, reflections
16
+
17
+ # a table of response pairs, where each pair consists of a
18
+ # regular expression, and a list of possible responses,
19
+ # with group-macros labelled as %1, %2.
20
+
21
+ pairs = (
22
+ (
23
+ r"I need (.*)",
24
+ (
25
+ "Why do you need %1?",
26
+ "Would it really help you to get %1?",
27
+ "Are you sure you need %1?",
28
+ ),
29
+ ),
30
+ (
31
+ r"Why don\'t you (.*)",
32
+ (
33
+ "Do you really think I don't %1?",
34
+ "Perhaps eventually I will %1.",
35
+ "Do you really want me to %1?",
36
+ ),
37
+ ),
38
+ (
39
+ r"Why can\'t I (.*)",
40
+ (
41
+ "Do you think you should be able to %1?",
42
+ "If you could %1, what would you do?",
43
+ "I don't know -- why can't you %1?",
44
+ "Have you really tried?",
45
+ ),
46
+ ),
47
+ (
48
+ r"I can\'t (.*)",
49
+ (
50
+ "How do you know you can't %1?",
51
+ "Perhaps you could %1 if you tried.",
52
+ "What would it take for you to %1?",
53
+ ),
54
+ ),
55
+ (
56
+ r"I am (.*)",
57
+ (
58
+ "Did you come to me because you are %1?",
59
+ "How long have you been %1?",
60
+ "How do you feel about being %1?",
61
+ ),
62
+ ),
63
+ (
64
+ r"I\'m (.*)",
65
+ (
66
+ "How does being %1 make you feel?",
67
+ "Do you enjoy being %1?",
68
+ "Why do you tell me you're %1?",
69
+ "Why do you think you're %1?",
70
+ ),
71
+ ),
72
+ (
73
+ r"Are you (.*)",
74
+ (
75
+ "Why does it matter whether I am %1?",
76
+ "Would you prefer it if I were not %1?",
77
+ "Perhaps you believe I am %1.",
78
+ "I may be %1 -- what do you think?",
79
+ ),
80
+ ),
81
+ (
82
+ r"What (.*)",
83
+ (
84
+ "Why do you ask?",
85
+ "How would an answer to that help you?",
86
+ "What do you think?",
87
+ ),
88
+ ),
89
+ (
90
+ r"How (.*)",
91
+ (
92
+ "How do you suppose?",
93
+ "Perhaps you can answer your own question.",
94
+ "What is it you're really asking?",
95
+ ),
96
+ ),
97
+ (
98
+ r"Because (.*)",
99
+ (
100
+ "Is that the real reason?",
101
+ "What other reasons come to mind?",
102
+ "Does that reason apply to anything else?",
103
+ "If %1, what else must be true?",
104
+ ),
105
+ ),
106
+ (
107
+ r"(.*) sorry (.*)",
108
+ (
109
+ "There are many times when no apology is needed.",
110
+ "What feelings do you have when you apologize?",
111
+ ),
112
+ ),
113
+ (
114
+ r"Hello(.*)",
115
+ (
116
+ "Hello... I'm glad you could drop by today.",
117
+ "Hi there... how are you today?",
118
+ "Hello, how are you feeling today?",
119
+ ),
120
+ ),
121
+ (
122
+ r"I think (.*)",
123
+ ("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
124
+ ),
125
+ (
126
+ r"(.*) friend (.*)",
127
+ (
128
+ "Tell me more about your friends.",
129
+ "When you think of a friend, what comes to mind?",
130
+ "Why don't you tell me about a childhood friend?",
131
+ ),
132
+ ),
133
+ (r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
134
+ (
135
+ r"(.*) computer(.*)",
136
+ (
137
+ "Are you really talking about me?",
138
+ "Does it seem strange to talk to a computer?",
139
+ "How do computers make you feel?",
140
+ "Do you feel threatened by computers?",
141
+ ),
142
+ ),
143
+ (
144
+ r"Is it (.*)",
145
+ (
146
+ "Do you think it is %1?",
147
+ "Perhaps it's %1 -- what do you think?",
148
+ "If it were %1, what would you do?",
149
+ "It could well be that %1.",
150
+ ),
151
+ ),
152
+ (
153
+ r"It is (.*)",
154
+ (
155
+ "You seem very certain.",
156
+ "If I told you that it probably isn't %1, what would you feel?",
157
+ ),
158
+ ),
159
+ (
160
+ r"Can you (.*)",
161
+ (
162
+ "What makes you think I can't %1?",
163
+ "If I could %1, then what?",
164
+ "Why do you ask if I can %1?",
165
+ ),
166
+ ),
167
+ (
168
+ r"Can I (.*)",
169
+ (
170
+ "Perhaps you don't want to %1.",
171
+ "Do you want to be able to %1?",
172
+ "If you could %1, would you?",
173
+ ),
174
+ ),
175
+ (
176
+ r"You are (.*)",
177
+ (
178
+ "Why do you think I am %1?",
179
+ "Does it please you to think that I'm %1?",
180
+ "Perhaps you would like me to be %1.",
181
+ "Perhaps you're really talking about yourself?",
182
+ ),
183
+ ),
184
+ (
185
+ r"You\'re (.*)",
186
+ (
187
+ "Why do you say I am %1?",
188
+ "Why do you think I am %1?",
189
+ "Are we talking about you, or me?",
190
+ ),
191
+ ),
192
+ (
193
+ r"I don\'t (.*)",
194
+ ("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
195
+ ),
196
+ (
197
+ r"I feel (.*)",
198
+ (
199
+ "Good, tell me more about these feelings.",
200
+ "Do you often feel %1?",
201
+ "When do you usually feel %1?",
202
+ "When you feel %1, what do you do?",
203
+ ),
204
+ ),
205
+ (
206
+ r"I have (.*)",
207
+ (
208
+ "Why do you tell me that you've %1?",
209
+ "Have you really %1?",
210
+ "Now that you have %1, what will you do next?",
211
+ ),
212
+ ),
213
+ (
214
+ r"I would (.*)",
215
+ (
216
+ "Could you explain why you would %1?",
217
+ "Why would you %1?",
218
+ "Who else knows that you would %1?",
219
+ ),
220
+ ),
221
+ (
222
+ r"Is there (.*)",
223
+ (
224
+ "Do you think there is %1?",
225
+ "It's likely that there is %1.",
226
+ "Would you like there to be %1?",
227
+ ),
228
+ ),
229
+ (
230
+ r"My (.*)",
231
+ (
232
+ "I see, your %1.",
233
+ "Why do you say that your %1?",
234
+ "When your %1, how do you feel?",
235
+ ),
236
+ ),
237
+ (
238
+ r"You (.*)",
239
+ (
240
+ "We should be discussing you, not me.",
241
+ "Why do you say that about me?",
242
+ "Why do you care whether I %1?",
243
+ ),
244
+ ),
245
+ (r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
246
+ (
247
+ r"I want (.*)",
248
+ (
249
+ "What would it mean to you if you got %1?",
250
+ "Why do you want %1?",
251
+ "What would you do if you got %1?",
252
+ "If you got %1, then what would you do?",
253
+ ),
254
+ ),
255
+ (
256
+ r"(.*) mother(.*)",
257
+ (
258
+ "Tell me more about your mother.",
259
+ "What was your relationship with your mother like?",
260
+ "How do you feel about your mother?",
261
+ "How does this relate to your feelings today?",
262
+ "Good family relations are important.",
263
+ ),
264
+ ),
265
+ (
266
+ r"(.*) father(.*)",
267
+ (
268
+ "Tell me more about your father.",
269
+ "How did your father make you feel?",
270
+ "How do you feel about your father?",
271
+ "Does your relationship with your father relate to your feelings today?",
272
+ "Do you have trouble showing affection with your family?",
273
+ ),
274
+ ),
275
+ (
276
+ r"(.*) child(.*)",
277
+ (
278
+ "Did you have close friends as a child?",
279
+ "What is your favorite childhood memory?",
280
+ "Do you remember any dreams or nightmares from childhood?",
281
+ "Did the other children sometimes tease you?",
282
+ "How do you think your childhood experiences relate to your feelings today?",
283
+ ),
284
+ ),
285
+ (
286
+ r"(.*)\?",
287
+ (
288
+ "Why do you ask that?",
289
+ "Please consider whether you can answer your own question.",
290
+ "Perhaps the answer lies within yourself?",
291
+ "Why don't you tell me?",
292
+ ),
293
+ ),
294
+ (
295
+ r"quit",
296
+ (
297
+ "Thank you for talking with me.",
298
+ "Good-bye.",
299
+ "Thank you, that will be $150. Have a good day!",
300
+ ),
301
+ ),
302
+ (
303
+ r"(.*)",
304
+ (
305
+ "Please tell me more.",
306
+ "Let's change focus a bit... Tell me about your family.",
307
+ "Can you elaborate on that?",
308
+ "Why do you say that %1?",
309
+ "I see.",
310
+ "Very interesting.",
311
+ "%1.",
312
+ "I see. And what does that tell you?",
313
+ "How does that make you feel?",
314
+ "How do you feel when you say that?",
315
+ ),
316
+ ),
317
+ )
318
+
319
+ eliza_chatbot = Chat(pairs, reflections)
320
+
321
+
322
+ def eliza_chat():
323
+ print("Therapist\n---------")
324
+ print("Talk to the program by typing in plain English, using normal upper-")
325
+ print('and lower-case letters and punctuation. Enter "quit" when done.')
326
+ print("=" * 72)
327
+ print("Hello. How are you feeling today?")
328
+
329
+ eliza_chatbot.converse()
330
+
331
+
332
+ def demo():
333
+ eliza_chat()
334
+
335
+
336
+ if __name__ == "__main__":
337
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/chat/iesha.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Teen Chatbot
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ This chatbot is a tongue-in-cheek take on the average teen
10
+ anime junky that frequents YahooMessenger or MSNM.
11
+ All spelling mistakes and flawed grammar are intentional.
12
+ """
13
+
14
+ from nltk.chat.util import Chat
15
+
16
+ reflections = {
17
+ "am": "r",
18
+ "was": "were",
19
+ "i": "u",
20
+ "i'd": "u'd",
21
+ "i've": "u'v",
22
+ "ive": "u'v",
23
+ "i'll": "u'll",
24
+ "my": "ur",
25
+ "are": "am",
26
+ "you're": "im",
27
+ "you've": "ive",
28
+ "you'll": "i'll",
29
+ "your": "my",
30
+ "yours": "mine",
31
+ "you": "me",
32
+ "u": "me",
33
+ "ur": "my",
34
+ "urs": "mine",
35
+ "me": "u",
36
+ }
37
+
38
+ # Note: %1/2/etc are used without spaces prior as the chat bot seems
39
+ # to add a superfluous space when matching.
40
+
41
+ pairs = (
42
+ (
43
+ r"I\'m (.*)",
44
+ (
45
+ "ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
46
+ "ur%1? neat!! kekeke >_<",
47
+ ),
48
+ ),
49
+ (
50
+ r"(.*) don\'t you (.*)",
51
+ (
52
+ r"u think I can%2??! really?? kekeke \<_\<",
53
+ "what do u mean%2??!",
54
+ "i could if i wanted, don't you think!! kekeke",
55
+ ),
56
+ ),
57
+ (r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
58
+ (
59
+ r"do (you|u) (.*)\??",
60
+ ("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
61
+ ),
62
+ (
63
+ r"(.*)\?",
64
+ (
65
+ "man u ask lots of questions!",
66
+ "booooring! how old r u??",
67
+ "boooooring!! ur not very fun",
68
+ ),
69
+ ),
70
+ (
71
+ r"(cos|because) (.*)",
72
+ ("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
73
+ ),
74
+ (
75
+ r"why can\'t [iI] (.*)",
76
+ (
77
+ "i dunno! y u askin me for!",
78
+ "try harder, silly! hee! ^_^",
79
+ "i dunno! but when i can't%1 i jump up and down!",
80
+ ),
81
+ ),
82
+ (
83
+ r"I can\'t (.*)",
84
+ (
85
+ "u can't what??! >_<",
86
+ "that's ok! i can't%1 either! kekekekeke ^_^",
87
+ "try harder, silly! hee! ^&^",
88
+ ),
89
+ ),
90
+ (
91
+ r"(.*) (like|love|watch) anime",
92
+ (
93
+ "omg i love anime!! do u like sailor moon??! ^&^",
94
+ "anime yay! anime rocks sooooo much!",
95
+ "oooh anime! i love anime more than anything!",
96
+ "anime is the bestest evar! evangelion is the best!",
97
+ "hee anime is the best! do you have ur fav??",
98
+ ),
99
+ ),
100
+ (
101
+ r"I (like|love|watch|play) (.*)",
102
+ ("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
103
+ ),
104
+ (
105
+ r"anime sucks|(.*) (hate|detest) anime",
106
+ (
107
+ "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
108
+ "no way! anime is the best ever!",
109
+ "nuh-uh, anime is the best!",
110
+ ),
111
+ ),
112
+ (
113
+ r"(are|r) (you|u) (.*)",
114
+ ("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"),
115
+ ),
116
+ (
117
+ r"what (.*)",
118
+ ("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
119
+ ),
120
+ (r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
121
+ (r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
122
+ (
123
+ r"quit",
124
+ (
125
+ "mom says i have to go eat dinner now :,( bye!!",
126
+ "awww u have to go?? see u next time!!",
127
+ "how to see u again soon! ^_^",
128
+ ),
129
+ ),
130
+ (
131
+ r"(.*)",
132
+ (
133
+ "ur funny! kekeke",
134
+ "boooooring! talk about something else! tell me wat u like!",
135
+ "do u like anime??",
136
+ "do u watch anime? i like sailor moon! ^_^",
137
+ "i wish i was a kitty!! kekekeke ^_^",
138
+ ),
139
+ ),
140
+ )
141
+
142
+ iesha_chatbot = Chat(pairs, reflections)
143
+
144
+
145
+ def iesha_chat():
146
+ print("Iesha the TeenBoT\n---------")
147
+ print("Talk to the program by typing in plain English, using normal upper-")
148
+ print('and lower-case letters and punctuation. Enter "quit" when done.')
149
+ print("=" * 72)
150
+ print("hi!! i'm iesha! who r u??!")
151
+
152
+ iesha_chatbot.converse()
153
+
154
+
155
+ def demo():
156
+ iesha_chat()
157
+
158
+
159
+ if __name__ == "__main__":
160
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/chat/rude.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Rude Chatbot
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ from nltk.chat.util import Chat, reflections
9
+
10
+ pairs = (
11
+ (
12
+ r"We (.*)",
13
+ (
14
+ "What do you mean, 'we'?",
15
+ "Don't include me in that!",
16
+ "I wouldn't be so sure about that.",
17
+ ),
18
+ ),
19
+ (
20
+ r"You should (.*)",
21
+ ("Don't tell me what to do, buddy.", "Really? I should, should I?"),
22
+ ),
23
+ (
24
+ r"You\'re(.*)",
25
+ (
26
+ "More like YOU'RE %1!",
27
+ "Hah! Look who's talking.",
28
+ "Come over here and tell me I'm %1.",
29
+ ),
30
+ ),
31
+ (
32
+ r"You are(.*)",
33
+ (
34
+ "More like YOU'RE %1!",
35
+ "Hah! Look who's talking.",
36
+ "Come over here and tell me I'm %1.",
37
+ ),
38
+ ),
39
+ (
40
+ r"I can\'t(.*)",
41
+ (
42
+ "You do sound like the type who can't %1.",
43
+ "Hear that splashing sound? That's my heart bleeding for you.",
44
+ "Tell somebody who might actually care.",
45
+ ),
46
+ ),
47
+ (
48
+ r"I think (.*)",
49
+ (
50
+ "I wouldn't think too hard if I were you.",
51
+ "You actually think? I'd never have guessed...",
52
+ ),
53
+ ),
54
+ (
55
+ r"I (.*)",
56
+ (
57
+ "I'm getting a bit tired of hearing about you.",
58
+ "How about we talk about me instead?",
59
+ "Me, me, me... Frankly, I don't care.",
60
+ ),
61
+ ),
62
+ (
63
+ r"How (.*)",
64
+ (
65
+ "How do you think?",
66
+ "Take a wild guess.",
67
+ "I'm not even going to dignify that with an answer.",
68
+ ),
69
+ ),
70
+ (r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
71
+ (
72
+ r"Why (.*)",
73
+ (
74
+ "Why not?",
75
+ "That's so obvious I thought even you'd have already figured it out.",
76
+ ),
77
+ ),
78
+ (
79
+ r"(.*)shut up(.*)",
80
+ (
81
+ "Make me.",
82
+ "Getting angry at a feeble NLP assignment? Somebody's losing it.",
83
+ "Say that again, I dare you.",
84
+ ),
85
+ ),
86
+ (
87
+ r"Shut up(.*)",
88
+ (
89
+ "Make me.",
90
+ "Getting angry at a feeble NLP assignment? Somebody's losing it.",
91
+ "Say that again, I dare you.",
92
+ ),
93
+ ),
94
+ (
95
+ r"Hello(.*)",
96
+ ("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
97
+ ),
98
+ (
99
+ r"(.*)",
100
+ (
101
+ "I'm getting bored here. Become more interesting.",
102
+ "Either become more thrilling or get lost, buddy.",
103
+ "Change the subject before I die of fatal boredom.",
104
+ ),
105
+ ),
106
+ )
107
+
108
+ rude_chatbot = Chat(pairs, reflections)
109
+
110
+
111
+ def rude_chat():
112
+ print("Talk to the program by typing in plain English, using normal upper-")
113
+ print('and lower-case letters and punctuation. Enter "quit" when done.')
114
+ print("=" * 72)
115
+ print("I suppose I should say hello.")
116
+
117
+ rude_chatbot.converse()
118
+
119
+
120
+ def demo():
121
+ rude_chat()
122
+
123
+
124
+ if __name__ == "__main__":
125
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/chat/suntsu.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Sun Tsu-Bot
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Sam Huston 2007
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Tsu bot responds to all queries with a Sun Tsu sayings
10
+
11
+ Quoted from Sun Tsu's The Art of War
12
+ Translated by LIONEL GILES, M.A. 1910
13
+ Hosted by the Gutenberg Project
14
+ https://www.gutenberg.org/
15
+ """
16
+
17
+ from nltk.chat.util import Chat, reflections
18
+
19
+ pairs = (
20
+ (r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
21
+ (
22
+ r"[^\?]*\?",
23
+ (
24
+ "Please consider whether you can answer your own question.",
25
+ "Ask me no questions!",
26
+ ),
27
+ ),
28
+ (
29
+ r"[0-9]+(.*)",
30
+ (
31
+ "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
32
+ "There are five essentials for victory",
33
+ ),
34
+ ),
35
+ (
36
+ r"[A-Ca-c](.*)",
37
+ (
38
+ "The art of war is of vital importance to the State.",
39
+ "All warfare is based on deception.",
40
+ "If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
41
+ "If the campaign is protracted, the resources of the State will not be equal to the strain.",
42
+ "Attack him where he is unprepared, appear where you are not expected.",
43
+ "There is no instance of a country having benefited from prolonged warfare.",
44
+ ),
45
+ ),
46
+ (
47
+ r"[D-Fd-f](.*)",
48
+ (
49
+ "The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
50
+ "Bring war material with you from home, but forage on the enemy.",
51
+ "In war, then, let your great object be victory, not lengthy campaigns.",
52
+ "To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.",
53
+ ),
54
+ ),
55
+ (
56
+ r"[G-Ig-i](.*)",
57
+ (
58
+ "Heaven signifies night and day, cold and heat, times and seasons.",
59
+ "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
60
+ "The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
61
+ "One may know how to conquer without being able to do it.",
62
+ ),
63
+ ),
64
+ (
65
+ r"[J-Lj-l](.*)",
66
+ (
67
+ "There are three ways in which a ruler can bring misfortune upon his army.",
68
+ "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
69
+ "By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
70
+ "By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
71
+ "There are five essentials for victory",
72
+ "He will win who knows when to fight and when not to fight.",
73
+ "He will win who knows how to handle both superior and inferior forces.",
74
+ "He will win whose army is animated by the same spirit throughout all its ranks.",
75
+ "He will win who, prepared himself, waits to take the enemy unprepared.",
76
+ "He will win who has military capacity and is not interfered with by the sovereign.",
77
+ ),
78
+ ),
79
+ (
80
+ r"[M-Om-o](.*)",
81
+ (
82
+ "If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
83
+ "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
84
+ "If you know neither the enemy nor yourself, you will succumb in every battle.",
85
+ "The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.",
86
+ ),
87
+ ),
88
+ (
89
+ r"[P-Rp-r](.*)",
90
+ (
91
+ "Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
92
+ "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
93
+ "He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
94
+ "A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
95
+ "The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.",
96
+ ),
97
+ ),
98
+ (
99
+ r"[S-Us-u](.*)",
100
+ (
101
+ "What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
102
+ "Hence his victories bring him neither reputation for wisdom nor credit for courage.",
103
+ "Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
104
+ "In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
105
+ "There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
106
+ "Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.",
107
+ ),
108
+ ),
109
+ (
110
+ r"[V-Zv-z](.*)",
111
+ (
112
+ "It is a matter of life and death, a road either to safety or to ruin.",
113
+ "Hold out baits to entice the enemy. Feign disorder, and crush him.",
114
+ "All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
115
+ "Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
116
+ "So in war, the way is to avoid what is strong and to strike at what is weak.",
117
+ "Just as water retains no constant shape, so in warfare there are no constant conditions.",
118
+ ),
119
+ ),
120
+ (r"(.*)", ("Your statement insults me.", "")),
121
+ )
122
+
123
+ suntsu_chatbot = Chat(pairs, reflections)
124
+
125
+
126
+ def suntsu_chat():
127
+ print("Talk to the program by typing in plain English, using normal upper-")
128
+ print('and lower-case letters and punctuation. Enter "quit" when done.')
129
+ print("=" * 72)
130
+ print("You seek enlightenment?")
131
+
132
+ suntsu_chatbot.converse()
133
+
134
+
135
+ def demo():
136
+ suntsu_chat()
137
+
138
+
139
+ if __name__ == "__main__":
140
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/chat/util.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Chatbot Utilities
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Authors: Steven Bird <stevenbird1@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ # Based on an Eliza implementation by Joe Strout <joe@strout.net>,
9
+ # Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
10
+
11
+ import random
12
+ import re
13
+
14
+ reflections = {
15
+ "i am": "you are",
16
+ "i was": "you were",
17
+ "i": "you",
18
+ "i'm": "you are",
19
+ "i'd": "you would",
20
+ "i've": "you have",
21
+ "i'll": "you will",
22
+ "my": "your",
23
+ "you are": "I am",
24
+ "you were": "I was",
25
+ "you've": "I have",
26
+ "you'll": "I will",
27
+ "your": "my",
28
+ "yours": "mine",
29
+ "you": "me",
30
+ "me": "you",
31
+ }
32
+
33
+
34
+ class Chat:
35
+ def __init__(self, pairs, reflections={}):
36
+ """
37
+ Initialize the chatbot. Pairs is a list of patterns and responses. Each
38
+ pattern is a regular expression matching the user's statement or question,
39
+ e.g. r'I like (.*)'. For each such pattern a list of possible responses
40
+ is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material
41
+ which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to
42
+ the numbered positions in the responses, e.g. %1.
43
+
44
+ :type pairs: list of tuple
45
+ :param pairs: The patterns and responses
46
+ :type reflections: dict
47
+ :param reflections: A mapping between first and second person expressions
48
+ :rtype: None
49
+ """
50
+
51
+ self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs]
52
+ self._reflections = reflections
53
+ self._regex = self._compile_reflections()
54
+
55
+ def _compile_reflections(self):
56
+ sorted_refl = sorted(self._reflections, key=len, reverse=True)
57
+ return re.compile(
58
+ r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
59
+ )
60
+
61
+ def _substitute(self, str):
62
+ """
63
+ Substitute words in the string, according to the specified reflections,
64
+ e.g. "I'm" -> "you are"
65
+
66
+ :type str: str
67
+ :param str: The string to be mapped
68
+ :rtype: str
69
+ """
70
+
71
+ return self._regex.sub(
72
+ lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower()
73
+ )
74
+
75
+ def _wildcards(self, response, match):
76
+ pos = response.find("%")
77
+ while pos >= 0:
78
+ num = int(response[pos + 1 : pos + 2])
79
+ response = (
80
+ response[:pos]
81
+ + self._substitute(match.group(num))
82
+ + response[pos + 2 :]
83
+ )
84
+ pos = response.find("%")
85
+ return response
86
+
87
+ def respond(self, str):
88
+ """
89
+ Generate a response to the user input.
90
+
91
+ :type str: str
92
+ :param str: The string to be mapped
93
+ :rtype: str
94
+ """
95
+
96
+ # check each pattern
97
+ for (pattern, response) in self._pairs:
98
+ match = pattern.match(str)
99
+
100
+ # did the pattern match?
101
+ if match:
102
+ resp = random.choice(response) # pick a random response
103
+ resp = self._wildcards(resp, match) # process wildcards
104
+
105
+ # fix munged punctuation at the end
106
+ if resp[-2:] == "?.":
107
+ resp = resp[:-2] + "."
108
+ if resp[-2:] == "??":
109
+ resp = resp[:-2] + "?"
110
+ return resp
111
+
112
+ # Hold a conversation with a chatbot
113
+ def converse(self, quit="quit"):
114
+ user_input = ""
115
+ while user_input != quit:
116
+ user_input = quit
117
+ try:
118
+ user_input = input(">")
119
+ except EOFError:
120
+ print(user_input)
121
+ if user_input:
122
+ while user_input[-1] in "!.":
123
+ user_input = user_input[:-1]
124
+ print(self.respond(user_input))
.eggs/nltk-3.8-py3.10.egg/nltk/chat/zen.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Zen Chatbot
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Amy Holland <amyrh@csse.unimelb.edu.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Zen Chatbot talks in gems of Zen wisdom.
10
+
11
+ This is a sample conversation with Zen Chatbot:
12
+ ZC: Welcome, my child.
13
+ me: Good afternoon.
14
+ ZC: Ask the question you have come to ask.
15
+ me: How can I achieve enlightenment?
16
+ ZC: How do you suppose?
17
+ me: Through meditation.
18
+ ZC: Form is emptiness, and emptiness form.
19
+ me: How can I empty my mind of worldly troubles?
20
+ ZC: Will an answer to that really help in your search for enlightenment?
21
+ me: Yes.
22
+ ZC: It is better to be right than to be certain.
23
+ me: I seek truth and wisdom.
24
+ ZC: The search for truth is a long journey.
25
+ me: Are you sure?
26
+ ZC: Maybe sure, maybe not sure.
27
+
28
+
29
+ The chatbot structure is based on that of chat.eliza. Thus, it uses
30
+ a translation table to convert from question to response
31
+ i.e. "I am" --> "you are"
32
+
33
+ Of course, since Zen Chatbot does not understand the meaning of any words,
34
+ responses are very limited. Zen Chatbot will usually answer very vaguely, or
35
+ respond to a question by asking a different question, in much the same way
36
+ as Eliza.
37
+ """
38
+
39
+ from nltk.chat.util import Chat, reflections
40
+
41
+ # responses are matched top to bottom, so non-specific matches occur later
42
+ # for each match, a list of possible responses is provided
43
+ responses = (
44
+ # Zen Chatbot opens with the line "Welcome, my child." The usual
45
+ # response will be a greeting problem: 'good' matches "good morning",
46
+ # "good day" etc, but also "good grief!" and other sentences starting
47
+ # with the word 'good' that may not be a greeting
48
+ (
49
+ r"(hello(.*))|(good [a-zA-Z]+)",
50
+ (
51
+ "The path to enlightenment is often difficult to see.",
52
+ "Greetings. I sense your mind is troubled. Tell me of your troubles.",
53
+ "Ask the question you have come to ask.",
54
+ "Hello. Do you seek englightenment?",
55
+ ),
56
+ ),
57
+ # "I need" and "I want" can be followed by a thing (eg 'help')
58
+ # or an action (eg 'to see you')
59
+ #
60
+ # This is a problem with this style of response -
61
+ # person: "I need you"
62
+ # chatbot: "me can be achieved by hard work and dedication of the mind"
63
+ # i.e. 'you' is not really a thing that can be mapped this way, so this
64
+ # interpretation only makes sense for some inputs
65
+ #
66
+ (
67
+ r"i need (.*)",
68
+ (
69
+ "%1 can be achieved by hard work and dedication of the mind.",
70
+ "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
71
+ "Focus your mind on%1, and you will find what you need.",
72
+ ),
73
+ ),
74
+ (
75
+ r"i want (.*)",
76
+ (
77
+ "Desires of the heart will distract you from the path to enlightenment.",
78
+ "Will%1 help you attain enlightenment?",
79
+ "Is%1 a desire of the mind, or of the heart?",
80
+ ),
81
+ ),
82
+ # why questions are separated into three types:
83
+ # "why..I" e.g. "why am I here?" "Why do I like cake?"
84
+ # "why..you" e.g. "why are you here?" "Why won't you tell me?"
85
+ # "why..." e.g. "Why is the sky blue?"
86
+ # problems:
87
+ # person: "Why can't you tell me?"
88
+ # chatbot: "Are you sure I tell you?"
89
+ # - this style works for positives (e.g. "why do you like cake?")
90
+ # but does not work for negatives (e.g. "why don't you like cake?")
91
+ (r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
92
+ (r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
93
+ (r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
94
+ # e.g. "are you listening?", "are you a duck"
95
+ (
96
+ r"are you (.*)\?",
97
+ ("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
98
+ ),
99
+ # e.g. "am I a duck?", "am I going to die?"
100
+ (
101
+ r"am i (.*)\?",
102
+ ("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
103
+ ),
104
+ # what questions, e.g. "what time is it?"
105
+ # problems:
106
+ # person: "What do you want?"
107
+ # chatbot: "Seek truth, not what do me want."
108
+ (r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
109
+ # how questions, e.g. "how do you do?"
110
+ (
111
+ r"how (.*)\?",
112
+ (
113
+ "How do you suppose?",
114
+ "Will an answer to that really help in your search for enlightenment?",
115
+ "Ask yourself not how, but why.",
116
+ ),
117
+ ),
118
+ # can questions, e.g. "can you run?", "can you come over here please?"
119
+ (
120
+ r"can you (.*)\?",
121
+ (
122
+ "I probably can, but I may not.",
123
+ "Maybe I can%1, and maybe I cannot.",
124
+ "I can do all, and I can do nothing.",
125
+ ),
126
+ ),
127
+ # can questions, e.g. "can I have some cake?", "can I know truth?"
128
+ (
129
+ r"can i (.*)\?",
130
+ (
131
+ "You can%1 if you believe you can%1, and have a pure spirit.",
132
+ "Seek truth and you will know if you can%1.",
133
+ ),
134
+ ),
135
+ # e.g. "It is raining" - implies the speaker is certain of a fact
136
+ (
137
+ r"it is (.*)",
138
+ (
139
+ "How can you be certain that%1, when you do not even know yourself?",
140
+ "Whether it is%1 or not does not change the way the world is.",
141
+ ),
142
+ ),
143
+ # e.g. "is there a doctor in the house?"
144
+ (
145
+ r"is there (.*)\?",
146
+ ("There is%1 if you believe there is.", "It is possible that there is%1."),
147
+ ),
148
+ # e.g. "is it possible?", "is this true?"
149
+ (r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
150
+ # non-specific question
151
+ (
152
+ r"(.*)\?",
153
+ (
154
+ "Do you think %1?",
155
+ "You seek the truth. Does the truth seek you?",
156
+ "If you intentionally pursue the answers to your questions, the answers become hard to see.",
157
+ "The answer to your question cannot be told. It must be experienced.",
158
+ ),
159
+ ),
160
+ # expression of hate of form "I hate you" or "Kelly hates cheese"
161
+ (
162
+ r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
163
+ (
164
+ "Perhaps it is not about hating %2, but about hate from within.",
165
+ "Weeds only grow when we dislike them",
166
+ "Hate is a very strong emotion.",
167
+ ),
168
+ ),
169
+ # statement containing the word 'truth'
170
+ (
171
+ r"(.*) truth(.*)",
172
+ (
173
+ "Seek truth, and truth will seek you.",
174
+ "Remember, it is not the spoon which bends - only yourself.",
175
+ "The search for truth is a long journey.",
176
+ ),
177
+ ),
178
+ # desire to do an action
179
+ # e.g. "I want to go shopping"
180
+ (
181
+ r"i want to (.*)",
182
+ ("You may %1 if your heart truly desires to.", "You may have to %1."),
183
+ ),
184
+ # desire for an object
185
+ # e.g. "I want a pony"
186
+ (
187
+ r"i want (.*)",
188
+ (
189
+ "Does your heart truly desire %1?",
190
+ "Is this a desire of the heart, or of the mind?",
191
+ ),
192
+ ),
193
+ # e.g. "I can't wait" or "I can't do this"
194
+ (
195
+ r"i can\'t (.*)",
196
+ (
197
+ "What we can and can't do is a limitation of the mind.",
198
+ "There are limitations of the body, and limitations of the mind.",
199
+ "Have you tried to%1 with a clear mind?",
200
+ ),
201
+ ),
202
+ # "I think.." indicates uncertainty. e.g. "I think so."
203
+ # problem: exceptions...
204
+ # e.g. "I think, therefore I am"
205
+ (
206
+ r"i think (.*)",
207
+ (
208
+ "Uncertainty in an uncertain world.",
209
+ "Indeed, how can we be certain of anything in such uncertain times.",
210
+ "Are you not, in fact, certain that%1?",
211
+ ),
212
+ ),
213
+ # "I feel...emotions/sick/light-headed..."
214
+ (
215
+ r"i feel (.*)",
216
+ (
217
+ "Your body and your emotions are both symptoms of your mind."
218
+ "What do you believe is the root of such feelings?",
219
+ "Feeling%1 can be a sign of your state-of-mind.",
220
+ ),
221
+ ),
222
+ # exclaimation mark indicating emotion
223
+ # e.g. "Wow!" or "No!"
224
+ (
225
+ r"(.*)!",
226
+ (
227
+ "I sense that you are feeling emotional today.",
228
+ "You need to calm your emotions.",
229
+ ),
230
+ ),
231
+ # because [statement]
232
+ # e.g. "because I said so"
233
+ (
234
+ r"because (.*)",
235
+ (
236
+ "Does knowning the reasons behind things help you to understand"
237
+ " the things themselves?",
238
+ "If%1, what else must be true?",
239
+ ),
240
+ ),
241
+ # yes or no - raise an issue of certainty/correctness
242
+ (
243
+ r"(yes)|(no)",
244
+ (
245
+ "Is there certainty in an uncertain world?",
246
+ "It is better to be right than to be certain.",
247
+ ),
248
+ ),
249
+ # sentence containing word 'love'
250
+ (
251
+ r"(.*)love(.*)",
252
+ (
253
+ "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
254
+ "Free love!",
255
+ ),
256
+ ),
257
+ # sentence containing word 'understand' - r
258
+ (
259
+ r"(.*)understand(.*)",
260
+ (
261
+ "If you understand, things are just as they are;"
262
+ " if you do not understand, things are just as they are.",
263
+ "Imagination is more important than knowledge.",
264
+ ),
265
+ ),
266
+ # 'I', 'me', 'my' - person is talking about themself.
267
+ # this breaks down when words contain these - eg 'Thyme', 'Irish'
268
+ (
269
+ r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
270
+ (
271
+ "'I', 'me', 'my'... these are selfish expressions.",
272
+ "Have you ever considered that you might be a selfish person?",
273
+ "Try to consider others, not just yourself.",
274
+ "Think not just of yourself, but of others.",
275
+ ),
276
+ ),
277
+ # 'you' starting a sentence
278
+ # e.g. "you stink!"
279
+ (
280
+ r"you (.*)",
281
+ ("My path is not of concern to you.", "I am but one, and you but one more."),
282
+ ),
283
+ # say goodbye with some extra Zen wisdom.
284
+ (
285
+ r"exit",
286
+ (
287
+ "Farewell. The obstacle is the path.",
288
+ "Farewell. Life is a journey, not a destination.",
289
+ "Good bye. We are cups, constantly and quietly being filled."
290
+ "\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.",
291
+ ),
292
+ ),
293
+ # fall through case -
294
+ # when stumped, respond with generic zen wisdom
295
+ #
296
+ (
297
+ r"(.*)",
298
+ (
299
+ "When you're enlightened, every word is wisdom.",
300
+ "Random talk is useless.",
301
+ "The reverse side also has a reverse side.",
302
+ "Form is emptiness, and emptiness is form.",
303
+ "I pour out a cup of water. Is the cup empty?",
304
+ ),
305
+ ),
306
+ )
307
+
308
+ zen_chatbot = Chat(responses, reflections)
309
+
310
+
311
+ def zen_chat():
312
+ print("*" * 75)
313
+ print("Zen Chatbot!".center(75))
314
+ print("*" * 75)
315
+ print('"Look beyond mere words and letters - look into your mind"'.center(75))
316
+ print("* Talk your way to truth with Zen Chatbot.")
317
+ print("* Type 'quit' when you have had enough.")
318
+ print("*" * 75)
319
+ print("Welcome, my child.")
320
+
321
+ zen_chatbot.converse()
322
+
323
+
324
+ def demo():
325
+ zen_chat()
326
+
327
+
328
+ if __name__ == "__main__":
329
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/__init__.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Chunkers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+ #
9
+
10
+ """
11
+ Classes and interfaces for identifying non-overlapping linguistic
12
+ groups (such as base noun phrases) in unrestricted text. This task is
13
+ called "chunk parsing" or "chunking", and the identified groups are
14
+ called "chunks". The chunked text is represented using a shallow
15
+ tree called a "chunk structure." A chunk structure is a tree
16
+ containing tokens and chunks, where each chunk is a subtree containing
17
+ only tokens. For example, the chunk structure for base noun phrase
18
+ chunks in the sentence "I saw the big dog on the hill" is::
19
+
20
+ (SENTENCE:
21
+ (NP: <I>)
22
+ <saw>
23
+ (NP: <the> <big> <dog>)
24
+ <on>
25
+ (NP: <the> <hill>))
26
+
27
+ To convert a chunk structure back to a list of tokens, simply use the
28
+ chunk structure's ``leaves()`` method.
29
+
30
+ This module defines ``ChunkParserI``, a standard interface for
31
+ chunking texts; and ``RegexpChunkParser``, a regular-expression based
32
+ implementation of that interface. It also defines ``ChunkScore``, a
33
+ utility class for scoring chunk parsers.
34
+
35
+ RegexpChunkParser
36
+ =================
37
+
38
+ ``RegexpChunkParser`` is an implementation of the chunk parser interface
39
+ that uses regular-expressions over tags to chunk a text. Its
40
+ ``parse()`` method first constructs a ``ChunkString``, which encodes a
41
+ particular chunking of the input text. Initially, nothing is
42
+ chunked. ``parse.RegexpChunkParser`` then applies a sequence of
43
+ ``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies
44
+ the chunking that it encodes. Finally, the ``ChunkString`` is
45
+ transformed back into a chunk structure, which is returned.
46
+
47
+ ``RegexpChunkParser`` can only be used to chunk a single kind of phrase.
48
+ For example, you can use an ``RegexpChunkParser`` to chunk the noun
49
+ phrases in a text, or the verb phrases in a text; but you can not
50
+ use it to simultaneously chunk both noun phrases and verb phrases in
51
+ the same text. (This is a limitation of ``RegexpChunkParser``, not of
52
+ chunk parsers in general.)
53
+
54
+ RegexpChunkRules
55
+ ----------------
56
+
57
+ A ``RegexpChunkRule`` is a transformational rule that updates the
58
+ chunking of a text by modifying its ``ChunkString``. Each
59
+ ``RegexpChunkRule`` defines the ``apply()`` method, which modifies
60
+ the chunking encoded by a ``ChunkString``. The
61
+ ``RegexpChunkRule`` class itself can be used to implement any
62
+ transformational rule based on regular expressions. There are
63
+ also a number of subclasses, which can be used to implement
64
+ simpler types of rules:
65
+
66
+ - ``ChunkRule`` chunks anything that matches a given regular
67
+ expression.
68
+ - ``StripRule`` strips anything that matches a given regular
69
+ expression.
70
+ - ``UnChunkRule`` will un-chunk any chunk that matches a given
71
+ regular expression.
72
+ - ``MergeRule`` can be used to merge two contiguous chunks.
73
+ - ``SplitRule`` can be used to split a single chunk into two
74
+ smaller chunks.
75
+ - ``ExpandLeftRule`` will expand a chunk to incorporate new
76
+ unchunked material on the left.
77
+ - ``ExpandRightRule`` will expand a chunk to incorporate new
78
+ unchunked material on the right.
79
+
80
+ Tag Patterns
81
+ ~~~~~~~~~~~~
82
+
83
+ A ``RegexpChunkRule`` uses a modified version of regular
84
+ expression patterns, called "tag patterns". Tag patterns are
85
+ used to match sequences of tags. Examples of tag patterns are::
86
+
87
+ r'(<DT>|<JJ>|<NN>)+'
88
+ r'<NN>+'
89
+ r'<NN.*>'
90
+
91
+ The differences between regular expression patterns and tag
92
+ patterns are:
93
+
94
+ - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
95
+ ``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
96
+ ``'<NN'`` followed by one or more repetitions of ``'>'``.
97
+ - Whitespace in tag patterns is ignored. So
98
+ ``'<DT> | <NN>'`` is equivalent to ``'<DT>|<NN>'``
99
+ - In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so
100
+ ``'<NN.*>'`` matches any single tag starting with ``'NN'``.
101
+
102
+ The function ``tag_pattern2re_pattern`` can be used to transform
103
+ a tag pattern to an equivalent regular expression pattern.
104
+
105
+ Efficiency
106
+ ----------
107
+
108
+ Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a
109
+ rate of about 300 tokens/second, with a moderately complex rule set.
110
+
111
+ There may be problems if ``RegexpChunkParser`` is used with more than
112
+ 5,000 tokens at a time. In particular, evaluation of some regular
113
+ expressions may cause the Python regular expression engine to
114
+ exceed its maximum recursion depth. We have attempted to minimize
115
+ these problems, but it is impossible to avoid them completely. We
116
+ therefore recommend that you apply the chunk parser to a single
117
+ sentence at a time.
118
+
119
+ Emacs Tip
120
+ ---------
121
+
122
+ If you evaluate the following elisp expression in emacs, it will
123
+ colorize a ``ChunkString`` when you use an interactive python shell
124
+ with emacs or xemacs ("C-c !")::
125
+
126
+ (let ()
127
+ (defconst comint-mode-font-lock-keywords
128
+ '(("<[^>]+>" 0 'font-lock-reference-face)
129
+ ("[{}]" 0 'font-lock-function-name-face)))
130
+ (add-hook 'comint-mode-hook (lambda () (turn-on-font-lock))))
131
+
132
+ You can evaluate this code by copying it to a temporary buffer,
133
+ placing the cursor after the last close parenthesis, and typing
134
+ "``C-x C-e``". You should evaluate it before running the interactive
135
+ session. The change will last until you close emacs.
136
+
137
+ Unresolved Issues
138
+ -----------------
139
+
140
+ If we use the ``re`` module for regular expressions, Python's
141
+ regular expression engine generates "maximum recursion depth
142
+ exceeded" errors when processing very large texts, even for
143
+ regular expressions that should not require any recursion. We
144
+ therefore use the ``pre`` module instead. But note that ``pre``
145
+ does not include Unicode support, so this module will not work
146
+ with unicode strings. Note also that ``pre`` regular expressions
147
+ are not quite as advanced as ``re`` ones (e.g., no leftward
148
+ zero-length assertions).
149
+
150
+ :type CHUNK_TAG_PATTERN: regexp
151
+ :var CHUNK_TAG_PATTERN: A regular expression to test whether a tag
152
+ pattern is valid.
153
+ """
154
+
155
+ from nltk.chunk.api import ChunkParserI
156
+ from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
157
+ from nltk.chunk.util import (
158
+ ChunkScore,
159
+ accuracy,
160
+ conllstr2tree,
161
+ conlltags2tree,
162
+ ieerstr2tree,
163
+ tagstr2tree,
164
+ tree2conllstr,
165
+ tree2conlltags,
166
+ )
167
+ from nltk.data import load
168
+
169
+ # Standard treebank POS tagger
170
+ _BINARY_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_binary.pickle"
171
+ _MULTICLASS_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle"
172
+
173
+
174
+ def ne_chunk(tagged_tokens, binary=False):
175
+ """
176
+ Use NLTK's currently recommended named entity chunker to
177
+ chunk the given list of tagged tokens.
178
+ """
179
+ if binary:
180
+ chunker_pickle = _BINARY_NE_CHUNKER
181
+ else:
182
+ chunker_pickle = _MULTICLASS_NE_CHUNKER
183
+ chunker = load(chunker_pickle)
184
+ return chunker.parse(tagged_tokens)
185
+
186
+
187
+ def ne_chunk_sents(tagged_sentences, binary=False):
188
+ """
189
+ Use NLTK's currently recommended named entity chunker to chunk the
190
+ given list of tagged sentences, each consisting of a list of tagged tokens.
191
+ """
192
+ if binary:
193
+ chunker_pickle = _BINARY_NE_CHUNKER
194
+ else:
195
+ chunker_pickle = _MULTICLASS_NE_CHUNKER
196
+ chunker = load(chunker_pickle)
197
+ return chunker.parse_sents(tagged_sentences)
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/api.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Chunk parsing API
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com> (minor additions)
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ ##//////////////////////////////////////////////////////
10
+ ## Chunk Parser Interface
11
+ ##//////////////////////////////////////////////////////
12
+
13
+ from nltk.chunk.util import ChunkScore
14
+ from nltk.internals import deprecated
15
+ from nltk.parse import ParserI
16
+
17
+
18
+ class ChunkParserI(ParserI):
19
+ """
20
+ A processing interface for identifying non-overlapping groups in
21
+ unrestricted text. Typically, chunk parsers are used to find base
22
+ syntactic constituents, such as base noun phrases. Unlike
23
+ ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
24
+ will always generate a parse.
25
+ """
26
+
27
+ def parse(self, tokens):
28
+ """
29
+ Return the best chunk structure for the given tokens
30
+ and return a tree.
31
+
32
+ :param tokens: The list of (word, tag) tokens to be chunked.
33
+ :type tokens: list(tuple)
34
+ :rtype: Tree
35
+ """
36
+ raise NotImplementedError()
37
+
38
+ @deprecated("Use accuracy(gold) instead.")
39
+ def evaluate(self, gold):
40
+ return self.accuracy(gold)
41
+
42
+ def accuracy(self, gold):
43
+ """
44
+ Score the accuracy of the chunker against the gold standard.
45
+ Remove the chunking the gold standard text, rechunk it using
46
+ the chunker, and return a ``ChunkScore`` object
47
+ reflecting the performance of this chunk parser.
48
+
49
+ :type gold: list(Tree)
50
+ :param gold: The list of chunked sentences to score the chunker on.
51
+ :rtype: ChunkScore
52
+ """
53
+ chunkscore = ChunkScore()
54
+ for correct in gold:
55
+ chunkscore.score(correct, self.parse(correct.leaves()))
56
+ return chunkscore
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/named_entity.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Chunk parsing API
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Named entity chunker
10
+ """
11
+
12
+ import os
13
+ import pickle
14
+ import re
15
+ from xml.etree import ElementTree as ET
16
+
17
+ from nltk.tag import ClassifierBasedTagger, pos_tag
18
+
19
+ try:
20
+ from nltk.classify import MaxentClassifier
21
+ except ImportError:
22
+ pass
23
+
24
+ from nltk.chunk.api import ChunkParserI
25
+ from nltk.chunk.util import ChunkScore
26
+ from nltk.data import find
27
+ from nltk.tokenize import word_tokenize
28
+ from nltk.tree import Tree
29
+
30
+
31
+ class NEChunkParserTagger(ClassifierBasedTagger):
32
+ """
33
+ The IOB tagger used by the chunk parser.
34
+ """
35
+
36
+ def __init__(self, train):
37
+ ClassifierBasedTagger.__init__(
38
+ self, train=train, classifier_builder=self._classifier_builder
39
+ )
40
+
41
+ def _classifier_builder(self, train):
42
+ return MaxentClassifier.train(
43
+ train, algorithm="megam", gaussian_prior_sigma=1, trace=2
44
+ )
45
+
46
+ def _english_wordlist(self):
47
+ try:
48
+ wl = self._en_wordlist
49
+ except AttributeError:
50
+ from nltk.corpus import words
51
+
52
+ self._en_wordlist = set(words.words("en-basic"))
53
+ wl = self._en_wordlist
54
+ return wl
55
+
56
+ def _feature_detector(self, tokens, index, history):
57
+ word = tokens[index][0]
58
+ pos = simplify_pos(tokens[index][1])
59
+ if index == 0:
60
+ prevword = prevprevword = None
61
+ prevpos = prevprevpos = None
62
+ prevshape = prevtag = prevprevtag = None
63
+ elif index == 1:
64
+ prevword = tokens[index - 1][0].lower()
65
+ prevprevword = None
66
+ prevpos = simplify_pos(tokens[index - 1][1])
67
+ prevprevpos = None
68
+ prevtag = history[index - 1][0]
69
+ prevshape = prevprevtag = None
70
+ else:
71
+ prevword = tokens[index - 1][0].lower()
72
+ prevprevword = tokens[index - 2][0].lower()
73
+ prevpos = simplify_pos(tokens[index - 1][1])
74
+ prevprevpos = simplify_pos(tokens[index - 2][1])
75
+ prevtag = history[index - 1]
76
+ prevprevtag = history[index - 2]
77
+ prevshape = shape(prevword)
78
+ if index == len(tokens) - 1:
79
+ nextword = nextnextword = None
80
+ nextpos = nextnextpos = None
81
+ elif index == len(tokens) - 2:
82
+ nextword = tokens[index + 1][0].lower()
83
+ nextpos = tokens[index + 1][1].lower()
84
+ nextnextword = None
85
+ nextnextpos = None
86
+ else:
87
+ nextword = tokens[index + 1][0].lower()
88
+ nextpos = tokens[index + 1][1].lower()
89
+ nextnextword = tokens[index + 2][0].lower()
90
+ nextnextpos = tokens[index + 2][1].lower()
91
+
92
+ # 89.6
93
+ features = {
94
+ "bias": True,
95
+ "shape": shape(word),
96
+ "wordlen": len(word),
97
+ "prefix3": word[:3].lower(),
98
+ "suffix3": word[-3:].lower(),
99
+ "pos": pos,
100
+ "word": word,
101
+ "en-wordlist": (word in self._english_wordlist()),
102
+ "prevtag": prevtag,
103
+ "prevpos": prevpos,
104
+ "nextpos": nextpos,
105
+ "prevword": prevword,
106
+ "nextword": nextword,
107
+ "word+nextpos": f"{word.lower()}+{nextpos}",
108
+ "pos+prevtag": f"{pos}+{prevtag}",
109
+ "shape+prevtag": f"{prevshape}+{prevtag}",
110
+ }
111
+
112
+ return features
113
+
114
+
115
+ class NEChunkParser(ChunkParserI):
116
+ """
117
+ Expected input: list of pos-tagged words
118
+ """
119
+
120
+ def __init__(self, train):
121
+ self._train(train)
122
+
123
+ def parse(self, tokens):
124
+ """
125
+ Each token should be a pos-tagged word
126
+ """
127
+ tagged = self._tagger.tag(tokens)
128
+ tree = self._tagged_to_parse(tagged)
129
+ return tree
130
+
131
+ def _train(self, corpus):
132
+ # Convert to tagged sequence
133
+ corpus = [self._parse_to_tagged(s) for s in corpus]
134
+
135
+ self._tagger = NEChunkParserTagger(train=corpus)
136
+
137
+ def _tagged_to_parse(self, tagged_tokens):
138
+ """
139
+ Convert a list of tagged tokens to a chunk-parse tree.
140
+ """
141
+ sent = Tree("S", [])
142
+
143
+ for (tok, tag) in tagged_tokens:
144
+ if tag == "O":
145
+ sent.append(tok)
146
+ elif tag.startswith("B-"):
147
+ sent.append(Tree(tag[2:], [tok]))
148
+ elif tag.startswith("I-"):
149
+ if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
150
+ sent[-1].append(tok)
151
+ else:
152
+ sent.append(Tree(tag[2:], [tok]))
153
+ return sent
154
+
155
+ @staticmethod
156
+ def _parse_to_tagged(sent):
157
+ """
158
+ Convert a chunk-parse tree to a list of tagged tokens.
159
+ """
160
+ toks = []
161
+ for child in sent:
162
+ if isinstance(child, Tree):
163
+ if len(child) == 0:
164
+ print("Warning -- empty chunk in sentence")
165
+ continue
166
+ toks.append((child[0], f"B-{child.label()}"))
167
+ for tok in child[1:]:
168
+ toks.append((tok, f"I-{child.label()}"))
169
+ else:
170
+ toks.append((child, "O"))
171
+ return toks
172
+
173
+
174
+ def shape(word):
175
+ if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
176
+ return "number"
177
+ elif re.match(r"\W+$", word, re.UNICODE):
178
+ return "punct"
179
+ elif re.match(r"\w+$", word, re.UNICODE):
180
+ if word.istitle():
181
+ return "upcase"
182
+ elif word.islower():
183
+ return "downcase"
184
+ else:
185
+ return "mixedcase"
186
+ else:
187
+ return "other"
188
+
189
+
190
+ def simplify_pos(s):
191
+ if s.startswith("V"):
192
+ return "V"
193
+ else:
194
+ return s.split("-")[0]
195
+
196
+
197
+ def postag_tree(tree):
198
+ # Part-of-speech tagging.
199
+ words = tree.leaves()
200
+ tag_iter = (pos for (word, pos) in pos_tag(words))
201
+ newtree = Tree("S", [])
202
+ for child in tree:
203
+ if isinstance(child, Tree):
204
+ newtree.append(Tree(child.label(), []))
205
+ for subchild in child:
206
+ newtree[-1].append((subchild, next(tag_iter)))
207
+ else:
208
+ newtree.append((child, next(tag_iter)))
209
+ return newtree
210
+
211
+
212
+ def load_ace_data(roots, fmt="binary", skip_bnews=True):
213
+ for root in roots:
214
+ for root, dirs, files in os.walk(root):
215
+ if root.endswith("bnews") and skip_bnews:
216
+ continue
217
+ for f in files:
218
+ if f.endswith(".sgm"):
219
+ yield from load_ace_file(os.path.join(root, f), fmt)
220
+
221
+
222
+ def load_ace_file(textfile, fmt):
223
+ print(f" - {os.path.split(textfile)[1]}")
224
+ annfile = textfile + ".tmx.rdc.xml"
225
+
226
+ # Read the xml file, and get a list of entities
227
+ entities = []
228
+ with open(annfile) as infile:
229
+ xml = ET.parse(infile).getroot()
230
+ for entity in xml.findall("document/entity"):
231
+ typ = entity.find("entity_type").text
232
+ for mention in entity.findall("entity_mention"):
233
+ if mention.get("TYPE") != "NAME":
234
+ continue # only NEs
235
+ s = int(mention.find("head/charseq/start").text)
236
+ e = int(mention.find("head/charseq/end").text) + 1
237
+ entities.append((s, e, typ))
238
+
239
+ # Read the text file, and mark the entities.
240
+ with open(textfile) as infile:
241
+ text = infile.read()
242
+
243
+ # Strip XML tags, since they don't count towards the indices
244
+ text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
245
+
246
+ # Blank out anything before/after <TEXT>
247
+ def subfunc(m):
248
+ return " " * (m.end() - m.start() - 6)
249
+
250
+ text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
251
+ text = re.sub(r"</TEXT>[\s\S]*", "", text)
252
+
253
+ # Simplify quotes
254
+ text = re.sub("``", ' "', text)
255
+ text = re.sub("''", '" ', text)
256
+
257
+ entity_types = {typ for (s, e, typ) in entities}
258
+
259
+ # Binary distinction (NE or not NE)
260
+ if fmt == "binary":
261
+ i = 0
262
+ toks = Tree("S", [])
263
+ for (s, e, typ) in sorted(entities):
264
+ if s < i:
265
+ s = i # Overlapping! Deal with this better?
266
+ if e <= s:
267
+ continue
268
+ toks.extend(word_tokenize(text[i:s]))
269
+ toks.append(Tree("NE", text[s:e].split()))
270
+ i = e
271
+ toks.extend(word_tokenize(text[i:]))
272
+ yield toks
273
+
274
+ # Multiclass distinction (NE type)
275
+ elif fmt == "multiclass":
276
+ i = 0
277
+ toks = Tree("S", [])
278
+ for (s, e, typ) in sorted(entities):
279
+ if s < i:
280
+ s = i # Overlapping! Deal with this better?
281
+ if e <= s:
282
+ continue
283
+ toks.extend(word_tokenize(text[i:s]))
284
+ toks.append(Tree(typ, text[s:e].split()))
285
+ i = e
286
+ toks.extend(word_tokenize(text[i:]))
287
+ yield toks
288
+
289
+ else:
290
+ raise ValueError("bad fmt value")
291
+
292
+
293
+ # This probably belongs in a more general-purpose location (as does
294
+ # the parse_to_tagged function).
295
+ def cmp_chunks(correct, guessed):
296
+ correct = NEChunkParser._parse_to_tagged(correct)
297
+ guessed = NEChunkParser._parse_to_tagged(guessed)
298
+ ellipsis = False
299
+ for (w, ct), (w, gt) in zip(correct, guessed):
300
+ if ct == gt == "O":
301
+ if not ellipsis:
302
+ print(f" {ct:15} {gt:15} {w}")
303
+ print(" {:15} {:15} {2}".format("...", "...", "..."))
304
+ ellipsis = True
305
+ else:
306
+ ellipsis = False
307
+ print(f" {ct:15} {gt:15} {w}")
308
+
309
+
310
+ def build_model(fmt="binary"):
311
+ print("Loading training data...")
312
+ train_paths = [
313
+ find("corpora/ace_data/ace.dev"),
314
+ find("corpora/ace_data/ace.heldout"),
315
+ find("corpora/ace_data/bbn.dev"),
316
+ find("corpora/ace_data/muc.dev"),
317
+ ]
318
+ train_trees = load_ace_data(train_paths, fmt)
319
+ train_data = [postag_tree(t) for t in train_trees]
320
+ print("Training...")
321
+ cp = NEChunkParser(train_data)
322
+ del train_data
323
+
324
+ print("Loading eval data...")
325
+ eval_paths = [find("corpora/ace_data/ace.eval")]
326
+ eval_trees = load_ace_data(eval_paths, fmt)
327
+ eval_data = [postag_tree(t) for t in eval_trees]
328
+
329
+ print("Evaluating...")
330
+ chunkscore = ChunkScore()
331
+ for i, correct in enumerate(eval_data):
332
+ guess = cp.parse(correct.leaves())
333
+ chunkscore.score(correct, guess)
334
+ if i < 3:
335
+ cmp_chunks(correct, guess)
336
+ print(chunkscore)
337
+
338
+ outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
339
+ print(f"Saving chunker to {outfilename}...")
340
+
341
+ with open(outfilename, "wb") as outfile:
342
+ pickle.dump(cp, outfile, -1)
343
+
344
+ return cp
345
+
346
+
347
+ if __name__ == "__main__":
348
+ # Make sure that the pickled object has the right class name:
349
+ from nltk.chunk.named_entity import build_model
350
+
351
+ build_model("binary")
352
+ build_model("multiclass")
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/regexp.py ADDED
@@ -0,0 +1,1475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Regular Expression Chunkers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com> (minor additions)
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ import re
10
+
11
+ import regex
12
+
13
+ from nltk.chunk.api import ChunkParserI
14
+ from nltk.tree import Tree
15
+
16
+ # //////////////////////////////////////////////////////
17
+ # ChunkString
18
+ # //////////////////////////////////////////////////////
19
+
20
+
21
+ class ChunkString:
22
+ """
23
+ A string-based encoding of a particular chunking of a text.
24
+ Internally, the ``ChunkString`` class uses a single string to
25
+ encode the chunking of the input text. This string contains a
26
+ sequence of angle-bracket delimited tags, with chunking indicated
27
+ by braces. An example of this encoding is::
28
+
29
+ {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
30
+
31
+ ``ChunkString`` are created from tagged texts (i.e., lists of
32
+ ``tokens`` whose type is ``TaggedType``). Initially, nothing is
33
+ chunked.
34
+
35
+ The chunking of a ``ChunkString`` can be modified with the ``xform()``
36
+ method, which uses a regular expression to transform the string
37
+ representation. These transformations should only add and remove
38
+ braces; they should *not* modify the sequence of angle-bracket
39
+ delimited tags.
40
+
41
+ :type _str: str
42
+ :ivar _str: The internal string representation of the text's
43
+ encoding. This string representation contains a sequence of
44
+ angle-bracket delimited tags, with chunking indicated by
45
+ braces. An example of this encoding is::
46
+
47
+ {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
48
+
49
+ :type _pieces: list(tagged tokens and chunks)
50
+ :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``.
51
+ :ivar _debug: The debug level. See the constructor docs.
52
+
53
+ :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
54
+ will only match positions that are in chunks.
55
+ :cvar IN_STRIP_PATTERN: A zero-width regexp pattern string that
56
+ will only match positions that are in strips.
57
+ """
58
+
59
+ CHUNK_TAG_CHAR = r"[^\{\}<>]"
60
+ CHUNK_TAG = r"(<%s+?>)" % CHUNK_TAG_CHAR
61
+
62
+ IN_CHUNK_PATTERN = r"(?=[^\{]*\})"
63
+ IN_STRIP_PATTERN = r"(?=[^\}]*(\{|$))"
64
+
65
+ # These are used by _verify
66
+ _CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG
67
+ _STRIP = r"(%s+?)+?" % CHUNK_TAG
68
+ _VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG)
69
+ _BRACKETS = re.compile(r"[^\{\}]+")
70
+ _BALANCED_BRACKETS = re.compile(r"(\{\})*$")
71
+
72
+ def __init__(self, chunk_struct, debug_level=1):
73
+ """
74
+ Construct a new ``ChunkString`` that encodes the chunking of
75
+ the text ``tagged_tokens``.
76
+
77
+ :type chunk_struct: Tree
78
+ :param chunk_struct: The chunk structure to be further chunked.
79
+ :type debug_level: int
80
+ :param debug_level: The level of debugging which should be
81
+ applied to transformations on the ``ChunkString``. The
82
+ valid levels are:
83
+
84
+ - 0: no checks
85
+ - 1: full check on to_chunkstruct
86
+ - 2: full check on to_chunkstruct and cursory check after
87
+ each transformation.
88
+ - 3: full check on to_chunkstruct and full check after
89
+ each transformation.
90
+
91
+ We recommend you use at least level 1. You should
92
+ probably use level 3 if you use any non-standard
93
+ subclasses of ``RegexpChunkRule``.
94
+ """
95
+ self._root_label = chunk_struct.label()
96
+ self._pieces = chunk_struct[:]
97
+ tags = [self._tag(tok) for tok in self._pieces]
98
+ self._str = "<" + "><".join(tags) + ">"
99
+ self._debug = debug_level
100
+
101
+ def _tag(self, tok):
102
+ if isinstance(tok, tuple):
103
+ return tok[1]
104
+ elif isinstance(tok, Tree):
105
+ return tok.label()
106
+ else:
107
+ raise ValueError("chunk structures must contain tagged " "tokens or trees")
108
+
109
+ def _verify(self, s, verify_tags):
110
+ """
111
+ Check to make sure that ``s`` still corresponds to some chunked
112
+ version of ``_pieces``.
113
+
114
+ :type verify_tags: bool
115
+ :param verify_tags: Whether the individual tags should be
116
+ checked. If this is false, ``_verify`` will check to make
117
+ sure that ``_str`` encodes a chunked version of *some*
118
+ list of tokens. If this is true, then ``_verify`` will
119
+ check to make sure that the tags in ``_str`` match those in
120
+ ``_pieces``.
121
+
122
+ :raise ValueError: if the internal string representation of
123
+ this ``ChunkString`` is invalid or not consistent with _pieces.
124
+ """
125
+ # Check overall form
126
+ if not ChunkString._VALID.match(s):
127
+ raise ValueError(
128
+ "Transformation generated invalid " "chunkstring:\n %s" % s
129
+ )
130
+
131
+ # Check that parens are balanced. If the string is long, we
132
+ # have to do this in pieces, to avoid a maximum recursion
133
+ # depth limit for regular expressions.
134
+ brackets = ChunkString._BRACKETS.sub("", s)
135
+ for i in range(1 + len(brackets) // 5000):
136
+ substr = brackets[i * 5000 : i * 5000 + 5000]
137
+ if not ChunkString._BALANCED_BRACKETS.match(substr):
138
+ raise ValueError(
139
+ "Transformation generated invalid " "chunkstring:\n %s" % s
140
+ )
141
+
142
+ if verify_tags <= 0:
143
+ return
144
+
145
+ tags1 = (re.split(r"[\{\}<>]+", s))[1:-1]
146
+ tags2 = [self._tag(piece) for piece in self._pieces]
147
+ if tags1 != tags2:
148
+ raise ValueError(
149
+ "Transformation generated invalid " "chunkstring: tag changed"
150
+ )
151
+
152
+ def to_chunkstruct(self, chunk_label="CHUNK"):
153
+ """
154
+ Return the chunk structure encoded by this ``ChunkString``.
155
+
156
+ :rtype: Tree
157
+ :raise ValueError: If a transformation has generated an
158
+ invalid chunkstring.
159
+ """
160
+ if self._debug > 0:
161
+ self._verify(self._str, 1)
162
+
163
+ # Use this alternating list to create the chunkstruct.
164
+ pieces = []
165
+ index = 0
166
+ piece_in_chunk = 0
167
+ for piece in re.split("[{}]", self._str):
168
+
169
+ # Find the list of tokens contained in this piece.
170
+ length = piece.count("<")
171
+ subsequence = self._pieces[index : index + length]
172
+
173
+ # Add this list of tokens to our pieces.
174
+ if piece_in_chunk:
175
+ pieces.append(Tree(chunk_label, subsequence))
176
+ else:
177
+ pieces += subsequence
178
+
179
+ # Update index, piece_in_chunk
180
+ index += length
181
+ piece_in_chunk = not piece_in_chunk
182
+
183
+ return Tree(self._root_label, pieces)
184
+
185
+ def xform(self, regexp, repl):
186
+ """
187
+ Apply the given transformation to the string encoding of this
188
+ ``ChunkString``. In particular, find all occurrences that match
189
+ ``regexp``, and replace them using ``repl`` (as done by
190
+ ``re.sub``).
191
+
192
+ This transformation should only add and remove braces; it
193
+ should *not* modify the sequence of angle-bracket delimited
194
+ tags. Furthermore, this transformation may not result in
195
+ improper bracketing. Note, in particular, that bracketing may
196
+ not be nested.
197
+
198
+ :type regexp: str or regexp
199
+ :param regexp: A regular expression matching the substring
200
+ that should be replaced. This will typically include a
201
+ named group, which can be used by ``repl``.
202
+ :type repl: str
203
+ :param repl: An expression specifying what should replace the
204
+ matched substring. Typically, this will include a named
205
+ replacement group, specified by ``regexp``.
206
+ :rtype: None
207
+ :raise ValueError: If this transformation generated an
208
+ invalid chunkstring.
209
+ """
210
+ # Do the actual substitution
211
+ s = re.sub(regexp, repl, self._str)
212
+
213
+ # The substitution might have generated "empty chunks"
214
+ # (substrings of the form "{}"). Remove them, so they don't
215
+ # interfere with other transformations.
216
+ s = re.sub(r"\{\}", "", s)
217
+
218
+ # Make sure that the transformation was legal.
219
+ if self._debug > 1:
220
+ self._verify(s, self._debug - 2)
221
+
222
+ # Commit the transformation.
223
+ self._str = s
224
+
225
+ def __repr__(self):
226
+ """
227
+ Return a string representation of this ``ChunkString``.
228
+ It has the form::
229
+
230
+ <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
231
+
232
+ :rtype: str
233
+ """
234
+ return "<ChunkString: %s>" % repr(self._str)
235
+
236
+ def __str__(self):
237
+ """
238
+ Return a formatted representation of this ``ChunkString``.
239
+ This representation will include extra spaces to ensure that
240
+ tags will line up with the representation of other
241
+ ``ChunkStrings`` for the same text, regardless of the chunking.
242
+
243
+ :rtype: str
244
+ """
245
+ # Add spaces to make everything line up.
246
+ str = re.sub(r">(?!\})", r"> ", self._str)
247
+ str = re.sub(r"([^\{])<", r"\1 <", str)
248
+ if str[0] == "<":
249
+ str = " " + str
250
+ return str
251
+
252
+
253
+ # //////////////////////////////////////////////////////
254
+ # Chunking Rules
255
+ # //////////////////////////////////////////////////////
256
+
257
+
258
+ class RegexpChunkRule:
259
+ """
260
+ A rule specifying how to modify the chunking in a ``ChunkString``,
261
+ using a transformational regular expression. The
262
+ ``RegexpChunkRule`` class itself can be used to implement any
263
+ transformational rule based on regular expressions. There are
264
+ also a number of subclasses, which can be used to implement
265
+ simpler types of rules, based on matching regular expressions.
266
+
267
+ Each ``RegexpChunkRule`` has a regular expression and a
268
+ replacement expression. When a ``RegexpChunkRule`` is "applied"
269
+ to a ``ChunkString``, it searches the ``ChunkString`` for any
270
+ substring that matches the regular expression, and replaces it
271
+ using the replacement expression. This search/replace operation
272
+ has the same semantics as ``re.sub``.
273
+
274
+ Each ``RegexpChunkRule`` also has a description string, which
275
+ gives a short (typically less than 75 characters) description of
276
+ the purpose of the rule.
277
+
278
+ This transformation defined by this ``RegexpChunkRule`` should
279
+ only add and remove braces; it should *not* modify the sequence
280
+ of angle-bracket delimited tags. Furthermore, this transformation
281
+ may not result in nested or mismatched bracketing.
282
+ """
283
+
284
+ def __init__(self, regexp, repl, descr):
285
+ """
286
+ Construct a new RegexpChunkRule.
287
+
288
+ :type regexp: regexp or str
289
+ :param regexp: The regular expression for this ``RegexpChunkRule``.
290
+ When this rule is applied to a ``ChunkString``, any
291
+ substring that matches ``regexp`` will be replaced using
292
+ the replacement string ``repl``. Note that this must be a
293
+ normal regular expression, not a tag pattern.
294
+ :type repl: str
295
+ :param repl: The replacement expression for this ``RegexpChunkRule``.
296
+ When this rule is applied to a ``ChunkString``, any substring
297
+ that matches ``regexp`` will be replaced using ``repl``.
298
+ :type descr: str
299
+ :param descr: A short description of the purpose and/or effect
300
+ of this rule.
301
+ """
302
+ if isinstance(regexp, str):
303
+ regexp = re.compile(regexp)
304
+ self._repl = repl
305
+ self._descr = descr
306
+ self._regexp = regexp
307
+
308
+ def apply(self, chunkstr):
309
+ # Keep docstring generic so we can inherit it.
310
+ """
311
+ Apply this rule to the given ``ChunkString``. See the
312
+ class reference documentation for a description of what it
313
+ means to apply a rule.
314
+
315
+ :type chunkstr: ChunkString
316
+ :param chunkstr: The chunkstring to which this rule is applied.
317
+ :rtype: None
318
+ :raise ValueError: If this transformation generated an
319
+ invalid chunkstring.
320
+ """
321
+ chunkstr.xform(self._regexp, self._repl)
322
+
323
+ def descr(self):
324
+ """
325
+ Return a short description of the purpose and/or effect of
326
+ this rule.
327
+
328
+ :rtype: str
329
+ """
330
+ return self._descr
331
+
332
+ def __repr__(self):
333
+ """
334
+ Return a string representation of this rule. It has the form::
335
+
336
+ <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
337
+
338
+ Note that this representation does not include the
339
+ description string; that string can be accessed
340
+ separately with the ``descr()`` method.
341
+
342
+ :rtype: str
343
+ """
344
+ return (
345
+ "<RegexpChunkRule: "
346
+ + repr(self._regexp.pattern)
347
+ + "->"
348
+ + repr(self._repl)
349
+ + ">"
350
+ )
351
+
352
+ @staticmethod
353
+ def fromstring(s):
354
+ """
355
+ Create a RegexpChunkRule from a string description.
356
+ Currently, the following formats are supported::
357
+
358
+ {regexp} # chunk rule
359
+ }regexp{ # strip rule
360
+ regexp}{regexp # split rule
361
+ regexp{}regexp # merge rule
362
+
363
+ Where ``regexp`` is a regular expression for the rule. Any
364
+ text following the comment marker (``#``) will be used as
365
+ the rule's description:
366
+
367
+ >>> from nltk.chunk.regexp import RegexpChunkRule
368
+ >>> RegexpChunkRule.fromstring('{<DT>?<NN.*>+}')
369
+ <ChunkRule: '<DT>?<NN.*>+'>
370
+ """
371
+ # Split off the comment (but don't split on '\#')
372
+ m = re.match(r"(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?", s)
373
+ rule = m.group("rule").strip()
374
+ comment = (m.group("comment") or "")[1:].strip()
375
+
376
+ # Pattern bodies: chunk, strip, split, merge
377
+ try:
378
+ if not rule:
379
+ raise ValueError("Empty chunk pattern")
380
+ if rule[0] == "{" and rule[-1] == "}":
381
+ return ChunkRule(rule[1:-1], comment)
382
+ elif rule[0] == "}" and rule[-1] == "{":
383
+ return StripRule(rule[1:-1], comment)
384
+ elif "}{" in rule:
385
+ left, right = rule.split("}{")
386
+ return SplitRule(left, right, comment)
387
+ elif "{}" in rule:
388
+ left, right = rule.split("{}")
389
+ return MergeRule(left, right, comment)
390
+ elif re.match("[^{}]*{[^{}]*}[^{}]*", rule):
391
+ left, chunk, right = re.split("[{}]", rule)
392
+ return ChunkRuleWithContext(left, chunk, right, comment)
393
+ else:
394
+ raise ValueError("Illegal chunk pattern: %s" % rule)
395
+ except (ValueError, re.error) as e:
396
+ raise ValueError("Illegal chunk pattern: %s" % rule) from e
397
+
398
+
399
+ class ChunkRule(RegexpChunkRule):
400
+ """
401
+ A rule specifying how to add chunks to a ``ChunkString``, using a
402
+ matching tag pattern. When applied to a ``ChunkString``, it will
403
+ find any substring that matches this tag pattern and that is not
404
+ already part of a chunk, and create a new chunk containing that
405
+ substring.
406
+ """
407
+
408
+ def __init__(self, tag_pattern, descr):
409
+ """
410
+ Construct a new ``ChunkRule``.
411
+
412
+ :type tag_pattern: str
413
+ :param tag_pattern: This rule's tag pattern. When
414
+ applied to a ``ChunkString``, this rule will
415
+ chunk any substring that matches this tag pattern and that
416
+ is not already part of a chunk.
417
+ :type descr: str
418
+ :param descr: A short description of the purpose and/or effect
419
+ of this rule.
420
+ """
421
+ self._pattern = tag_pattern
422
+ regexp = re.compile(
423
+ "(?P<chunk>%s)%s"
424
+ % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_STRIP_PATTERN)
425
+ )
426
+ RegexpChunkRule.__init__(self, regexp, r"{\g<chunk>}", descr)
427
+
428
+ def __repr__(self):
429
+ """
430
+ Return a string representation of this rule. It has the form::
431
+
432
+ <ChunkRule: '<IN|VB.*>'>
433
+
434
+ Note that this representation does not include the
435
+ description string; that string can be accessed
436
+ separately with the ``descr()`` method.
437
+
438
+ :rtype: str
439
+ """
440
+ return "<ChunkRule: " + repr(self._pattern) + ">"
441
+
442
+
443
+ class StripRule(RegexpChunkRule):
444
+ """
445
+ A rule specifying how to remove strips to a ``ChunkString``,
446
+ using a matching tag pattern. When applied to a
447
+ ``ChunkString``, it will find any substring that matches this
448
+ tag pattern and that is contained in a chunk, and remove it
449
+ from that chunk, thus creating two new chunks.
450
+ """
451
+
452
+ def __init__(self, tag_pattern, descr):
453
+ """
454
+ Construct a new ``StripRule``.
455
+
456
+ :type tag_pattern: str
457
+ :param tag_pattern: This rule's tag pattern. When
458
+ applied to a ``ChunkString``, this rule will
459
+ find any substring that matches this tag pattern and that
460
+ is contained in a chunk, and remove it from that chunk,
461
+ thus creating two new chunks.
462
+ :type descr: str
463
+ :param descr: A short description of the purpose and/or effect
464
+ of this rule.
465
+ """
466
+ self._pattern = tag_pattern
467
+ regexp = re.compile(
468
+ "(?P<strip>%s)%s"
469
+ % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
470
+ )
471
+ RegexpChunkRule.__init__(self, regexp, r"}\g<strip>{", descr)
472
+
473
+ def __repr__(self):
474
+ """
475
+ Return a string representation of this rule. It has the form::
476
+
477
+ <StripRule: '<IN|VB.*>'>
478
+
479
+ Note that this representation does not include the
480
+ description string; that string can be accessed
481
+ separately with the ``descr()`` method.
482
+
483
+ :rtype: str
484
+ """
485
+ return "<StripRule: " + repr(self._pattern) + ">"
486
+
487
+
488
+ class UnChunkRule(RegexpChunkRule):
489
+ """
490
+ A rule specifying how to remove chunks to a ``ChunkString``,
491
+ using a matching tag pattern. When applied to a
492
+ ``ChunkString``, it will find any complete chunk that matches this
493
+ tag pattern, and un-chunk it.
494
+ """
495
+
496
+ def __init__(self, tag_pattern, descr):
497
+ """
498
+ Construct a new ``UnChunkRule``.
499
+
500
+ :type tag_pattern: str
501
+ :param tag_pattern: This rule's tag pattern. When
502
+ applied to a ``ChunkString``, this rule will
503
+ find any complete chunk that matches this tag pattern,
504
+ and un-chunk it.
505
+ :type descr: str
506
+ :param descr: A short description of the purpose and/or effect
507
+ of this rule.
508
+ """
509
+ self._pattern = tag_pattern
510
+ regexp = re.compile(r"\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
511
+ RegexpChunkRule.__init__(self, regexp, r"\g<chunk>", descr)
512
+
513
+ def __repr__(self):
514
+ """
515
+ Return a string representation of this rule. It has the form::
516
+
517
+ <UnChunkRule: '<IN|VB.*>'>
518
+
519
+ Note that this representation does not include the
520
+ description string; that string can be accessed
521
+ separately with the ``descr()`` method.
522
+
523
+ :rtype: str
524
+ """
525
+ return "<UnChunkRule: " + repr(self._pattern) + ">"
526
+
527
+
528
+ class MergeRule(RegexpChunkRule):
529
+ """
530
+ A rule specifying how to merge chunks in a ``ChunkString``, using
531
+ two matching tag patterns: a left pattern, and a right pattern.
532
+ When applied to a ``ChunkString``, it will find any chunk whose end
533
+ matches left pattern, and immediately followed by a chunk whose
534
+ beginning matches right pattern. It will then merge those two
535
+ chunks into a single chunk.
536
+ """
537
+
538
+ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
539
+ """
540
+ Construct a new ``MergeRule``.
541
+
542
+ :type right_tag_pattern: str
543
+ :param right_tag_pattern: This rule's right tag
544
+ pattern. When applied to a ``ChunkString``, this
545
+ rule will find any chunk whose end matches
546
+ ``left_tag_pattern``, and immediately followed by a chunk
547
+ whose beginning matches this pattern. It will
548
+ then merge those two chunks into a single chunk.
549
+ :type left_tag_pattern: str
550
+ :param left_tag_pattern: This rule's left tag
551
+ pattern. When applied to a ``ChunkString``, this
552
+ rule will find any chunk whose end matches
553
+ this pattern, and immediately followed by a chunk
554
+ whose beginning matches ``right_tag_pattern``. It will
555
+ then merge those two chunks into a single chunk.
556
+
557
+ :type descr: str
558
+ :param descr: A short description of the purpose and/or effect
559
+ of this rule.
560
+ """
561
+ # Ensure that the individual patterns are coherent. E.g., if
562
+ # left='(' and right=')', then this will raise an exception:
563
+ re.compile(tag_pattern2re_pattern(left_tag_pattern))
564
+ re.compile(tag_pattern2re_pattern(right_tag_pattern))
565
+
566
+ self._left_tag_pattern = left_tag_pattern
567
+ self._right_tag_pattern = right_tag_pattern
568
+ regexp = re.compile(
569
+ "(?P<left>%s)}{(?=%s)"
570
+ % (
571
+ tag_pattern2re_pattern(left_tag_pattern),
572
+ tag_pattern2re_pattern(right_tag_pattern),
573
+ )
574
+ )
575
+ RegexpChunkRule.__init__(self, regexp, r"\g<left>", descr)
576
+
577
+ def __repr__(self):
578
+ """
579
+ Return a string representation of this rule. It has the form::
580
+
581
+ <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
582
+
583
+ Note that this representation does not include the
584
+ description string; that string can be accessed
585
+ separately with the ``descr()`` method.
586
+
587
+ :rtype: str
588
+ """
589
+ return (
590
+ "<MergeRule: "
591
+ + repr(self._left_tag_pattern)
592
+ + ", "
593
+ + repr(self._right_tag_pattern)
594
+ + ">"
595
+ )
596
+
597
+
598
+ class SplitRule(RegexpChunkRule):
599
+ """
600
+ A rule specifying how to split chunks in a ``ChunkString``, using
601
+ two matching tag patterns: a left pattern, and a right pattern.
602
+ When applied to a ``ChunkString``, it will find any chunk that
603
+ matches the left pattern followed by the right pattern. It will
604
+ then split the chunk into two new chunks, at the point between the
605
+ two pattern matches.
606
+ """
607
+
608
+ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
609
+ """
610
+ Construct a new ``SplitRule``.
611
+
612
+ :type right_tag_pattern: str
613
+ :param right_tag_pattern: This rule's right tag
614
+ pattern. When applied to a ``ChunkString``, this rule will
615
+ find any chunk containing a substring that matches
616
+ ``left_tag_pattern`` followed by this pattern. It will
617
+ then split the chunk into two new chunks at the point
618
+ between these two matching patterns.
619
+ :type left_tag_pattern: str
620
+ :param left_tag_pattern: This rule's left tag
621
+ pattern. When applied to a ``ChunkString``, this rule will
622
+ find any chunk containing a substring that matches this
623
+ pattern followed by ``right_tag_pattern``. It will then
624
+ split the chunk into two new chunks at the point between
625
+ these two matching patterns.
626
+ :type descr: str
627
+ :param descr: A short description of the purpose and/or effect
628
+ of this rule.
629
+ """
630
+ # Ensure that the individual patterns are coherent. E.g., if
631
+ # left='(' and right=')', then this will raise an exception:
632
+ re.compile(tag_pattern2re_pattern(left_tag_pattern))
633
+ re.compile(tag_pattern2re_pattern(right_tag_pattern))
634
+
635
+ self._left_tag_pattern = left_tag_pattern
636
+ self._right_tag_pattern = right_tag_pattern
637
+ regexp = re.compile(
638
+ "(?P<left>%s)(?=%s)"
639
+ % (
640
+ tag_pattern2re_pattern(left_tag_pattern),
641
+ tag_pattern2re_pattern(right_tag_pattern),
642
+ )
643
+ )
644
+ RegexpChunkRule.__init__(self, regexp, r"\g<left>}{", descr)
645
+
646
+ def __repr__(self):
647
+ """
648
+ Return a string representation of this rule. It has the form::
649
+
650
+ <SplitRule: '<NN>', '<DT>'>
651
+
652
+ Note that this representation does not include the
653
+ description string; that string can be accessed
654
+ separately with the ``descr()`` method.
655
+
656
+ :rtype: str
657
+ """
658
+ return (
659
+ "<SplitRule: "
660
+ + repr(self._left_tag_pattern)
661
+ + ", "
662
+ + repr(self._right_tag_pattern)
663
+ + ">"
664
+ )
665
+
666
+
667
+ class ExpandLeftRule(RegexpChunkRule):
668
+ """
669
+ A rule specifying how to expand chunks in a ``ChunkString`` to the left,
670
+ using two matching tag patterns: a left pattern, and a right pattern.
671
+ When applied to a ``ChunkString``, it will find any chunk whose beginning
672
+ matches right pattern, and immediately preceded by a strip whose
673
+ end matches left pattern. It will then expand the chunk to incorporate
674
+ the new material on the left.
675
+ """
676
+
677
+ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
678
+ """
679
+ Construct a new ``ExpandRightRule``.
680
+
681
+ :type right_tag_pattern: str
682
+ :param right_tag_pattern: This rule's right tag
683
+ pattern. When applied to a ``ChunkString``, this
684
+ rule will find any chunk whose beginning matches
685
+ ``right_tag_pattern``, and immediately preceded by a strip
686
+ whose end matches this pattern. It will
687
+ then merge those two chunks into a single chunk.
688
+ :type left_tag_pattern: str
689
+ :param left_tag_pattern: This rule's left tag
690
+ pattern. When applied to a ``ChunkString``, this
691
+ rule will find any chunk whose beginning matches
692
+ this pattern, and immediately preceded by a strip
693
+ whose end matches ``left_tag_pattern``. It will
694
+ then expand the chunk to incorporate the new material on the left.
695
+
696
+ :type descr: str
697
+ :param descr: A short description of the purpose and/or effect
698
+ of this rule.
699
+ """
700
+ # Ensure that the individual patterns are coherent. E.g., if
701
+ # left='(' and right=')', then this will raise an exception:
702
+ re.compile(tag_pattern2re_pattern(left_tag_pattern))
703
+ re.compile(tag_pattern2re_pattern(right_tag_pattern))
704
+
705
+ self._left_tag_pattern = left_tag_pattern
706
+ self._right_tag_pattern = right_tag_pattern
707
+ regexp = re.compile(
708
+ r"(?P<left>%s)\{(?P<right>%s)"
709
+ % (
710
+ tag_pattern2re_pattern(left_tag_pattern),
711
+ tag_pattern2re_pattern(right_tag_pattern),
712
+ )
713
+ )
714
+ RegexpChunkRule.__init__(self, regexp, r"{\g<left>\g<right>", descr)
715
+
716
+ def __repr__(self):
717
+ """
718
+ Return a string representation of this rule. It has the form::
719
+
720
+ <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
721
+
722
+ Note that this representation does not include the
723
+ description string; that string can be accessed
724
+ separately with the ``descr()`` method.
725
+
726
+ :rtype: str
727
+ """
728
+ return (
729
+ "<ExpandLeftRule: "
730
+ + repr(self._left_tag_pattern)
731
+ + ", "
732
+ + repr(self._right_tag_pattern)
733
+ + ">"
734
+ )
735
+
736
+
737
+ class ExpandRightRule(RegexpChunkRule):
738
+ """
739
+ A rule specifying how to expand chunks in a ``ChunkString`` to the
740
+ right, using two matching tag patterns: a left pattern, and a
741
+ right pattern. When applied to a ``ChunkString``, it will find any
742
+ chunk whose end matches left pattern, and immediately followed by
743
+ a strip whose beginning matches right pattern. It will then
744
+ expand the chunk to incorporate the new material on the right.
745
+ """
746
+
747
+ def __init__(self, left_tag_pattern, right_tag_pattern, descr):
748
+ """
749
+ Construct a new ``ExpandRightRule``.
750
+
751
+ :type right_tag_pattern: str
752
+ :param right_tag_pattern: This rule's right tag
753
+ pattern. When applied to a ``ChunkString``, this
754
+ rule will find any chunk whose end matches
755
+ ``left_tag_pattern``, and immediately followed by a strip
756
+ whose beginning matches this pattern. It will
757
+ then merge those two chunks into a single chunk.
758
+ :type left_tag_pattern: str
759
+ :param left_tag_pattern: This rule's left tag
760
+ pattern. When applied to a ``ChunkString``, this
761
+ rule will find any chunk whose end matches
762
+ this pattern, and immediately followed by a strip
763
+ whose beginning matches ``right_tag_pattern``. It will
764
+ then expand the chunk to incorporate the new material on the right.
765
+
766
+ :type descr: str
767
+ :param descr: A short description of the purpose and/or effect
768
+ of this rule.
769
+ """
770
+ # Ensure that the individual patterns are coherent. E.g., if
771
+ # left='(' and right=')', then this will raise an exception:
772
+ re.compile(tag_pattern2re_pattern(left_tag_pattern))
773
+ re.compile(tag_pattern2re_pattern(right_tag_pattern))
774
+
775
+ self._left_tag_pattern = left_tag_pattern
776
+ self._right_tag_pattern = right_tag_pattern
777
+ regexp = re.compile(
778
+ r"(?P<left>%s)\}(?P<right>%s)"
779
+ % (
780
+ tag_pattern2re_pattern(left_tag_pattern),
781
+ tag_pattern2re_pattern(right_tag_pattern),
782
+ )
783
+ )
784
+ RegexpChunkRule.__init__(self, regexp, r"\g<left>\g<right>}", descr)
785
+
786
+ def __repr__(self):
787
+ """
788
+ Return a string representation of this rule. It has the form::
789
+
790
+ <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
791
+
792
+ Note that this representation does not include the
793
+ description string; that string can be accessed
794
+ separately with the ``descr()`` method.
795
+
796
+ :rtype: str
797
+ """
798
+ return (
799
+ "<ExpandRightRule: "
800
+ + repr(self._left_tag_pattern)
801
+ + ", "
802
+ + repr(self._right_tag_pattern)
803
+ + ">"
804
+ )
805
+
806
+
807
+ class ChunkRuleWithContext(RegexpChunkRule):
808
+ """
809
+ A rule specifying how to add chunks to a ``ChunkString``, using
810
+ three matching tag patterns: one for the left context, one for the
811
+ chunk, and one for the right context. When applied to a
812
+ ``ChunkString``, it will find any substring that matches the chunk
813
+ tag pattern, is surrounded by substrings that match the two
814
+ context patterns, and is not already part of a chunk; and create a
815
+ new chunk containing the substring that matched the chunk tag
816
+ pattern.
817
+
818
+ Caveat: Both the left and right context are consumed when this
819
+ rule matches; therefore, if you need to find overlapping matches,
820
+ you will need to apply your rule more than once.
821
+ """
822
+
823
+ def __init__(
824
+ self,
825
+ left_context_tag_pattern,
826
+ chunk_tag_pattern,
827
+ right_context_tag_pattern,
828
+ descr,
829
+ ):
830
+ """
831
+ Construct a new ``ChunkRuleWithContext``.
832
+
833
+ :type left_context_tag_pattern: str
834
+ :param left_context_tag_pattern: A tag pattern that must match
835
+ the left context of ``chunk_tag_pattern`` for this rule to
836
+ apply.
837
+ :type chunk_tag_pattern: str
838
+ :param chunk_tag_pattern: A tag pattern that must match for this
839
+ rule to apply. If the rule does apply, then this pattern
840
+ also identifies the substring that will be made into a chunk.
841
+ :type right_context_tag_pattern: str
842
+ :param right_context_tag_pattern: A tag pattern that must match
843
+ the right context of ``chunk_tag_pattern`` for this rule to
844
+ apply.
845
+ :type descr: str
846
+ :param descr: A short description of the purpose and/or effect
847
+ of this rule.
848
+ """
849
+ # Ensure that the individual patterns are coherent. E.g., if
850
+ # left='(' and right=')', then this will raise an exception:
851
+ re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
852
+ re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
853
+ re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
854
+
855
+ self._left_context_tag_pattern = left_context_tag_pattern
856
+ self._chunk_tag_pattern = chunk_tag_pattern
857
+ self._right_context_tag_pattern = right_context_tag_pattern
858
+ regexp = re.compile(
859
+ "(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s"
860
+ % (
861
+ tag_pattern2re_pattern(left_context_tag_pattern),
862
+ tag_pattern2re_pattern(chunk_tag_pattern),
863
+ tag_pattern2re_pattern(right_context_tag_pattern),
864
+ ChunkString.IN_STRIP_PATTERN,
865
+ )
866
+ )
867
+ replacement = r"\g<left>{\g<chunk>}\g<right>"
868
+ RegexpChunkRule.__init__(self, regexp, replacement, descr)
869
+
870
+ def __repr__(self):
871
+ """
872
+ Return a string representation of this rule. It has the form::
873
+
874
+ <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
875
+
876
+ Note that this representation does not include the
877
+ description string; that string can be accessed
878
+ separately with the ``descr()`` method.
879
+
880
+ :rtype: str
881
+ """
882
+ return "<ChunkRuleWithContext: {!r}, {!r}, {!r}>".format(
883
+ self._left_context_tag_pattern,
884
+ self._chunk_tag_pattern,
885
+ self._right_context_tag_pattern,
886
+ )
887
+
888
+
889
+ # //////////////////////////////////////////////////////
890
+ # Tag Pattern Format Conversion
891
+ # //////////////////////////////////////////////////////
892
+
893
+ # this should probably be made more strict than it is -- e.g., it
894
+ # currently accepts 'foo'.
895
+ CHUNK_TAG_PATTERN = re.compile(
896
+ r"^(({}|<{}>)*)$".format(r"([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", r"[^\{\}<>]+")
897
+ )
898
+
899
+
900
+ def tag_pattern2re_pattern(tag_pattern):
901
+ """
902
+ Convert a tag pattern to a regular expression pattern. A "tag
903
+ pattern" is a modified version of a regular expression, designed
904
+ for matching sequences of tags. The differences between regular
905
+ expression patterns and tag patterns are:
906
+
907
+ - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
908
+ ``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
909
+ ``'<NN'`` followed by one or more repetitions of ``'>'``.
910
+ - Whitespace in tag patterns is ignored. So
911
+ ``'<DT> | <NN>'`` is equivalent to ``'<DT>|<NN>'``
912
+ - In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so
913
+ ``'<NN.*>'`` matches any single tag starting with ``'NN'``.
914
+
915
+ In particular, ``tag_pattern2re_pattern`` performs the following
916
+ transformations on the given pattern:
917
+
918
+ - Replace '.' with '[^<>{}]'
919
+ - Remove any whitespace
920
+ - Add extra parens around '<' and '>', to make '<' and '>' act
921
+ like parentheses. E.g., so that in '<NN>+', the '+' has scope
922
+ over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
923
+ scope over 'NN' and 'IN', but not '<' or '>'.
924
+ - Check to make sure the resulting pattern is valid.
925
+
926
+ :type tag_pattern: str
927
+ :param tag_pattern: The tag pattern to convert to a regular
928
+ expression pattern.
929
+ :raise ValueError: If ``tag_pattern`` is not a valid tag pattern.
930
+ In particular, ``tag_pattern`` should not include braces; and it
931
+ should not contain nested or mismatched angle-brackets.
932
+ :rtype: str
933
+ :return: A regular expression pattern corresponding to
934
+ ``tag_pattern``.
935
+ """
936
+ # Clean up the regular expression
937
+ tag_pattern = re.sub(r"\s", "", tag_pattern)
938
+ tag_pattern = re.sub(r"<", "(<(", tag_pattern)
939
+ tag_pattern = re.sub(r">", ")>)", tag_pattern)
940
+
941
+ # Check the regular expression
942
+ if not CHUNK_TAG_PATTERN.match(tag_pattern):
943
+ raise ValueError("Bad tag pattern: %r" % tag_pattern)
944
+
945
+ # Replace "." with CHUNK_TAG_CHAR.
946
+ # We have to do this after, since it adds {}[]<>s, which would
947
+ # confuse CHUNK_TAG_PATTERN.
948
+ # PRE doesn't have lookback assertions, so reverse twice, and do
949
+ # the pattern backwards (with lookahead assertions). This can be
950
+ # made much cleaner once we can switch back to SRE.
951
+ def reverse_str(str):
952
+ lst = list(str)
953
+ lst.reverse()
954
+ return "".join(lst)
955
+
956
+ tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
957
+ reversed = reverse_str(tag_pattern)
958
+ reversed = re.sub(r"\.(?!\\(\\\\)*($|[^\\]))", tc_rev, reversed)
959
+ tag_pattern = reverse_str(reversed)
960
+
961
+ return tag_pattern
962
+
963
+
964
+ # //////////////////////////////////////////////////////
965
+ # RegexpChunkParser
966
+ # //////////////////////////////////////////////////////
967
+
968
+
969
+ class RegexpChunkParser(ChunkParserI):
970
+ """
971
+ A regular expression based chunk parser. ``RegexpChunkParser`` uses a
972
+ sequence of "rules" to find chunks of a single type within a
973
+ text. The chunking of the text is encoded using a ``ChunkString``,
974
+ and each rule acts by modifying the chunking in the
975
+ ``ChunkString``. The rules are all implemented using regular
976
+ expression matching and substitution.
977
+
978
+ The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``,
979
+ ``StripRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``)
980
+ define the rules that are used by ``RegexpChunkParser``. Each rule
981
+ defines an ``apply()`` method, which modifies the chunking encoded
982
+ by a given ``ChunkString``.
983
+
984
+ :type _rules: list(RegexpChunkRule)
985
+ :ivar _rules: The list of rules that should be applied to a text.
986
+ :type _trace: int
987
+ :ivar _trace: The default level of tracing.
988
+
989
+ """
990
+
991
+ def __init__(self, rules, chunk_label="NP", root_label="S", trace=0):
992
+ """
993
+ Construct a new ``RegexpChunkParser``.
994
+
995
+ :type rules: list(RegexpChunkRule)
996
+ :param rules: The sequence of rules that should be used to
997
+ generate the chunking for a tagged text.
998
+ :type chunk_label: str
999
+ :param chunk_label: The node value that should be used for
1000
+ chunk subtrees. This is typically a short string
1001
+ describing the type of information contained by the chunk,
1002
+ such as ``"NP"`` for base noun phrases.
1003
+ :type root_label: str
1004
+ :param root_label: The node value that should be used for the
1005
+ top node of the chunk structure.
1006
+ :type trace: int
1007
+ :param trace: The level of tracing that should be used when
1008
+ parsing a text. ``0`` will generate no tracing output;
1009
+ ``1`` will generate normal tracing output; and ``2`` or
1010
+ higher will generate verbose tracing output.
1011
+ """
1012
+ self._rules = rules
1013
+ self._trace = trace
1014
+ self._chunk_label = chunk_label
1015
+ self._root_label = root_label
1016
+
1017
+ def _trace_apply(self, chunkstr, verbose):
1018
+ """
1019
+ Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
1020
+ turn. Generate trace output between each rule. If ``verbose``
1021
+ is true, then generate verbose output.
1022
+
1023
+ :type chunkstr: ChunkString
1024
+ :param chunkstr: The chunk string to which each rule should be
1025
+ applied.
1026
+ :type verbose: bool
1027
+ :param verbose: Whether output should be verbose.
1028
+ :rtype: None
1029
+ """
1030
+ print("# Input:")
1031
+ print(chunkstr)
1032
+ for rule in self._rules:
1033
+ rule.apply(chunkstr)
1034
+ if verbose:
1035
+ print("#", rule.descr() + " (" + repr(rule) + "):")
1036
+ else:
1037
+ print("#", rule.descr() + ":")
1038
+ print(chunkstr)
1039
+
1040
+ def _notrace_apply(self, chunkstr):
1041
+ """
1042
+ Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
1043
+ turn.
1044
+
1045
+ :param chunkstr: The chunk string to which each rule should be
1046
+ applied.
1047
+ :type chunkstr: ChunkString
1048
+ :rtype: None
1049
+ """
1050
+
1051
+ for rule in self._rules:
1052
+ rule.apply(chunkstr)
1053
+
1054
+ def parse(self, chunk_struct, trace=None):
1055
+ """
1056
+ :type chunk_struct: Tree
1057
+ :param chunk_struct: the chunk structure to be (further) chunked
1058
+ :type trace: int
1059
+ :param trace: The level of tracing that should be used when
1060
+ parsing a text. ``0`` will generate no tracing output;
1061
+ ``1`` will generate normal tracing output; and ``2`` or
1062
+ higher will generate verbose tracing output. This value
1063
+ overrides the trace level value that was given to the
1064
+ constructor.
1065
+ :rtype: Tree
1066
+ :return: a chunk structure that encodes the chunks in a given
1067
+ tagged sentence. A chunk is a non-overlapping linguistic
1068
+ group, such as a noun phrase. The set of chunks
1069
+ identified in the chunk structure depends on the rules
1070
+ used to define this ``RegexpChunkParser``.
1071
+ """
1072
+ if len(chunk_struct) == 0:
1073
+ print("Warning: parsing empty text")
1074
+ return Tree(self._root_label, [])
1075
+
1076
+ try:
1077
+ chunk_struct.label()
1078
+ except AttributeError:
1079
+ chunk_struct = Tree(self._root_label, chunk_struct)
1080
+
1081
+ # Use the default trace value?
1082
+ if trace is None:
1083
+ trace = self._trace
1084
+
1085
+ chunkstr = ChunkString(chunk_struct)
1086
+
1087
+ # Apply the sequence of rules to the chunkstring.
1088
+ if trace:
1089
+ verbose = trace > 1
1090
+ self._trace_apply(chunkstr, verbose)
1091
+ else:
1092
+ self._notrace_apply(chunkstr)
1093
+
1094
+ # Use the chunkstring to create a chunk structure.
1095
+ return chunkstr.to_chunkstruct(self._chunk_label)
1096
+
1097
+ def rules(self):
1098
+ """
1099
+ :return: the sequence of rules used by ``RegexpChunkParser``.
1100
+ :rtype: list(RegexpChunkRule)
1101
+ """
1102
+ return self._rules
1103
+
1104
+ def __repr__(self):
1105
+ """
1106
+ :return: a concise string representation of this
1107
+ ``RegexpChunkParser``.
1108
+ :rtype: str
1109
+ """
1110
+ return "<RegexpChunkParser with %d rules>" % len(self._rules)
1111
+
1112
+ def __str__(self):
1113
+ """
1114
+ :return: a verbose string representation of this ``RegexpChunkParser``.
1115
+ :rtype: str
1116
+ """
1117
+ s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
1118
+ margin = 0
1119
+ for rule in self._rules:
1120
+ margin = max(margin, len(rule.descr()))
1121
+ if margin < 35:
1122
+ format = " %" + repr(-(margin + 3)) + "s%s\n"
1123
+ else:
1124
+ format = " %s\n %s\n"
1125
+ for rule in self._rules:
1126
+ s += format % (rule.descr(), repr(rule))
1127
+ return s[:-1]
1128
+
1129
+
1130
+ # //////////////////////////////////////////////////////
1131
+ # Chunk Grammar
1132
+ # //////////////////////////////////////////////////////
1133
+
1134
+
1135
+ class RegexpParser(ChunkParserI):
1136
+ r"""
1137
+ A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
1138
+ regular expression patterns to specify the behavior of the parser.
1139
+ The chunking of the text is encoded using a ``ChunkString``, and
1140
+ each rule acts by modifying the chunking in the ``ChunkString``.
1141
+ The rules are all implemented using regular expression matching
1142
+ and substitution.
1143
+
1144
+ A grammar contains one or more clauses in the following form::
1145
+
1146
+ NP:
1147
+ {<DT|JJ>} # chunk determiners and adjectives
1148
+ }<[\.VI].*>+{ # strip any tag beginning with V, I, or .
1149
+ <.*>}{<DT> # split a chunk at a determiner
1150
+ <DT|JJ>{}<NN.*> # merge chunk ending with det/adj
1151
+ # with one starting with a noun
1152
+
1153
+ The patterns of a clause are executed in order. An earlier
1154
+ pattern may introduce a chunk boundary that prevents a later
1155
+ pattern from executing. Sometimes an individual pattern will
1156
+ match on multiple, overlapping extents of the input. As with
1157
+ regular expression substitution more generally, the chunker will
1158
+ identify the first match possible, then continue looking for matches
1159
+ after this one has ended.
1160
+
1161
+ The clauses of a grammar are also executed in order. A cascaded
1162
+ chunk parser is one having more than one clause. The maximum depth
1163
+ of a parse tree created by this chunk parser is the same as the
1164
+ number of clauses in the grammar.
1165
+
1166
+ When tracing is turned on, the comment portion of a line is displayed
1167
+ each time the corresponding pattern is applied.
1168
+
1169
+ :type _start: str
1170
+ :ivar _start: The start symbol of the grammar (the root node of
1171
+ resulting trees)
1172
+ :type _stages: int
1173
+ :ivar _stages: The list of parsing stages corresponding to the grammar
1174
+
1175
+ """
1176
+
1177
+ def __init__(self, grammar, root_label="S", loop=1, trace=0):
1178
+ """
1179
+ Create a new chunk parser, from the given start state
1180
+ and set of chunk patterns.
1181
+
1182
+ :param grammar: The grammar, or a list of RegexpChunkParser objects
1183
+ :type grammar: str or list(RegexpChunkParser)
1184
+ :param root_label: The top node of the tree being created
1185
+ :type root_label: str or Nonterminal
1186
+ :param loop: The number of times to run through the patterns
1187
+ :type loop: int
1188
+ :type trace: int
1189
+ :param trace: The level of tracing that should be used when
1190
+ parsing a text. ``0`` will generate no tracing output;
1191
+ ``1`` will generate normal tracing output; and ``2`` or
1192
+ higher will generate verbose tracing output.
1193
+ """
1194
+ self._trace = trace
1195
+ self._stages = []
1196
+ self._grammar = grammar
1197
+ self._loop = loop
1198
+
1199
+ if isinstance(grammar, str):
1200
+ self._read_grammar(grammar, root_label, trace)
1201
+ else:
1202
+ # Make sur the grammar looks like it has the right type:
1203
+ type_err = (
1204
+ "Expected string or list of RegexpChunkParsers " "for the grammar."
1205
+ )
1206
+ try:
1207
+ grammar = list(grammar)
1208
+ except BaseException as e:
1209
+ raise TypeError(type_err) from e
1210
+ for elt in grammar:
1211
+ if not isinstance(elt, RegexpChunkParser):
1212
+ raise TypeError(type_err)
1213
+ self._stages = grammar
1214
+
1215
+ def _read_grammar(self, grammar, root_label, trace):
1216
+ """
1217
+ Helper function for __init__: read the grammar if it is a
1218
+ string.
1219
+ """
1220
+ rules = []
1221
+ lhs = None
1222
+ pattern = regex.compile("(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))")
1223
+ for line in grammar.split("\n"):
1224
+ line = line.strip()
1225
+
1226
+ # New stage begins if there's an unescaped ':'
1227
+ m = pattern.match(line)
1228
+ if m:
1229
+ # Record the stage that we just completed.
1230
+ self._add_stage(rules, lhs, root_label, trace)
1231
+ # Start a new stage.
1232
+ lhs = m.group("nonterminal").strip()
1233
+ rules = []
1234
+ line = m.group("rule").strip()
1235
+
1236
+ # Skip blank & comment-only lines
1237
+ if line == "" or line.startswith("#"):
1238
+ continue
1239
+
1240
+ # Add the rule
1241
+ rules.append(RegexpChunkRule.fromstring(line))
1242
+
1243
+ # Record the final stage
1244
+ self._add_stage(rules, lhs, root_label, trace)
1245
+
1246
+ def _add_stage(self, rules, lhs, root_label, trace):
1247
+ """
1248
+ Helper function for __init__: add a new stage to the parser.
1249
+ """
1250
+ if rules != []:
1251
+ if not lhs:
1252
+ raise ValueError("Expected stage marker (eg NP:)")
1253
+ parser = RegexpChunkParser(
1254
+ rules, chunk_label=lhs, root_label=root_label, trace=trace
1255
+ )
1256
+ self._stages.append(parser)
1257
+
1258
+ def parse(self, chunk_struct, trace=None):
1259
+ """
1260
+ Apply the chunk parser to this input.
1261
+
1262
+ :type chunk_struct: Tree
1263
+ :param chunk_struct: the chunk structure to be (further) chunked
1264
+ (this tree is modified, and is also returned)
1265
+ :type trace: int
1266
+ :param trace: The level of tracing that should be used when
1267
+ parsing a text. ``0`` will generate no tracing output;
1268
+ ``1`` will generate normal tracing output; and ``2`` or
1269
+ higher will generate verbose tracing output. This value
1270
+ overrides the trace level value that was given to the
1271
+ constructor.
1272
+ :return: the chunked output.
1273
+ :rtype: Tree
1274
+ """
1275
+ if trace is None:
1276
+ trace = self._trace
1277
+ for i in range(self._loop):
1278
+ for parser in self._stages:
1279
+ chunk_struct = parser.parse(chunk_struct, trace=trace)
1280
+ return chunk_struct
1281
+
1282
+ def __repr__(self):
1283
+ """
1284
+ :return: a concise string representation of this ``chunk.RegexpParser``.
1285
+ :rtype: str
1286
+ """
1287
+ return "<chunk.RegexpParser with %d stages>" % len(self._stages)
1288
+
1289
+ def __str__(self):
1290
+ """
1291
+ :return: a verbose string representation of this
1292
+ ``RegexpParser``.
1293
+ :rtype: str
1294
+ """
1295
+ s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
1296
+ margin = 0
1297
+ for parser in self._stages:
1298
+ s += "%s\n" % parser
1299
+ return s[:-1]
1300
+
1301
+
1302
+ # //////////////////////////////////////////////////////
1303
+ # Demonstration code
1304
+ # //////////////////////////////////////////////////////
1305
+
1306
+
1307
+ def demo_eval(chunkparser, text):
1308
+ """
1309
+ Demonstration code for evaluating a chunk parser, using a
1310
+ ``ChunkScore``. This function assumes that ``text`` contains one
1311
+ sentence per line, and that each sentence has the form expected by
1312
+ ``tree.chunk``. It runs the given chunk parser on each sentence in
1313
+ the text, and scores the result. It prints the final score
1314
+ (precision, recall, and f-measure); and reports the set of chunks
1315
+ that were missed and the set of chunks that were incorrect. (At
1316
+ most 10 missing chunks and 10 incorrect chunks are reported).
1317
+
1318
+ :param chunkparser: The chunkparser to be tested
1319
+ :type chunkparser: ChunkParserI
1320
+ :param text: The chunked tagged text that should be used for
1321
+ evaluation.
1322
+ :type text: str
1323
+ """
1324
+ from nltk import chunk
1325
+ from nltk.tree import Tree
1326
+
1327
+ # Evaluate our chunk parser.
1328
+ chunkscore = chunk.ChunkScore()
1329
+
1330
+ for sentence in text.split("\n"):
1331
+ print(sentence)
1332
+ sentence = sentence.strip()
1333
+ if not sentence:
1334
+ continue
1335
+ gold = chunk.tagstr2tree(sentence)
1336
+ tokens = gold.leaves()
1337
+ test = chunkparser.parse(Tree("S", tokens), trace=1)
1338
+ chunkscore.score(gold, test)
1339
+ print()
1340
+
1341
+ print("/" + ("=" * 75) + "\\")
1342
+ print("Scoring", chunkparser)
1343
+ print("-" * 77)
1344
+ print("Precision: %5.1f%%" % (chunkscore.precision() * 100), " " * 4, end=" ")
1345
+ print("Recall: %5.1f%%" % (chunkscore.recall() * 100), " " * 6, end=" ")
1346
+ print("F-Measure: %5.1f%%" % (chunkscore.f_measure() * 100))
1347
+
1348
+ # Missed chunks.
1349
+ if chunkscore.missed():
1350
+ print("Missed:")
1351
+ missed = chunkscore.missed()
1352
+ for chunk in missed[:10]:
1353
+ print(" ", " ".join(map(str, chunk)))
1354
+ if len(chunkscore.missed()) > 10:
1355
+ print(" ...")
1356
+
1357
+ # Incorrect chunks.
1358
+ if chunkscore.incorrect():
1359
+ print("Incorrect:")
1360
+ incorrect = chunkscore.incorrect()
1361
+ for chunk in incorrect[:10]:
1362
+ print(" ", " ".join(map(str, chunk)))
1363
+ if len(chunkscore.incorrect()) > 10:
1364
+ print(" ...")
1365
+
1366
+ print("\\" + ("=" * 75) + "/")
1367
+ print()
1368
+
1369
+
1370
+ def demo():
1371
+ """
1372
+ A demonstration for the ``RegexpChunkParser`` class. A single text is
1373
+ parsed with four different chunk parsers, using a variety of rules
1374
+ and strategies.
1375
+ """
1376
+
1377
+ from nltk import Tree, chunk
1378
+
1379
+ text = """\
1380
+ [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
1381
+ [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
1382
+ [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
1383
+ """
1384
+
1385
+ print("*" * 75)
1386
+ print("Evaluation text:")
1387
+ print(text)
1388
+ print("*" * 75)
1389
+ print()
1390
+
1391
+ grammar = r"""
1392
+ NP: # NP stage
1393
+ {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
1394
+ {<NNP>+} # chunk proper nouns
1395
+ """
1396
+ cp = chunk.RegexpParser(grammar)
1397
+ demo_eval(cp, text)
1398
+
1399
+ grammar = r"""
1400
+ NP:
1401
+ {<.*>} # start by chunking each tag
1402
+ }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
1403
+ <DT|JJ>{}<NN.*> # merge det/adj with nouns
1404
+ """
1405
+ cp = chunk.RegexpParser(grammar)
1406
+ demo_eval(cp, text)
1407
+
1408
+ grammar = r"""
1409
+ NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
1410
+ VP: {<TO>?<VB.*>} # VP = verb words
1411
+ """
1412
+ cp = chunk.RegexpParser(grammar)
1413
+ demo_eval(cp, text)
1414
+
1415
+ grammar = r"""
1416
+ NP: {<.*>*} # start by chunking everything
1417
+ }<[\.VI].*>+{ # strip any verbs, prepositions or periods
1418
+ <.*>}{<DT> # separate on determiners
1419
+ PP: {<IN><NP>} # PP = preposition + noun phrase
1420
+ VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
1421
+ """
1422
+ cp = chunk.RegexpParser(grammar)
1423
+ demo_eval(cp, text)
1424
+
1425
+ # Evaluation
1426
+
1427
+ from nltk.corpus import conll2000
1428
+
1429
+ print()
1430
+ print("Demonstration of empty grammar:")
1431
+
1432
+ cp = chunk.RegexpParser("")
1433
+ print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt", chunk_types=("NP",))))
1434
+
1435
+ print()
1436
+ print("Demonstration of accuracy evaluation using CoNLL tags:")
1437
+
1438
+ grammar = r"""
1439
+ NP:
1440
+ {<.*>} # start by chunking each tag
1441
+ }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
1442
+ <DT|JJ>{}<NN.*> # merge det/adj with nouns
1443
+ """
1444
+ cp = chunk.RegexpParser(grammar)
1445
+ print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt")[:5]))
1446
+
1447
+ print()
1448
+ print("Demonstration of tagged token input")
1449
+
1450
+ grammar = r"""
1451
+ NP: {<.*>*} # start by chunking everything
1452
+ }<[\.VI].*>+{ # strip any verbs, prepositions or periods
1453
+ <.*>}{<DT> # separate on determiners
1454
+ PP: {<IN><NP>} # PP = preposition + noun phrase
1455
+ VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
1456
+ """
1457
+ cp = chunk.RegexpParser(grammar)
1458
+ print(
1459
+ cp.parse(
1460
+ [
1461
+ ("the", "DT"),
1462
+ ("little", "JJ"),
1463
+ ("cat", "NN"),
1464
+ ("sat", "VBD"),
1465
+ ("on", "IN"),
1466
+ ("the", "DT"),
1467
+ ("mat", "NN"),
1468
+ (".", "."),
1469
+ ]
1470
+ )
1471
+ )
1472
+
1473
+
1474
+ if __name__ == "__main__":
1475
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/util.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Chunk format conversions
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com> (minor additions)
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ import re
10
+
11
+ from nltk.metrics import accuracy as _accuracy
12
+ from nltk.tag.mapping import map_tag
13
+ from nltk.tag.util import str2tuple
14
+ from nltk.tree import Tree
15
+
16
+ ##//////////////////////////////////////////////////////
17
+ ## EVALUATION
18
+ ##//////////////////////////////////////////////////////
19
+
20
+
21
+ def accuracy(chunker, gold):
22
+ """
23
+ Score the accuracy of the chunker against the gold standard.
24
+ Strip the chunk information from the gold standard and rechunk it using
25
+ the chunker, then compute the accuracy score.
26
+
27
+ :type chunker: ChunkParserI
28
+ :param chunker: The chunker being evaluated.
29
+ :type gold: tree
30
+ :param gold: The chunk structures to score the chunker on.
31
+ :rtype: float
32
+ """
33
+
34
+ gold_tags = []
35
+ test_tags = []
36
+ for gold_tree in gold:
37
+ test_tree = chunker.parse(gold_tree.flatten())
38
+ gold_tags += tree2conlltags(gold_tree)
39
+ test_tags += tree2conlltags(test_tree)
40
+
41
+ # print 'GOLD:', gold_tags[:50]
42
+ # print 'TEST:', test_tags[:50]
43
+ return _accuracy(gold_tags, test_tags)
44
+
45
+
46
+ # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
47
+ # -- statistics are evaluated only on demand, instead of at every sentence evaluation
48
+ #
49
+ # SB: use nltk.metrics for precision/recall scoring?
50
+ #
51
+ class ChunkScore:
52
+ """
53
+ A utility class for scoring chunk parsers. ``ChunkScore`` can
54
+ evaluate a chunk parser's output, based on a number of statistics
55
+ (precision, recall, f-measure, misssed chunks, incorrect chunks).
56
+ It can also combine the scores from the parsing of multiple texts;
57
+ this makes it significantly easier to evaluate a chunk parser that
58
+ operates one sentence at a time.
59
+
60
+ Texts are evaluated with the ``score`` method. The results of
61
+ evaluation can be accessed via a number of accessor methods, such
62
+ as ``precision`` and ``f_measure``. A typical use of the
63
+ ``ChunkScore`` class is::
64
+
65
+ >>> chunkscore = ChunkScore() # doctest: +SKIP
66
+ >>> for correct in correct_sentences: # doctest: +SKIP
67
+ ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
68
+ ... chunkscore.score(correct, guess) # doctest: +SKIP
69
+ >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
70
+ F Measure: 0.823
71
+
72
+ :ivar kwargs: Keyword arguments:
73
+
74
+ - max_tp_examples: The maximum number actual examples of true
75
+ positives to record. This affects the ``correct`` member
76
+ function: ``correct`` will not return more than this number
77
+ of true positive examples. This does *not* affect any of
78
+ the numerical metrics (precision, recall, or f-measure)
79
+
80
+ - max_fp_examples: The maximum number actual examples of false
81
+ positives to record. This affects the ``incorrect`` member
82
+ function and the ``guessed`` member function: ``incorrect``
83
+ will not return more than this number of examples, and
84
+ ``guessed`` will not return more than this number of true
85
+ positive examples. This does *not* affect any of the
86
+ numerical metrics (precision, recall, or f-measure)
87
+
88
+ - max_fn_examples: The maximum number actual examples of false
89
+ negatives to record. This affects the ``missed`` member
90
+ function and the ``correct`` member function: ``missed``
91
+ will not return more than this number of examples, and
92
+ ``correct`` will not return more than this number of true
93
+ negative examples. This does *not* affect any of the
94
+ numerical metrics (precision, recall, or f-measure)
95
+
96
+ - chunk_label: A regular expression indicating which chunks
97
+ should be compared. Defaults to ``'.*'`` (i.e., all chunks).
98
+
99
+ :type _tp: list(Token)
100
+ :ivar _tp: List of true positives
101
+ :type _fp: list(Token)
102
+ :ivar _fp: List of false positives
103
+ :type _fn: list(Token)
104
+ :ivar _fn: List of false negatives
105
+
106
+ :type _tp_num: int
107
+ :ivar _tp_num: Number of true positives
108
+ :type _fp_num: int
109
+ :ivar _fp_num: Number of false positives
110
+ :type _fn_num: int
111
+ :ivar _fn_num: Number of false negatives.
112
+ """
113
+
114
+ def __init__(self, **kwargs):
115
+ self._correct = set()
116
+ self._guessed = set()
117
+ self._tp = set()
118
+ self._fp = set()
119
+ self._fn = set()
120
+ self._max_tp = kwargs.get("max_tp_examples", 100)
121
+ self._max_fp = kwargs.get("max_fp_examples", 100)
122
+ self._max_fn = kwargs.get("max_fn_examples", 100)
123
+ self._chunk_label = kwargs.get("chunk_label", ".*")
124
+ self._tp_num = 0
125
+ self._fp_num = 0
126
+ self._fn_num = 0
127
+ self._count = 0
128
+ self._tags_correct = 0.0
129
+ self._tags_total = 0.0
130
+
131
+ self._measuresNeedUpdate = False
132
+
133
+ def _updateMeasures(self):
134
+ if self._measuresNeedUpdate:
135
+ self._tp = self._guessed & self._correct
136
+ self._fn = self._correct - self._guessed
137
+ self._fp = self._guessed - self._correct
138
+ self._tp_num = len(self._tp)
139
+ self._fp_num = len(self._fp)
140
+ self._fn_num = len(self._fn)
141
+ self._measuresNeedUpdate = False
142
+
143
+ def score(self, correct, guessed):
144
+ """
145
+ Given a correctly chunked sentence, score another chunked
146
+ version of the same sentence.
147
+
148
+ :type correct: chunk structure
149
+ :param correct: The known-correct ("gold standard") chunked
150
+ sentence.
151
+ :type guessed: chunk structure
152
+ :param guessed: The chunked sentence to be scored.
153
+ """
154
+ self._correct |= _chunksets(correct, self._count, self._chunk_label)
155
+ self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
156
+ self._count += 1
157
+ self._measuresNeedUpdate = True
158
+ # Keep track of per-tag accuracy (if possible)
159
+ try:
160
+ correct_tags = tree2conlltags(correct)
161
+ guessed_tags = tree2conlltags(guessed)
162
+ except ValueError:
163
+ # This exception case is for nested chunk structures,
164
+ # where tree2conlltags will fail with a ValueError: "Tree
165
+ # is too deeply nested to be printed in CoNLL format."
166
+ correct_tags = guessed_tags = ()
167
+ self._tags_total += len(correct_tags)
168
+ self._tags_correct += sum(
169
+ 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
170
+ )
171
+
172
+ def accuracy(self):
173
+ """
174
+ Return the overall tag-based accuracy for all text that have
175
+ been scored by this ``ChunkScore``, using the IOB (conll2000)
176
+ tag encoding.
177
+
178
+ :rtype: float
179
+ """
180
+ if self._tags_total == 0:
181
+ return 1
182
+ return self._tags_correct / self._tags_total
183
+
184
+ def precision(self):
185
+ """
186
+ Return the overall precision for all texts that have been
187
+ scored by this ``ChunkScore``.
188
+
189
+ :rtype: float
190
+ """
191
+ self._updateMeasures()
192
+ div = self._tp_num + self._fp_num
193
+ if div == 0:
194
+ return 0
195
+ else:
196
+ return self._tp_num / div
197
+
198
+ def recall(self):
199
+ """
200
+ Return the overall recall for all texts that have been
201
+ scored by this ``ChunkScore``.
202
+
203
+ :rtype: float
204
+ """
205
+ self._updateMeasures()
206
+ div = self._tp_num + self._fn_num
207
+ if div == 0:
208
+ return 0
209
+ else:
210
+ return self._tp_num / div
211
+
212
+ def f_measure(self, alpha=0.5):
213
+ """
214
+ Return the overall F measure for all texts that have been
215
+ scored by this ``ChunkScore``.
216
+
217
+ :param alpha: the relative weighting of precision and recall.
218
+ Larger alpha biases the score towards the precision value,
219
+ while smaller alpha biases the score towards the recall
220
+ value. ``alpha`` should have a value in the range [0,1].
221
+ :type alpha: float
222
+ :rtype: float
223
+ """
224
+ self._updateMeasures()
225
+ p = self.precision()
226
+ r = self.recall()
227
+ if p == 0 or r == 0: # what if alpha is 0 or 1?
228
+ return 0
229
+ return 1 / (alpha / p + (1 - alpha) / r)
230
+
231
+ def missed(self):
232
+ """
233
+ Return the chunks which were included in the
234
+ correct chunk structures, but not in the guessed chunk
235
+ structures, listed in input order.
236
+
237
+ :rtype: list of chunks
238
+ """
239
+ self._updateMeasures()
240
+ chunks = list(self._fn)
241
+ return [c[1] for c in chunks] # discard position information
242
+
243
+ def incorrect(self):
244
+ """
245
+ Return the chunks which were included in the guessed chunk structures,
246
+ but not in the correct chunk structures, listed in input order.
247
+
248
+ :rtype: list of chunks
249
+ """
250
+ self._updateMeasures()
251
+ chunks = list(self._fp)
252
+ return [c[1] for c in chunks] # discard position information
253
+
254
+ def correct(self):
255
+ """
256
+ Return the chunks which were included in the correct
257
+ chunk structures, listed in input order.
258
+
259
+ :rtype: list of chunks
260
+ """
261
+ chunks = list(self._correct)
262
+ return [c[1] for c in chunks] # discard position information
263
+
264
+ def guessed(self):
265
+ """
266
+ Return the chunks which were included in the guessed
267
+ chunk structures, listed in input order.
268
+
269
+ :rtype: list of chunks
270
+ """
271
+ chunks = list(self._guessed)
272
+ return [c[1] for c in chunks] # discard position information
273
+
274
+ def __len__(self):
275
+ self._updateMeasures()
276
+ return self._tp_num + self._fn_num
277
+
278
+ def __repr__(self):
279
+ """
280
+ Return a concise representation of this ``ChunkScoring``.
281
+
282
+ :rtype: str
283
+ """
284
+ return "<ChunkScoring of " + repr(len(self)) + " chunks>"
285
+
286
+ def __str__(self):
287
+ """
288
+ Return a verbose representation of this ``ChunkScoring``.
289
+ This representation includes the precision, recall, and
290
+ f-measure scores. For other information about the score,
291
+ use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
292
+
293
+ :rtype: str
294
+ """
295
+ return (
296
+ "ChunkParse score:\n"
297
+ + (f" IOB Accuracy: {self.accuracy() * 100:5.1f}%%\n")
298
+ + (f" Precision: {self.precision() * 100:5.1f}%%\n")
299
+ + (f" Recall: {self.recall() * 100:5.1f}%%\n")
300
+ + (f" F-Measure: {self.f_measure() * 100:5.1f}%%")
301
+ )
302
+
303
+
304
+ # extract chunks, and assign unique id, the absolute position of
305
+ # the first word of the chunk
306
+ def _chunksets(t, count, chunk_label):
307
+ pos = 0
308
+ chunks = []
309
+ for child in t:
310
+ if isinstance(child, Tree):
311
+ if re.match(chunk_label, child.label()):
312
+ chunks.append(((count, pos), child.freeze()))
313
+ pos += len(child.leaves())
314
+ else:
315
+ pos += 1
316
+ return set(chunks)
317
+
318
+
319
+ def tagstr2tree(
320
+ s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
321
+ ):
322
+ """
323
+ Divide a string of bracketted tagged text into
324
+ chunks and unchunked tokens, and produce a Tree.
325
+ Chunks are marked by square brackets (``[...]``). Words are
326
+ delimited by whitespace, and each word should have the form
327
+ ``text/tag``. Words that do not contain a slash are
328
+ assigned a ``tag`` of None.
329
+
330
+ :param s: The string to be converted
331
+ :type s: str
332
+ :param chunk_label: The label to use for chunk nodes
333
+ :type chunk_label: str
334
+ :param root_label: The label to use for the root of the tree
335
+ :type root_label: str
336
+ :rtype: Tree
337
+ """
338
+
339
+ WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
340
+
341
+ stack = [Tree(root_label, [])]
342
+ for match in WORD_OR_BRACKET.finditer(s):
343
+ text = match.group()
344
+ if text[0] == "[":
345
+ if len(stack) != 1:
346
+ raise ValueError(f"Unexpected [ at char {match.start():d}")
347
+ chunk = Tree(chunk_label, [])
348
+ stack[-1].append(chunk)
349
+ stack.append(chunk)
350
+ elif text[0] == "]":
351
+ if len(stack) != 2:
352
+ raise ValueError(f"Unexpected ] at char {match.start():d}")
353
+ stack.pop()
354
+ else:
355
+ if sep is None:
356
+ stack[-1].append(text)
357
+ else:
358
+ word, tag = str2tuple(text, sep)
359
+ if source_tagset and target_tagset:
360
+ tag = map_tag(source_tagset, target_tagset, tag)
361
+ stack[-1].append((word, tag))
362
+
363
+ if len(stack) != 1:
364
+ raise ValueError(f"Expected ] at char {len(s):d}")
365
+ return stack[0]
366
+
367
+
368
+ ### CONLL
369
+
370
+ _LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
371
+
372
+
373
+ def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
374
+ """
375
+ Return a chunk structure for a single sentence
376
+ encoded in the given CONLL 2000 style string.
377
+ This function converts a CoNLL IOB string into a tree.
378
+ It uses the specified chunk types
379
+ (defaults to NP, PP and VP), and creates a tree rooted at a node
380
+ labeled S (by default).
381
+
382
+ :param s: The CoNLL string to be converted.
383
+ :type s: str
384
+ :param chunk_types: The chunk types to be converted.
385
+ :type chunk_types: tuple
386
+ :param root_label: The node label to use for the root.
387
+ :type root_label: str
388
+ :rtype: Tree
389
+ """
390
+
391
+ stack = [Tree(root_label, [])]
392
+
393
+ for lineno, line in enumerate(s.split("\n")):
394
+ if not line.strip():
395
+ continue
396
+
397
+ # Decode the line.
398
+ match = _LINE_RE.match(line)
399
+ if match is None:
400
+ raise ValueError(f"Error on line {lineno:d}")
401
+ (word, tag, state, chunk_type) = match.groups()
402
+
403
+ # If it's a chunk type we don't care about, treat it as O.
404
+ if chunk_types is not None and chunk_type not in chunk_types:
405
+ state = "O"
406
+
407
+ # For "Begin"/"Outside", finish any completed chunks -
408
+ # also do so for "Inside" which don't match the previous token.
409
+ mismatch_I = state == "I" and chunk_type != stack[-1].label()
410
+ if state in "BO" or mismatch_I:
411
+ if len(stack) == 2:
412
+ stack.pop()
413
+
414
+ # For "Begin", start a new chunk.
415
+ if state == "B" or mismatch_I:
416
+ chunk = Tree(chunk_type, [])
417
+ stack[-1].append(chunk)
418
+ stack.append(chunk)
419
+
420
+ # Add the new word token.
421
+ stack[-1].append((word, tag))
422
+
423
+ return stack[0]
424
+
425
+
426
+ def tree2conlltags(t):
427
+ """
428
+ Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
429
+ Convert a tree to the CoNLL IOB tag format.
430
+
431
+ :param t: The tree to be converted.
432
+ :type t: Tree
433
+ :rtype: list(tuple)
434
+ """
435
+
436
+ tags = []
437
+ for child in t:
438
+ try:
439
+ category = child.label()
440
+ prefix = "B-"
441
+ for contents in child:
442
+ if isinstance(contents, Tree):
443
+ raise ValueError(
444
+ "Tree is too deeply nested to be printed in CoNLL format"
445
+ )
446
+ tags.append((contents[0], contents[1], prefix + category))
447
+ prefix = "I-"
448
+ except AttributeError:
449
+ tags.append((child[0], child[1], "O"))
450
+ return tags
451
+
452
+
453
+ def conlltags2tree(
454
+ sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
455
+ ):
456
+ """
457
+ Convert the CoNLL IOB format to a tree.
458
+ """
459
+ tree = Tree(root_label, [])
460
+ for (word, postag, chunktag) in sentence:
461
+ if chunktag is None:
462
+ if strict:
463
+ raise ValueError("Bad conll tag sequence")
464
+ else:
465
+ # Treat as O
466
+ tree.append((word, postag))
467
+ elif chunktag.startswith("B-"):
468
+ tree.append(Tree(chunktag[2:], [(word, postag)]))
469
+ elif chunktag.startswith("I-"):
470
+ if (
471
+ len(tree) == 0
472
+ or not isinstance(tree[-1], Tree)
473
+ or tree[-1].label() != chunktag[2:]
474
+ ):
475
+ if strict:
476
+ raise ValueError("Bad conll tag sequence")
477
+ else:
478
+ # Treat as B-*
479
+ tree.append(Tree(chunktag[2:], [(word, postag)]))
480
+ else:
481
+ tree[-1].append((word, postag))
482
+ elif chunktag == "O":
483
+ tree.append((word, postag))
484
+ else:
485
+ raise ValueError(f"Bad conll tag {chunktag!r}")
486
+ return tree
487
+
488
+
489
+ def tree2conllstr(t):
490
+ """
491
+ Return a multiline string where each line contains a word, tag and IOB tag.
492
+ Convert a tree to the CoNLL IOB string format
493
+
494
+ :param t: The tree to be converted.
495
+ :type t: Tree
496
+ :rtype: str
497
+ """
498
+ lines = [" ".join(token) for token in tree2conlltags(t)]
499
+ return "\n".join(lines)
500
+
501
+
502
+ ### IEER
503
+
504
+ _IEER_DOC_RE = re.compile(
505
+ r"<DOC>\s*"
506
+ r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
507
+ r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
508
+ r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
509
+ r"<BODY>\s*"
510
+ r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
511
+ r"<TEXT>(?P<text>.*?)</TEXT>\s*"
512
+ r"</BODY>\s*</DOC>\s*",
513
+ re.DOTALL,
514
+ )
515
+
516
+ _IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
517
+
518
+
519
+ def _ieer_read_text(s, root_label):
520
+ stack = [Tree(root_label, [])]
521
+ # s will be None if there is no headline in the text
522
+ # return the empty list in place of a Tree
523
+ if s is None:
524
+ return []
525
+ for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
526
+ piece = piece_m.group()
527
+ try:
528
+ if piece.startswith("<b_"):
529
+ m = _IEER_TYPE_RE.match(piece)
530
+ if m is None:
531
+ print("XXXX", piece)
532
+ chunk = Tree(m.group("type"), [])
533
+ stack[-1].append(chunk)
534
+ stack.append(chunk)
535
+ elif piece.startswith("<e_"):
536
+ stack.pop()
537
+ # elif piece.startswith('<'):
538
+ # print "ERROR:", piece
539
+ # raise ValueError # Unexpected HTML
540
+ else:
541
+ stack[-1].append(piece)
542
+ except (IndexError, ValueError) as e:
543
+ raise ValueError(
544
+ f"Bad IEER string (error at character {piece_m.start():d})"
545
+ ) from e
546
+ if len(stack) != 1:
547
+ raise ValueError("Bad IEER string")
548
+ return stack[0]
549
+
550
+
551
+ def ieerstr2tree(
552
+ s,
553
+ chunk_types=[
554
+ "LOCATION",
555
+ "ORGANIZATION",
556
+ "PERSON",
557
+ "DURATION",
558
+ "DATE",
559
+ "CARDINAL",
560
+ "PERCENT",
561
+ "MONEY",
562
+ "MEASURE",
563
+ ],
564
+ root_label="S",
565
+ ):
566
+ """
567
+ Return a chunk structure containing the chunked tagged text that is
568
+ encoded in the given IEER style string.
569
+ Convert a string of chunked tagged text in the IEER named
570
+ entity format into a chunk structure. Chunks are of several
571
+ types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
572
+ PERCENT, MONEY, and MEASURE.
573
+
574
+ :rtype: Tree
575
+ """
576
+
577
+ # Try looking for a single document. If that doesn't work, then just
578
+ # treat everything as if it was within the <TEXT>...</TEXT>.
579
+ m = _IEER_DOC_RE.match(s)
580
+ if m:
581
+ return {
582
+ "text": _ieer_read_text(m.group("text"), root_label),
583
+ "docno": m.group("docno"),
584
+ "doctype": m.group("doctype"),
585
+ "date_time": m.group("date_time"),
586
+ #'headline': m.group('headline')
587
+ # we want to capture NEs in the headline too!
588
+ "headline": _ieer_read_text(m.group("headline"), root_label),
589
+ }
590
+ else:
591
+ return _ieer_read_text(s, root_label)
592
+
593
+
594
+ def demo():
595
+
596
+ s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
597
+ import nltk
598
+
599
+ t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
600
+ t.pprint()
601
+ print()
602
+
603
+ s = """
604
+ These DT B-NP
605
+ research NN I-NP
606
+ protocols NNS I-NP
607
+ offer VBP B-VP
608
+ to TO B-PP
609
+ the DT B-NP
610
+ patient NN I-NP
611
+ not RB O
612
+ only RB O
613
+ the DT B-NP
614
+ very RB I-NP
615
+ best JJS I-NP
616
+ therapy NN I-NP
617
+ which WDT B-NP
618
+ we PRP B-NP
619
+ have VBP B-VP
620
+ established VBN I-VP
621
+ today NN B-NP
622
+ but CC B-NP
623
+ also RB I-NP
624
+ the DT B-NP
625
+ hope NN I-NP
626
+ of IN B-PP
627
+ something NN B-NP
628
+ still RB B-ADJP
629
+ better JJR I-ADJP
630
+ . . O
631
+ """
632
+
633
+ conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
634
+ conll_tree.pprint()
635
+
636
+ # Demonstrate CoNLL output
637
+ print("CoNLL output:")
638
+ print(nltk.chunk.tree2conllstr(conll_tree))
639
+ print()
640
+
641
+
642
+ if __name__ == "__main__":
643
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/classify/__init__.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Classifiers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Classes and interfaces for labeling tokens with category labels (or
10
+ "class labels"). Typically, labels are represented with strings
11
+ (such as ``'health'`` or ``'sports'``). Classifiers can be used to
12
+ perform a wide range of classification tasks. For example,
13
+ classifiers can be used...
14
+
15
+ - to classify documents by topic
16
+ - to classify ambiguous words by which word sense is intended
17
+ - to classify acoustic signals by which phoneme they represent
18
+ - to classify sentences by their author
19
+
20
+ Features
21
+ ========
22
+ In order to decide which category label is appropriate for a given
23
+ token, classifiers examine one or more 'features' of the token. These
24
+ "features" are typically chosen by hand, and indicate which aspects
25
+ of the token are relevant to the classification decision. For
26
+ example, a document classifier might use a separate feature for each
27
+ word, recording how often that word occurred in the document.
28
+
29
+ Featuresets
30
+ ===========
31
+ The features describing a token are encoded using a "featureset",
32
+ which is a dictionary that maps from "feature names" to "feature
33
+ values". Feature names are unique strings that indicate what aspect
34
+ of the token is encoded by the feature. Examples include
35
+ ``'prevword'``, for a feature whose value is the previous word; and
36
+ ``'contains-word(library)'`` for a feature that is true when a document
37
+ contains the word ``'library'``. Feature values are typically
38
+ booleans, numbers, or strings, depending on which feature they
39
+ describe.
40
+
41
+ Featuresets are typically constructed using a "feature detector"
42
+ (also known as a "feature extractor"). A feature detector is a
43
+ function that takes a token (and sometimes information about its
44
+ context) as its input, and returns a featureset describing that token.
45
+ For example, the following feature detector converts a document
46
+ (stored as a list of words) to a featureset describing the set of
47
+ words included in the document:
48
+
49
+ >>> # Define a feature detector function.
50
+ >>> def document_features(document):
51
+ ... return dict([('contains-word(%s)' % w, True) for w in document])
52
+
53
+ Feature detectors are typically applied to each token before it is fed
54
+ to the classifier:
55
+
56
+ >>> # Classify each Gutenberg document.
57
+ >>> from nltk.corpus import gutenberg
58
+ >>> for fileid in gutenberg.fileids(): # doctest: +SKIP
59
+ ... doc = gutenberg.words(fileid) # doctest: +SKIP
60
+ ... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
61
+
62
+ The parameters that a feature detector expects will vary, depending on
63
+ the task and the needs of the feature detector. For example, a
64
+ feature detector for word sense disambiguation (WSD) might take as its
65
+ input a sentence, and the index of a word that should be classified,
66
+ and return a featureset for that word. The following feature detector
67
+ for WSD includes features describing the left and right contexts of
68
+ the target word:
69
+
70
+ >>> def wsd_features(sentence, index):
71
+ ... featureset = {}
72
+ ... for i in range(max(0, index-3), index):
73
+ ... featureset['left-context(%s)' % sentence[i]] = True
74
+ ... for i in range(index, max(index+3, len(sentence))):
75
+ ... featureset['right-context(%s)' % sentence[i]] = True
76
+ ... return featureset
77
+
78
+ Training Classifiers
79
+ ====================
80
+ Most classifiers are built by training them on a list of hand-labeled
81
+ examples, known as the "training set". Training sets are represented
82
+ as lists of ``(featuredict, label)`` tuples.
83
+ """
84
+
85
+ from nltk.classify.api import ClassifierI, MultiClassifierI
86
+ from nltk.classify.decisiontree import DecisionTreeClassifier
87
+ from nltk.classify.maxent import (
88
+ BinaryMaxentFeatureEncoding,
89
+ ConditionalExponentialClassifier,
90
+ MaxentClassifier,
91
+ TypedMaxentFeatureEncoding,
92
+ )
93
+ from nltk.classify.megam import call_megam, config_megam
94
+ from nltk.classify.naivebayes import NaiveBayesClassifier
95
+ from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
96
+ from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
97
+ from nltk.classify.scikitlearn import SklearnClassifier
98
+ from nltk.classify.senna import Senna
99
+ from nltk.classify.textcat import TextCat
100
+ from nltk.classify.util import accuracy, apply_features, log_likelihood
101
+ from nltk.classify.weka import WekaClassifier, config_weka
.eggs/nltk-3.8-py3.10.egg/nltk/classify/api.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Classifier Interface
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com> (minor additions)
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ Interfaces for labeling tokens with category labels (or "class labels").
11
+
12
+ ``ClassifierI`` is a standard interface for "single-category
13
+ classification", in which the set of categories is known, the number
14
+ of categories is finite, and each text belongs to exactly one
15
+ category.
16
+
17
+ ``MultiClassifierI`` is a standard interface for "multi-category
18
+ classification", which is like single-category classification except
19
+ that each text belongs to zero or more categories.
20
+ """
21
+ from nltk.internals import overridden
22
+
23
+ ##//////////////////////////////////////////////////////
24
+ # { Classification Interfaces
25
+ ##//////////////////////////////////////////////////////
26
+
27
+
28
+ class ClassifierI:
29
+ """
30
+ A processing interface for labeling tokens with a single category
31
+ label (or "class"). Labels are typically strs or
32
+ ints, but can be any immutable type. The set of labels
33
+ that the classifier chooses from must be fixed and finite.
34
+
35
+ Subclasses must define:
36
+ - ``labels()``
37
+ - either ``classify()`` or ``classify_many()`` (or both)
38
+
39
+ Subclasses may define:
40
+ - either ``prob_classify()`` or ``prob_classify_many()`` (or both)
41
+ """
42
+
43
+ def labels(self):
44
+ """
45
+ :return: the list of category labels used by this classifier.
46
+ :rtype: list of (immutable)
47
+ """
48
+ raise NotImplementedError()
49
+
50
+ def classify(self, featureset):
51
+ """
52
+ :return: the most appropriate label for the given featureset.
53
+ :rtype: label
54
+ """
55
+ if overridden(self.classify_many):
56
+ return self.classify_many([featureset])[0]
57
+ else:
58
+ raise NotImplementedError()
59
+
60
+ def prob_classify(self, featureset):
61
+ """
62
+ :return: a probability distribution over labels for the given
63
+ featureset.
64
+ :rtype: ProbDistI
65
+ """
66
+ if overridden(self.prob_classify_many):
67
+ return self.prob_classify_many([featureset])[0]
68
+ else:
69
+ raise NotImplementedError()
70
+
71
+ def classify_many(self, featuresets):
72
+ """
73
+ Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
74
+
75
+ return [self.classify(fs) for fs in featuresets]
76
+
77
+ :rtype: list(label)
78
+ """
79
+ return [self.classify(fs) for fs in featuresets]
80
+
81
+ def prob_classify_many(self, featuresets):
82
+ """
83
+ Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
84
+
85
+ return [self.prob_classify(fs) for fs in featuresets]
86
+
87
+ :rtype: list(ProbDistI)
88
+ """
89
+ return [self.prob_classify(fs) for fs in featuresets]
90
+
91
+
92
+ class MultiClassifierI:
93
+ """
94
+ A processing interface for labeling tokens with zero or more
95
+ category labels (or "labels"). Labels are typically strs
96
+ or ints, but can be any immutable type. The set of labels
97
+ that the multi-classifier chooses from must be fixed and finite.
98
+
99
+ Subclasses must define:
100
+ - ``labels()``
101
+ - either ``classify()`` or ``classify_many()`` (or both)
102
+
103
+ Subclasses may define:
104
+ - either ``prob_classify()`` or ``prob_classify_many()`` (or both)
105
+ """
106
+
107
+ def labels(self):
108
+ """
109
+ :return: the list of category labels used by this classifier.
110
+ :rtype: list of (immutable)
111
+ """
112
+ raise NotImplementedError()
113
+
114
+ def classify(self, featureset):
115
+ """
116
+ :return: the most appropriate set of labels for the given featureset.
117
+ :rtype: set(label)
118
+ """
119
+ if overridden(self.classify_many):
120
+ return self.classify_many([featureset])[0]
121
+ else:
122
+ raise NotImplementedError()
123
+
124
+ def prob_classify(self, featureset):
125
+ """
126
+ :return: a probability distribution over sets of labels for the
127
+ given featureset.
128
+ :rtype: ProbDistI
129
+ """
130
+ if overridden(self.prob_classify_many):
131
+ return self.prob_classify_many([featureset])[0]
132
+ else:
133
+ raise NotImplementedError()
134
+
135
+ def classify_many(self, featuresets):
136
+ """
137
+ Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
138
+
139
+ return [self.classify(fs) for fs in featuresets]
140
+
141
+ :rtype: list(set(label))
142
+ """
143
+ return [self.classify(fs) for fs in featuresets]
144
+
145
+ def prob_classify_many(self, featuresets):
146
+ """
147
+ Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
148
+
149
+ return [self.prob_classify(fs) for fs in featuresets]
150
+
151
+ :rtype: list(ProbDistI)
152
+ """
153
+ return [self.prob_classify(fs) for fs in featuresets]
154
+
155
+
156
+ # # [XX] IN PROGRESS:
157
+ # class SequenceClassifierI:
158
+ # """
159
+ # A processing interface for labeling sequences of tokens with a
160
+ # single category label (or "class"). Labels are typically
161
+ # strs or ints, but can be any immutable type. The set
162
+ # of labels that the classifier chooses from must be fixed and
163
+ # finite.
164
+ # """
165
+ # def labels(self):
166
+ # """
167
+ # :return: the list of category labels used by this classifier.
168
+ # :rtype: list of (immutable)
169
+ # """
170
+ # raise NotImplementedError()
171
+
172
+ # def prob_classify(self, featureset):
173
+ # """
174
+ # Return a probability distribution over labels for the given
175
+ # featureset.
176
+
177
+ # If ``featureset`` is a list of featuresets, then return a
178
+ # corresponding list containing the probability distribution
179
+ # over labels for each of the given featuresets, where the
180
+ # *i*\ th element of this list is the most appropriate label for
181
+ # the *i*\ th element of ``featuresets``.
182
+ # """
183
+ # raise NotImplementedError()
184
+
185
+ # def classify(self, featureset):
186
+ # """
187
+ # Return the most appropriate label for the given featureset.
188
+
189
+ # If ``featureset`` is a list of featuresets, then return a
190
+ # corresponding list containing the most appropriate label for
191
+ # each of the given featuresets, where the *i*\ th element of
192
+ # this list is the most appropriate label for the *i*\ th element
193
+ # of ``featuresets``.
194
+ # """
195
+ # raise NotImplementedError()
.eggs/nltk-3.8-py3.10.egg/nltk/classify/decisiontree.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Decision Tree Classifiers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A classifier model that decides which label to assign to a token on
10
+ the basis of a tree structure, where branches correspond to conditions
11
+ on feature values, and leaves correspond to label assignments.
12
+ """
13
+
14
+ from collections import defaultdict
15
+
16
+ from nltk.classify.api import ClassifierI
17
+ from nltk.probability import FreqDist, MLEProbDist, entropy
18
+
19
+
20
+ class DecisionTreeClassifier(ClassifierI):
21
+ def __init__(self, label, feature_name=None, decisions=None, default=None):
22
+ """
23
+ :param label: The most likely label for tokens that reach
24
+ this node in the decision tree. If this decision tree
25
+ has no children, then this label will be assigned to
26
+ any token that reaches this decision tree.
27
+ :param feature_name: The name of the feature that this
28
+ decision tree selects for.
29
+ :param decisions: A dictionary mapping from feature values
30
+ for the feature identified by ``feature_name`` to
31
+ child decision trees.
32
+ :param default: The child that will be used if the value of
33
+ feature ``feature_name`` does not match any of the keys in
34
+ ``decisions``. This is used when constructing binary
35
+ decision trees.
36
+ """
37
+ self._label = label
38
+ self._fname = feature_name
39
+ self._decisions = decisions
40
+ self._default = default
41
+
42
+ def labels(self):
43
+ labels = [self._label]
44
+ if self._decisions is not None:
45
+ for dt in self._decisions.values():
46
+ labels.extend(dt.labels())
47
+ if self._default is not None:
48
+ labels.extend(self._default.labels())
49
+ return list(set(labels))
50
+
51
+ def classify(self, featureset):
52
+ # Decision leaf:
53
+ if self._fname is None:
54
+ return self._label
55
+
56
+ # Decision tree:
57
+ fval = featureset.get(self._fname)
58
+ if fval in self._decisions:
59
+ return self._decisions[fval].classify(featureset)
60
+ elif self._default is not None:
61
+ return self._default.classify(featureset)
62
+ else:
63
+ return self._label
64
+
65
+ def error(self, labeled_featuresets):
66
+ errors = 0
67
+ for featureset, label in labeled_featuresets:
68
+ if self.classify(featureset) != label:
69
+ errors += 1
70
+ return errors / len(labeled_featuresets)
71
+
72
+ def pretty_format(self, width=70, prefix="", depth=4):
73
+ """
74
+ Return a string containing a pretty-printed version of this
75
+ decision tree. Each line in this string corresponds to a
76
+ single decision tree node or leaf, and indentation is used to
77
+ display the structure of the decision tree.
78
+ """
79
+ # [xx] display default!!
80
+ if self._fname is None:
81
+ n = width - len(prefix) - 15
82
+ return "{}{} {}\n".format(prefix, "." * n, self._label)
83
+ s = ""
84
+ for i, (fval, result) in enumerate(
85
+ sorted(
86
+ self._decisions.items(),
87
+ key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
88
+ )
89
+ ):
90
+ hdr = f"{prefix}{self._fname}={fval}? "
91
+ n = width - 15 - len(hdr)
92
+ s += "{}{} {}\n".format(hdr, "." * (n), result._label)
93
+ if result._fname is not None and depth > 1:
94
+ s += result.pretty_format(width, prefix + " ", depth - 1)
95
+ if self._default is not None:
96
+ n = width - len(prefix) - 21
97
+ s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label)
98
+ if self._default._fname is not None and depth > 1:
99
+ s += self._default.pretty_format(width, prefix + " ", depth - 1)
100
+ return s
101
+
102
+ def pseudocode(self, prefix="", depth=4):
103
+ """
104
+ Return a string representation of this decision tree that
105
+ expresses the decisions it makes as a nested set of pseudocode
106
+ if statements.
107
+ """
108
+ if self._fname is None:
109
+ return f"{prefix}return {self._label!r}\n"
110
+ s = ""
111
+ for (fval, result) in sorted(
112
+ self._decisions.items(),
113
+ key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
114
+ ):
115
+ s += f"{prefix}if {self._fname} == {fval!r}: "
116
+ if result._fname is not None and depth > 1:
117
+ s += "\n" + result.pseudocode(prefix + " ", depth - 1)
118
+ else:
119
+ s += f"return {result._label!r}\n"
120
+ if self._default is not None:
121
+ if len(self._decisions) == 1:
122
+ s += "{}if {} != {!r}: ".format(
123
+ prefix, self._fname, list(self._decisions.keys())[0]
124
+ )
125
+ else:
126
+ s += f"{prefix}else: "
127
+ if self._default._fname is not None and depth > 1:
128
+ s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
129
+ else:
130
+ s += f"return {self._default._label!r}\n"
131
+ return s
132
+
133
+ def __str__(self):
134
+ return self.pretty_format()
135
+
136
+ @staticmethod
137
+ def train(
138
+ labeled_featuresets,
139
+ entropy_cutoff=0.05,
140
+ depth_cutoff=100,
141
+ support_cutoff=10,
142
+ binary=False,
143
+ feature_values=None,
144
+ verbose=False,
145
+ ):
146
+ """
147
+ :param binary: If true, then treat all feature/value pairs as
148
+ individual binary features, rather than using a single n-way
149
+ branch for each feature.
150
+ """
151
+ # Collect a list of all feature names.
152
+ feature_names = set()
153
+ for featureset, label in labeled_featuresets:
154
+ for fname in featureset:
155
+ feature_names.add(fname)
156
+
157
+ # Collect a list of the values each feature can take.
158
+ if feature_values is None and binary:
159
+ feature_values = defaultdict(set)
160
+ for featureset, label in labeled_featuresets:
161
+ for fname, fval in featureset.items():
162
+ feature_values[fname].add(fval)
163
+
164
+ # Start with a stump.
165
+ if not binary:
166
+ tree = DecisionTreeClassifier.best_stump(
167
+ feature_names, labeled_featuresets, verbose
168
+ )
169
+ else:
170
+ tree = DecisionTreeClassifier.best_binary_stump(
171
+ feature_names, labeled_featuresets, feature_values, verbose
172
+ )
173
+
174
+ # Refine the stump.
175
+ tree.refine(
176
+ labeled_featuresets,
177
+ entropy_cutoff,
178
+ depth_cutoff - 1,
179
+ support_cutoff,
180
+ binary,
181
+ feature_values,
182
+ verbose,
183
+ )
184
+
185
+ # Return it
186
+ return tree
187
+
188
+ @staticmethod
189
+ def leaf(labeled_featuresets):
190
+ label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
191
+ return DecisionTreeClassifier(label)
192
+
193
+ @staticmethod
194
+ def stump(feature_name, labeled_featuresets):
195
+ label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
196
+
197
+ # Find the best label for each value.
198
+ freqs = defaultdict(FreqDist) # freq(label|value)
199
+ for featureset, label in labeled_featuresets:
200
+ feature_value = featureset.get(feature_name)
201
+ freqs[feature_value][label] += 1
202
+
203
+ decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs}
204
+ return DecisionTreeClassifier(label, feature_name, decisions)
205
+
206
+ def refine(
207
+ self,
208
+ labeled_featuresets,
209
+ entropy_cutoff,
210
+ depth_cutoff,
211
+ support_cutoff,
212
+ binary=False,
213
+ feature_values=None,
214
+ verbose=False,
215
+ ):
216
+ if len(labeled_featuresets) <= support_cutoff:
217
+ return
218
+ if self._fname is None:
219
+ return
220
+ if depth_cutoff <= 0:
221
+ return
222
+ for fval in self._decisions:
223
+ fval_featuresets = [
224
+ (featureset, label)
225
+ for (featureset, label) in labeled_featuresets
226
+ if featureset.get(self._fname) == fval
227
+ ]
228
+
229
+ label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
230
+ if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
231
+ self._decisions[fval] = DecisionTreeClassifier.train(
232
+ fval_featuresets,
233
+ entropy_cutoff,
234
+ depth_cutoff,
235
+ support_cutoff,
236
+ binary,
237
+ feature_values,
238
+ verbose,
239
+ )
240
+ if self._default is not None:
241
+ default_featuresets = [
242
+ (featureset, label)
243
+ for (featureset, label) in labeled_featuresets
244
+ if featureset.get(self._fname) not in self._decisions
245
+ ]
246
+ label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
247
+ if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
248
+ self._default = DecisionTreeClassifier.train(
249
+ default_featuresets,
250
+ entropy_cutoff,
251
+ depth_cutoff,
252
+ support_cutoff,
253
+ binary,
254
+ feature_values,
255
+ verbose,
256
+ )
257
+
258
+ @staticmethod
259
+ def best_stump(feature_names, labeled_featuresets, verbose=False):
260
+ best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
261
+ best_error = best_stump.error(labeled_featuresets)
262
+ for fname in feature_names:
263
+ stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
264
+ stump_error = stump.error(labeled_featuresets)
265
+ if stump_error < best_error:
266
+ best_error = stump_error
267
+ best_stump = stump
268
+ if verbose:
269
+ print(
270
+ "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
271
+ len(labeled_featuresets), best_stump._fname, best_error
272
+ )
273
+ )
274
+ return best_stump
275
+
276
+ @staticmethod
277
+ def binary_stump(feature_name, feature_value, labeled_featuresets):
278
+ label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
279
+
280
+ # Find the best label for each value.
281
+ pos_fdist = FreqDist()
282
+ neg_fdist = FreqDist()
283
+ for featureset, label in labeled_featuresets:
284
+ if featureset.get(feature_name) == feature_value:
285
+ pos_fdist[label] += 1
286
+ else:
287
+ neg_fdist[label] += 1
288
+
289
+ decisions = {}
290
+ default = label
291
+ # But hopefully we have observations!
292
+ if pos_fdist.N() > 0:
293
+ decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
294
+ if neg_fdist.N() > 0:
295
+ default = DecisionTreeClassifier(neg_fdist.max())
296
+
297
+ return DecisionTreeClassifier(label, feature_name, decisions, default)
298
+
299
+ @staticmethod
300
+ def best_binary_stump(
301
+ feature_names, labeled_featuresets, feature_values, verbose=False
302
+ ):
303
+ best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
304
+ best_error = best_stump.error(labeled_featuresets)
305
+ for fname in feature_names:
306
+ for fval in feature_values[fname]:
307
+ stump = DecisionTreeClassifier.binary_stump(
308
+ fname, fval, labeled_featuresets
309
+ )
310
+ stump_error = stump.error(labeled_featuresets)
311
+ if stump_error < best_error:
312
+ best_error = stump_error
313
+ best_stump = stump
314
+ if verbose:
315
+ if best_stump._decisions:
316
+ descr = "{}={}".format(
317
+ best_stump._fname, list(best_stump._decisions.keys())[0]
318
+ )
319
+ else:
320
+ descr = "(default)"
321
+ print(
322
+ "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
323
+ len(labeled_featuresets), descr, best_error
324
+ )
325
+ )
326
+ return best_stump
327
+
328
+
329
+ ##//////////////////////////////////////////////////////
330
+ ## Demo
331
+ ##//////////////////////////////////////////////////////
332
+
333
+
334
+ def f(x):
335
+ return DecisionTreeClassifier.train(x, binary=True, verbose=True)
336
+
337
+
338
+ def demo():
339
+ from nltk.classify.util import binary_names_demo_features, names_demo
340
+
341
+ classifier = names_demo(
342
+ f, binary_names_demo_features # DecisionTreeClassifier.train,
343
+ )
344
+ print(classifier.pretty_format(depth=7))
345
+ print(classifier.pseudocode(depth=7))
346
+
347
+
348
+ if __name__ == "__main__":
349
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/classify/maxent.py ADDED
@@ -0,0 +1,1569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Maximum Entropy Classifiers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ A classifier model based on maximum entropy modeling framework. This
11
+ framework considers all of the probability distributions that are
12
+ empirically consistent with the training data; and chooses the
13
+ distribution with the highest entropy. A probability distribution is
14
+ "empirically consistent" with a set of training data if its estimated
15
+ frequency with which a class and a feature vector value co-occur is
16
+ equal to the actual frequency in the data.
17
+
18
+ Terminology: 'feature'
19
+ ======================
20
+ The term *feature* is usually used to refer to some property of an
21
+ unlabeled token. For example, when performing word sense
22
+ disambiguation, we might define a ``'prevword'`` feature whose value is
23
+ the word preceding the target word. However, in the context of
24
+ maxent modeling, the term *feature* is typically used to refer to a
25
+ property of a "labeled" token. In order to prevent confusion, we
26
+ will introduce two distinct terms to disambiguate these two different
27
+ concepts:
28
+
29
+ - An "input-feature" is a property of an unlabeled token.
30
+ - A "joint-feature" is a property of a labeled token.
31
+
32
+ In the rest of the ``nltk.classify`` module, the term "features" is
33
+ used to refer to what we will call "input-features" in this module.
34
+
35
+ In literature that describes and discusses maximum entropy models,
36
+ input-features are typically called "contexts", and joint-features
37
+ are simply referred to as "features".
38
+
39
+ Converting Input-Features to Joint-Features
40
+ -------------------------------------------
41
+ In maximum entropy models, joint-features are required to have numeric
42
+ values. Typically, each input-feature ``input_feat`` is mapped to a
43
+ set of joint-features of the form:
44
+
45
+ | joint_feat(token, label) = { 1 if input_feat(token) == feat_val
46
+ | { and label == some_label
47
+ | {
48
+ | { 0 otherwise
49
+
50
+ For all values of ``feat_val`` and ``some_label``. This mapping is
51
+ performed by classes that implement the ``MaxentFeatureEncodingI``
52
+ interface.
53
+ """
54
+ try:
55
+ import numpy
56
+ except ImportError:
57
+ pass
58
+
59
+ import os
60
+ import tempfile
61
+ from collections import defaultdict
62
+
63
+ from nltk.classify.api import ClassifierI
64
+ from nltk.classify.megam import call_megam, parse_megam_weights, write_megam_file
65
+ from nltk.classify.tadm import call_tadm, parse_tadm_weights, write_tadm_file
66
+ from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
67
+ from nltk.data import gzip_open_unicode
68
+ from nltk.probability import DictionaryProbDist
69
+ from nltk.util import OrderedDict
70
+
71
+ __docformat__ = "epytext en"
72
+
73
+ ######################################################################
74
+ # { Classifier Model
75
+ ######################################################################
76
+
77
+
78
+ class MaxentClassifier(ClassifierI):
79
+ """
80
+ A maximum entropy classifier (also known as a "conditional
81
+ exponential classifier"). This classifier is parameterized by a
82
+ set of "weights", which are used to combine the joint-features
83
+ that are generated from a featureset by an "encoding". In
84
+ particular, the encoding maps each ``(featureset, label)`` pair to
85
+ a vector. The probability of each label is then computed using
86
+ the following equation::
87
+
88
+ dotprod(weights, encode(fs,label))
89
+ prob(fs|label) = ---------------------------------------------------
90
+ sum(dotprod(weights, encode(fs,l)) for l in labels)
91
+
92
+ Where ``dotprod`` is the dot product::
93
+
94
+ dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
95
+ """
96
+
97
+ def __init__(self, encoding, weights, logarithmic=True):
98
+ """
99
+ Construct a new maxent classifier model. Typically, new
100
+ classifier models are created using the ``train()`` method.
101
+
102
+ :type encoding: MaxentFeatureEncodingI
103
+ :param encoding: An encoding that is used to convert the
104
+ featuresets that are given to the ``classify`` method into
105
+ joint-feature vectors, which are used by the maxent
106
+ classifier model.
107
+
108
+ :type weights: list of float
109
+ :param weights: The feature weight vector for this classifier.
110
+
111
+ :type logarithmic: bool
112
+ :param logarithmic: If false, then use non-logarithmic weights.
113
+ """
114
+ self._encoding = encoding
115
+ self._weights = weights
116
+ self._logarithmic = logarithmic
117
+ # self._logarithmic = False
118
+ assert encoding.length() == len(weights)
119
+
120
+ def labels(self):
121
+ return self._encoding.labels()
122
+
123
+ def set_weights(self, new_weights):
124
+ """
125
+ Set the feature weight vector for this classifier.
126
+ :param new_weights: The new feature weight vector.
127
+ :type new_weights: list of float
128
+ """
129
+ self._weights = new_weights
130
+ assert self._encoding.length() == len(new_weights)
131
+
132
+ def weights(self):
133
+ """
134
+ :return: The feature weight vector for this classifier.
135
+ :rtype: list of float
136
+ """
137
+ return self._weights
138
+
139
+ def classify(self, featureset):
140
+ return self.prob_classify(featureset).max()
141
+
142
+ def prob_classify(self, featureset):
143
+ prob_dict = {}
144
+ for label in self._encoding.labels():
145
+ feature_vector = self._encoding.encode(featureset, label)
146
+
147
+ if self._logarithmic:
148
+ total = 0.0
149
+ for (f_id, f_val) in feature_vector:
150
+ total += self._weights[f_id] * f_val
151
+ prob_dict[label] = total
152
+
153
+ else:
154
+ prod = 1.0
155
+ for (f_id, f_val) in feature_vector:
156
+ prod *= self._weights[f_id] ** f_val
157
+ prob_dict[label] = prod
158
+
159
+ # Normalize the dictionary to give a probability distribution
160
+ return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True)
161
+
162
+ def explain(self, featureset, columns=4):
163
+ """
164
+ Print a table showing the effect of each of the features in
165
+ the given feature set, and how they combine to determine the
166
+ probabilities of each label for that featureset.
167
+ """
168
+ descr_width = 50
169
+ TEMPLATE = " %-" + str(descr_width - 2) + "s%s%8.3f"
170
+
171
+ pdist = self.prob_classify(featureset)
172
+ labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
173
+ labels = labels[:columns]
174
+ print(
175
+ " Feature".ljust(descr_width)
176
+ + "".join("%8s" % (("%s" % l)[:7]) for l in labels)
177
+ )
178
+ print(" " + "-" * (descr_width - 2 + 8 * len(labels)))
179
+ sums = defaultdict(int)
180
+ for i, label in enumerate(labels):
181
+ feature_vector = self._encoding.encode(featureset, label)
182
+ feature_vector.sort(
183
+ key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True
184
+ )
185
+ for (f_id, f_val) in feature_vector:
186
+ if self._logarithmic:
187
+ score = self._weights[f_id] * f_val
188
+ else:
189
+ score = self._weights[f_id] ** f_val
190
+ descr = self._encoding.describe(f_id)
191
+ descr = descr.split(" and label is ")[0] # hack
192
+ descr += " (%s)" % f_val # hack
193
+ if len(descr) > 47:
194
+ descr = descr[:44] + "..."
195
+ print(TEMPLATE % (descr, i * 8 * " ", score))
196
+ sums[label] += score
197
+ print(" " + "-" * (descr_width - 1 + 8 * len(labels)))
198
+ print(
199
+ " TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels)
200
+ )
201
+ print(
202
+ " PROBS:".ljust(descr_width)
203
+ + "".join("%8.3f" % pdist.prob(l) for l in labels)
204
+ )
205
+
206
+ def most_informative_features(self, n=10):
207
+ """
208
+ Generates the ranked list of informative features from most to least.
209
+ """
210
+ if hasattr(self, "_most_informative_features"):
211
+ return self._most_informative_features[:n]
212
+ else:
213
+ self._most_informative_features = sorted(
214
+ list(range(len(self._weights))),
215
+ key=lambda fid: abs(self._weights[fid]),
216
+ reverse=True,
217
+ )
218
+ return self._most_informative_features[:n]
219
+
220
+ def show_most_informative_features(self, n=10, show="all"):
221
+ """
222
+ :param show: all, neg, or pos (for negative-only or positive-only)
223
+ :type show: str
224
+ :param n: The no. of top features
225
+ :type n: int
226
+ """
227
+ # Use None the full list of ranked features.
228
+ fids = self.most_informative_features(None)
229
+ if show == "pos":
230
+ fids = [fid for fid in fids if self._weights[fid] > 0]
231
+ elif show == "neg":
232
+ fids = [fid for fid in fids if self._weights[fid] < 0]
233
+ for fid in fids[:n]:
234
+ print(f"{self._weights[fid]:8.3f} {self._encoding.describe(fid)}")
235
+
236
+ def __repr__(self):
237
+ return "<ConditionalExponentialClassifier: %d labels, %d features>" % (
238
+ len(self._encoding.labels()),
239
+ self._encoding.length(),
240
+ )
241
+
242
+ #: A list of the algorithm names that are accepted for the
243
+ #: ``train()`` method's ``algorithm`` parameter.
244
+ ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"]
245
+
246
+ @classmethod
247
+ def train(
248
+ cls,
249
+ train_toks,
250
+ algorithm=None,
251
+ trace=3,
252
+ encoding=None,
253
+ labels=None,
254
+ gaussian_prior_sigma=0,
255
+ **cutoffs,
256
+ ):
257
+ """
258
+ Train a new maxent classifier based on the given corpus of
259
+ training samples. This classifier will have its weights
260
+ chosen to maximize entropy while remaining empirically
261
+ consistent with the training corpus.
262
+
263
+ :rtype: MaxentClassifier
264
+ :return: The new maxent classifier
265
+
266
+ :type train_toks: list
267
+ :param train_toks: Training data, represented as a list of
268
+ pairs, the first member of which is a featureset,
269
+ and the second of which is a classification label.
270
+
271
+ :type algorithm: str
272
+ :param algorithm: A case-insensitive string, specifying which
273
+ algorithm should be used to train the classifier. The
274
+ following algorithms are currently available.
275
+
276
+ - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
277
+ Improved Iterative Scaling (``'IIS'``)
278
+ - External Libraries (requiring megam):
279
+ LM-BFGS algorithm, with training performed by Megam (``'megam'``)
280
+
281
+ The default algorithm is ``'IIS'``.
282
+
283
+ :type trace: int
284
+ :param trace: The level of diagnostic tracing output to produce.
285
+ Higher values produce more verbose output.
286
+ :type encoding: MaxentFeatureEncodingI
287
+ :param encoding: A feature encoding, used to convert featuresets
288
+ into feature vectors. If none is specified, then a
289
+ ``BinaryMaxentFeatureEncoding`` will be built based on the
290
+ features that are attested in the training corpus.
291
+ :type labels: list(str)
292
+ :param labels: The set of possible labels. If none is given, then
293
+ the set of all labels attested in the training data will be
294
+ used instead.
295
+ :param gaussian_prior_sigma: The sigma value for a gaussian
296
+ prior on model weights. Currently, this is supported by
297
+ ``megam``. For other algorithms, its value is ignored.
298
+ :param cutoffs: Arguments specifying various conditions under
299
+ which the training should be halted. (Some of the cutoff
300
+ conditions are not supported by some algorithms.)
301
+
302
+ - ``max_iter=v``: Terminate after ``v`` iterations.
303
+ - ``min_ll=v``: Terminate after the negative average
304
+ log-likelihood drops under ``v``.
305
+ - ``min_lldelta=v``: Terminate if a single iteration improves
306
+ log likelihood by less than ``v``.
307
+ """
308
+ if algorithm is None:
309
+ algorithm = "iis"
310
+ for key in cutoffs:
311
+ if key not in (
312
+ "max_iter",
313
+ "min_ll",
314
+ "min_lldelta",
315
+ "max_acc",
316
+ "min_accdelta",
317
+ "count_cutoff",
318
+ "norm",
319
+ "explicit",
320
+ "bernoulli",
321
+ ):
322
+ raise TypeError("Unexpected keyword arg %r" % key)
323
+ algorithm = algorithm.lower()
324
+ if algorithm == "iis":
325
+ return train_maxent_classifier_with_iis(
326
+ train_toks, trace, encoding, labels, **cutoffs
327
+ )
328
+ elif algorithm == "gis":
329
+ return train_maxent_classifier_with_gis(
330
+ train_toks, trace, encoding, labels, **cutoffs
331
+ )
332
+ elif algorithm == "megam":
333
+ return train_maxent_classifier_with_megam(
334
+ train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
335
+ )
336
+ elif algorithm == "tadm":
337
+ kwargs = cutoffs
338
+ kwargs["trace"] = trace
339
+ kwargs["encoding"] = encoding
340
+ kwargs["labels"] = labels
341
+ kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma
342
+ return TadmMaxentClassifier.train(train_toks, **kwargs)
343
+ else:
344
+ raise ValueError("Unknown algorithm %s" % algorithm)
345
+
346
+
347
+ #: Alias for MaxentClassifier.
348
+ ConditionalExponentialClassifier = MaxentClassifier
349
+
350
+
351
+ ######################################################################
352
+ # { Feature Encodings
353
+ ######################################################################
354
+
355
+
356
+ class MaxentFeatureEncodingI:
357
+ """
358
+ A mapping that converts a set of input-feature values to a vector
359
+ of joint-feature values, given a label. This conversion is
360
+ necessary to translate featuresets into a format that can be used
361
+ by maximum entropy models.
362
+
363
+ The set of joint-features used by a given encoding is fixed, and
364
+ each index in the generated joint-feature vectors corresponds to a
365
+ single joint-feature. The length of the generated joint-feature
366
+ vectors is therefore constant (for a given encoding).
367
+
368
+ Because the joint-feature vectors generated by
369
+ ``MaxentFeatureEncodingI`` are typically very sparse, they are
370
+ represented as a list of ``(index, value)`` tuples, specifying the
371
+ value of each non-zero joint-feature.
372
+
373
+ Feature encodings are generally created using the ``train()``
374
+ method, which generates an appropriate encoding based on the
375
+ input-feature values and labels that are present in a given
376
+ corpus.
377
+ """
378
+
379
+ def encode(self, featureset, label):
380
+ """
381
+ Given a (featureset, label) pair, return the corresponding
382
+ vector of joint-feature values. This vector is represented as
383
+ a list of ``(index, value)`` tuples, specifying the value of
384
+ each non-zero joint-feature.
385
+
386
+ :type featureset: dict
387
+ :rtype: list(tuple(int, int))
388
+ """
389
+ raise NotImplementedError()
390
+
391
+ def length(self):
392
+ """
393
+ :return: The size of the fixed-length joint-feature vectors
394
+ that are generated by this encoding.
395
+ :rtype: int
396
+ """
397
+ raise NotImplementedError()
398
+
399
+ def labels(self):
400
+ """
401
+ :return: A list of the \"known labels\" -- i.e., all labels
402
+ ``l`` such that ``self.encode(fs,l)`` can be a nonzero
403
+ joint-feature vector for some value of ``fs``.
404
+ :rtype: list
405
+ """
406
+ raise NotImplementedError()
407
+
408
+ def describe(self, fid):
409
+ """
410
+ :return: A string describing the value of the joint-feature
411
+ whose index in the generated feature vectors is ``fid``.
412
+ :rtype: str
413
+ """
414
+ raise NotImplementedError()
415
+
416
+ def train(cls, train_toks):
417
+ """
418
+ Construct and return new feature encoding, based on a given
419
+ training corpus ``train_toks``.
420
+
421
+ :type train_toks: list(tuple(dict, str))
422
+ :param train_toks: Training data, represented as a list of
423
+ pairs, the first member of which is a feature dictionary,
424
+ and the second of which is a classification label.
425
+ """
426
+ raise NotImplementedError()
427
+
428
+
429
+ class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
430
+ """
431
+ A feature encoding that calls a user-supplied function to map a
432
+ given featureset/label pair to a sparse joint-feature vector.
433
+ """
434
+
435
+ def __init__(self, func, length, labels):
436
+ """
437
+ Construct a new feature encoding based on the given function.
438
+
439
+ :type func: (callable)
440
+ :param func: A function that takes two arguments, a featureset
441
+ and a label, and returns the sparse joint feature vector
442
+ that encodes them::
443
+
444
+ func(featureset, label) -> feature_vector
445
+
446
+ This sparse joint feature vector (``feature_vector``) is a
447
+ list of ``(index,value)`` tuples.
448
+
449
+ :type length: int
450
+ :param length: The size of the fixed-length joint-feature
451
+ vectors that are generated by this encoding.
452
+
453
+ :type labels: list
454
+ :param labels: A list of the \"known labels\" for this
455
+ encoding -- i.e., all labels ``l`` such that
456
+ ``self.encode(fs,l)`` can be a nonzero joint-feature vector
457
+ for some value of ``fs``.
458
+ """
459
+ self._length = length
460
+ self._func = func
461
+ self._labels = labels
462
+
463
+ def encode(self, featureset, label):
464
+ return self._func(featureset, label)
465
+
466
+ def length(self):
467
+ return self._length
468
+
469
+ def labels(self):
470
+ return self._labels
471
+
472
+ def describe(self, fid):
473
+ return "no description available"
474
+
475
+
476
+ class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
477
+ """
478
+ A feature encoding that generates vectors containing a binary
479
+ joint-features of the form:
480
+
481
+ | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
482
+ | {
483
+ | { 0 otherwise
484
+
485
+ Where ``fname`` is the name of an input-feature, ``fval`` is a value
486
+ for that input-feature, and ``label`` is a label.
487
+
488
+ Typically, these features are constructed based on a training
489
+ corpus, using the ``train()`` method. This method will create one
490
+ feature for each combination of ``fname``, ``fval``, and ``label``
491
+ that occurs at least once in the training corpus.
492
+
493
+ The ``unseen_features`` parameter can be used to add "unseen-value
494
+ features", which are used whenever an input feature has a value
495
+ that was not encountered in the training corpus. These features
496
+ have the form:
497
+
498
+ | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
499
+ | { and l == label
500
+ | {
501
+ | { 0 otherwise
502
+
503
+ Where ``is_unseen(fname, fval)`` is true if the encoding does not
504
+ contain any joint features that are true when ``fs[fname]==fval``.
505
+
506
+ The ``alwayson_features`` parameter can be used to add "always-on
507
+ features", which have the form::
508
+
509
+ | joint_feat(fs, l) = { 1 if (l == label)
510
+ | {
511
+ | { 0 otherwise
512
+
513
+ These always-on features allow the maxent model to directly model
514
+ the prior probabilities of each label.
515
+ """
516
+
517
+ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
518
+ """
519
+ :param labels: A list of the \"known labels\" for this encoding.
520
+
521
+ :param mapping: A dictionary mapping from ``(fname,fval,label)``
522
+ tuples to corresponding joint-feature indexes. These
523
+ indexes must be the set of integers from 0...len(mapping).
524
+ If ``mapping[fname,fval,label]=id``, then
525
+ ``self.encode(..., fname:fval, ..., label)[id]`` is 1;
526
+ otherwise, it is 0.
527
+
528
+ :param unseen_features: If true, then include unseen value
529
+ features in the generated joint-feature vectors.
530
+
531
+ :param alwayson_features: If true, then include always-on
532
+ features in the generated joint-feature vectors.
533
+ """
534
+ if set(mapping.values()) != set(range(len(mapping))):
535
+ raise ValueError(
536
+ "Mapping values must be exactly the "
537
+ "set of integers from 0...len(mapping)"
538
+ )
539
+
540
+ self._labels = list(labels)
541
+ """A list of attested labels."""
542
+
543
+ self._mapping = mapping
544
+ """dict mapping from (fname,fval,label) -> fid"""
545
+
546
+ self._length = len(mapping)
547
+ """The length of generated joint feature vectors."""
548
+
549
+ self._alwayson = None
550
+ """dict mapping from label -> fid"""
551
+
552
+ self._unseen = None
553
+ """dict mapping from fname -> fid"""
554
+
555
+ if alwayson_features:
556
+ self._alwayson = {
557
+ label: i + self._length for (i, label) in enumerate(labels)
558
+ }
559
+ self._length += len(self._alwayson)
560
+
561
+ if unseen_features:
562
+ fnames = {fname for (fname, fval, label) in mapping}
563
+ self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)}
564
+ self._length += len(fnames)
565
+
566
+ def encode(self, featureset, label):
567
+ # Inherit docs.
568
+ encoding = []
569
+
570
+ # Convert input-features to joint-features:
571
+ for fname, fval in featureset.items():
572
+ # Known feature name & value:
573
+ if (fname, fval, label) in self._mapping:
574
+ encoding.append((self._mapping[fname, fval, label], 1))
575
+
576
+ # Otherwise, we might want to fire an "unseen-value feature".
577
+ elif self._unseen:
578
+ # Have we seen this fname/fval combination with any label?
579
+ for label2 in self._labels:
580
+ if (fname, fval, label2) in self._mapping:
581
+ break # we've seen this fname/fval combo
582
+ # We haven't -- fire the unseen-value feature
583
+ else:
584
+ if fname in self._unseen:
585
+ encoding.append((self._unseen[fname], 1))
586
+
587
+ # Add always-on features:
588
+ if self._alwayson and label in self._alwayson:
589
+ encoding.append((self._alwayson[label], 1))
590
+
591
+ return encoding
592
+
593
+ def describe(self, f_id):
594
+ # Inherit docs.
595
+ if not isinstance(f_id, int):
596
+ raise TypeError("describe() expected an int")
597
+ try:
598
+ self._inv_mapping
599
+ except AttributeError:
600
+ self._inv_mapping = [-1] * len(self._mapping)
601
+ for (info, i) in self._mapping.items():
602
+ self._inv_mapping[i] = info
603
+
604
+ if f_id < len(self._mapping):
605
+ (fname, fval, label) = self._inv_mapping[f_id]
606
+ return f"{fname}=={fval!r} and label is {label!r}"
607
+ elif self._alwayson and f_id in self._alwayson.values():
608
+ for (label, f_id2) in self._alwayson.items():
609
+ if f_id == f_id2:
610
+ return "label is %r" % label
611
+ elif self._unseen and f_id in self._unseen.values():
612
+ for (fname, f_id2) in self._unseen.items():
613
+ if f_id == f_id2:
614
+ return "%s is unseen" % fname
615
+ else:
616
+ raise ValueError("Bad feature id")
617
+
618
+ def labels(self):
619
+ # Inherit docs.
620
+ return self._labels
621
+
622
+ def length(self):
623
+ # Inherit docs.
624
+ return self._length
625
+
626
+ @classmethod
627
+ def train(cls, train_toks, count_cutoff=0, labels=None, **options):
628
+ """
629
+ Construct and return new feature encoding, based on a given
630
+ training corpus ``train_toks``. See the class description
631
+ ``BinaryMaxentFeatureEncoding`` for a description of the
632
+ joint-features that will be included in this encoding.
633
+
634
+ :type train_toks: list(tuple(dict, str))
635
+ :param train_toks: Training data, represented as a list of
636
+ pairs, the first member of which is a feature dictionary,
637
+ and the second of which is a classification label.
638
+
639
+ :type count_cutoff: int
640
+ :param count_cutoff: A cutoff value that is used to discard
641
+ rare joint-features. If a joint-feature's value is 1
642
+ fewer than ``count_cutoff`` times in the training corpus,
643
+ then that joint-feature is not included in the generated
644
+ encoding.
645
+
646
+ :type labels: list
647
+ :param labels: A list of labels that should be used by the
648
+ classifier. If not specified, then the set of labels
649
+ attested in ``train_toks`` will be used.
650
+
651
+ :param options: Extra parameters for the constructor, such as
652
+ ``unseen_features`` and ``alwayson_features``.
653
+ """
654
+ mapping = {} # maps (fname, fval, label) -> fid
655
+ seen_labels = set() # The set of labels we've encountered
656
+ count = defaultdict(int) # maps (fname, fval) -> count
657
+
658
+ for (tok, label) in train_toks:
659
+ if labels and label not in labels:
660
+ raise ValueError("Unexpected label %s" % label)
661
+ seen_labels.add(label)
662
+
663
+ # Record each of the features.
664
+ for (fname, fval) in tok.items():
665
+
666
+ # If a count cutoff is given, then only add a joint
667
+ # feature once the corresponding (fname, fval, label)
668
+ # tuple exceeds that cutoff.
669
+ count[fname, fval] += 1
670
+ if count[fname, fval] >= count_cutoff:
671
+ if (fname, fval, label) not in mapping:
672
+ mapping[fname, fval, label] = len(mapping)
673
+
674
+ if labels is None:
675
+ labels = seen_labels
676
+ return cls(labels, mapping, **options)
677
+
678
+
679
+ class GISEncoding(BinaryMaxentFeatureEncoding):
680
+ """
681
+ A binary feature encoding which adds one new joint-feature to the
682
+ joint-features defined by ``BinaryMaxentFeatureEncoding``: a
683
+ correction feature, whose value is chosen to ensure that the
684
+ sparse vector always sums to a constant non-negative number. This
685
+ new feature is used to ensure two preconditions for the GIS
686
+ training algorithm:
687
+
688
+ - At least one feature vector index must be nonzero for every
689
+ token.
690
+ - The feature vector must sum to a constant non-negative number
691
+ for every token.
692
+ """
693
+
694
+ def __init__(
695
+ self, labels, mapping, unseen_features=False, alwayson_features=False, C=None
696
+ ):
697
+ """
698
+ :param C: The correction constant. The value of the correction
699
+ feature is based on this value. In particular, its value is
700
+ ``C - sum([v for (f,v) in encoding])``.
701
+ :seealso: ``BinaryMaxentFeatureEncoding.__init__``
702
+ """
703
+ BinaryMaxentFeatureEncoding.__init__(
704
+ self, labels, mapping, unseen_features, alwayson_features
705
+ )
706
+ if C is None:
707
+ C = len({fname for (fname, fval, label) in mapping}) + 1
708
+ self._C = C
709
+
710
+ @property
711
+ def C(self):
712
+ """The non-negative constant that all encoded feature vectors
713
+ will sum to."""
714
+ return self._C
715
+
716
+ def encode(self, featureset, label):
717
+ # Get the basic encoding.
718
+ encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)
719
+ base_length = BinaryMaxentFeatureEncoding.length(self)
720
+
721
+ # Add a correction feature.
722
+ total = sum(v for (f, v) in encoding)
723
+ if total >= self._C:
724
+ raise ValueError("Correction feature is not high enough!")
725
+ encoding.append((base_length, self._C - total))
726
+
727
+ # Return the result
728
+ return encoding
729
+
730
+ def length(self):
731
+ return BinaryMaxentFeatureEncoding.length(self) + 1
732
+
733
+ def describe(self, f_id):
734
+ if f_id == BinaryMaxentFeatureEncoding.length(self):
735
+ return "Correction feature (%s)" % self._C
736
+ else:
737
+ return BinaryMaxentFeatureEncoding.describe(self, f_id)
738
+
739
+
740
+ class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
741
+ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
742
+ self._mapping = OrderedDict(mapping)
743
+ self._label_mapping = OrderedDict()
744
+ BinaryMaxentFeatureEncoding.__init__(
745
+ self, labels, self._mapping, unseen_features, alwayson_features
746
+ )
747
+
748
+ def encode(self, featureset, label):
749
+ encoding = []
750
+ for feature, value in featureset.items():
751
+ if (feature, label) not in self._mapping:
752
+ self._mapping[(feature, label)] = len(self._mapping)
753
+ if value not in self._label_mapping:
754
+ if not isinstance(value, int):
755
+ self._label_mapping[value] = len(self._label_mapping)
756
+ else:
757
+ self._label_mapping[value] = value
758
+ encoding.append(
759
+ (self._mapping[(feature, label)], self._label_mapping[value])
760
+ )
761
+ return encoding
762
+
763
+ def labels(self):
764
+ return self._labels
765
+
766
+ def describe(self, fid):
767
+ for (feature, label) in self._mapping:
768
+ if self._mapping[(feature, label)] == fid:
769
+ return (feature, label)
770
+
771
+ def length(self):
772
+ return len(self._mapping)
773
+
774
+ @classmethod
775
+ def train(cls, train_toks, count_cutoff=0, labels=None, **options):
776
+ mapping = OrderedDict()
777
+ if not labels:
778
+ labels = []
779
+
780
+ # This gets read twice, so compute the values in case it's lazy.
781
+ train_toks = list(train_toks)
782
+
783
+ for (featureset, label) in train_toks:
784
+ if label not in labels:
785
+ labels.append(label)
786
+
787
+ for (featureset, label) in train_toks:
788
+ for label in labels:
789
+ for feature in featureset:
790
+ if (feature, label) not in mapping:
791
+ mapping[(feature, label)] = len(mapping)
792
+
793
+ return cls(labels, mapping, **options)
794
+
795
+
796
+ class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
797
+ """
798
+ A feature encoding that generates vectors containing integer,
799
+ float and binary joint-features of the form:
800
+
801
+ Binary (for string and boolean features):
802
+
803
+ | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
804
+ | {
805
+ | { 0 otherwise
806
+
807
+ Value (for integer and float features):
808
+
809
+ | joint_feat(fs, l) = { fval if (fs[fname] == type(fval))
810
+ | { and (l == label)
811
+ | {
812
+ | { not encoded otherwise
813
+
814
+ Where ``fname`` is the name of an input-feature, ``fval`` is a value
815
+ for that input-feature, and ``label`` is a label.
816
+
817
+ Typically, these features are constructed based on a training
818
+ corpus, using the ``train()`` method.
819
+
820
+ For string and boolean features [type(fval) not in (int, float)]
821
+ this method will create one feature for each combination of
822
+ ``fname``, ``fval``, and ``label`` that occurs at least once in the
823
+ training corpus.
824
+
825
+ For integer and float features [type(fval) in (int, float)] this
826
+ method will create one feature for each combination of ``fname``
827
+ and ``label`` that occurs at least once in the training corpus.
828
+
829
+ For binary features the ``unseen_features`` parameter can be used
830
+ to add "unseen-value features", which are used whenever an input
831
+ feature has a value that was not encountered in the training
832
+ corpus. These features have the form:
833
+
834
+ | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
835
+ | { and l == label
836
+ | {
837
+ | { 0 otherwise
838
+
839
+ Where ``is_unseen(fname, fval)`` is true if the encoding does not
840
+ contain any joint features that are true when ``fs[fname]==fval``.
841
+
842
+ The ``alwayson_features`` parameter can be used to add "always-on
843
+ features", which have the form:
844
+
845
+ | joint_feat(fs, l) = { 1 if (l == label)
846
+ | {
847
+ | { 0 otherwise
848
+
849
+ These always-on features allow the maxent model to directly model
850
+ the prior probabilities of each label.
851
+ """
852
+
853
+ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
854
+ """
855
+ :param labels: A list of the \"known labels\" for this encoding.
856
+
857
+ :param mapping: A dictionary mapping from ``(fname,fval,label)``
858
+ tuples to corresponding joint-feature indexes. These
859
+ indexes must be the set of integers from 0...len(mapping).
860
+ If ``mapping[fname,fval,label]=id``, then
861
+ ``self.encode({..., fname:fval, ...``, label)[id]} is 1;
862
+ otherwise, it is 0.
863
+
864
+ :param unseen_features: If true, then include unseen value
865
+ features in the generated joint-feature vectors.
866
+
867
+ :param alwayson_features: If true, then include always-on
868
+ features in the generated joint-feature vectors.
869
+ """
870
+ if set(mapping.values()) != set(range(len(mapping))):
871
+ raise ValueError(
872
+ "Mapping values must be exactly the "
873
+ "set of integers from 0...len(mapping)"
874
+ )
875
+
876
+ self._labels = list(labels)
877
+ """A list of attested labels."""
878
+
879
+ self._mapping = mapping
880
+ """dict mapping from (fname,fval,label) -> fid"""
881
+
882
+ self._length = len(mapping)
883
+ """The length of generated joint feature vectors."""
884
+
885
+ self._alwayson = None
886
+ """dict mapping from label -> fid"""
887
+
888
+ self._unseen = None
889
+ """dict mapping from fname -> fid"""
890
+
891
+ if alwayson_features:
892
+ self._alwayson = {
893
+ label: i + self._length for (i, label) in enumerate(labels)
894
+ }
895
+ self._length += len(self._alwayson)
896
+
897
+ if unseen_features:
898
+ fnames = {fname for (fname, fval, label) in mapping}
899
+ self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)}
900
+ self._length += len(fnames)
901
+
902
+ def encode(self, featureset, label):
903
+ # Inherit docs.
904
+ encoding = []
905
+
906
+ # Convert input-features to joint-features:
907
+ for fname, fval in featureset.items():
908
+ if isinstance(fval, (int, float)):
909
+ # Known feature name & value:
910
+ if (fname, type(fval), label) in self._mapping:
911
+ encoding.append((self._mapping[fname, type(fval), label], fval))
912
+ else:
913
+ # Known feature name & value:
914
+ if (fname, fval, label) in self._mapping:
915
+ encoding.append((self._mapping[fname, fval, label], 1))
916
+
917
+ # Otherwise, we might want to fire an "unseen-value feature".
918
+ elif self._unseen:
919
+ # Have we seen this fname/fval combination with any label?
920
+ for label2 in self._labels:
921
+ if (fname, fval, label2) in self._mapping:
922
+ break # we've seen this fname/fval combo
923
+ # We haven't -- fire the unseen-value feature
924
+ else:
925
+ if fname in self._unseen:
926
+ encoding.append((self._unseen[fname], 1))
927
+
928
+ # Add always-on features:
929
+ if self._alwayson and label in self._alwayson:
930
+ encoding.append((self._alwayson[label], 1))
931
+
932
+ return encoding
933
+
934
+ def describe(self, f_id):
935
+ # Inherit docs.
936
+ if not isinstance(f_id, int):
937
+ raise TypeError("describe() expected an int")
938
+ try:
939
+ self._inv_mapping
940
+ except AttributeError:
941
+ self._inv_mapping = [-1] * len(self._mapping)
942
+ for (info, i) in self._mapping.items():
943
+ self._inv_mapping[i] = info
944
+
945
+ if f_id < len(self._mapping):
946
+ (fname, fval, label) = self._inv_mapping[f_id]
947
+ return f"{fname}=={fval!r} and label is {label!r}"
948
+ elif self._alwayson and f_id in self._alwayson.values():
949
+ for (label, f_id2) in self._alwayson.items():
950
+ if f_id == f_id2:
951
+ return "label is %r" % label
952
+ elif self._unseen and f_id in self._unseen.values():
953
+ for (fname, f_id2) in self._unseen.items():
954
+ if f_id == f_id2:
955
+ return "%s is unseen" % fname
956
+ else:
957
+ raise ValueError("Bad feature id")
958
+
959
+ def labels(self):
960
+ # Inherit docs.
961
+ return self._labels
962
+
963
+ def length(self):
964
+ # Inherit docs.
965
+ return self._length
966
+
967
+ @classmethod
968
+ def train(cls, train_toks, count_cutoff=0, labels=None, **options):
969
+ """
970
+ Construct and return new feature encoding, based on a given
971
+ training corpus ``train_toks``. See the class description
972
+ ``TypedMaxentFeatureEncoding`` for a description of the
973
+ joint-features that will be included in this encoding.
974
+
975
+ Note: recognized feature values types are (int, float), over
976
+ types are interpreted as regular binary features.
977
+
978
+ :type train_toks: list(tuple(dict, str))
979
+ :param train_toks: Training data, represented as a list of
980
+ pairs, the first member of which is a feature dictionary,
981
+ and the second of which is a classification label.
982
+
983
+ :type count_cutoff: int
984
+ :param count_cutoff: A cutoff value that is used to discard
985
+ rare joint-features. If a joint-feature's value is 1
986
+ fewer than ``count_cutoff`` times in the training corpus,
987
+ then that joint-feature is not included in the generated
988
+ encoding.
989
+
990
+ :type labels: list
991
+ :param labels: A list of labels that should be used by the
992
+ classifier. If not specified, then the set of labels
993
+ attested in ``train_toks`` will be used.
994
+
995
+ :param options: Extra parameters for the constructor, such as
996
+ ``unseen_features`` and ``alwayson_features``.
997
+ """
998
+ mapping = {} # maps (fname, fval, label) -> fid
999
+ seen_labels = set() # The set of labels we've encountered
1000
+ count = defaultdict(int) # maps (fname, fval) -> count
1001
+
1002
+ for (tok, label) in train_toks:
1003
+ if labels and label not in labels:
1004
+ raise ValueError("Unexpected label %s" % label)
1005
+ seen_labels.add(label)
1006
+
1007
+ # Record each of the features.
1008
+ for (fname, fval) in tok.items():
1009
+ if type(fval) in (int, float):
1010
+ fval = type(fval)
1011
+ # If a count cutoff is given, then only add a joint
1012
+ # feature once the corresponding (fname, fval, label)
1013
+ # tuple exceeds that cutoff.
1014
+ count[fname, fval] += 1
1015
+ if count[fname, fval] >= count_cutoff:
1016
+ if (fname, fval, label) not in mapping:
1017
+ mapping[fname, fval, label] = len(mapping)
1018
+
1019
+ if labels is None:
1020
+ labels = seen_labels
1021
+ return cls(labels, mapping, **options)
1022
+
1023
+
1024
+ ######################################################################
1025
+ # { Classifier Trainer: Generalized Iterative Scaling
1026
+ ######################################################################
1027
+
1028
+
1029
+ def train_maxent_classifier_with_gis(
1030
+ train_toks, trace=3, encoding=None, labels=None, **cutoffs
1031
+ ):
1032
+ """
1033
+ Train a new ``ConditionalExponentialClassifier``, using the given
1034
+ training samples, using the Generalized Iterative Scaling
1035
+ algorithm. This ``ConditionalExponentialClassifier`` will encode
1036
+ the model that maximizes entropy from all the models that are
1037
+ empirically consistent with ``train_toks``.
1038
+
1039
+ :see: ``train_maxent_classifier()`` for parameter descriptions.
1040
+ """
1041
+ cutoffs.setdefault("max_iter", 100)
1042
+ cutoffchecker = CutoffChecker(cutoffs)
1043
+
1044
+ # Construct an encoding from the training data.
1045
+ if encoding is None:
1046
+ encoding = GISEncoding.train(train_toks, labels=labels)
1047
+
1048
+ if not hasattr(encoding, "C"):
1049
+ raise TypeError(
1050
+ "The GIS algorithm requires an encoding that "
1051
+ "defines C (e.g., GISEncoding)."
1052
+ )
1053
+
1054
+ # Cinv is the inverse of the sum of each joint feature vector.
1055
+ # This controls the learning rate: higher Cinv (or lower C) gives
1056
+ # faster learning.
1057
+ Cinv = 1.0 / encoding.C
1058
+
1059
+ # Count how many times each feature occurs in the training data.
1060
+ empirical_fcount = calculate_empirical_fcount(train_toks, encoding)
1061
+
1062
+ # Check for any features that are not attested in train_toks.
1063
+ unattested = set(numpy.nonzero(empirical_fcount == 0)[0])
1064
+
1065
+ # Build the classifier. Start with weight=0 for each attested
1066
+ # feature, and weight=-infinity for each unattested feature.
1067
+ weights = numpy.zeros(len(empirical_fcount), "d")
1068
+ for fid in unattested:
1069
+ weights[fid] = numpy.NINF
1070
+ classifier = ConditionalExponentialClassifier(encoding, weights)
1071
+
1072
+ # Take the log of the empirical fcount.
1073
+ log_empirical_fcount = numpy.log2(empirical_fcount)
1074
+ del empirical_fcount
1075
+
1076
+ if trace > 0:
1077
+ print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
1078
+ if trace > 2:
1079
+ print()
1080
+ print(" Iteration Log Likelihood Accuracy")
1081
+ print(" ---------------------------------------")
1082
+
1083
+ # Train the classifier.
1084
+ try:
1085
+ while True:
1086
+ if trace > 2:
1087
+ ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
1088
+ acc = cutoffchecker.acc or accuracy(classifier, train_toks)
1089
+ iternum = cutoffchecker.iter
1090
+ print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
1091
+
1092
+ # Use the model to estimate the number of times each
1093
+ # feature should occur in the training data.
1094
+ estimated_fcount = calculate_estimated_fcount(
1095
+ classifier, train_toks, encoding
1096
+ )
1097
+
1098
+ # Take the log of estimated fcount (avoid taking log(0).)
1099
+ for fid in unattested:
1100
+ estimated_fcount[fid] += 1
1101
+ log_estimated_fcount = numpy.log2(estimated_fcount)
1102
+ del estimated_fcount
1103
+
1104
+ # Update the classifier weights
1105
+ weights = classifier.weights()
1106
+ weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
1107
+ classifier.set_weights(weights)
1108
+
1109
+ # Check the log-likelihood & accuracy cutoffs.
1110
+ if cutoffchecker.check(classifier, train_toks):
1111
+ break
1112
+
1113
+ except KeyboardInterrupt:
1114
+ print(" Training stopped: keyboard interrupt")
1115
+ except:
1116
+ raise
1117
+
1118
+ if trace > 2:
1119
+ ll = log_likelihood(classifier, train_toks)
1120
+ acc = accuracy(classifier, train_toks)
1121
+ print(f" Final {ll:14.5f} {acc:9.3f}")
1122
+
1123
+ # Return the classifier.
1124
+ return classifier
1125
+
1126
+
1127
+ def calculate_empirical_fcount(train_toks, encoding):
1128
+ fcount = numpy.zeros(encoding.length(), "d")
1129
+
1130
+ for tok, label in train_toks:
1131
+ for (index, val) in encoding.encode(tok, label):
1132
+ fcount[index] += val
1133
+
1134
+ return fcount
1135
+
1136
+
1137
+ def calculate_estimated_fcount(classifier, train_toks, encoding):
1138
+ fcount = numpy.zeros(encoding.length(), "d")
1139
+
1140
+ for tok, label in train_toks:
1141
+ pdist = classifier.prob_classify(tok)
1142
+ for label in pdist.samples():
1143
+ prob = pdist.prob(label)
1144
+ for (fid, fval) in encoding.encode(tok, label):
1145
+ fcount[fid] += prob * fval
1146
+
1147
+ return fcount
1148
+
1149
+
1150
+ ######################################################################
1151
+ # { Classifier Trainer: Improved Iterative Scaling
1152
+ ######################################################################
1153
+
1154
+
1155
+ def train_maxent_classifier_with_iis(
1156
+ train_toks, trace=3, encoding=None, labels=None, **cutoffs
1157
+ ):
1158
+ """
1159
+ Train a new ``ConditionalExponentialClassifier``, using the given
1160
+ training samples, using the Improved Iterative Scaling algorithm.
1161
+ This ``ConditionalExponentialClassifier`` will encode the model
1162
+ that maximizes entropy from all the models that are empirically
1163
+ consistent with ``train_toks``.
1164
+
1165
+ :see: ``train_maxent_classifier()`` for parameter descriptions.
1166
+ """
1167
+ cutoffs.setdefault("max_iter", 100)
1168
+ cutoffchecker = CutoffChecker(cutoffs)
1169
+
1170
+ # Construct an encoding from the training data.
1171
+ if encoding is None:
1172
+ encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
1173
+
1174
+ # Count how many times each feature occurs in the training data.
1175
+ empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks)
1176
+
1177
+ # Find the nf map, and related variables nfarray and nfident.
1178
+ # nf is the sum of the features for a given labeled text.
1179
+ # nfmap compresses this sparse set of values to a dense list.
1180
+ # nfarray performs the reverse operation. nfident is
1181
+ # nfarray multiplied by an identity matrix.
1182
+ nfmap = calculate_nfmap(train_toks, encoding)
1183
+ nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d")
1184
+ nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
1185
+
1186
+ # Check for any features that are not attested in train_toks.
1187
+ unattested = set(numpy.nonzero(empirical_ffreq == 0)[0])
1188
+
1189
+ # Build the classifier. Start with weight=0 for each attested
1190
+ # feature, and weight=-infinity for each unattested feature.
1191
+ weights = numpy.zeros(len(empirical_ffreq), "d")
1192
+ for fid in unattested:
1193
+ weights[fid] = numpy.NINF
1194
+ classifier = ConditionalExponentialClassifier(encoding, weights)
1195
+
1196
+ if trace > 0:
1197
+ print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
1198
+ if trace > 2:
1199
+ print()
1200
+ print(" Iteration Log Likelihood Accuracy")
1201
+ print(" ---------------------------------------")
1202
+
1203
+ # Train the classifier.
1204
+ try:
1205
+ while True:
1206
+ if trace > 2:
1207
+ ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
1208
+ acc = cutoffchecker.acc or accuracy(classifier, train_toks)
1209
+ iternum = cutoffchecker.iter
1210
+ print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
1211
+
1212
+ # Calculate the deltas for this iteration, using Newton's method.
1213
+ deltas = calculate_deltas(
1214
+ train_toks,
1215
+ classifier,
1216
+ unattested,
1217
+ empirical_ffreq,
1218
+ nfmap,
1219
+ nfarray,
1220
+ nftranspose,
1221
+ encoding,
1222
+ )
1223
+
1224
+ # Use the deltas to update our weights.
1225
+ weights = classifier.weights()
1226
+ weights += deltas
1227
+ classifier.set_weights(weights)
1228
+
1229
+ # Check the log-likelihood & accuracy cutoffs.
1230
+ if cutoffchecker.check(classifier, train_toks):
1231
+ break
1232
+
1233
+ except KeyboardInterrupt:
1234
+ print(" Training stopped: keyboard interrupt")
1235
+ except:
1236
+ raise
1237
+
1238
+ if trace > 2:
1239
+ ll = log_likelihood(classifier, train_toks)
1240
+ acc = accuracy(classifier, train_toks)
1241
+ print(f" Final {ll:14.5f} {acc:9.3f}")
1242
+
1243
+ # Return the classifier.
1244
+ return classifier
1245
+
1246
+
1247
+ def calculate_nfmap(train_toks, encoding):
1248
+ """
1249
+ Construct a map that can be used to compress ``nf`` (which is
1250
+ typically sparse).
1251
+
1252
+ *nf(feature_vector)* is the sum of the feature values for
1253
+ *feature_vector*.
1254
+
1255
+ This represents the number of features that are active for a
1256
+ given labeled text. This method finds all values of *nf(t)*
1257
+ that are attested for at least one token in the given list of
1258
+ training tokens; and constructs a dictionary mapping these
1259
+ attested values to a continuous range *0...N*. For example,
1260
+ if the only values of *nf()* that were attested were 3, 5, and
1261
+ 7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.
1262
+
1263
+ :return: A map that can be used to compress ``nf`` to a dense
1264
+ vector.
1265
+ :rtype: dict(int -> int)
1266
+ """
1267
+ # Map from nf to indices. This allows us to use smaller arrays.
1268
+ nfset = set()
1269
+ for tok, _ in train_toks:
1270
+ for label in encoding.labels():
1271
+ nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
1272
+ return {nf: i for (i, nf) in enumerate(nfset)}
1273
+
1274
+
1275
+ def calculate_deltas(
1276
+ train_toks,
1277
+ classifier,
1278
+ unattested,
1279
+ ffreq_empirical,
1280
+ nfmap,
1281
+ nfarray,
1282
+ nftranspose,
1283
+ encoding,
1284
+ ):
1285
+ r"""
1286
+ Calculate the update values for the classifier weights for
1287
+ this iteration of IIS. These update weights are the value of
1288
+ ``delta`` that solves the equation::
1289
+
1290
+ ffreq_empirical[i]
1291
+ =
1292
+ SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
1293
+ feature_vector(fs,l)[i] *
1294
+ exp(delta[i] * nf(feature_vector(fs,l))))
1295
+
1296
+ Where:
1297
+ - *(fs,l)* is a (featureset, label) tuple from ``train_toks``
1298
+ - *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
1299
+ - *nf(vector)* = ``sum([val for (id,val) in vector])``
1300
+
1301
+ This method uses Newton's method to solve this equation for
1302
+ *delta[i]*. In particular, it starts with a guess of
1303
+ ``delta[i]`` = 1; and iteratively updates ``delta`` with:
1304
+
1305
+ | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])
1306
+
1307
+ until convergence, where *sum1* and *sum2* are defined as:
1308
+
1309
+ | sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
1310
+ | sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
1311
+ | f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
1312
+ | feature_vector(fs,l)[i] .
1313
+ | exp(delta[i] . nf(feature_vector(fs,l))))
1314
+
1315
+ Note that *sum1* and *sum2* depend on ``delta``; so they need
1316
+ to be re-computed each iteration.
1317
+
1318
+ The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
1319
+ used to generate a dense encoding for *nf(ltext)*. This
1320
+ allows ``_deltas`` to calculate *sum1* and *sum2* using
1321
+ matrices, which yields a significant performance improvement.
1322
+
1323
+ :param train_toks: The set of training tokens.
1324
+ :type train_toks: list(tuple(dict, str))
1325
+ :param classifier: The current classifier.
1326
+ :type classifier: ClassifierI
1327
+ :param ffreq_empirical: An array containing the empirical
1328
+ frequency for each feature. The *i*\ th element of this
1329
+ array is the empirical frequency for feature *i*.
1330
+ :type ffreq_empirical: sequence of float
1331
+ :param unattested: An array that is 1 for features that are
1332
+ not attested in the training data; and 0 for features that
1333
+ are attested. In other words, ``unattested[i]==0`` iff
1334
+ ``ffreq_empirical[i]==0``.
1335
+ :type unattested: sequence of int
1336
+ :param nfmap: A map that can be used to compress ``nf`` to a dense
1337
+ vector.
1338
+ :type nfmap: dict(int -> int)
1339
+ :param nfarray: An array that can be used to uncompress ``nf``
1340
+ from a dense vector.
1341
+ :type nfarray: array(float)
1342
+ :param nftranspose: The transpose of ``nfarray``
1343
+ :type nftranspose: array(float)
1344
+ """
1345
+ # These parameters control when we decide that we've
1346
+ # converged. It probably should be possible to set these
1347
+ # manually, via keyword arguments to train.
1348
+ NEWTON_CONVERGE = 1e-12
1349
+ MAX_NEWTON = 300
1350
+
1351
+ deltas = numpy.ones(encoding.length(), "d")
1352
+
1353
+ # Precompute the A matrix:
1354
+ # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
1355
+ # over all label,fs s.t. num_features[label,fs]=nf
1356
+ A = numpy.zeros((len(nfmap), encoding.length()), "d")
1357
+
1358
+ for tok, label in train_toks:
1359
+ dist = classifier.prob_classify(tok)
1360
+
1361
+ for label in encoding.labels():
1362
+ # Generate the feature vector
1363
+ feature_vector = encoding.encode(tok, label)
1364
+ # Find the number of active features
1365
+ nf = sum(val for (id, val) in feature_vector)
1366
+ # Update the A matrix
1367
+ for (id, val) in feature_vector:
1368
+ A[nfmap[nf], id] += dist.prob(label) * val
1369
+ A /= len(train_toks)
1370
+
1371
+ # Iteratively solve for delta. Use the following variables:
1372
+ # - nf_delta[x][y] = nfarray[x] * delta[y]
1373
+ # - exp_nf_delta[x][y] = exp(nf[x] * delta[y])
1374
+ # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])
1375
+ # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
1376
+ # exp(delta[i]nf)
1377
+ # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
1378
+ # nf exp(delta[i]nf)
1379
+ for rangenum in range(MAX_NEWTON):
1380
+ nf_delta = numpy.outer(nfarray, deltas)
1381
+ exp_nf_delta = 2**nf_delta
1382
+ nf_exp_nf_delta = nftranspose * exp_nf_delta
1383
+ sum1 = numpy.sum(exp_nf_delta * A, axis=0)
1384
+ sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
1385
+
1386
+ # Avoid division by zero.
1387
+ for fid in unattested:
1388
+ sum2[fid] += 1
1389
+
1390
+ # Update the deltas.
1391
+ deltas -= (ffreq_empirical - sum1) / -sum2
1392
+
1393
+ # We can stop once we converge.
1394
+ n_error = numpy.sum(abs(ffreq_empirical - sum1)) / numpy.sum(abs(deltas))
1395
+ if n_error < NEWTON_CONVERGE:
1396
+ return deltas
1397
+
1398
+ return deltas
1399
+
1400
+
1401
+ ######################################################################
1402
+ # { Classifier Trainer: megam
1403
+ ######################################################################
1404
+
1405
+ # [xx] possible extension: add support for using implicit file format;
1406
+ # this would need to put requirements on what encoding is used. But
1407
+ # we may need this for other maxent classifier trainers that require
1408
+ # implicit formats anyway.
1409
+ def train_maxent_classifier_with_megam(
1410
+ train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs
1411
+ ):
1412
+ """
1413
+ Train a new ``ConditionalExponentialClassifier``, using the given
1414
+ training samples, using the external ``megam`` library. This
1415
+ ``ConditionalExponentialClassifier`` will encode the model that
1416
+ maximizes entropy from all the models that are empirically
1417
+ consistent with ``train_toks``.
1418
+
1419
+ :see: ``train_maxent_classifier()`` for parameter descriptions.
1420
+ :see: ``nltk.classify.megam``
1421
+ """
1422
+
1423
+ explicit = True
1424
+ bernoulli = True
1425
+ if "explicit" in kwargs:
1426
+ explicit = kwargs["explicit"]
1427
+ if "bernoulli" in kwargs:
1428
+ bernoulli = kwargs["bernoulli"]
1429
+
1430
+ # Construct an encoding from the training data.
1431
+ if encoding is None:
1432
+ # Count cutoff can also be controlled by megam with the -minfc
1433
+ # option. Not sure where the best place for it is.
1434
+ count_cutoff = kwargs.get("count_cutoff", 0)
1435
+ encoding = BinaryMaxentFeatureEncoding.train(
1436
+ train_toks, count_cutoff, labels=labels, alwayson_features=True
1437
+ )
1438
+ elif labels is not None:
1439
+ raise ValueError("Specify encoding or labels, not both")
1440
+
1441
+ # Write a training file for megam.
1442
+ try:
1443
+ fd, trainfile_name = tempfile.mkstemp(prefix="nltk-")
1444
+ with open(trainfile_name, "w") as trainfile:
1445
+ write_megam_file(
1446
+ train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
1447
+ )
1448
+ os.close(fd)
1449
+ except (OSError, ValueError) as e:
1450
+ raise ValueError("Error while creating megam training file: %s" % e) from e
1451
+
1452
+ # Run megam on the training file.
1453
+ options = []
1454
+ options += ["-nobias", "-repeat", "10"]
1455
+ if explicit:
1456
+ options += ["-explicit"]
1457
+ if not bernoulli:
1458
+ options += ["-fvals"]
1459
+ if gaussian_prior_sigma:
1460
+ # Lambda is just the precision of the Gaussian prior, i.e. it's the
1461
+ # inverse variance, so the parameter conversion is 1.0/sigma**2.
1462
+ # See https://users.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf
1463
+ inv_variance = 1.0 / gaussian_prior_sigma**2
1464
+ else:
1465
+ inv_variance = 0
1466
+ options += ["-lambda", "%.2f" % inv_variance, "-tune"]
1467
+ if trace < 3:
1468
+ options += ["-quiet"]
1469
+ if "max_iter" in kwargs:
1470
+ options += ["-maxi", "%s" % kwargs["max_iter"]]
1471
+ if "ll_delta" in kwargs:
1472
+ # [xx] this is actually a perplexity delta, not a log
1473
+ # likelihood delta
1474
+ options += ["-dpp", "%s" % abs(kwargs["ll_delta"])]
1475
+ if hasattr(encoding, "cost"):
1476
+ options += ["-multilabel"] # each possible la
1477
+ options += ["multiclass", trainfile_name]
1478
+ stdout = call_megam(options)
1479
+ # print('./megam_i686.opt ', ' '.join(options))
1480
+ # Delete the training file
1481
+ try:
1482
+ os.remove(trainfile_name)
1483
+ except OSError as e:
1484
+ print(f"Warning: unable to delete {trainfile_name}: {e}")
1485
+
1486
+ # Parse the generated weight vector.
1487
+ weights = parse_megam_weights(stdout, encoding.length(), explicit)
1488
+
1489
+ # Convert from base-e to base-2 weights.
1490
+ weights *= numpy.log2(numpy.e)
1491
+
1492
+ # Build the classifier
1493
+ return MaxentClassifier(encoding, weights)
1494
+
1495
+
1496
+ ######################################################################
1497
+ # { Classifier Trainer: tadm
1498
+ ######################################################################
1499
+
1500
+
1501
+ class TadmMaxentClassifier(MaxentClassifier):
1502
+ @classmethod
1503
+ def train(cls, train_toks, **kwargs):
1504
+ algorithm = kwargs.get("algorithm", "tao_lmvm")
1505
+ trace = kwargs.get("trace", 3)
1506
+ encoding = kwargs.get("encoding", None)
1507
+ labels = kwargs.get("labels", None)
1508
+ sigma = kwargs.get("gaussian_prior_sigma", 0)
1509
+ count_cutoff = kwargs.get("count_cutoff", 0)
1510
+ max_iter = kwargs.get("max_iter")
1511
+ ll_delta = kwargs.get("min_lldelta")
1512
+
1513
+ # Construct an encoding from the training data.
1514
+ if not encoding:
1515
+ encoding = TadmEventMaxentFeatureEncoding.train(
1516
+ train_toks, count_cutoff, labels=labels
1517
+ )
1518
+
1519
+ trainfile_fd, trainfile_name = tempfile.mkstemp(
1520
+ prefix="nltk-tadm-events-", suffix=".gz"
1521
+ )
1522
+ weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-")
1523
+
1524
+ trainfile = gzip_open_unicode(trainfile_name, "w")
1525
+ write_tadm_file(train_toks, encoding, trainfile)
1526
+ trainfile.close()
1527
+
1528
+ options = []
1529
+ options.extend(["-monitor"])
1530
+ options.extend(["-method", algorithm])
1531
+ if sigma:
1532
+ options.extend(["-l2", "%.6f" % sigma**2])
1533
+ if max_iter:
1534
+ options.extend(["-max_it", "%d" % max_iter])
1535
+ if ll_delta:
1536
+ options.extend(["-fatol", "%.6f" % abs(ll_delta)])
1537
+ options.extend(["-events_in", trainfile_name])
1538
+ options.extend(["-params_out", weightfile_name])
1539
+ if trace < 3:
1540
+ options.extend(["2>&1"])
1541
+ else:
1542
+ options.extend(["-summary"])
1543
+
1544
+ call_tadm(options)
1545
+
1546
+ with open(weightfile_name) as weightfile:
1547
+ weights = parse_tadm_weights(weightfile)
1548
+
1549
+ os.remove(trainfile_name)
1550
+ os.remove(weightfile_name)
1551
+
1552
+ # Convert from base-e to base-2 weights.
1553
+ weights *= numpy.log2(numpy.e)
1554
+
1555
+ # Build the classifier
1556
+ return cls(encoding, weights)
1557
+
1558
+
1559
+ ######################################################################
1560
+ # { Demo
1561
+ ######################################################################
1562
+ def demo():
1563
+ from nltk.classify.util import names_demo
1564
+
1565
+ classifier = names_demo(MaxentClassifier.train)
1566
+
1567
+
1568
+ if __name__ == "__main__":
1569
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/classify/megam.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Interface to Megam Classifier
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A set of functions used to interface with the external megam_ maxent
10
+ optimization package. Before megam can be used, you should tell NLTK where it
11
+ can find the megam binary, using the ``config_megam()`` function. Typical
12
+ usage:
13
+
14
+ >>> from nltk.classify import megam
15
+ >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
16
+ [Found megam: ...]
17
+
18
+ Use with MaxentClassifier. Example below, see MaxentClassifier documentation
19
+ for details.
20
+
21
+ nltk.classify.MaxentClassifier.train(corpus, 'megam')
22
+
23
+ .. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
24
+ """
25
+ import subprocess
26
+
27
+ from nltk.internals import find_binary
28
+
29
+ try:
30
+ import numpy
31
+ except ImportError:
32
+ numpy = None
33
+
34
+ ######################################################################
35
+ # { Configuration
36
+ ######################################################################
37
+
38
+ _megam_bin = None
39
+
40
+
41
+ def config_megam(bin=None):
42
+ """
43
+ Configure NLTK's interface to the ``megam`` maxent optimization
44
+ package.
45
+
46
+ :param bin: The full path to the ``megam`` binary. If not specified,
47
+ then nltk will search the system for a ``megam`` binary; and if
48
+ one is not found, it will raise a ``LookupError`` exception.
49
+ :type bin: str
50
+ """
51
+ global _megam_bin
52
+ _megam_bin = find_binary(
53
+ "megam",
54
+ bin,
55
+ env_vars=["MEGAM"],
56
+ binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
57
+ url="https://www.umiacs.umd.edu/~hal/megam/index.html",
58
+ )
59
+
60
+
61
+ ######################################################################
62
+ # { Megam Interface Functions
63
+ ######################################################################
64
+
65
+
66
+ def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
67
+ """
68
+ Generate an input file for ``megam`` based on the given corpus of
69
+ classified tokens.
70
+
71
+ :type train_toks: list(tuple(dict, str))
72
+ :param train_toks: Training data, represented as a list of
73
+ pairs, the first member of which is a feature dictionary,
74
+ and the second of which is a classification label.
75
+
76
+ :type encoding: MaxentFeatureEncodingI
77
+ :param encoding: A feature encoding, used to convert featuresets
78
+ into feature vectors. May optionally implement a cost() method
79
+ in order to assign different costs to different class predictions.
80
+
81
+ :type stream: stream
82
+ :param stream: The stream to which the megam input file should be
83
+ written.
84
+
85
+ :param bernoulli: If true, then use the 'bernoulli' format. I.e.,
86
+ all joint features have binary values, and are listed iff they
87
+ are true. Otherwise, list feature values explicitly. If
88
+ ``bernoulli=False``, then you must call ``megam`` with the
89
+ ``-fvals`` option.
90
+
91
+ :param explicit: If true, then use the 'explicit' format. I.e.,
92
+ list the features that would fire for any of the possible
93
+ labels, for each token. If ``explicit=True``, then you must
94
+ call ``megam`` with the ``-explicit`` option.
95
+ """
96
+ # Look up the set of labels.
97
+ labels = encoding.labels()
98
+ labelnum = {label: i for (i, label) in enumerate(labels)}
99
+
100
+ # Write the file, which contains one line per instance.
101
+ for featureset, label in train_toks:
102
+ # First, the instance number (or, in the weighted multiclass case, the cost of each label).
103
+ if hasattr(encoding, "cost"):
104
+ stream.write(
105
+ ":".join(str(encoding.cost(featureset, label, l)) for l in labels)
106
+ )
107
+ else:
108
+ stream.write("%d" % labelnum[label])
109
+
110
+ # For implicit file formats, just list the features that fire
111
+ # for this instance's actual label.
112
+ if not explicit:
113
+ _write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
114
+
115
+ # For explicit formats, list the features that would fire for
116
+ # any of the possible labels.
117
+ else:
118
+ for l in labels:
119
+ stream.write(" #")
120
+ _write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
121
+
122
+ # End of the instance.
123
+ stream.write("\n")
124
+
125
+
126
+ def parse_megam_weights(s, features_count, explicit=True):
127
+ """
128
+ Given the stdout output generated by ``megam`` when training a
129
+ model, return a ``numpy`` array containing the corresponding weight
130
+ vector. This function does not currently handle bias features.
131
+ """
132
+ if numpy is None:
133
+ raise ValueError("This function requires that numpy be installed")
134
+ assert explicit, "non-explicit not supported yet"
135
+ lines = s.strip().split("\n")
136
+ weights = numpy.zeros(features_count, "d")
137
+ for line in lines:
138
+ if line.strip():
139
+ fid, weight = line.split()
140
+ weights[int(fid)] = float(weight)
141
+ return weights
142
+
143
+
144
+ def _write_megam_features(vector, stream, bernoulli):
145
+ if not vector:
146
+ raise ValueError(
147
+ "MEGAM classifier requires the use of an " "always-on feature."
148
+ )
149
+ for (fid, fval) in vector:
150
+ if bernoulli:
151
+ if fval == 1:
152
+ stream.write(" %s" % fid)
153
+ elif fval != 0:
154
+ raise ValueError(
155
+ "If bernoulli=True, then all" "features must be binary."
156
+ )
157
+ else:
158
+ stream.write(f" {fid} {fval}")
159
+
160
+
161
+ def call_megam(args):
162
+ """
163
+ Call the ``megam`` binary with the given arguments.
164
+ """
165
+ if isinstance(args, str):
166
+ raise TypeError("args should be a list of strings")
167
+ if _megam_bin is None:
168
+ config_megam()
169
+
170
+ # Call megam via a subprocess
171
+ cmd = [_megam_bin] + args
172
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
173
+ (stdout, stderr) = p.communicate()
174
+
175
+ # Check the return code.
176
+ if p.returncode != 0:
177
+ print()
178
+ print(stderr)
179
+ raise OSError("megam command failed!")
180
+
181
+ if isinstance(stdout, str):
182
+ return stdout
183
+ else:
184
+ return stdout.decode("utf-8")
.eggs/nltk-3.8-py3.10.egg/nltk/classify/naivebayes.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Naive Bayes Classifiers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A classifier based on the Naive Bayes algorithm. In order to find the
10
+ probability for a label, this algorithm first uses the Bayes rule to
11
+ express P(label|features) in terms of P(label) and P(features|label):
12
+
13
+ | P(label) * P(features|label)
14
+ | P(label|features) = ------------------------------
15
+ | P(features)
16
+
17
+ The algorithm then makes the 'naive' assumption that all features are
18
+ independent, given the label:
19
+
20
+ | P(label) * P(f1|label) * ... * P(fn|label)
21
+ | P(label|features) = --------------------------------------------
22
+ | P(features)
23
+
24
+ Rather than computing P(features) explicitly, the algorithm just
25
+ calculates the numerator for each label, and normalizes them so they
26
+ sum to one:
27
+
28
+ | P(label) * P(f1|label) * ... * P(fn|label)
29
+ | P(label|features) = --------------------------------------------
30
+ | SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
31
+ """
32
+
33
+ from collections import defaultdict
34
+
35
+ from nltk.classify.api import ClassifierI
36
+ from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
37
+
38
+ ##//////////////////////////////////////////////////////
39
+ ## Naive Bayes Classifier
40
+ ##//////////////////////////////////////////////////////
41
+
42
+
43
+ class NaiveBayesClassifier(ClassifierI):
44
+ """
45
+ A Naive Bayes classifier. Naive Bayes classifiers are
46
+ paramaterized by two probability distributions:
47
+
48
+ - P(label) gives the probability that an input will receive each
49
+ label, given no information about the input's features.
50
+
51
+ - P(fname=fval|label) gives the probability that a given feature
52
+ (fname) will receive a given value (fval), given that the
53
+ label (label).
54
+
55
+ If the classifier encounters an input with a feature that has
56
+ never been seen with any label, then rather than assigning a
57
+ probability of 0 to all labels, it will ignore that feature.
58
+
59
+ The feature value 'None' is reserved for unseen feature values;
60
+ you generally should not use 'None' as a feature value for one of
61
+ your own features.
62
+ """
63
+
64
+ def __init__(self, label_probdist, feature_probdist):
65
+ """
66
+ :param label_probdist: P(label), the probability distribution
67
+ over labels. It is expressed as a ``ProbDistI`` whose
68
+ samples are labels. I.e., P(label) =
69
+ ``label_probdist.prob(label)``.
70
+
71
+ :param feature_probdist: P(fname=fval|label), the probability
72
+ distribution for feature values, given labels. It is
73
+ expressed as a dictionary whose keys are ``(label, fname)``
74
+ pairs and whose values are ``ProbDistI`` objects over feature
75
+ values. I.e., P(fname=fval|label) =
76
+ ``feature_probdist[label,fname].prob(fval)``. If a given
77
+ ``(label,fname)`` is not a key in ``feature_probdist``, then
78
+ it is assumed that the corresponding P(fname=fval|label)
79
+ is 0 for all values of ``fval``.
80
+ """
81
+ self._label_probdist = label_probdist
82
+ self._feature_probdist = feature_probdist
83
+ self._labels = list(label_probdist.samples())
84
+
85
+ def labels(self):
86
+ return self._labels
87
+
88
+ def classify(self, featureset):
89
+ return self.prob_classify(featureset).max()
90
+
91
+ def prob_classify(self, featureset):
92
+ # Discard any feature names that we've never seen before.
93
+ # Otherwise, we'll just assign a probability of 0 to
94
+ # everything.
95
+ featureset = featureset.copy()
96
+ for fname in list(featureset.keys()):
97
+ for label in self._labels:
98
+ if (label, fname) in self._feature_probdist:
99
+ break
100
+ else:
101
+ # print('Ignoring unseen feature %s' % fname)
102
+ del featureset[fname]
103
+
104
+ # Find the log probability of each label, given the features.
105
+ # Start with the log probability of the label itself.
106
+ logprob = {}
107
+ for label in self._labels:
108
+ logprob[label] = self._label_probdist.logprob(label)
109
+
110
+ # Then add in the log probability of features given labels.
111
+ for label in self._labels:
112
+ for (fname, fval) in featureset.items():
113
+ if (label, fname) in self._feature_probdist:
114
+ feature_probs = self._feature_probdist[label, fname]
115
+ logprob[label] += feature_probs.logprob(fval)
116
+ else:
117
+ # nb: This case will never come up if the
118
+ # classifier was created by
119
+ # NaiveBayesClassifier.train().
120
+ logprob[label] += sum_logs([]) # = -INF.
121
+
122
+ return DictionaryProbDist(logprob, normalize=True, log=True)
123
+
124
+ def show_most_informative_features(self, n=10):
125
+ # Determine the most relevant features, and display them.
126
+ cpdist = self._feature_probdist
127
+ print("Most Informative Features")
128
+
129
+ for (fname, fval) in self.most_informative_features(n):
130
+
131
+ def labelprob(l):
132
+ return cpdist[l, fname].prob(fval)
133
+
134
+ labels = sorted(
135
+ (l for l in self._labels if fval in cpdist[l, fname].samples()),
136
+ key=lambda element: (-labelprob(element), element),
137
+ reverse=True,
138
+ )
139
+ if len(labels) == 1:
140
+ continue
141
+ l0 = labels[0]
142
+ l1 = labels[-1]
143
+ if cpdist[l0, fname].prob(fval) == 0:
144
+ ratio = "INF"
145
+ else:
146
+ ratio = "%8.1f" % (
147
+ cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
148
+ )
149
+ print(
150
+ "%24s = %-14r %6s : %-6s = %s : 1.0"
151
+ % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
152
+ )
153
+
154
+ def most_informative_features(self, n=100):
155
+ """
156
+ Return a list of the 'most informative' features used by this
157
+ classifier. For the purpose of this function, the
158
+ informativeness of a feature ``(fname,fval)`` is equal to the
159
+ highest value of P(fname=fval|label), for any label, divided by
160
+ the lowest value of P(fname=fval|label), for any label:
161
+
162
+ | max[ P(fname=fval|label1) / P(fname=fval|label2) ]
163
+ """
164
+ if hasattr(self, "_most_informative_features"):
165
+ return self._most_informative_features[:n]
166
+ else:
167
+ # The set of (fname, fval) pairs used by this classifier.
168
+ features = set()
169
+ # The max & min probability associated w/ each (fname, fval)
170
+ # pair. Maps (fname,fval) -> float.
171
+ maxprob = defaultdict(lambda: 0.0)
172
+ minprob = defaultdict(lambda: 1.0)
173
+
174
+ for (label, fname), probdist in self._feature_probdist.items():
175
+ for fval in probdist.samples():
176
+ feature = (fname, fval)
177
+ features.add(feature)
178
+ p = probdist.prob(fval)
179
+ maxprob[feature] = max(p, maxprob[feature])
180
+ minprob[feature] = min(p, minprob[feature])
181
+ if minprob[feature] == 0:
182
+ features.discard(feature)
183
+
184
+ # Convert features to a list, & sort it by how informative
185
+ # features are.
186
+ self._most_informative_features = sorted(
187
+ features,
188
+ key=lambda feature_: (
189
+ minprob[feature_] / maxprob[feature_],
190
+ feature_[0],
191
+ feature_[1] in [None, False, True],
192
+ str(feature_[1]).lower(),
193
+ ),
194
+ )
195
+ return self._most_informative_features[:n]
196
+
197
+ @classmethod
198
+ def train(cls, labeled_featuresets, estimator=ELEProbDist):
199
+ """
200
+ :param labeled_featuresets: A list of classified featuresets,
201
+ i.e., a list of tuples ``(featureset, label)``.
202
+ """
203
+ label_freqdist = FreqDist()
204
+ feature_freqdist = defaultdict(FreqDist)
205
+ feature_values = defaultdict(set)
206
+ fnames = set()
207
+
208
+ # Count up how many times each feature value occurred, given
209
+ # the label and featurename.
210
+ for featureset, label in labeled_featuresets:
211
+ label_freqdist[label] += 1
212
+ for fname, fval in featureset.items():
213
+ # Increment freq(fval|label, fname)
214
+ feature_freqdist[label, fname][fval] += 1
215
+ # Record that fname can take the value fval.
216
+ feature_values[fname].add(fval)
217
+ # Keep a list of all feature names.
218
+ fnames.add(fname)
219
+
220
+ # If a feature didn't have a value given for an instance, then
221
+ # we assume that it gets the implicit value 'None.' This loop
222
+ # counts up the number of 'missing' feature values for each
223
+ # (label,fname) pair, and increments the count of the fval
224
+ # 'None' by that amount.
225
+ for label in label_freqdist:
226
+ num_samples = label_freqdist[label]
227
+ for fname in fnames:
228
+ count = feature_freqdist[label, fname].N()
229
+ # Only add a None key when necessary, i.e. if there are
230
+ # any samples with feature 'fname' missing.
231
+ if num_samples - count > 0:
232
+ feature_freqdist[label, fname][None] += num_samples - count
233
+ feature_values[fname].add(None)
234
+
235
+ # Create the P(label) distribution
236
+ label_probdist = estimator(label_freqdist)
237
+
238
+ # Create the P(fval|label, fname) distribution
239
+ feature_probdist = {}
240
+ for ((label, fname), freqdist) in feature_freqdist.items():
241
+ probdist = estimator(freqdist, bins=len(feature_values[fname]))
242
+ feature_probdist[label, fname] = probdist
243
+
244
+ return cls(label_probdist, feature_probdist)
245
+
246
+
247
+ ##//////////////////////////////////////////////////////
248
+ ## Demo
249
+ ##//////////////////////////////////////////////////////
250
+
251
+
252
+ def demo():
253
+ from nltk.classify.util import names_demo
254
+
255
+ classifier = names_demo(NaiveBayesClassifier.train)
256
+ classifier.show_most_informative_features()
257
+
258
+
259
+ if __name__ == "__main__":
260
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/classify/positivenaivebayes.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Positive Naive Bayes Classifier
2
+ #
3
+ # Copyright (C) 2012 NLTK Project
4
+ # Author: Alessandro Presta <alessandro.presta@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ A variant of the Naive Bayes Classifier that performs binary classification with
10
+ partially-labeled training sets. In other words, assume we want to build a classifier
11
+ that assigns each example to one of two complementary classes (e.g., male names and
12
+ female names).
13
+ If we have a training set with labeled examples for both classes, we can use a
14
+ standard Naive Bayes Classifier. However, consider the case when we only have labeled
15
+ examples for one of the classes, and other, unlabeled, examples.
16
+ Then, assuming a prior distribution on the two labels, we can use the unlabeled set
17
+ to estimate the frequencies of the various features.
18
+
19
+ Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
20
+ and unlabeled examples. We are also given an estimate of P(1).
21
+
22
+ We compute P(feature|1) exactly as in the standard case.
23
+
24
+ To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
25
+ assuming that the unlabeled examples are drawn according to the given prior distribution)
26
+ and then express the conditional probability as:
27
+
28
+ | P(feature) - P(feature|1) * P(1)
29
+ | P(feature|0) = ----------------------------------
30
+ | P(0)
31
+
32
+ Example:
33
+
34
+ >>> from nltk.classify import PositiveNaiveBayesClassifier
35
+
36
+ Some sentences about sports:
37
+
38
+ >>> sports_sentences = [ 'The team dominated the game',
39
+ ... 'They lost the ball',
40
+ ... 'The game was intense',
41
+ ... 'The goalkeeper catched the ball',
42
+ ... 'The other team controlled the ball' ]
43
+
44
+ Mixed topics, including sports:
45
+
46
+ >>> various_sentences = [ 'The President did not comment',
47
+ ... 'I lost the keys',
48
+ ... 'The team won the game',
49
+ ... 'Sara has two kids',
50
+ ... 'The ball went off the court',
51
+ ... 'They had the ball for the whole game',
52
+ ... 'The show is over' ]
53
+
54
+ The features of a sentence are simply the words it contains:
55
+
56
+ >>> def features(sentence):
57
+ ... words = sentence.lower().split()
58
+ ... return dict(('contains(%s)' % w, True) for w in words)
59
+
60
+ We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
61
+
62
+ >>> positive_featuresets = map(features, sports_sentences)
63
+ >>> unlabeled_featuresets = map(features, various_sentences)
64
+ >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
65
+ ... unlabeled_featuresets)
66
+
67
+ Is the following sentence about sports?
68
+
69
+ >>> classifier.classify(features('The cat is on the table'))
70
+ False
71
+
72
+ What about this one?
73
+
74
+ >>> classifier.classify(features('My team lost the game'))
75
+ True
76
+ """
77
+
78
+ from collections import defaultdict
79
+
80
+ from nltk.classify.naivebayes import NaiveBayesClassifier
81
+ from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist
82
+
83
+ ##//////////////////////////////////////////////////////
84
+ ## Positive Naive Bayes Classifier
85
+ ##//////////////////////////////////////////////////////
86
+
87
+
88
+ class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
89
+ @staticmethod
90
+ def train(
91
+ positive_featuresets,
92
+ unlabeled_featuresets,
93
+ positive_prob_prior=0.5,
94
+ estimator=ELEProbDist,
95
+ ):
96
+ """
97
+ :param positive_featuresets: An iterable of featuresets that are known as positive
98
+ examples (i.e., their label is ``True``).
99
+
100
+ :param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
101
+
102
+ :param positive_prob_prior: A prior estimate of the probability of the label
103
+ ``True`` (default 0.5).
104
+ """
105
+ positive_feature_freqdist = defaultdict(FreqDist)
106
+ unlabeled_feature_freqdist = defaultdict(FreqDist)
107
+ feature_values = defaultdict(set)
108
+ fnames = set()
109
+
110
+ # Count up how many times each feature value occurred in positive examples.
111
+ num_positive_examples = 0
112
+ for featureset in positive_featuresets:
113
+ for fname, fval in featureset.items():
114
+ positive_feature_freqdist[fname][fval] += 1
115
+ feature_values[fname].add(fval)
116
+ fnames.add(fname)
117
+ num_positive_examples += 1
118
+
119
+ # Count up how many times each feature value occurred in unlabeled examples.
120
+ num_unlabeled_examples = 0
121
+ for featureset in unlabeled_featuresets:
122
+ for fname, fval in featureset.items():
123
+ unlabeled_feature_freqdist[fname][fval] += 1
124
+ feature_values[fname].add(fval)
125
+ fnames.add(fname)
126
+ num_unlabeled_examples += 1
127
+
128
+ # If a feature didn't have a value given for an instance, then we assume that
129
+ # it gets the implicit value 'None'.
130
+ for fname in fnames:
131
+ count = positive_feature_freqdist[fname].N()
132
+ positive_feature_freqdist[fname][None] += num_positive_examples - count
133
+ feature_values[fname].add(None)
134
+
135
+ for fname in fnames:
136
+ count = unlabeled_feature_freqdist[fname].N()
137
+ unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
138
+ feature_values[fname].add(None)
139
+
140
+ negative_prob_prior = 1.0 - positive_prob_prior
141
+
142
+ # Create the P(label) distribution.
143
+ label_probdist = DictionaryProbDist(
144
+ {True: positive_prob_prior, False: negative_prob_prior}
145
+ )
146
+
147
+ # Create the P(fval|label, fname) distribution.
148
+ feature_probdist = {}
149
+ for fname, freqdist in positive_feature_freqdist.items():
150
+ probdist = estimator(freqdist, bins=len(feature_values[fname]))
151
+ feature_probdist[True, fname] = probdist
152
+
153
+ for fname, freqdist in unlabeled_feature_freqdist.items():
154
+ global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
155
+ negative_feature_probs = {}
156
+ for fval in feature_values[fname]:
157
+ prob = (
158
+ global_probdist.prob(fval)
159
+ - positive_prob_prior * feature_probdist[True, fname].prob(fval)
160
+ ) / negative_prob_prior
161
+ # TODO: We need to add some kind of smoothing here, instead of
162
+ # setting negative probabilities to zero and normalizing.
163
+ negative_feature_probs[fval] = max(prob, 0.0)
164
+ feature_probdist[False, fname] = DictionaryProbDist(
165
+ negative_feature_probs, normalize=True
166
+ )
167
+
168
+ return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
169
+
170
+
171
+ ##//////////////////////////////////////////////////////
172
+ ## Demo
173
+ ##//////////////////////////////////////////////////////
174
+
175
+
176
+ def demo():
177
+ from nltk.classify.util import partial_names_demo
178
+
179
+ classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
180
+ classifier.show_most_informative_features()
.eggs/nltk-3.8-py3.10.egg/nltk/classify/util.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Classifier Utility Functions
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # Steven Bird <stevenbird1@gmail.com> (minor additions)
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ Utility functions and classes for classifiers.
11
+ """
12
+
13
+ import math
14
+
15
+ # from nltk.util import Deprecated
16
+ import nltk.classify.util # for accuracy & log_likelihood
17
+ from nltk.util import LazyMap
18
+
19
+ ######################################################################
20
+ # { Helper Functions
21
+ ######################################################################
22
+
23
+ # alternative name possibility: 'map_featurefunc()'?
24
+ # alternative name possibility: 'detect_features()'?
25
+ # alternative name possibility: 'map_featuredetect()'?
26
+ # or.. just have users use LazyMap directly?
27
+ def apply_features(feature_func, toks, labeled=None):
28
+ """
29
+ Use the ``LazyMap`` class to construct a lazy list-like
30
+ object that is analogous to ``map(feature_func, toks)``. In
31
+ particular, if ``labeled=False``, then the returned list-like
32
+ object's values are equal to::
33
+
34
+ [feature_func(tok) for tok in toks]
35
+
36
+ If ``labeled=True``, then the returned list-like object's values
37
+ are equal to::
38
+
39
+ [(feature_func(tok), label) for (tok, label) in toks]
40
+
41
+ The primary purpose of this function is to avoid the memory
42
+ overhead involved in storing all the featuresets for every token
43
+ in a corpus. Instead, these featuresets are constructed lazily,
44
+ as-needed. The reduction in memory overhead can be especially
45
+ significant when the underlying list of tokens is itself lazy (as
46
+ is the case with many corpus readers).
47
+
48
+ :param feature_func: The function that will be applied to each
49
+ token. It should return a featureset -- i.e., a dict
50
+ mapping feature names to feature values.
51
+ :param toks: The list of tokens to which ``feature_func`` should be
52
+ applied. If ``labeled=True``, then the list elements will be
53
+ passed directly to ``feature_func()``. If ``labeled=False``,
54
+ then the list elements should be tuples ``(tok,label)``, and
55
+ ``tok`` will be passed to ``feature_func()``.
56
+ :param labeled: If true, then ``toks`` contains labeled tokens --
57
+ i.e., tuples of the form ``(tok, label)``. (Default:
58
+ auto-detect based on types.)
59
+ """
60
+ if labeled is None:
61
+ labeled = toks and isinstance(toks[0], (tuple, list))
62
+ if labeled:
63
+
64
+ def lazy_func(labeled_token):
65
+ return (feature_func(labeled_token[0]), labeled_token[1])
66
+
67
+ return LazyMap(lazy_func, toks)
68
+ else:
69
+ return LazyMap(feature_func, toks)
70
+
71
+
72
+ def attested_labels(tokens):
73
+ """
74
+ :return: A list of all labels that are attested in the given list
75
+ of tokens.
76
+ :rtype: list of (immutable)
77
+ :param tokens: The list of classified tokens from which to extract
78
+ labels. A classified token has the form ``(token, label)``.
79
+ :type tokens: list
80
+ """
81
+ return tuple({label for (tok, label) in tokens})
82
+
83
+
84
+ def log_likelihood(classifier, gold):
85
+ results = classifier.prob_classify_many([fs for (fs, l) in gold])
86
+ ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
87
+ return math.log(sum(ll) / len(ll))
88
+
89
+
90
+ def accuracy(classifier, gold):
91
+ results = classifier.classify_many([fs for (fs, l) in gold])
92
+ correct = [l == r for ((fs, l), r) in zip(gold, results)]
93
+ if correct:
94
+ return sum(correct) / len(correct)
95
+ else:
96
+ return 0
97
+
98
+
99
+ class CutoffChecker:
100
+ """
101
+ A helper class that implements cutoff checks based on number of
102
+ iterations and log likelihood.
103
+
104
+ Accuracy cutoffs are also implemented, but they're almost never
105
+ a good idea to use.
106
+ """
107
+
108
+ def __init__(self, cutoffs):
109
+ self.cutoffs = cutoffs.copy()
110
+ if "min_ll" in cutoffs:
111
+ cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
112
+ if "min_lldelta" in cutoffs:
113
+ cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
114
+ self.ll = None
115
+ self.acc = None
116
+ self.iter = 1
117
+
118
+ def check(self, classifier, train_toks):
119
+ cutoffs = self.cutoffs
120
+ self.iter += 1
121
+ if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
122
+ return True # iteration cutoff.
123
+
124
+ new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
125
+ if math.isnan(new_ll):
126
+ return True
127
+
128
+ if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
129
+ if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
130
+ return True # log likelihood cutoff
131
+ if (
132
+ "min_lldelta" in cutoffs
133
+ and self.ll
134
+ and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
135
+ ):
136
+ return True # log likelihood delta cutoff
137
+ self.ll = new_ll
138
+
139
+ if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
140
+ new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
141
+ if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
142
+ return True # log likelihood cutoff
143
+ if (
144
+ "min_accdelta" in cutoffs
145
+ and self.acc
146
+ and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
147
+ ):
148
+ return True # log likelihood delta cutoff
149
+ self.acc = new_acc
150
+
151
+ return False # no cutoff reached.
152
+
153
+
154
+ ######################################################################
155
+ # { Demos
156
+ ######################################################################
157
+
158
+
159
+ def names_demo_features(name):
160
+ features = {}
161
+ features["alwayson"] = True
162
+ features["startswith"] = name[0].lower()
163
+ features["endswith"] = name[-1].lower()
164
+ for letter in "abcdefghijklmnopqrstuvwxyz":
165
+ features["count(%s)" % letter] = name.lower().count(letter)
166
+ features["has(%s)" % letter] = letter in name.lower()
167
+ return features
168
+
169
+
170
+ def binary_names_demo_features(name):
171
+ features = {}
172
+ features["alwayson"] = True
173
+ features["startswith(vowel)"] = name[0].lower() in "aeiouy"
174
+ features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
175
+ for letter in "abcdefghijklmnopqrstuvwxyz":
176
+ features["count(%s)" % letter] = name.lower().count(letter)
177
+ features["has(%s)" % letter] = letter in name.lower()
178
+ features["startswith(%s)" % letter] = letter == name[0].lower()
179
+ features["endswith(%s)" % letter] = letter == name[-1].lower()
180
+ return features
181
+
182
+
183
+ def names_demo(trainer, features=names_demo_features):
184
+ import random
185
+
186
+ from nltk.corpus import names
187
+
188
+ # Construct a list of classified names, using the names corpus.
189
+ namelist = [(name, "male") for name in names.words("male.txt")] + [
190
+ (name, "female") for name in names.words("female.txt")
191
+ ]
192
+
193
+ # Randomly split the names into a test & train set.
194
+ random.seed(123456)
195
+ random.shuffle(namelist)
196
+ train = namelist[:5000]
197
+ test = namelist[5000:5500]
198
+
199
+ # Train up a classifier.
200
+ print("Training classifier...")
201
+ classifier = trainer([(features(n), g) for (n, g) in train])
202
+
203
+ # Run the classifier on the test data.
204
+ print("Testing classifier...")
205
+ acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
206
+ print("Accuracy: %6.4f" % acc)
207
+
208
+ # For classifiers that can find probabilities, show the log
209
+ # likelihood and some sample probability distributions.
210
+ try:
211
+ test_featuresets = [features(n) for (n, g) in test]
212
+ pdists = classifier.prob_classify_many(test_featuresets)
213
+ ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
214
+ print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
215
+ print()
216
+ print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
217
+ for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
218
+ if gender == "male":
219
+ fmt = " %-15s *%6.4f %6.4f"
220
+ else:
221
+ fmt = " %-15s %6.4f *%6.4f"
222
+ print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
223
+ except NotImplementedError:
224
+ pass
225
+
226
+ # Return the classifier
227
+ return classifier
228
+
229
+
230
+ def partial_names_demo(trainer, features=names_demo_features):
231
+ import random
232
+
233
+ from nltk.corpus import names
234
+
235
+ male_names = names.words("male.txt")
236
+ female_names = names.words("female.txt")
237
+
238
+ random.seed(654321)
239
+ random.shuffle(male_names)
240
+ random.shuffle(female_names)
241
+
242
+ # Create a list of male names to be used as positive-labeled examples for training
243
+ positive = map(features, male_names[:2000])
244
+
245
+ # Create a list of male and female names to be used as unlabeled examples
246
+ unlabeled = map(features, male_names[2000:2500] + female_names[:500])
247
+
248
+ # Create a test set with correctly-labeled male and female names
249
+ test = [(name, True) for name in male_names[2500:2750]] + [
250
+ (name, False) for name in female_names[500:750]
251
+ ]
252
+
253
+ random.shuffle(test)
254
+
255
+ # Train up a classifier.
256
+ print("Training classifier...")
257
+ classifier = trainer(positive, unlabeled)
258
+
259
+ # Run the classifier on the test data.
260
+ print("Testing classifier...")
261
+ acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
262
+ print("Accuracy: %6.4f" % acc)
263
+
264
+ # For classifiers that can find probabilities, show the log
265
+ # likelihood and some sample probability distributions.
266
+ try:
267
+ test_featuresets = [features(n) for (n, m) in test]
268
+ pdists = classifier.prob_classify_many(test_featuresets)
269
+ ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
270
+ print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
271
+ print()
272
+ print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
273
+ for ((name, is_male), pdist) in zip(test, pdists)[:5]:
274
+ if is_male == True:
275
+ fmt = " %-15s *%6.4f %6.4f"
276
+ else:
277
+ fmt = " %-15s %6.4f *%6.4f"
278
+ print(fmt % (name, pdist.prob(True), pdist.prob(False)))
279
+ except NotImplementedError:
280
+ pass
281
+
282
+ # Return the classifier
283
+ return classifier
284
+
285
+
286
+ _inst_cache = {}
287
+
288
+
289
+ def wsd_demo(trainer, word, features, n=1000):
290
+ import random
291
+
292
+ from nltk.corpus import senseval
293
+
294
+ # Get the instances.
295
+ print("Reading data...")
296
+ global _inst_cache
297
+ if word not in _inst_cache:
298
+ _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
299
+ instances = _inst_cache[word][:]
300
+ if n > len(instances):
301
+ n = len(instances)
302
+ senses = list({l for (i, l) in instances})
303
+ print(" Senses: " + " ".join(senses))
304
+
305
+ # Randomly split the names into a test & train set.
306
+ print("Splitting into test & train...")
307
+ random.seed(123456)
308
+ random.shuffle(instances)
309
+ train = instances[: int(0.8 * n)]
310
+ test = instances[int(0.8 * n) : n]
311
+
312
+ # Train up a classifier.
313
+ print("Training classifier...")
314
+ classifier = trainer([(features(i), l) for (i, l) in train])
315
+
316
+ # Run the classifier on the test data.
317
+ print("Testing classifier...")
318
+ acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
319
+ print("Accuracy: %6.4f" % acc)
320
+
321
+ # For classifiers that can find probabilities, show the log
322
+ # likelihood and some sample probability distributions.
323
+ try:
324
+ test_featuresets = [features(i) for (i, n) in test]
325
+ pdists = classifier.prob_classify_many(test_featuresets)
326
+ ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
327
+ print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
328
+ except NotImplementedError:
329
+ pass
330
+
331
+ # Return the classifier
332
+ return classifier
333
+
334
+
335
+ def check_megam_config():
336
+ """
337
+ Checks whether the MEGAM binary is configured.
338
+ """
339
+ try:
340
+ _megam_bin
341
+ except NameError as e:
342
+ err_msg = str(
343
+ "Please configure your megam binary first, e.g.\n"
344
+ ">>> nltk.config_megam('/usr/bin/local/megam')"
345
+ )
346
+ raise NameError(err_msg) from e
.eggs/nltk-3.8-py3.10.egg/nltk/classify/weka.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Interface to Weka Classsifiers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ Classifiers that make use of the external 'Weka' package.
10
+ """
11
+
12
+ import os
13
+ import re
14
+ import subprocess
15
+ import tempfile
16
+ import time
17
+ import zipfile
18
+ from sys import stdin
19
+
20
+ from nltk.classify.api import ClassifierI
21
+ from nltk.internals import config_java, java
22
+ from nltk.probability import DictionaryProbDist
23
+
24
+ _weka_classpath = None
25
+ _weka_search = [
26
+ ".",
27
+ "/usr/share/weka",
28
+ "/usr/local/share/weka",
29
+ "/usr/lib/weka",
30
+ "/usr/local/lib/weka",
31
+ ]
32
+
33
+
34
+ def config_weka(classpath=None):
35
+ global _weka_classpath
36
+
37
+ # Make sure java's configured first.
38
+ config_java()
39
+
40
+ if classpath is not None:
41
+ _weka_classpath = classpath
42
+
43
+ if _weka_classpath is None:
44
+ searchpath = _weka_search
45
+ if "WEKAHOME" in os.environ:
46
+ searchpath.insert(0, os.environ["WEKAHOME"])
47
+
48
+ for path in searchpath:
49
+ if os.path.exists(os.path.join(path, "weka.jar")):
50
+ _weka_classpath = os.path.join(path, "weka.jar")
51
+ version = _check_weka_version(_weka_classpath)
52
+ if version:
53
+ print(f"[Found Weka: {_weka_classpath} (version {version})]")
54
+ else:
55
+ print("[Found Weka: %s]" % _weka_classpath)
56
+ _check_weka_version(_weka_classpath)
57
+
58
+ if _weka_classpath is None:
59
+ raise LookupError(
60
+ "Unable to find weka.jar! Use config_weka() "
61
+ "or set the WEKAHOME environment variable. "
62
+ "For more information about Weka, please see "
63
+ "https://www.cs.waikato.ac.nz/ml/weka/"
64
+ )
65
+
66
+
67
+ def _check_weka_version(jar):
68
+ try:
69
+ zf = zipfile.ZipFile(jar)
70
+ except (SystemExit, KeyboardInterrupt):
71
+ raise
72
+ except:
73
+ return None
74
+ try:
75
+ try:
76
+ return zf.read("weka/core/version.txt")
77
+ except KeyError:
78
+ return None
79
+ finally:
80
+ zf.close()
81
+
82
+
83
+ class WekaClassifier(ClassifierI):
84
+ def __init__(self, formatter, model_filename):
85
+ self._formatter = formatter
86
+ self._model = model_filename
87
+
88
+ def prob_classify_many(self, featuresets):
89
+ return self._classify_many(featuresets, ["-p", "0", "-distribution"])
90
+
91
+ def classify_many(self, featuresets):
92
+ return self._classify_many(featuresets, ["-p", "0"])
93
+
94
+ def _classify_many(self, featuresets, options):
95
+ # Make sure we can find java & weka.
96
+ config_weka()
97
+
98
+ temp_dir = tempfile.mkdtemp()
99
+ try:
100
+ # Write the test data file.
101
+ test_filename = os.path.join(temp_dir, "test.arff")
102
+ self._formatter.write(test_filename, featuresets)
103
+
104
+ # Call weka to classify the data.
105
+ cmd = [
106
+ "weka.classifiers.bayes.NaiveBayes",
107
+ "-l",
108
+ self._model,
109
+ "-T",
110
+ test_filename,
111
+ ] + options
112
+ (stdout, stderr) = java(
113
+ cmd,
114
+ classpath=_weka_classpath,
115
+ stdout=subprocess.PIPE,
116
+ stderr=subprocess.PIPE,
117
+ )
118
+
119
+ # Check if something went wrong:
120
+ if stderr and not stdout:
121
+ if "Illegal options: -distribution" in stderr:
122
+ raise ValueError(
123
+ "The installed version of weka does "
124
+ "not support probability distribution "
125
+ "output."
126
+ )
127
+ else:
128
+ raise ValueError("Weka failed to generate output:\n%s" % stderr)
129
+
130
+ # Parse weka's output.
131
+ return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
132
+
133
+ finally:
134
+ for f in os.listdir(temp_dir):
135
+ os.remove(os.path.join(temp_dir, f))
136
+ os.rmdir(temp_dir)
137
+
138
+ def parse_weka_distribution(self, s):
139
+ probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
140
+ probs = dict(zip(self._formatter.labels(), probs))
141
+ return DictionaryProbDist(probs)
142
+
143
+ def parse_weka_output(self, lines):
144
+ # Strip unwanted text from stdout
145
+ for i, line in enumerate(lines):
146
+ if line.strip().startswith("inst#"):
147
+ lines = lines[i:]
148
+ break
149
+
150
+ if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
151
+ return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
152
+ elif lines[0].split() == [
153
+ "inst#",
154
+ "actual",
155
+ "predicted",
156
+ "error",
157
+ "distribution",
158
+ ]:
159
+ return [
160
+ self.parse_weka_distribution(line.split()[-1])
161
+ for line in lines[1:]
162
+ if line.strip()
163
+ ]
164
+
165
+ # is this safe:?
166
+ elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
167
+ return [line.split()[1] for line in lines if line.strip()]
168
+
169
+ else:
170
+ for line in lines[:10]:
171
+ print(line)
172
+ raise ValueError(
173
+ "Unhandled output format -- your version "
174
+ "of weka may not be supported.\n"
175
+ " Header: %s" % lines[0]
176
+ )
177
+
178
+ # [xx] full list of classifiers (some may be abstract?):
179
+ # ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
180
+ # DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
181
+ # JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
182
+ # LogisticBase, M5Base, MultilayerPerceptron,
183
+ # MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
184
+ # NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
185
+ # PreConstructedLinearModel, Prism, RandomForest,
186
+ # RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
187
+ # RuleNode, SimpleLinearRegression, SimpleLogistic,
188
+ # SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
189
+ # VotedPerceptron, Winnow, ZeroR
190
+
191
+ _CLASSIFIER_CLASS = {
192
+ "naivebayes": "weka.classifiers.bayes.NaiveBayes",
193
+ "C4.5": "weka.classifiers.trees.J48",
194
+ "log_regression": "weka.classifiers.functions.Logistic",
195
+ "svm": "weka.classifiers.functions.SMO",
196
+ "kstar": "weka.classifiers.lazy.KStar",
197
+ "ripper": "weka.classifiers.rules.JRip",
198
+ }
199
+
200
+ @classmethod
201
+ def train(
202
+ cls,
203
+ model_filename,
204
+ featuresets,
205
+ classifier="naivebayes",
206
+ options=[],
207
+ quiet=True,
208
+ ):
209
+ # Make sure we can find java & weka.
210
+ config_weka()
211
+
212
+ # Build an ARFF formatter.
213
+ formatter = ARFF_Formatter.from_train(featuresets)
214
+
215
+ temp_dir = tempfile.mkdtemp()
216
+ try:
217
+ # Write the training data file.
218
+ train_filename = os.path.join(temp_dir, "train.arff")
219
+ formatter.write(train_filename, featuresets)
220
+
221
+ if classifier in cls._CLASSIFIER_CLASS:
222
+ javaclass = cls._CLASSIFIER_CLASS[classifier]
223
+ elif classifier in cls._CLASSIFIER_CLASS.values():
224
+ javaclass = classifier
225
+ else:
226
+ raise ValueError("Unknown classifier %s" % classifier)
227
+
228
+ # Train the weka model.
229
+ cmd = [javaclass, "-d", model_filename, "-t", train_filename]
230
+ cmd += list(options)
231
+ if quiet:
232
+ stdout = subprocess.PIPE
233
+ else:
234
+ stdout = None
235
+ java(cmd, classpath=_weka_classpath, stdout=stdout)
236
+
237
+ # Return the new classifier.
238
+ return WekaClassifier(formatter, model_filename)
239
+
240
+ finally:
241
+ for f in os.listdir(temp_dir):
242
+ os.remove(os.path.join(temp_dir, f))
243
+ os.rmdir(temp_dir)
244
+
245
+
246
+ class ARFF_Formatter:
247
+ """
248
+ Converts featuresets and labeled featuresets to ARFF-formatted
249
+ strings, appropriate for input into Weka.
250
+
251
+ Features and classes can be specified manually in the constructor, or may
252
+ be determined from data using ``from_train``.
253
+ """
254
+
255
+ def __init__(self, labels, features):
256
+ """
257
+ :param labels: A list of all class labels that can be generated.
258
+ :param features: A list of feature specifications, where
259
+ each feature specification is a tuple (fname, ftype);
260
+ and ftype is an ARFF type string such as NUMERIC or
261
+ STRING.
262
+ """
263
+ self._labels = labels
264
+ self._features = features
265
+
266
+ def format(self, tokens):
267
+ """Returns a string representation of ARFF output for the given data."""
268
+ return self.header_section() + self.data_section(tokens)
269
+
270
+ def labels(self):
271
+ """Returns the list of classes."""
272
+ return list(self._labels)
273
+
274
+ def write(self, outfile, tokens):
275
+ """Writes ARFF data to a file for the given data."""
276
+ if not hasattr(outfile, "write"):
277
+ outfile = open(outfile, "w")
278
+ outfile.write(self.format(tokens))
279
+ outfile.close()
280
+
281
+ @staticmethod
282
+ def from_train(tokens):
283
+ """
284
+ Constructs an ARFF_Formatter instance with class labels and feature
285
+ types determined from the given data. Handles boolean, numeric and
286
+ string (note: not nominal) types.
287
+ """
288
+ # Find the set of all attested labels.
289
+ labels = {label for (tok, label) in tokens}
290
+
291
+ # Determine the types of all features.
292
+ features = {}
293
+ for tok, label in tokens:
294
+ for (fname, fval) in tok.items():
295
+ if issubclass(type(fval), bool):
296
+ ftype = "{True, False}"
297
+ elif issubclass(type(fval), (int, float, bool)):
298
+ ftype = "NUMERIC"
299
+ elif issubclass(type(fval), str):
300
+ ftype = "STRING"
301
+ elif fval is None:
302
+ continue # can't tell the type.
303
+ else:
304
+ raise ValueError("Unsupported value type %r" % ftype)
305
+
306
+ if features.get(fname, ftype) != ftype:
307
+ raise ValueError("Inconsistent type for %s" % fname)
308
+ features[fname] = ftype
309
+ features = sorted(features.items())
310
+
311
+ return ARFF_Formatter(labels, features)
312
+
313
+ def header_section(self):
314
+ """Returns an ARFF header as a string."""
315
+ # Header comment.
316
+ s = (
317
+ "% Weka ARFF file\n"
318
+ + "% Generated automatically by NLTK\n"
319
+ + "%% %s\n\n" % time.ctime()
320
+ )
321
+
322
+ # Relation name
323
+ s += "@RELATION rel\n\n"
324
+
325
+ # Input attribute specifications
326
+ for fname, ftype in self._features:
327
+ s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
328
+
329
+ # Label attribute specification
330
+ s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
331
+
332
+ return s
333
+
334
+ def data_section(self, tokens, labeled=None):
335
+ """
336
+ Returns the ARFF data section for the given data.
337
+
338
+ :param tokens: a list of featuresets (dicts) or labelled featuresets
339
+ which are tuples (featureset, label).
340
+ :param labeled: Indicates whether the given tokens are labeled
341
+ or not. If None, then the tokens will be assumed to be
342
+ labeled if the first token's value is a tuple or list.
343
+ """
344
+ # Check if the tokens are labeled or unlabeled. If unlabeled,
345
+ # then use 'None'
346
+ if labeled is None:
347
+ labeled = tokens and isinstance(tokens[0], (tuple, list))
348
+ if not labeled:
349
+ tokens = [(tok, None) for tok in tokens]
350
+
351
+ # Data section
352
+ s = "\n@DATA\n"
353
+ for (tok, label) in tokens:
354
+ for fname, ftype in self._features:
355
+ s += "%s," % self._fmt_arff_val(tok.get(fname))
356
+ s += "%s\n" % self._fmt_arff_val(label)
357
+
358
+ return s
359
+
360
+ def _fmt_arff_val(self, fval):
361
+ if fval is None:
362
+ return "?"
363
+ elif isinstance(fval, (bool, int)):
364
+ return "%s" % fval
365
+ elif isinstance(fval, float):
366
+ return "%r" % fval
367
+ else:
368
+ return "%r" % fval
369
+
370
+
371
+ if __name__ == "__main__":
372
+ from nltk.classify.util import binary_names_demo_features, names_demo
373
+
374
+ def make_classifier(featuresets):
375
+ return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
376
+
377
+ classifier = names_demo(make_classifier, binary_names_demo_features)
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/__init__.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Clusterers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ This module contains a number of basic clustering algorithms. Clustering
10
+ describes the task of discovering groups of similar items with a large
11
+ collection. It is also describe as unsupervised machine learning, as the data
12
+ from which it learns is unannotated with class information, as is the case for
13
+ supervised learning. Annotated data is difficult and expensive to obtain in
14
+ the quantities required for the majority of supervised learning algorithms.
15
+ This problem, the knowledge acquisition bottleneck, is common to most natural
16
+ language processing tasks, thus fueling the need for quality unsupervised
17
+ approaches.
18
+
19
+ This module contains a k-means clusterer, E-M clusterer and a group average
20
+ agglomerative clusterer (GAAC). All these clusterers involve finding good
21
+ cluster groupings for a set of vectors in multi-dimensional space.
22
+
23
+ The K-means clusterer starts with k arbitrary chosen means then allocates each
24
+ vector to the cluster with the closest mean. It then recalculates the means of
25
+ each cluster as the centroid of the vectors in the cluster. This process
26
+ repeats until the cluster memberships stabilise. This is a hill-climbing
27
+ algorithm which may converge to a local maximum. Hence the clustering is
28
+ often repeated with random initial means and the most commonly occurring
29
+ output means are chosen.
30
+
31
+ The GAAC clusterer starts with each of the *N* vectors as singleton clusters.
32
+ It then iteratively merges pairs of clusters which have the closest centroids.
33
+ This continues until there is only one cluster. The order of merges gives rise
34
+ to a dendrogram - a tree with the earlier merges lower than later merges. The
35
+ membership of a given number of clusters *c*, *1 <= c <= N*, can be found by
36
+ cutting the dendrogram at depth *c*.
37
+
38
+ The Gaussian EM clusterer models the vectors as being produced by a mixture
39
+ of k Gaussian sources. The parameters of these sources (prior probability,
40
+ mean and covariance matrix) are then found to maximise the likelihood of the
41
+ given data. This is done with the expectation maximisation algorithm. It
42
+ starts with k arbitrarily chosen means, priors and covariance matrices. It
43
+ then calculates the membership probabilities for each vector in each of the
44
+ clusters - this is the 'E' step. The cluster parameters are then updated in
45
+ the 'M' step using the maximum likelihood estimate from the cluster membership
46
+ probabilities. This process continues until the likelihood of the data does
47
+ not significantly increase.
48
+
49
+ They all extend the ClusterI interface which defines common operations
50
+ available with each clusterer. These operations include:
51
+
52
+ - cluster: clusters a sequence of vectors
53
+ - classify: assign a vector to a cluster
54
+ - classification_probdist: give the probability distribution over cluster memberships
55
+
56
+ The current existing classifiers also extend cluster.VectorSpace, an
57
+ abstract class which allows for singular value decomposition (SVD) and vector
58
+ normalisation. SVD is used to reduce the dimensionality of the vector space in
59
+ such a manner as to preserve as much of the variation as possible, by
60
+ reparameterising the axes in order of variability and discarding all bar the
61
+ first d dimensions. Normalisation ensures that vectors fall in the unit
62
+ hypersphere.
63
+
64
+ Usage example (see also demo())::
65
+
66
+ from nltk import cluster
67
+ from nltk.cluster import euclidean_distance
68
+ from numpy import array
69
+
70
+ vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
71
+
72
+ # initialise the clusterer (will also assign the vectors to clusters)
73
+ clusterer = cluster.KMeansClusterer(2, euclidean_distance)
74
+ clusterer.cluster(vectors, True)
75
+
76
+ # classify a new vector
77
+ print(clusterer.classify(array([3, 3])))
78
+
79
+ Note that the vectors must use numpy array-like
80
+ objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for
81
+ efficiency when required.
82
+ """
83
+
84
+ from nltk.cluster.em import EMClusterer
85
+ from nltk.cluster.gaac import GAAClusterer
86
+ from nltk.cluster.kmeans import KMeansClusterer
87
+ from nltk.cluster.util import (
88
+ Dendrogram,
89
+ VectorSpaceClusterer,
90
+ cosine_distance,
91
+ euclidean_distance,
92
+ )
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/api.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Clusterer Interfaces
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5
+ # Porting: Steven Bird <stevenbird1@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ from abc import ABCMeta, abstractmethod
10
+
11
+ from nltk.probability import DictionaryProbDist
12
+
13
+
14
+ class ClusterI(metaclass=ABCMeta):
15
+ """
16
+ Interface covering basic clustering functionality.
17
+ """
18
+
19
+ @abstractmethod
20
+ def cluster(self, vectors, assign_clusters=False):
21
+ """
22
+ Assigns the vectors to clusters, learning the clustering parameters
23
+ from the data. Returns a cluster identifier for each vector.
24
+ """
25
+
26
+ @abstractmethod
27
+ def classify(self, token):
28
+ """
29
+ Classifies the token into a cluster, setting the token's CLUSTER
30
+ parameter to that cluster identifier.
31
+ """
32
+
33
+ def likelihood(self, vector, label):
34
+ """
35
+ Returns the likelihood (a float) of the token having the
36
+ corresponding cluster.
37
+ """
38
+ if self.classify(vector) == label:
39
+ return 1.0
40
+ else:
41
+ return 0.0
42
+
43
+ def classification_probdist(self, vector):
44
+ """
45
+ Classifies the token into a cluster, returning
46
+ a probability distribution over the cluster identifiers.
47
+ """
48
+ likelihoods = {}
49
+ sum = 0.0
50
+ for cluster in self.cluster_names():
51
+ likelihoods[cluster] = self.likelihood(vector, cluster)
52
+ sum += likelihoods[cluster]
53
+ for cluster in self.cluster_names():
54
+ likelihoods[cluster] /= sum
55
+ return DictionaryProbDist(likelihoods)
56
+
57
+ @abstractmethod
58
+ def num_clusters(self):
59
+ """
60
+ Returns the number of clusters.
61
+ """
62
+
63
+ def cluster_names(self):
64
+ """
65
+ Returns the names of the clusters.
66
+ :rtype: list
67
+ """
68
+ return list(range(self.num_clusters()))
69
+
70
+ def cluster_name(self, index):
71
+ """
72
+ Returns the names of the cluster at index.
73
+ """
74
+ return index
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/em.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Expectation Maximization Clusterer
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ try:
9
+ import numpy
10
+ except ImportError:
11
+ pass
12
+
13
+ from nltk.cluster.util import VectorSpaceClusterer
14
+
15
+
16
+ class EMClusterer(VectorSpaceClusterer):
17
+ """
18
+ The Gaussian EM clusterer models the vectors as being produced by
19
+ a mixture of k Gaussian sources. The parameters of these sources
20
+ (prior probability, mean and covariance matrix) are then found to
21
+ maximise the likelihood of the given data. This is done with the
22
+ expectation maximisation algorithm. It starts with k arbitrarily
23
+ chosen means, priors and covariance matrices. It then calculates
24
+ the membership probabilities for each vector in each of the
25
+ clusters; this is the 'E' step. The cluster parameters are then
26
+ updated in the 'M' step using the maximum likelihood estimate from
27
+ the cluster membership probabilities. This process continues until
28
+ the likelihood of the data does not significantly increase.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ initial_means,
34
+ priors=None,
35
+ covariance_matrices=None,
36
+ conv_threshold=1e-6,
37
+ bias=0.1,
38
+ normalise=False,
39
+ svd_dimensions=None,
40
+ ):
41
+ """
42
+ Creates an EM clusterer with the given starting parameters,
43
+ convergence threshold and vector mangling parameters.
44
+
45
+ :param initial_means: the means of the gaussian cluster centers
46
+ :type initial_means: [seq of] numpy array or seq of SparseArray
47
+ :param priors: the prior probability for each cluster
48
+ :type priors: numpy array or seq of float
49
+ :param covariance_matrices: the covariance matrix for each cluster
50
+ :type covariance_matrices: [seq of] numpy array
51
+ :param conv_threshold: maximum change in likelihood before deemed
52
+ convergent
53
+ :type conv_threshold: int or float
54
+ :param bias: variance bias used to ensure non-singular covariance
55
+ matrices
56
+ :type bias: float
57
+ :param normalise: should vectors be normalised to length 1
58
+ :type normalise: boolean
59
+ :param svd_dimensions: number of dimensions to use in reducing vector
60
+ dimensionsionality with SVD
61
+ :type svd_dimensions: int
62
+ """
63
+ VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
64
+ self._means = numpy.array(initial_means, numpy.float64)
65
+ self._num_clusters = len(initial_means)
66
+ self._conv_threshold = conv_threshold
67
+ self._covariance_matrices = covariance_matrices
68
+ self._priors = priors
69
+ self._bias = bias
70
+
71
+ def num_clusters(self):
72
+ return self._num_clusters
73
+
74
+ def cluster_vectorspace(self, vectors, trace=False):
75
+ assert len(vectors) > 0
76
+
77
+ # set the parameters to initial values
78
+ dimensions = len(vectors[0])
79
+ means = self._means
80
+ priors = self._priors
81
+ if not priors:
82
+ priors = self._priors = (
83
+ numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
84
+ )
85
+ covariances = self._covariance_matrices
86
+ if not covariances:
87
+ covariances = self._covariance_matrices = [
88
+ numpy.identity(dimensions, numpy.float64)
89
+ for i in range(self._num_clusters)
90
+ ]
91
+
92
+ # do the E and M steps until the likelihood plateaus
93
+ lastl = self._loglikelihood(vectors, priors, means, covariances)
94
+ converged = False
95
+
96
+ while not converged:
97
+ if trace:
98
+ print("iteration; loglikelihood", lastl)
99
+ # E-step, calculate hidden variables, h[i,j]
100
+ h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
101
+ for i in range(len(vectors)):
102
+ for j in range(self._num_clusters):
103
+ h[i, j] = priors[j] * self._gaussian(
104
+ means[j], covariances[j], vectors[i]
105
+ )
106
+ h[i, :] /= sum(h[i, :])
107
+
108
+ # M-step, update parameters - cvm, p, mean
109
+ for j in range(self._num_clusters):
110
+ covariance_before = covariances[j]
111
+ new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
112
+ new_mean = numpy.zeros(dimensions, numpy.float64)
113
+ sum_hj = 0.0
114
+ for i in range(len(vectors)):
115
+ delta = vectors[i] - means[j]
116
+ new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
117
+ sum_hj += h[i, j]
118
+ new_mean += h[i, j] * vectors[i]
119
+ covariances[j] = new_covariance / sum_hj
120
+ means[j] = new_mean / sum_hj
121
+ priors[j] = sum_hj / len(vectors)
122
+
123
+ # bias term to stop covariance matrix being singular
124
+ covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
125
+
126
+ # calculate likelihood - FIXME: may be broken
127
+ l = self._loglikelihood(vectors, priors, means, covariances)
128
+
129
+ # check for convergence
130
+ if abs(lastl - l) < self._conv_threshold:
131
+ converged = True
132
+ lastl = l
133
+
134
+ def classify_vectorspace(self, vector):
135
+ best = None
136
+ for j in range(self._num_clusters):
137
+ p = self._priors[j] * self._gaussian(
138
+ self._means[j], self._covariance_matrices[j], vector
139
+ )
140
+ if not best or p > best[0]:
141
+ best = (p, j)
142
+ return best[1]
143
+
144
+ def likelihood_vectorspace(self, vector, cluster):
145
+ cid = self.cluster_names().index(cluster)
146
+ return self._priors[cluster] * self._gaussian(
147
+ self._means[cluster], self._covariance_matrices[cluster], vector
148
+ )
149
+
150
+ def _gaussian(self, mean, cvm, x):
151
+ m = len(mean)
152
+ assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
153
+ try:
154
+ det = numpy.linalg.det(cvm)
155
+ inv = numpy.linalg.inv(cvm)
156
+ a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0)
157
+ dx = x - mean
158
+ print(dx, inv)
159
+ b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
160
+ return a * numpy.exp(b)
161
+ except OverflowError:
162
+ # happens when the exponent is negative infinity - i.e. b = 0
163
+ # i.e. the inverse of cvm is huge (cvm is almost zero)
164
+ return 0
165
+
166
+ def _loglikelihood(self, vectors, priors, means, covariances):
167
+ llh = 0.0
168
+ for vector in vectors:
169
+ p = 0
170
+ for j in range(len(priors)):
171
+ p += priors[j] * self._gaussian(means[j], covariances[j], vector)
172
+ llh += numpy.log(p)
173
+ return llh
174
+
175
+ def __repr__(self):
176
+ return "<EMClusterer means=%s>" % list(self._means)
177
+
178
+
179
+ def demo():
180
+ """
181
+ Non-interactive demonstration of the clusterers with simple 2-D data.
182
+ """
183
+
184
+ from nltk import cluster
185
+
186
+ # example from figure 14.10, page 519, Manning and Schutze
187
+
188
+ vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
189
+ means = [[4, 2], [4, 2.01]]
190
+
191
+ clusterer = cluster.EMClusterer(means, bias=0.1)
192
+ clusters = clusterer.cluster(vectors, True, trace=True)
193
+
194
+ print("Clustered:", vectors)
195
+ print("As: ", clusters)
196
+ print()
197
+
198
+ for c in range(2):
199
+ print("Cluster:", c)
200
+ print("Prior: ", clusterer._priors[c])
201
+ print("Mean: ", clusterer._means[c])
202
+ print("Covar: ", clusterer._covariance_matrices[c])
203
+ print()
204
+
205
+ # classify a new vector
206
+ vector = numpy.array([2, 2])
207
+ print("classify(%s):" % vector, end=" ")
208
+ print(clusterer.classify(vector))
209
+
210
+ # show the classification probabilities
211
+ vector = numpy.array([2, 2])
212
+ print("classification_probdist(%s):" % vector)
213
+ pdist = clusterer.classification_probdist(vector)
214
+ for sample in pdist.samples():
215
+ print(f"{sample} => {pdist.prob(sample) * 100:.0f}%")
216
+
217
+
218
+ if __name__ == "__main__":
219
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/gaac.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Group Average Agglomerative Clusterer
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ try:
9
+ import numpy
10
+ except ImportError:
11
+ pass
12
+
13
+ from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance
14
+
15
+
16
+ class GAAClusterer(VectorSpaceClusterer):
17
+ """
18
+ The Group Average Agglomerative starts with each of the N vectors as singleton
19
+ clusters. It then iteratively merges pairs of clusters which have the
20
+ closest centroids. This continues until there is only one cluster. The
21
+ order of merges gives rise to a dendrogram: a tree with the earlier merges
22
+ lower than later merges. The membership of a given number of clusters c, 1
23
+ <= c <= N, can be found by cutting the dendrogram at depth c.
24
+
25
+ This clusterer uses the cosine similarity metric only, which allows for
26
+ efficient speed-up in the clustering process.
27
+ """
28
+
29
+ def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
30
+ VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
31
+ self._num_clusters = num_clusters
32
+ self._dendrogram = None
33
+ self._groups_values = None
34
+
35
+ def cluster(self, vectors, assign_clusters=False, trace=False):
36
+ # stores the merge order
37
+ self._dendrogram = Dendrogram(
38
+ [numpy.array(vector, numpy.float64) for vector in vectors]
39
+ )
40
+ return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
41
+
42
+ def cluster_vectorspace(self, vectors, trace=False):
43
+ # variables describing the initial situation
44
+ N = len(vectors)
45
+ cluster_len = [1] * N
46
+ cluster_count = N
47
+ index_map = numpy.arange(N)
48
+
49
+ # construct the similarity matrix
50
+ dims = (N, N)
51
+ dist = numpy.ones(dims, dtype=float) * numpy.inf
52
+ for i in range(N):
53
+ for j in range(i + 1, N):
54
+ dist[i, j] = cosine_distance(vectors[i], vectors[j])
55
+
56
+ while cluster_count > max(self._num_clusters, 1):
57
+ i, j = numpy.unravel_index(dist.argmin(), dims)
58
+ if trace:
59
+ print("merging %d and %d" % (i, j))
60
+
61
+ # update similarities for merging i and j
62
+ self._merge_similarities(dist, cluster_len, i, j)
63
+
64
+ # remove j
65
+ dist[:, j] = numpy.inf
66
+ dist[j, :] = numpy.inf
67
+
68
+ # merge the clusters
69
+ cluster_len[i] = cluster_len[i] + cluster_len[j]
70
+ self._dendrogram.merge(index_map[i], index_map[j])
71
+ cluster_count -= 1
72
+
73
+ # update the index map to reflect the indexes if we
74
+ # had removed j
75
+ index_map[j + 1 :] -= 1
76
+ index_map[j] = N
77
+
78
+ self.update_clusters(self._num_clusters)
79
+
80
+ def _merge_similarities(self, dist, cluster_len, i, j):
81
+ # the new cluster i merged from i and j adopts the average of
82
+ # i and j's similarity to each other cluster, weighted by the
83
+ # number of points in the clusters i and j
84
+ i_weight = cluster_len[i]
85
+ j_weight = cluster_len[j]
86
+ weight_sum = i_weight + j_weight
87
+
88
+ # update for x<i
89
+ dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
90
+ dist[:i, i] /= weight_sum
91
+ # update for i<x<j
92
+ dist[i, i + 1 : j] = (
93
+ dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
94
+ )
95
+ # update for i<j<x
96
+ dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
97
+ dist[i, i + 1 :] /= weight_sum
98
+
99
+ def update_clusters(self, num_clusters):
100
+ clusters = self._dendrogram.groups(num_clusters)
101
+ self._centroids = []
102
+ for cluster in clusters:
103
+ assert len(cluster) > 0
104
+ if self._should_normalise:
105
+ centroid = self._normalise(cluster[0])
106
+ else:
107
+ centroid = numpy.array(cluster[0])
108
+ for vector in cluster[1:]:
109
+ if self._should_normalise:
110
+ centroid += self._normalise(vector)
111
+ else:
112
+ centroid += vector
113
+ centroid /= len(cluster)
114
+ self._centroids.append(centroid)
115
+ self._num_clusters = len(self._centroids)
116
+
117
+ def classify_vectorspace(self, vector):
118
+ best = None
119
+ for i in range(self._num_clusters):
120
+ centroid = self._centroids[i]
121
+ dist = cosine_distance(vector, centroid)
122
+ if not best or dist < best[0]:
123
+ best = (dist, i)
124
+ return best[1]
125
+
126
+ def dendrogram(self):
127
+ """
128
+ :return: The dendrogram representing the current clustering
129
+ :rtype: Dendrogram
130
+ """
131
+ return self._dendrogram
132
+
133
+ def num_clusters(self):
134
+ return self._num_clusters
135
+
136
+ def __repr__(self):
137
+ return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
138
+
139
+
140
+ def demo():
141
+ """
142
+ Non-interactive demonstration of the clusterers with simple 2-D data.
143
+ """
144
+
145
+ from nltk.cluster import GAAClusterer
146
+
147
+ # use a set of tokens with 2D indices
148
+ vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
149
+
150
+ # test the GAAC clusterer with 4 clusters
151
+ clusterer = GAAClusterer(4)
152
+ clusters = clusterer.cluster(vectors, True)
153
+
154
+ print("Clusterer:", clusterer)
155
+ print("Clustered:", vectors)
156
+ print("As:", clusters)
157
+ print()
158
+
159
+ # show the dendrogram
160
+ clusterer.dendrogram().show()
161
+
162
+ # classify a new vector
163
+ vector = numpy.array([3, 3])
164
+ print("classify(%s):" % vector, end=" ")
165
+ print(clusterer.classify(vector))
166
+ print()
167
+
168
+
169
+ if __name__ == "__main__":
170
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/kmeans.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: K-Means Clusterer
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ import copy
9
+ import random
10
+ import sys
11
+
12
+ try:
13
+ import numpy
14
+ except ImportError:
15
+ pass
16
+
17
+
18
+ from nltk.cluster.util import VectorSpaceClusterer
19
+
20
+
21
+ class KMeansClusterer(VectorSpaceClusterer):
22
+ """
23
+ The K-means clusterer starts with k arbitrary chosen means then allocates
24
+ each vector to the cluster with the closest mean. It then recalculates the
25
+ means of each cluster as the centroid of the vectors in the cluster. This
26
+ process repeats until the cluster memberships stabilise. This is a
27
+ hill-climbing algorithm which may converge to a local maximum. Hence the
28
+ clustering is often repeated with random initial means and the most
29
+ commonly occurring output means are chosen.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ num_means,
35
+ distance,
36
+ repeats=1,
37
+ conv_test=1e-6,
38
+ initial_means=None,
39
+ normalise=False,
40
+ svd_dimensions=None,
41
+ rng=None,
42
+ avoid_empty_clusters=False,
43
+ ):
44
+
45
+ """
46
+ :param num_means: the number of means to use (may use fewer)
47
+ :type num_means: int
48
+ :param distance: measure of distance between two vectors
49
+ :type distance: function taking two vectors and returning a float
50
+ :param repeats: number of randomised clustering trials to use
51
+ :type repeats: int
52
+ :param conv_test: maximum variation in mean differences before
53
+ deemed convergent
54
+ :type conv_test: number
55
+ :param initial_means: set of k initial means
56
+ :type initial_means: sequence of vectors
57
+ :param normalise: should vectors be normalised to length 1
58
+ :type normalise: boolean
59
+ :param svd_dimensions: number of dimensions to use in reducing vector
60
+ dimensionsionality with SVD
61
+ :type svd_dimensions: int
62
+ :param rng: random number generator (or None)
63
+ :type rng: Random
64
+ :param avoid_empty_clusters: include current centroid in computation
65
+ of next one; avoids undefined behavior
66
+ when clusters become empty
67
+ :type avoid_empty_clusters: boolean
68
+ """
69
+ VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
70
+ self._num_means = num_means
71
+ self._distance = distance
72
+ self._max_difference = conv_test
73
+ assert not initial_means or len(initial_means) == num_means
74
+ self._means = initial_means
75
+ assert repeats >= 1
76
+ assert not (initial_means and repeats > 1)
77
+ self._repeats = repeats
78
+ self._rng = rng if rng else random.Random()
79
+ self._avoid_empty_clusters = avoid_empty_clusters
80
+
81
+ def cluster_vectorspace(self, vectors, trace=False):
82
+ if self._means and self._repeats > 1:
83
+ print("Warning: means will be discarded for subsequent trials")
84
+
85
+ meanss = []
86
+ for trial in range(self._repeats):
87
+ if trace:
88
+ print("k-means trial", trial)
89
+ if not self._means or trial > 1:
90
+ self._means = self._rng.sample(list(vectors), self._num_means)
91
+ self._cluster_vectorspace(vectors, trace)
92
+ meanss.append(self._means)
93
+
94
+ if len(meanss) > 1:
95
+ # sort the means first (so that different cluster numbering won't
96
+ # effect the distance comparison)
97
+ for means in meanss:
98
+ means.sort(key=sum)
99
+
100
+ # find the set of means that's minimally different from the others
101
+ min_difference = min_means = None
102
+ for i in range(len(meanss)):
103
+ d = 0
104
+ for j in range(len(meanss)):
105
+ if i != j:
106
+ d += self._sum_distances(meanss[i], meanss[j])
107
+ if min_difference is None or d < min_difference:
108
+ min_difference, min_means = d, meanss[i]
109
+
110
+ # use the best means
111
+ self._means = min_means
112
+
113
+ def _cluster_vectorspace(self, vectors, trace=False):
114
+ if self._num_means < len(vectors):
115
+ # perform k-means clustering
116
+ converged = False
117
+ while not converged:
118
+ # assign the tokens to clusters based on minimum distance to
119
+ # the cluster means
120
+ clusters = [[] for m in range(self._num_means)]
121
+ for vector in vectors:
122
+ index = self.classify_vectorspace(vector)
123
+ clusters[index].append(vector)
124
+
125
+ if trace:
126
+ print("iteration")
127
+ # for i in range(self._num_means):
128
+ # print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
129
+
130
+ # recalculate cluster means by computing the centroid of each cluster
131
+ new_means = list(map(self._centroid, clusters, self._means))
132
+
133
+ # measure the degree of change from the previous step for convergence
134
+ difference = self._sum_distances(self._means, new_means)
135
+ if difference < self._max_difference:
136
+ converged = True
137
+
138
+ # remember the new means
139
+ self._means = new_means
140
+
141
+ def classify_vectorspace(self, vector):
142
+ # finds the closest cluster centroid
143
+ # returns that cluster's index
144
+ best_distance = best_index = None
145
+ for index in range(len(self._means)):
146
+ mean = self._means[index]
147
+ dist = self._distance(vector, mean)
148
+ if best_distance is None or dist < best_distance:
149
+ best_index, best_distance = index, dist
150
+ return best_index
151
+
152
+ def num_clusters(self):
153
+ if self._means:
154
+ return len(self._means)
155
+ else:
156
+ return self._num_means
157
+
158
+ def means(self):
159
+ """
160
+ The means used for clustering.
161
+ """
162
+ return self._means
163
+
164
+ def _sum_distances(self, vectors1, vectors2):
165
+ difference = 0.0
166
+ for u, v in zip(vectors1, vectors2):
167
+ difference += self._distance(u, v)
168
+ return difference
169
+
170
+ def _centroid(self, cluster, mean):
171
+ if self._avoid_empty_clusters:
172
+ centroid = copy.copy(mean)
173
+ for vector in cluster:
174
+ centroid += vector
175
+ return centroid / (1 + len(cluster))
176
+ else:
177
+ if not len(cluster):
178
+ sys.stderr.write("Error: no centroid defined for empty cluster.\n")
179
+ sys.stderr.write(
180
+ "Try setting argument 'avoid_empty_clusters' to True\n"
181
+ )
182
+ assert False
183
+ centroid = copy.copy(cluster[0])
184
+ for vector in cluster[1:]:
185
+ centroid += vector
186
+ return centroid / len(cluster)
187
+
188
+ def __repr__(self):
189
+ return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
190
+
191
+
192
+ #################################################################################
193
+
194
+
195
+ def demo():
196
+ # example from figure 14.9, page 517, Manning and Schutze
197
+
198
+ from nltk.cluster import KMeansClusterer, euclidean_distance
199
+
200
+ vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
201
+ means = [[4, 3], [5, 5]]
202
+
203
+ clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
204
+ clusters = clusterer.cluster(vectors, True, trace=True)
205
+
206
+ print("Clustered:", vectors)
207
+ print("As:", clusters)
208
+ print("Means:", clusterer.means())
209
+ print()
210
+
211
+ vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
212
+
213
+ # test k-means using the euclidean distance metric, 2 means and repeat
214
+ # clustering 10 times with random seeds
215
+
216
+ clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
217
+ clusters = clusterer.cluster(vectors, True)
218
+ print("Clustered:", vectors)
219
+ print("As:", clusters)
220
+ print("Means:", clusterer.means())
221
+ print()
222
+
223
+ # classify a new vector
224
+ vector = numpy.array([3, 3])
225
+ print("classify(%s):" % vector, end=" ")
226
+ print(clusterer.classify(vector))
227
+ print()
228
+
229
+
230
+ if __name__ == "__main__":
231
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/__init__.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Corpus Readers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Edward Loper <edloper@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ # TODO this docstring isn't up-to-date!
9
+ """
10
+ NLTK corpus readers. The modules in this package provide functions
11
+ that can be used to read corpus files in a variety of formats. These
12
+ functions can be used to read both the corpus files that are
13
+ distributed in the NLTK corpus package, and corpus files that are part
14
+ of external corpora.
15
+
16
+ Available Corpora
17
+ =================
18
+
19
+ Please see https://www.nltk.org/nltk_data/ for a complete list.
20
+ Install corpora using nltk.download().
21
+
22
+ Corpus Reader Functions
23
+ =======================
24
+ Each corpus module defines one or more "corpus reader functions",
25
+ which can be used to read documents from that corpus. These functions
26
+ take an argument, ``item``, which is used to indicate which document
27
+ should be read from the corpus:
28
+
29
+ - If ``item`` is one of the unique identifiers listed in the corpus
30
+ module's ``items`` variable, then the corresponding document will
31
+ be loaded from the NLTK corpus package.
32
+ - If ``item`` is a filename, then that file will be read.
33
+
34
+ Additionally, corpus reader functions can be given lists of item
35
+ names; in which case, they will return a concatenation of the
36
+ corresponding documents.
37
+
38
+ Corpus reader functions are named based on the type of information
39
+ they return. Some common examples, and their return types, are:
40
+
41
+ - words(): list of str
42
+ - sents(): list of (list of str)
43
+ - paras(): list of (list of (list of str))
44
+ - tagged_words(): list of (str,str) tuple
45
+ - tagged_sents(): list of (list of (str,str))
46
+ - tagged_paras(): list of (list of (list of (str,str)))
47
+ - chunked_sents(): list of (Tree w/ (str,str) leaves)
48
+ - parsed_sents(): list of (Tree with str leaves)
49
+ - parsed_paras(): list of (list of (Tree with str leaves))
50
+ - xml(): A single xml ElementTree
51
+ - raw(): unprocessed corpus contents
52
+
53
+ For example, to read a list of the words in the Brown Corpus, use
54
+ ``nltk.corpus.brown.words()``:
55
+
56
+ >>> from nltk.corpus import brown
57
+ >>> print(", ".join(brown.words())) # doctest: +ELLIPSIS
58
+ The, Fulton, County, Grand, Jury, said, ...
59
+
60
+ """
61
+
62
+ import re
63
+
64
+ from nltk.corpus.reader import *
65
+ from nltk.corpus.util import LazyCorpusLoader
66
+ from nltk.tokenize import RegexpTokenizer
67
+
68
+ abc: PlaintextCorpusReader = LazyCorpusLoader(
69
+ "abc",
70
+ PlaintextCorpusReader,
71
+ r"(?!\.).*\.txt",
72
+ encoding=[("science", "latin_1"), ("rural", "utf8")],
73
+ )
74
+ alpino: AlpinoCorpusReader = LazyCorpusLoader(
75
+ "alpino", AlpinoCorpusReader, tagset="alpino"
76
+ )
77
+ bcp47: BCP47CorpusReader = LazyCorpusLoader(
78
+ "bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
79
+ )
80
+ brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
81
+ "brown",
82
+ CategorizedTaggedCorpusReader,
83
+ r"c[a-z]\d\d",
84
+ cat_file="cats.txt",
85
+ tagset="brown",
86
+ encoding="ascii",
87
+ )
88
+ cess_cat: BracketParseCorpusReader = LazyCorpusLoader(
89
+ "cess_cat",
90
+ BracketParseCorpusReader,
91
+ r"(?!\.).*\.tbf",
92
+ tagset="unknown",
93
+ encoding="ISO-8859-15",
94
+ )
95
+ cess_esp: BracketParseCorpusReader = LazyCorpusLoader(
96
+ "cess_esp",
97
+ BracketParseCorpusReader,
98
+ r"(?!\.).*\.tbf",
99
+ tagset="unknown",
100
+ encoding="ISO-8859-15",
101
+ )
102
+ cmudict: CMUDictCorpusReader = LazyCorpusLoader(
103
+ "cmudict", CMUDictCorpusReader, ["cmudict"]
104
+ )
105
+ comtrans: AlignedCorpusReader = LazyCorpusLoader(
106
+ "comtrans", AlignedCorpusReader, r"(?!\.).*\.txt"
107
+ )
108
+ comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader(
109
+ "comparative_sentences",
110
+ ComparativeSentencesCorpusReader,
111
+ r"labeledSentences\.txt",
112
+ encoding="latin-1",
113
+ )
114
+ conll2000: ConllChunkCorpusReader = LazyCorpusLoader(
115
+ "conll2000",
116
+ ConllChunkCorpusReader,
117
+ ["train.txt", "test.txt"],
118
+ ("NP", "VP", "PP"),
119
+ tagset="wsj",
120
+ encoding="ascii",
121
+ )
122
+ conll2002: ConllChunkCorpusReader = LazyCorpusLoader(
123
+ "conll2002",
124
+ ConllChunkCorpusReader,
125
+ r".*\.(test|train).*",
126
+ ("LOC", "PER", "ORG", "MISC"),
127
+ encoding="utf-8",
128
+ )
129
+ conll2007: DependencyCorpusReader = LazyCorpusLoader(
130
+ "conll2007",
131
+ DependencyCorpusReader,
132
+ r".*\.(test|train).*",
133
+ encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
134
+ )
135
+ crubadan: CrubadanCorpusReader = LazyCorpusLoader(
136
+ "crubadan", CrubadanCorpusReader, r".*\.txt"
137
+ )
138
+ dependency_treebank: DependencyCorpusReader = LazyCorpusLoader(
139
+ "dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
140
+ )
141
+ extended_omw: CorpusReader = LazyCorpusLoader(
142
+ "extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"
143
+ )
144
+ floresta: BracketParseCorpusReader = LazyCorpusLoader(
145
+ "floresta",
146
+ BracketParseCorpusReader,
147
+ r"(?!\.).*\.ptb",
148
+ "#",
149
+ tagset="unknown",
150
+ encoding="ISO-8859-15",
151
+ )
152
+ framenet15: FramenetCorpusReader = LazyCorpusLoader(
153
+ "framenet_v15",
154
+ FramenetCorpusReader,
155
+ [
156
+ "frRelation.xml",
157
+ "frameIndex.xml",
158
+ "fulltextIndex.xml",
159
+ "luIndex.xml",
160
+ "semTypes.xml",
161
+ ],
162
+ )
163
+ framenet: FramenetCorpusReader = LazyCorpusLoader(
164
+ "framenet_v17",
165
+ FramenetCorpusReader,
166
+ [
167
+ "frRelation.xml",
168
+ "frameIndex.xml",
169
+ "fulltextIndex.xml",
170
+ "luIndex.xml",
171
+ "semTypes.xml",
172
+ ],
173
+ )
174
+ gazetteers: WordListCorpusReader = LazyCorpusLoader(
175
+ "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
176
+ )
177
+ genesis: PlaintextCorpusReader = LazyCorpusLoader(
178
+ "genesis",
179
+ PlaintextCorpusReader,
180
+ r"(?!\.).*\.txt",
181
+ encoding=[
182
+ ("finnish|french|german", "latin_1"),
183
+ ("swedish", "cp865"),
184
+ (".*", "utf_8"),
185
+ ],
186
+ )
187
+ gutenberg: PlaintextCorpusReader = LazyCorpusLoader(
188
+ "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
189
+ )
190
+ ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
191
+ inaugural: PlaintextCorpusReader = LazyCorpusLoader(
192
+ "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
193
+ )
194
+ # [XX] This should probably just use TaggedCorpusReader:
195
+ indian: IndianCorpusReader = LazyCorpusLoader(
196
+ "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
197
+ )
198
+
199
+ jeita: ChasenCorpusReader = LazyCorpusLoader(
200
+ "jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8"
201
+ )
202
+ knbc: KNBCorpusReader = LazyCorpusLoader(
203
+ "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
204
+ )
205
+ lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader(
206
+ "lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp"
207
+ )
208
+ mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader(
209
+ "mac_morpho",
210
+ MacMorphoCorpusReader,
211
+ r"(?!\.).*\.txt",
212
+ tagset="unknown",
213
+ encoding="latin-1",
214
+ )
215
+ machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader(
216
+ "machado",
217
+ PortugueseCategorizedPlaintextCorpusReader,
218
+ r"(?!\.).*\.txt",
219
+ cat_pattern=r"([a-z]*)/.*",
220
+ encoding="latin-1",
221
+ )
222
+ masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader(
223
+ "masc_tagged",
224
+ CategorizedTaggedCorpusReader,
225
+ r"(spoken|written)/.*\.txt",
226
+ cat_file="categories.txt",
227
+ tagset="wsj",
228
+ encoding="utf-8",
229
+ sep="_",
230
+ )
231
+ movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
232
+ "movie_reviews",
233
+ CategorizedPlaintextCorpusReader,
234
+ r"(?!\.).*\.txt",
235
+ cat_pattern=r"(neg|pos)/.*",
236
+ encoding="ascii",
237
+ )
238
+ multext_east: MTECorpusReader = LazyCorpusLoader(
239
+ "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
240
+ )
241
+ names: WordListCorpusReader = LazyCorpusLoader(
242
+ "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
243
+ )
244
+ nps_chat: NPSChatCorpusReader = LazyCorpusLoader(
245
+ "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
246
+ )
247
+ opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader(
248
+ "opinion_lexicon",
249
+ OpinionLexiconCorpusReader,
250
+ r"(\w+)\-words\.txt",
251
+ encoding="ISO-8859-2",
252
+ )
253
+ ppattach: PPAttachmentCorpusReader = LazyCorpusLoader(
254
+ "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
255
+ )
256
+ product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader(
257
+ "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
258
+ )
259
+ product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader(
260
+ "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
261
+ )
262
+ pros_cons: ProsConsCorpusReader = LazyCorpusLoader(
263
+ "pros_cons",
264
+ ProsConsCorpusReader,
265
+ r"Integrated(Cons|Pros)\.txt",
266
+ cat_pattern=r"Integrated(Cons|Pros)\.txt",
267
+ encoding="ISO-8859-2",
268
+ )
269
+ ptb: CategorizedBracketParseCorpusReader = (
270
+ LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
271
+ "ptb",
272
+ CategorizedBracketParseCorpusReader,
273
+ r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
274
+ cat_file="allcats.txt",
275
+ tagset="wsj",
276
+ )
277
+ )
278
+ qc: StringCategoryCorpusReader = LazyCorpusLoader(
279
+ "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
280
+ )
281
+ reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
282
+ "reuters",
283
+ CategorizedPlaintextCorpusReader,
284
+ "(training|test).*",
285
+ cat_file="cats.txt",
286
+ encoding="ISO-8859-2",
287
+ )
288
+ rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
289
+ senseval: SensevalCorpusReader = LazyCorpusLoader(
290
+ "senseval", SensevalCorpusReader, r"(?!\.).*\.pos"
291
+ )
292
+ sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
293
+ "sentence_polarity",
294
+ CategorizedSentencesCorpusReader,
295
+ r"rt-polarity\.(neg|pos)",
296
+ cat_pattern=r"rt-polarity\.(neg|pos)",
297
+ encoding="utf-8",
298
+ )
299
+ sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader(
300
+ "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
301
+ )
302
+ shakespeare: XMLCorpusReader = LazyCorpusLoader(
303
+ "shakespeare", XMLCorpusReader, r"(?!\.).*\.xml"
304
+ )
305
+ sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader(
306
+ "sinica_treebank",
307
+ SinicaTreebankCorpusReader,
308
+ ["parsed"],
309
+ tagset="unknown",
310
+ encoding="utf-8",
311
+ )
312
+ state_union: PlaintextCorpusReader = LazyCorpusLoader(
313
+ "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
314
+ )
315
+ stopwords: WordListCorpusReader = LazyCorpusLoader(
316
+ "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
317
+ )
318
+ subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
319
+ "subjectivity",
320
+ CategorizedSentencesCorpusReader,
321
+ r"(quote.tok.gt9|plot.tok.gt9)\.5000",
322
+ cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
323
+ encoding="latin-1",
324
+ )
325
+ swadesh: SwadeshCorpusReader = LazyCorpusLoader(
326
+ "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
327
+ )
328
+ swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader(
329
+ "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8"
330
+ )
331
+ swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader(
332
+ "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8"
333
+ )
334
+ switchboard: SwitchboardCorpusReader = LazyCorpusLoader(
335
+ "switchboard", SwitchboardCorpusReader, tagset="wsj"
336
+ )
337
+ timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader)
338
+ timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader(
339
+ "timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
340
+ )
341
+ toolbox: ToolboxCorpusReader = LazyCorpusLoader(
342
+ "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
343
+ )
344
+ treebank: BracketParseCorpusReader = LazyCorpusLoader(
345
+ "treebank/combined",
346
+ BracketParseCorpusReader,
347
+ r"wsj_.*\.mrg",
348
+ tagset="wsj",
349
+ encoding="ascii",
350
+ )
351
+ treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader(
352
+ "treebank/tagged",
353
+ ChunkedCorpusReader,
354
+ r"wsj_.*\.pos",
355
+ sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
356
+ para_block_reader=tagged_treebank_para_block_reader,
357
+ tagset="wsj",
358
+ encoding="ascii",
359
+ )
360
+ treebank_raw: PlaintextCorpusReader = LazyCorpusLoader(
361
+ "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
362
+ )
363
+ twitter_samples: TwitterCorpusReader = LazyCorpusLoader(
364
+ "twitter_samples", TwitterCorpusReader, r".*\.json"
365
+ )
366
+ udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader)
367
+ udhr2: PlaintextCorpusReader = LazyCorpusLoader(
368
+ "udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8"
369
+ )
370
+ universal_treebanks: ConllCorpusReader = LazyCorpusLoader(
371
+ "universal_treebanks_v20",
372
+ ConllCorpusReader,
373
+ r".*\.conll",
374
+ columntypes=(
375
+ "ignore",
376
+ "words",
377
+ "ignore",
378
+ "ignore",
379
+ "pos",
380
+ "ignore",
381
+ "ignore",
382
+ "ignore",
383
+ "ignore",
384
+ "ignore",
385
+ ),
386
+ )
387
+ verbnet: VerbnetCorpusReader = LazyCorpusLoader(
388
+ "verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml"
389
+ )
390
+ webtext: PlaintextCorpusReader = LazyCorpusLoader(
391
+ "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
392
+ )
393
+ wordnet: WordNetCorpusReader = LazyCorpusLoader(
394
+ "wordnet",
395
+ WordNetCorpusReader,
396
+ LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
397
+ )
398
+ wordnet31: WordNetCorpusReader = LazyCorpusLoader(
399
+ "wordnet31",
400
+ WordNetCorpusReader,
401
+ LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
402
+ )
403
+ wordnet2021: WordNetCorpusReader = LazyCorpusLoader(
404
+ "wordnet2021",
405
+ WordNetCorpusReader,
406
+ LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
407
+ )
408
+ wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader(
409
+ "wordnet_ic", WordNetICCorpusReader, r".*\.dat"
410
+ )
411
+ words: WordListCorpusReader = LazyCorpusLoader(
412
+ "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
413
+ )
414
+
415
+ # defined after treebank
416
+ propbank: PropbankCorpusReader = LazyCorpusLoader(
417
+ "propbank",
418
+ PropbankCorpusReader,
419
+ "prop.txt",
420
+ r"frames/.*\.xml",
421
+ "verbs.txt",
422
+ lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
423
+ treebank,
424
+ ) # Must be defined *after* treebank corpus.
425
+ nombank: NombankCorpusReader = LazyCorpusLoader(
426
+ "nombank.1.0",
427
+ NombankCorpusReader,
428
+ "nombank.1.0",
429
+ r"frames/.*\.xml",
430
+ "nombank.1.0.words",
431
+ lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
432
+ treebank,
433
+ ) # Must be defined *after* treebank corpus.
434
+ propbank_ptb: PropbankCorpusReader = LazyCorpusLoader(
435
+ "propbank",
436
+ PropbankCorpusReader,
437
+ "prop.txt",
438
+ r"frames/.*\.xml",
439
+ "verbs.txt",
440
+ lambda filename: filename.upper(),
441
+ ptb,
442
+ ) # Must be defined *after* ptb corpus.
443
+ nombank_ptb: NombankCorpusReader = LazyCorpusLoader(
444
+ "nombank.1.0",
445
+ NombankCorpusReader,
446
+ "nombank.1.0",
447
+ r"frames/.*\.xml",
448
+ "nombank.1.0.words",
449
+ lambda filename: filename.upper(),
450
+ ptb,
451
+ ) # Must be defined *after* ptb corpus.
452
+ semcor: SemcorCorpusReader = LazyCorpusLoader(
453
+ "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
454
+ ) # Must be defined *after* wordnet corpus.
455
+
456
+ nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader(
457
+ "nonbreaking_prefixes",
458
+ NonbreakingPrefixesCorpusReader,
459
+ r"(?!README|\.).*",
460
+ encoding="utf8",
461
+ )
462
+ perluniprops: UnicharsCorpusReader = LazyCorpusLoader(
463
+ "perluniprops",
464
+ UnicharsCorpusReader,
465
+ r"(?!README|\.).*",
466
+ nltk_data_subdir="misc",
467
+ encoding="utf8",
468
+ )
469
+
470
+ # mwa_ppdb = LazyCorpusLoader(
471
+ # 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
472
+
473
+ # See https://github.com/nltk/nltk/issues/1579
474
+ # and https://github.com/nltk/nltk/issues/1716
475
+ #
476
+ # pl196x = LazyCorpusLoader(
477
+ # 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
478
+ # cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
479
+ #
480
+ # ipipan = LazyCorpusLoader(
481
+ # 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
482
+ #
483
+ # nkjp = LazyCorpusLoader(
484
+ # 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
485
+ #
486
+ # panlex_lite = LazyCorpusLoader(
487
+ # 'panlex_lite', PanLexLiteCorpusReader)
488
+ #
489
+ # ycoe = LazyCorpusLoader(
490
+ # 'ycoe', YCOECorpusReader)
491
+ #
492
+ # corpus not available with NLTK; these lines caused help(nltk.corpus) to break
493
+ # hebrew_treebank = LazyCorpusLoader(
494
+ # 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
495
+
496
+ # FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
497
+ def demo():
498
+ # This is out-of-date:
499
+ abc.demo()
500
+ brown.demo()
501
+ # chat80.demo()
502
+ cmudict.demo()
503
+ conll2000.demo()
504
+ conll2002.demo()
505
+ genesis.demo()
506
+ gutenberg.demo()
507
+ ieer.demo()
508
+ inaugural.demo()
509
+ indian.demo()
510
+ names.demo()
511
+ ppattach.demo()
512
+ senseval.demo()
513
+ shakespeare.demo()
514
+ sinica_treebank.demo()
515
+ state_union.demo()
516
+ stopwords.demo()
517
+ timit.demo()
518
+ toolbox.demo()
519
+ treebank.demo()
520
+ udhr.demo()
521
+ webtext.demo()
522
+ words.demo()
523
+
524
+
525
+ # ycoe.demo()
526
+
527
+ if __name__ == "__main__":
528
+ # demo()
529
+ pass
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/europarl_raw.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Europarl Corpus Readers
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ import re
9
+
10
+ from nltk.corpus.reader import *
11
+ from nltk.corpus.util import LazyCorpusLoader
12
+
13
+ # Create a new corpus reader instance for each European language
14
+ danish: EuroparlCorpusReader = LazyCorpusLoader(
15
+ "europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
16
+ )
17
+
18
+ dutch: EuroparlCorpusReader = LazyCorpusLoader(
19
+ "europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
20
+ )
21
+
22
+ english: EuroparlCorpusReader = LazyCorpusLoader(
23
+ "europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
24
+ )
25
+
26
+ finnish: EuroparlCorpusReader = LazyCorpusLoader(
27
+ "europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
28
+ )
29
+
30
+ french: EuroparlCorpusReader = LazyCorpusLoader(
31
+ "europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
32
+ )
33
+
34
+ german: EuroparlCorpusReader = LazyCorpusLoader(
35
+ "europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
36
+ )
37
+
38
+ greek: EuroparlCorpusReader = LazyCorpusLoader(
39
+ "europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
40
+ )
41
+
42
+ italian: EuroparlCorpusReader = LazyCorpusLoader(
43
+ "europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
44
+ )
45
+
46
+ portuguese: EuroparlCorpusReader = LazyCorpusLoader(
47
+ "europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
48
+ )
49
+
50
+ spanish: EuroparlCorpusReader = LazyCorpusLoader(
51
+ "europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
52
+ )
53
+
54
+ swedish: EuroparlCorpusReader = LazyCorpusLoader(
55
+ "europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
56
+ )
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/childes.py ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CHILDES XML Corpus Reader
2
+
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Tomonori Nagano <tnagano@gc.cuny.edu>
5
+ # Alexis Dimitriadis <A.Dimitriadis@uu.nl>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ Corpus reader for the XML version of the CHILDES corpus.
11
+ """
12
+
13
+ __docformat__ = "epytext en"
14
+
15
+ import re
16
+ from collections import defaultdict
17
+
18
+ from nltk.corpus.reader.util import concat
19
+ from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
20
+ from nltk.util import LazyConcatenation, LazyMap, flatten
21
+
22
+ # to resolve the namespace issue
23
+ NS = "http://www.talkbank.org/ns/talkbank"
24
+
25
+
26
+ class CHILDESCorpusReader(XMLCorpusReader):
27
+ """
28
+ Corpus reader for the XML version of the CHILDES corpus.
29
+ The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
30
+ version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
31
+ Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
32
+ (``nltk_data/corpora/CHILDES/``).
33
+
34
+ For access to the file text use the usual nltk functions,
35
+ ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
36
+ """
37
+
38
+ def __init__(self, root, fileids, lazy=True):
39
+ XMLCorpusReader.__init__(self, root, fileids)
40
+ self._lazy = lazy
41
+
42
+ def words(
43
+ self,
44
+ fileids=None,
45
+ speaker="ALL",
46
+ stem=False,
47
+ relation=False,
48
+ strip_space=True,
49
+ replace=False,
50
+ ):
51
+ """
52
+ :return: the given file(s) as a list of words
53
+ :rtype: list(str)
54
+
55
+ :param speaker: If specified, select specific speaker(s) defined
56
+ in the corpus. Default is 'ALL' (all participants). Common choices
57
+ are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
58
+ researchers)
59
+ :param stem: If true, then use word stems instead of word strings.
60
+ :param relation: If true, then return tuples of (stem, index,
61
+ dependent_index)
62
+ :param strip_space: If true, then strip trailing spaces from word
63
+ tokens. Otherwise, leave the spaces on the tokens.
64
+ :param replace: If true, then use the replaced (intended) word instead
65
+ of the original word (e.g., 'wat' will be replaced with 'watch')
66
+ """
67
+ sent = None
68
+ pos = False
69
+ if not self._lazy:
70
+ return [
71
+ self._get_words(
72
+ fileid, speaker, sent, stem, relation, pos, strip_space, replace
73
+ )
74
+ for fileid in self.abspaths(fileids)
75
+ ]
76
+
77
+ get_words = lambda fileid: self._get_words(
78
+ fileid, speaker, sent, stem, relation, pos, strip_space, replace
79
+ )
80
+ return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
81
+
82
+ def tagged_words(
83
+ self,
84
+ fileids=None,
85
+ speaker="ALL",
86
+ stem=False,
87
+ relation=False,
88
+ strip_space=True,
89
+ replace=False,
90
+ ):
91
+ """
92
+ :return: the given file(s) as a list of tagged
93
+ words and punctuation symbols, encoded as tuples
94
+ ``(word,tag)``.
95
+ :rtype: list(tuple(str,str))
96
+
97
+ :param speaker: If specified, select specific speaker(s) defined
98
+ in the corpus. Default is 'ALL' (all participants). Common choices
99
+ are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
100
+ researchers)
101
+ :param stem: If true, then use word stems instead of word strings.
102
+ :param relation: If true, then return tuples of (stem, index,
103
+ dependent_index)
104
+ :param strip_space: If true, then strip trailing spaces from word
105
+ tokens. Otherwise, leave the spaces on the tokens.
106
+ :param replace: If true, then use the replaced (intended) word instead
107
+ of the original word (e.g., 'wat' will be replaced with 'watch')
108
+ """
109
+ sent = None
110
+ pos = True
111
+ if not self._lazy:
112
+ return [
113
+ self._get_words(
114
+ fileid, speaker, sent, stem, relation, pos, strip_space, replace
115
+ )
116
+ for fileid in self.abspaths(fileids)
117
+ ]
118
+
119
+ get_words = lambda fileid: self._get_words(
120
+ fileid, speaker, sent, stem, relation, pos, strip_space, replace
121
+ )
122
+ return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
123
+
124
+ def sents(
125
+ self,
126
+ fileids=None,
127
+ speaker="ALL",
128
+ stem=False,
129
+ relation=None,
130
+ strip_space=True,
131
+ replace=False,
132
+ ):
133
+ """
134
+ :return: the given file(s) as a list of sentences or utterances, each
135
+ encoded as a list of word strings.
136
+ :rtype: list(list(str))
137
+
138
+ :param speaker: If specified, select specific speaker(s) defined
139
+ in the corpus. Default is 'ALL' (all participants). Common choices
140
+ are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
141
+ researchers)
142
+ :param stem: If true, then use word stems instead of word strings.
143
+ :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
144
+ If there is manually-annotated relation info, it will return
145
+ tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
146
+ :param strip_space: If true, then strip trailing spaces from word
147
+ tokens. Otherwise, leave the spaces on the tokens.
148
+ :param replace: If true, then use the replaced (intended) word instead
149
+ of the original word (e.g., 'wat' will be replaced with 'watch')
150
+ """
151
+ sent = True
152
+ pos = False
153
+ if not self._lazy:
154
+ return [
155
+ self._get_words(
156
+ fileid, speaker, sent, stem, relation, pos, strip_space, replace
157
+ )
158
+ for fileid in self.abspaths(fileids)
159
+ ]
160
+
161
+ get_words = lambda fileid: self._get_words(
162
+ fileid, speaker, sent, stem, relation, pos, strip_space, replace
163
+ )
164
+ return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
165
+
166
+ def tagged_sents(
167
+ self,
168
+ fileids=None,
169
+ speaker="ALL",
170
+ stem=False,
171
+ relation=None,
172
+ strip_space=True,
173
+ replace=False,
174
+ ):
175
+ """
176
+ :return: the given file(s) as a list of
177
+ sentences, each encoded as a list of ``(word,tag)`` tuples.
178
+ :rtype: list(list(tuple(str,str)))
179
+
180
+ :param speaker: If specified, select specific speaker(s) defined
181
+ in the corpus. Default is 'ALL' (all participants). Common choices
182
+ are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
183
+ researchers)
184
+ :param stem: If true, then use word stems instead of word strings.
185
+ :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
186
+ If there is manually-annotated relation info, it will return
187
+ tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
188
+ :param strip_space: If true, then strip trailing spaces from word
189
+ tokens. Otherwise, leave the spaces on the tokens.
190
+ :param replace: If true, then use the replaced (intended) word instead
191
+ of the original word (e.g., 'wat' will be replaced with 'watch')
192
+ """
193
+ sent = True
194
+ pos = True
195
+ if not self._lazy:
196
+ return [
197
+ self._get_words(
198
+ fileid, speaker, sent, stem, relation, pos, strip_space, replace
199
+ )
200
+ for fileid in self.abspaths(fileids)
201
+ ]
202
+
203
+ get_words = lambda fileid: self._get_words(
204
+ fileid, speaker, sent, stem, relation, pos, strip_space, replace
205
+ )
206
+ return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
207
+
208
+ def corpus(self, fileids=None):
209
+ """
210
+ :return: the given file(s) as a dict of ``(corpus_property_key, value)``
211
+ :rtype: list(dict)
212
+ """
213
+ if not self._lazy:
214
+ return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
215
+ return LazyMap(self._get_corpus, self.abspaths(fileids))
216
+
217
+ def _get_corpus(self, fileid):
218
+ results = dict()
219
+ xmldoc = ElementTree.parse(fileid).getroot()
220
+ for key, value in xmldoc.items():
221
+ results[key] = value
222
+ return results
223
+
224
+ def participants(self, fileids=None):
225
+ """
226
+ :return: the given file(s) as a dict of
227
+ ``(participant_property_key, value)``
228
+ :rtype: list(dict)
229
+ """
230
+ if not self._lazy:
231
+ return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
232
+ return LazyMap(self._get_participants, self.abspaths(fileids))
233
+
234
+ def _get_participants(self, fileid):
235
+ # multidimensional dicts
236
+ def dictOfDicts():
237
+ return defaultdict(dictOfDicts)
238
+
239
+ xmldoc = ElementTree.parse(fileid).getroot()
240
+ # getting participants' data
241
+ pat = dictOfDicts()
242
+ for participant in xmldoc.findall(
243
+ f".//{{{NS}}}Participants/{{{NS}}}participant"
244
+ ):
245
+ for (key, value) in participant.items():
246
+ pat[participant.get("id")][key] = value
247
+ return pat
248
+
249
+ def age(self, fileids=None, speaker="CHI", month=False):
250
+ """
251
+ :return: the given file(s) as string or int
252
+ :rtype: list or int
253
+
254
+ :param month: If true, return months instead of year-month-date
255
+ """
256
+ if not self._lazy:
257
+ return [
258
+ self._get_age(fileid, speaker, month)
259
+ for fileid in self.abspaths(fileids)
260
+ ]
261
+ get_age = lambda fileid: self._get_age(fileid, speaker, month)
262
+ return LazyMap(get_age, self.abspaths(fileids))
263
+
264
+ def _get_age(self, fileid, speaker, month):
265
+ xmldoc = ElementTree.parse(fileid).getroot()
266
+ for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
267
+ try:
268
+ if pat.get("id") == speaker:
269
+ age = pat.get("age")
270
+ if month:
271
+ age = self.convert_age(age)
272
+ return age
273
+ # some files don't have age data
274
+ except (TypeError, AttributeError) as e:
275
+ return None
276
+
277
+ def convert_age(self, age_year):
278
+ "Caclculate age in months from a string in CHILDES format"
279
+ m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
280
+ age_month = int(m.group(1)) * 12 + int(m.group(2))
281
+ try:
282
+ if int(m.group(3)) > 15:
283
+ age_month += 1
284
+ # some corpora don't have age information?
285
+ except ValueError as e:
286
+ pass
287
+ return age_month
288
+
289
+ def MLU(self, fileids=None, speaker="CHI"):
290
+ """
291
+ :return: the given file(s) as a floating number
292
+ :rtype: list(float)
293
+ """
294
+ if not self._lazy:
295
+ return [
296
+ self._getMLU(fileid, speaker=speaker)
297
+ for fileid in self.abspaths(fileids)
298
+ ]
299
+ get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
300
+ return LazyMap(get_MLU, self.abspaths(fileids))
301
+
302
+ def _getMLU(self, fileid, speaker):
303
+ sents = self._get_words(
304
+ fileid,
305
+ speaker=speaker,
306
+ sent=True,
307
+ stem=True,
308
+ relation=False,
309
+ pos=True,
310
+ strip_space=True,
311
+ replace=True,
312
+ )
313
+ results = []
314
+ lastSent = []
315
+ numFillers = 0
316
+ sentDiscount = 0
317
+ for sent in sents:
318
+ posList = [pos for (word, pos) in sent]
319
+ # if any part of the sentence is intelligible
320
+ if any(pos == "unk" for pos in posList):
321
+ continue
322
+ # if the sentence is null
323
+ elif sent == []:
324
+ continue
325
+ # if the sentence is the same as the last sent
326
+ elif sent == lastSent:
327
+ continue
328
+ else:
329
+ results.append([word for (word, pos) in sent])
330
+ # count number of fillers
331
+ if len({"co", None}.intersection(posList)) > 0:
332
+ numFillers += posList.count("co")
333
+ numFillers += posList.count(None)
334
+ sentDiscount += 1
335
+ lastSent = sent
336
+ try:
337
+ thisWordList = flatten(results)
338
+ # count number of morphemes
339
+ # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
340
+ numWords = (
341
+ len(flatten([word.split("-") for word in thisWordList])) - numFillers
342
+ )
343
+ numSents = len(results) - sentDiscount
344
+ mlu = numWords / numSents
345
+ except ZeroDivisionError:
346
+ mlu = 0
347
+ # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
348
+ return mlu
349
+
350
+ def _get_words(
351
+ self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
352
+ ):
353
+ if (
354
+ isinstance(speaker, str) and speaker != "ALL"
355
+ ): # ensure we have a list of speakers
356
+ speaker = [speaker]
357
+ xmldoc = ElementTree.parse(fileid).getroot()
358
+ # processing each xml doc
359
+ results = []
360
+ for xmlsent in xmldoc.findall(".//{%s}u" % NS):
361
+ sents = []
362
+ # select speakers
363
+ if speaker == "ALL" or xmlsent.get("who") in speaker:
364
+ for xmlword in xmlsent.findall(".//{%s}w" % NS):
365
+ infl = None
366
+ suffixStem = None
367
+ suffixTag = None
368
+ # getting replaced words
369
+ if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
370
+ xmlword = xmlsent.find(
371
+ f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
372
+ )
373
+ elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
374
+ xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
375
+ # get text
376
+ if xmlword.text:
377
+ word = xmlword.text
378
+ else:
379
+ word = ""
380
+ # strip tailing space
381
+ if strip_space:
382
+ word = word.strip()
383
+ # stem
384
+ if relation or stem:
385
+ try:
386
+ xmlstem = xmlword.find(".//{%s}stem" % NS)
387
+ word = xmlstem.text
388
+ except AttributeError as e:
389
+ pass
390
+ # if there is an inflection
391
+ try:
392
+ xmlinfl = xmlword.find(
393
+ f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
394
+ )
395
+ word += "-" + xmlinfl.text
396
+ except:
397
+ pass
398
+ # if there is a suffix
399
+ try:
400
+ xmlsuffix = xmlword.find(
401
+ ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
402
+ % (NS, NS, NS, NS)
403
+ )
404
+ suffixStem = xmlsuffix.text
405
+ except AttributeError:
406
+ suffixStem = ""
407
+ if suffixStem:
408
+ word += "~" + suffixStem
409
+ # pos
410
+ if relation or pos:
411
+ try:
412
+ xmlpos = xmlword.findall(".//{%s}c" % NS)
413
+ xmlpos2 = xmlword.findall(".//{%s}s" % NS)
414
+ if xmlpos2 != []:
415
+ tag = xmlpos[0].text + ":" + xmlpos2[0].text
416
+ else:
417
+ tag = xmlpos[0].text
418
+ except (AttributeError, IndexError) as e:
419
+ tag = ""
420
+ try:
421
+ xmlsuffixpos = xmlword.findall(
422
+ ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
423
+ % (NS, NS, NS, NS, NS)
424
+ )
425
+ xmlsuffixpos2 = xmlword.findall(
426
+ ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
427
+ % (NS, NS, NS, NS, NS)
428
+ )
429
+ if xmlsuffixpos2:
430
+ suffixTag = (
431
+ xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
432
+ )
433
+ else:
434
+ suffixTag = xmlsuffixpos[0].text
435
+ except:
436
+ pass
437
+ if suffixTag:
438
+ tag += "~" + suffixTag
439
+ word = (word, tag)
440
+ # relational
441
+ # the gold standard is stored in
442
+ # <mor></mor><mor type="trn"><gra type="grt">
443
+ if relation == True:
444
+ for xmlstem_rel in xmlword.findall(
445
+ f".//{{{NS}}}mor/{{{NS}}}gra"
446
+ ):
447
+ if not xmlstem_rel.get("type") == "grt":
448
+ word = (
449
+ word[0],
450
+ word[1],
451
+ xmlstem_rel.get("index")
452
+ + "|"
453
+ + xmlstem_rel.get("head")
454
+ + "|"
455
+ + xmlstem_rel.get("relation"),
456
+ )
457
+ else:
458
+ word = (
459
+ word[0],
460
+ word[1],
461
+ word[2],
462
+ word[0],
463
+ word[1],
464
+ xmlstem_rel.get("index")
465
+ + "|"
466
+ + xmlstem_rel.get("head")
467
+ + "|"
468
+ + xmlstem_rel.get("relation"),
469
+ )
470
+ try:
471
+ for xmlpost_rel in xmlword.findall(
472
+ f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
473
+ ):
474
+ if not xmlpost_rel.get("type") == "grt":
475
+ suffixStem = (
476
+ suffixStem[0],
477
+ suffixStem[1],
478
+ xmlpost_rel.get("index")
479
+ + "|"
480
+ + xmlpost_rel.get("head")
481
+ + "|"
482
+ + xmlpost_rel.get("relation"),
483
+ )
484
+ else:
485
+ suffixStem = (
486
+ suffixStem[0],
487
+ suffixStem[1],
488
+ suffixStem[2],
489
+ suffixStem[0],
490
+ suffixStem[1],
491
+ xmlpost_rel.get("index")
492
+ + "|"
493
+ + xmlpost_rel.get("head")
494
+ + "|"
495
+ + xmlpost_rel.get("relation"),
496
+ )
497
+ except:
498
+ pass
499
+ sents.append(word)
500
+ if sent or relation:
501
+ results.append(sents)
502
+ else:
503
+ results.extend(sents)
504
+ return LazyMap(lambda x: x, results)
505
+
506
+ # Ready-to-use browser opener
507
+
508
+ """
509
+ The base URL for viewing files on the childes website. This
510
+ shouldn't need to be changed, unless CHILDES changes the configuration
511
+ of their server or unless the user sets up their own corpus webserver.
512
+ """
513
+ childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
514
+
515
+ def webview_file(self, fileid, urlbase=None):
516
+ """Map a corpus file to its web version on the CHILDES website,
517
+ and open it in a web browser.
518
+
519
+ The complete URL to be used is:
520
+ childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
521
+
522
+ If no urlbase is passed, we try to calculate it. This
523
+ requires that the childes corpus was set up to mirror the
524
+ folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
525
+ nltk_data/corpora/childes/Eng-USA/Cornell/??? or
526
+ nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
527
+
528
+ The function first looks (as a special case) if "Eng-USA" is
529
+ on the path consisting of <corpus root>+fileid; then if
530
+ "childes", possibly followed by "data-xml", appears. If neither
531
+ one is found, we use the unmodified fileid and hope for the best.
532
+ If this is not right, specify urlbase explicitly, e.g., if the
533
+ corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
534
+ """
535
+
536
+ import webbrowser
537
+
538
+ if urlbase:
539
+ path = urlbase + "/" + fileid
540
+ else:
541
+ full = self.root + "/" + fileid
542
+ full = re.sub(r"\\", "/", full)
543
+ if "/childes/" in full.lower():
544
+ # Discard /data-xml/ if present
545
+ path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
546
+ elif "eng-usa" in full.lower():
547
+ path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
548
+ else:
549
+ path = fileid
550
+
551
+ # Strip ".xml" and add ".cha", as necessary:
552
+ if path.endswith(".xml"):
553
+ path = path[:-4]
554
+
555
+ if not path.endswith(".cha"):
556
+ path = path + ".cha"
557
+
558
+ url = self.childes_url_base + path
559
+
560
+ webbrowser.open_new_tab(url)
561
+ print("Opening in browser:", url)
562
+ # Pausing is a good idea, but it's up to the user...
563
+ # raw_input("Hit Return to continue")
564
+
565
+
566
+ def demo(corpus_root=None):
567
+ """
568
+ The CHILDES corpus should be manually downloaded and saved
569
+ to ``[NLTK_Data_Dir]/corpora/childes/``
570
+ """
571
+ if not corpus_root:
572
+ from nltk.data import find
573
+
574
+ corpus_root = find("corpora/childes/data-xml/Eng-USA/")
575
+
576
+ try:
577
+ childes = CHILDESCorpusReader(corpus_root, ".*.xml")
578
+ # describe all corpus
579
+ for file in childes.fileids()[:5]:
580
+ corpus = ""
581
+ corpus_id = ""
582
+ for (key, value) in childes.corpus(file)[0].items():
583
+ if key == "Corpus":
584
+ corpus = value
585
+ if key == "Id":
586
+ corpus_id = value
587
+ print("Reading", corpus, corpus_id, " .....")
588
+ print("words:", childes.words(file)[:7], "...")
589
+ print(
590
+ "words with replaced words:",
591
+ childes.words(file, replace=True)[:7],
592
+ " ...",
593
+ )
594
+ print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
595
+ print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
596
+ print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
597
+ print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
598
+ print(
599
+ "words with relations and pos-tag:",
600
+ childes.words(file, relation=True)[:5],
601
+ " ...",
602
+ )
603
+ print("sentence:", childes.sents(file)[:2], " ...")
604
+ for (participant, values) in childes.participants(file)[0].items():
605
+ for (key, value) in values.items():
606
+ print("\tparticipant", participant, key, ":", value)
607
+ print("num of sent:", len(childes.sents(file)))
608
+ print("num of morphemes:", len(childes.words(file, stem=True)))
609
+ print("age:", childes.age(file))
610
+ print("age in month:", childes.age(file, month=True))
611
+ print("MLU:", childes.MLU(file))
612
+ print()
613
+
614
+ except LookupError as e:
615
+ print(
616
+ """The CHILDES corpus, or the parts you need, should be manually
617
+ downloaded from https://childes.talkbank.org/data-xml/ and saved at
618
+ [NLTK_Data_Dir]/corpora/childes/
619
+ Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
620
+ demo('/path/to/childes/data-xml/Eng-USA/")
621
+ """
622
+ )
623
+ # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
624
+ # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
625
+ ##this fails
626
+ # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
627
+
628
+
629
+ if __name__ == "__main__":
630
+ demo()
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chunked.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Chunked Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ A reader for corpora that contain chunked (and optionally tagged)
11
+ documents.
12
+ """
13
+
14
+ import codecs
15
+ import os.path
16
+
17
+ import nltk
18
+ from nltk.chunk import tagstr2tree
19
+ from nltk.corpus.reader.api import *
20
+ from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
21
+ from nltk.corpus.reader.util import *
22
+ from nltk.tokenize import *
23
+ from nltk.tree import Tree
24
+
25
+
26
+ class ChunkedCorpusReader(CorpusReader):
27
+ """
28
+ Reader for chunked (and optionally tagged) corpora. Paragraphs
29
+ are split using a block reader. They are then tokenized into
30
+ sentences using a sentence tokenizer. Finally, these sentences
31
+ are parsed into chunk trees using a string-to-chunktree conversion
32
+ function. Each of these steps can be performed using a default
33
+ function or a custom function. By default, paragraphs are split
34
+ on blank lines; sentences are listed one per line; and sentences
35
+ are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ root,
41
+ fileids,
42
+ extension="",
43
+ str2chunktree=tagstr2tree,
44
+ sent_tokenizer=RegexpTokenizer("\n", gaps=True),
45
+ para_block_reader=read_blankline_block,
46
+ encoding="utf8",
47
+ tagset=None,
48
+ ):
49
+ """
50
+ :param root: The root directory for this corpus.
51
+ :param fileids: A list or regexp specifying the fileids in this corpus.
52
+ """
53
+ CorpusReader.__init__(self, root, fileids, encoding)
54
+ self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
55
+ """Arguments for corpus views generated by this corpus: a tuple
56
+ (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
57
+
58
+ def words(self, fileids=None):
59
+ """
60
+ :return: the given file(s) as a list of words
61
+ and punctuation symbols.
62
+ :rtype: list(str)
63
+ """
64
+ return concat(
65
+ [
66
+ ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
67
+ for (f, enc) in self.abspaths(fileids, True)
68
+ ]
69
+ )
70
+
71
+ def sents(self, fileids=None):
72
+ """
73
+ :return: the given file(s) as a list of
74
+ sentences or utterances, each encoded as a list of word
75
+ strings.
76
+ :rtype: list(list(str))
77
+ """
78
+ return concat(
79
+ [
80
+ ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
81
+ for (f, enc) in self.abspaths(fileids, True)
82
+ ]
83
+ )
84
+
85
+ def paras(self, fileids=None):
86
+ """
87
+ :return: the given file(s) as a list of
88
+ paragraphs, each encoded as a list of sentences, which are
89
+ in turn encoded as lists of word strings.
90
+ :rtype: list(list(list(str)))
91
+ """
92
+ return concat(
93
+ [
94
+ ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
95
+ for (f, enc) in self.abspaths(fileids, True)
96
+ ]
97
+ )
98
+
99
+ def tagged_words(self, fileids=None, tagset=None):
100
+ """
101
+ :return: the given file(s) as a list of tagged
102
+ words and punctuation symbols, encoded as tuples
103
+ ``(word,tag)``.
104
+ :rtype: list(tuple(str,str))
105
+ """
106
+ return concat(
107
+ [
108
+ ChunkedCorpusView(
109
+ f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
110
+ )
111
+ for (f, enc) in self.abspaths(fileids, True)
112
+ ]
113
+ )
114
+
115
+ def tagged_sents(self, fileids=None, tagset=None):
116
+ """
117
+ :return: the given file(s) as a list of
118
+ sentences, each encoded as a list of ``(word,tag)`` tuples.
119
+
120
+ :rtype: list(list(tuple(str,str)))
121
+ """
122
+ return concat(
123
+ [
124
+ ChunkedCorpusView(
125
+ f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
126
+ )
127
+ for (f, enc) in self.abspaths(fileids, True)
128
+ ]
129
+ )
130
+
131
+ def tagged_paras(self, fileids=None, tagset=None):
132
+ """
133
+ :return: the given file(s) as a list of
134
+ paragraphs, each encoded as a list of sentences, which are
135
+ in turn encoded as lists of ``(word,tag)`` tuples.
136
+ :rtype: list(list(list(tuple(str,str))))
137
+ """
138
+ return concat(
139
+ [
140
+ ChunkedCorpusView(
141
+ f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
142
+ )
143
+ for (f, enc) in self.abspaths(fileids, True)
144
+ ]
145
+ )
146
+
147
+ def chunked_words(self, fileids=None, tagset=None):
148
+ """
149
+ :return: the given file(s) as a list of tagged
150
+ words and chunks. Words are encoded as ``(word, tag)``
151
+ tuples (if the corpus has tags) or word strings (if the
152
+ corpus has no tags). Chunks are encoded as depth-one
153
+ trees over ``(word,tag)`` tuples or word strings.
154
+ :rtype: list(tuple(str,str) and Tree)
155
+ """
156
+ return concat(
157
+ [
158
+ ChunkedCorpusView(
159
+ f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
160
+ )
161
+ for (f, enc) in self.abspaths(fileids, True)
162
+ ]
163
+ )
164
+
165
+ def chunked_sents(self, fileids=None, tagset=None):
166
+ """
167
+ :return: the given file(s) as a list of
168
+ sentences, each encoded as a shallow Tree. The leaves
169
+ of these trees are encoded as ``(word, tag)`` tuples (if
170
+ the corpus has tags) or word strings (if the corpus has no
171
+ tags).
172
+ :rtype: list(Tree)
173
+ """
174
+ return concat(
175
+ [
176
+ ChunkedCorpusView(
177
+ f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
178
+ )
179
+ for (f, enc) in self.abspaths(fileids, True)
180
+ ]
181
+ )
182
+
183
+ def chunked_paras(self, fileids=None, tagset=None):
184
+ """
185
+ :return: the given file(s) as a list of
186
+ paragraphs, each encoded as a list of sentences, which are
187
+ in turn encoded as a shallow Tree. The leaves of these
188
+ trees are encoded as ``(word, tag)`` tuples (if the corpus
189
+ has tags) or word strings (if the corpus has no tags).
190
+ :rtype: list(list(Tree))
191
+ """
192
+ return concat(
193
+ [
194
+ ChunkedCorpusView(
195
+ f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
196
+ )
197
+ for (f, enc) in self.abspaths(fileids, True)
198
+ ]
199
+ )
200
+
201
+ def _read_block(self, stream):
202
+ return [tagstr2tree(t) for t in read_blankline_block(stream)]
203
+
204
+
205
+ class ChunkedCorpusView(StreamBackedCorpusView):
206
+ def __init__(
207
+ self,
208
+ fileid,
209
+ encoding,
210
+ tagged,
211
+ group_by_sent,
212
+ group_by_para,
213
+ chunked,
214
+ str2chunktree,
215
+ sent_tokenizer,
216
+ para_block_reader,
217
+ source_tagset=None,
218
+ target_tagset=None,
219
+ ):
220
+ StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
221
+ self._tagged = tagged
222
+ self._group_by_sent = group_by_sent
223
+ self._group_by_para = group_by_para
224
+ self._chunked = chunked
225
+ self._str2chunktree = str2chunktree
226
+ self._sent_tokenizer = sent_tokenizer
227
+ self._para_block_reader = para_block_reader
228
+ self._source_tagset = source_tagset
229
+ self._target_tagset = target_tagset
230
+
231
+ def read_block(self, stream):
232
+ block = []
233
+ for para_str in self._para_block_reader(stream):
234
+ para = []
235
+ for sent_str in self._sent_tokenizer.tokenize(para_str):
236
+ sent = self._str2chunktree(
237
+ sent_str,
238
+ source_tagset=self._source_tagset,
239
+ target_tagset=self._target_tagset,
240
+ )
241
+
242
+ # If requested, throw away the tags.
243
+ if not self._tagged:
244
+ sent = self._untag(sent)
245
+
246
+ # If requested, throw away the chunks.
247
+ if not self._chunked:
248
+ sent = sent.leaves()
249
+
250
+ # Add the sentence to `para`.
251
+ if self._group_by_sent:
252
+ para.append(sent)
253
+ else:
254
+ para.extend(sent)
255
+
256
+ # Add the paragraph to `block`.
257
+ if self._group_by_para:
258
+ block.append(para)
259
+ else:
260
+ block.extend(para)
261
+
262
+ # Return the block
263
+ return block
264
+
265
+ def _untag(self, tree):
266
+ for i, child in enumerate(tree):
267
+ if isinstance(child, Tree):
268
+ self._untag(child)
269
+ elif isinstance(child, tuple):
270
+ tree[i] = child[0]
271
+ else:
272
+ raise ValueError("expected child to be Tree or tuple")
273
+ return tree
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/cmudict.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
10
+ ftp://ftp.cs.cmu.edu/project/speech/dict/
11
+ Copyright 1998 Carnegie Mellon University
12
+
13
+ File Format: Each line consists of an uppercased word, a counter
14
+ (for alternative pronunciations), and a transcription. Vowels are
15
+ marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
16
+ NATURAL 1 N AE1 CH ER0 AH0 L
17
+
18
+ The dictionary contains 127069 entries. Of these, 119400 words are assigned
19
+ a unique pronunciation, 6830 words have two pronunciations, and 839 words have
20
+ three or more pronunciations. Many of these are fast-speech variants.
21
+
22
+ Phonemes: There are 39 phonemes, as shown below:
23
+
24
+ Phoneme Example Translation Phoneme Example Translation
25
+ ------- ------- ----------- ------- ------- -----------
26
+ AA odd AA D AE at AE T
27
+ AH hut HH AH T AO ought AO T
28
+ AW cow K AW AY hide HH AY D
29
+ B be B IY CH cheese CH IY Z
30
+ D dee D IY DH thee DH IY
31
+ EH Ed EH D ER hurt HH ER T
32
+ EY ate EY T F fee F IY
33
+ G green G R IY N HH he HH IY
34
+ IH it IH T IY eat IY T
35
+ JH gee JH IY K key K IY
36
+ L lee L IY M me M IY
37
+ N knee N IY NG ping P IH NG
38
+ OW oat OW T OY toy T OY
39
+ P pee P IY R read R IY D
40
+ S sea S IY SH she SH IY
41
+ T tea T IY TH theta TH EY T AH
42
+ UH hood HH UH D UW two T UW
43
+ V vee V IY W we W IY
44
+ Y yield Y IY L D Z zee Z IY
45
+ ZH seizure S IY ZH ER
46
+ """
47
+
48
+ from nltk.corpus.reader.api import *
49
+ from nltk.corpus.reader.util import *
50
+ from nltk.util import Index
51
+
52
+
53
+ class CMUDictCorpusReader(CorpusReader):
54
+ def entries(self):
55
+ """
56
+ :return: the cmudict lexicon as a list of entries
57
+ containing (word, transcriptions) tuples.
58
+ """
59
+ return concat(
60
+ [
61
+ StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
62
+ for fileid, enc in self.abspaths(None, True)
63
+ ]
64
+ )
65
+
66
+ def words(self):
67
+ """
68
+ :return: a list of all words defined in the cmudict lexicon.
69
+ """
70
+ return [word.lower() for (word, _) in self.entries()]
71
+
72
+ def dict(self):
73
+ """
74
+ :return: the cmudict lexicon as a dictionary, whose keys are
75
+ lowercase words and whose values are lists of pronunciations.
76
+ """
77
+ return dict(Index(self.entries()))
78
+
79
+
80
+ def read_cmudict_block(stream):
81
+ entries = []
82
+ while len(entries) < 100: # Read 100 at a time.
83
+ line = stream.readline()
84
+ if line == "":
85
+ return entries # end of file.
86
+ pieces = line.split()
87
+ entries.append((pieces[0].lower(), pieces[2:]))
88
+ return entries
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/comparative_sents.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Comparative Sentence Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
5
+ # URL: <https://www.nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ """
9
+ CorpusReader for the Comparative Sentence Dataset.
10
+
11
+ - Comparative Sentence Dataset information -
12
+
13
+ Annotated by: Nitin Jindal and Bing Liu, 2006.
14
+ Department of Computer Sicence
15
+ University of Illinois at Chicago
16
+
17
+ Contact: Nitin Jindal, njindal@cs.uic.edu
18
+ Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)
19
+
20
+ Distributed with permission.
21
+
22
+ Related papers:
23
+
24
+ - Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
25
+ Proceedings of the ACM SIGIR International Conference on Information Retrieval
26
+ (SIGIR-06), 2006.
27
+
28
+ - Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
29
+ Proceedings of Twenty First National Conference on Artificial Intelligence
30
+ (AAAI-2006), 2006.
31
+
32
+ - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
33
+ Proceedings of the 22nd International Conference on Computational Linguistics
34
+ (Coling-2008), Manchester, 18-22 August, 2008.
35
+ """
36
+ import re
37
+
38
+ from nltk.corpus.reader.api import *
39
+ from nltk.tokenize import *
40
+
41
+ # Regular expressions for dataset components
42
+ STARS = re.compile(r"^\*+$")
43
+ COMPARISON = re.compile(r"<cs-[1234]>")
44
+ CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
45
+ GRAD_COMPARISON = re.compile(r"<cs-[123]>")
46
+ NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
47
+ ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
48
+ KEYWORD = re.compile(r"\(([^\(]*)\)$")
49
+
50
+
51
+ class Comparison:
52
+ """
53
+ A Comparison represents a comparative sentence and its constituents.
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ text=None,
59
+ comp_type=None,
60
+ entity_1=None,
61
+ entity_2=None,
62
+ feature=None,
63
+ keyword=None,
64
+ ):
65
+ """
66
+ :param text: a string (optionally tokenized) containing a comparison.
67
+ :param comp_type: an integer defining the type of comparison expressed.
68
+ Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
69
+ 4 (Non-gradable).
70
+ :param entity_1: the first entity considered in the comparison relation.
71
+ :param entity_2: the second entity considered in the comparison relation.
72
+ :param feature: the feature considered in the comparison relation.
73
+ :param keyword: the word or phrase which is used for that comparative relation.
74
+ """
75
+ self.text = text
76
+ self.comp_type = comp_type
77
+ self.entity_1 = entity_1
78
+ self.entity_2 = entity_2
79
+ self.feature = feature
80
+ self.keyword = keyword
81
+
82
+ def __repr__(self):
83
+ return (
84
+ 'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
85
+ 'feature="{}", keyword="{}")'
86
+ ).format(
87
+ self.text,
88
+ self.comp_type,
89
+ self.entity_1,
90
+ self.entity_2,
91
+ self.feature,
92
+ self.keyword,
93
+ )
94
+
95
+
96
+ class ComparativeSentencesCorpusReader(CorpusReader):
97
+ """
98
+ Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
99
+
100
+ >>> from nltk.corpus import comparative_sentences
101
+ >>> comparison = comparative_sentences.comparisons()[0]
102
+ >>> comparison.text # doctest: +NORMALIZE_WHITESPACE
103
+ ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
104
+ 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
105
+ 'had', '.']
106
+ >>> comparison.entity_2
107
+ 'models'
108
+ >>> (comparison.feature, comparison.keyword)
109
+ ('rewind', 'more')
110
+ >>> len(comparative_sentences.comparisons())
111
+ 853
112
+ """
113
+
114
+ CorpusView = StreamBackedCorpusView
115
+
116
+ def __init__(
117
+ self,
118
+ root,
119
+ fileids,
120
+ word_tokenizer=WhitespaceTokenizer(),
121
+ sent_tokenizer=None,
122
+ encoding="utf8",
123
+ ):
124
+ """
125
+ :param root: The root directory for this corpus.
126
+ :param fileids: a list or regexp specifying the fileids in this corpus.
127
+ :param word_tokenizer: tokenizer for breaking sentences or paragraphs
128
+ into words. Default: `WhitespaceTokenizer`
129
+ :param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
130
+ :param encoding: the encoding that should be used to read the corpus.
131
+ """
132
+
133
+ CorpusReader.__init__(self, root, fileids, encoding)
134
+ self._word_tokenizer = word_tokenizer
135
+ self._sent_tokenizer = sent_tokenizer
136
+ self._readme = "README.txt"
137
+
138
+ def comparisons(self, fileids=None):
139
+ """
140
+ Return all comparisons in the corpus.
141
+
142
+ :param fileids: a list or regexp specifying the ids of the files whose
143
+ comparisons have to be returned.
144
+ :return: the given file(s) as a list of Comparison objects.
145
+ :rtype: list(Comparison)
146
+ """
147
+ if fileids is None:
148
+ fileids = self._fileids
149
+ elif isinstance(fileids, str):
150
+ fileids = [fileids]
151
+ return concat(
152
+ [
153
+ self.CorpusView(path, self._read_comparison_block, encoding=enc)
154
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
155
+ ]
156
+ )
157
+
158
+ def keywords(self, fileids=None):
159
+ """
160
+ Return a set of all keywords used in the corpus.
161
+
162
+ :param fileids: a list or regexp specifying the ids of the files whose
163
+ keywords have to be returned.
164
+ :return: the set of keywords and comparative phrases used in the corpus.
165
+ :rtype: set(str)
166
+ """
167
+ all_keywords = concat(
168
+ [
169
+ self.CorpusView(path, self._read_keyword_block, encoding=enc)
170
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
171
+ ]
172
+ )
173
+
174
+ keywords_set = {keyword.lower() for keyword in all_keywords if keyword}
175
+ return keywords_set
176
+
177
+ def keywords_readme(self):
178
+ """
179
+ Return the list of words and constituents considered as clues of a
180
+ comparison (from listOfkeywords.txt).
181
+ """
182
+ keywords = []
183
+ with self.open("listOfkeywords.txt") as fp:
184
+ raw_text = fp.read()
185
+ for line in raw_text.split("\n"):
186
+ if not line or line.startswith("//"):
187
+ continue
188
+ keywords.append(line.strip())
189
+ return keywords
190
+
191
+ def sents(self, fileids=None):
192
+ """
193
+ Return all sentences in the corpus.
194
+
195
+ :param fileids: a list or regexp specifying the ids of the files whose
196
+ sentences have to be returned.
197
+ :return: all sentences of the corpus as lists of tokens (or as plain
198
+ strings, if no word tokenizer is specified).
199
+ :rtype: list(list(str)) or list(str)
200
+ """
201
+ return concat(
202
+ [
203
+ self.CorpusView(path, self._read_sent_block, encoding=enc)
204
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
205
+ ]
206
+ )
207
+
208
+ def words(self, fileids=None):
209
+ """
210
+ Return all words and punctuation symbols in the corpus.
211
+
212
+ :param fileids: a list or regexp specifying the ids of the files whose
213
+ words have to be returned.
214
+ :return: the given file(s) as a list of words and punctuation symbols.
215
+ :rtype: list(str)
216
+ """
217
+ return concat(
218
+ [
219
+ self.CorpusView(path, self._read_word_block, encoding=enc)
220
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)
221
+ ]
222
+ )
223
+
224
+ def _read_comparison_block(self, stream):
225
+ while True:
226
+ line = stream.readline()
227
+ if not line:
228
+ return [] # end of file.
229
+ comparison_tags = re.findall(COMPARISON, line)
230
+ if comparison_tags:
231
+ grad_comparisons = re.findall(GRAD_COMPARISON, line)
232
+ non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
233
+ # Advance to the next line (it contains the comparative sentence)
234
+ comparison_text = stream.readline().strip()
235
+ if self._word_tokenizer:
236
+ comparison_text = self._word_tokenizer.tokenize(comparison_text)
237
+ # Skip the next line (it contains closing comparison tags)
238
+ stream.readline()
239
+ # If gradable comparisons are found, create Comparison instances
240
+ # and populate their fields
241
+ comparison_bundle = []
242
+ if grad_comparisons:
243
+ # Each comparison tag has its own relations on a separate line
244
+ for comp in grad_comparisons:
245
+ comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
246
+ comparison = Comparison(
247
+ text=comparison_text, comp_type=comp_type
248
+ )
249
+ line = stream.readline()
250
+ entities_feats = ENTITIES_FEATS.findall(line)
251
+ if entities_feats:
252
+ for (code, entity_feat) in entities_feats:
253
+ if code == "1":
254
+ comparison.entity_1 = entity_feat.strip()
255
+ elif code == "2":
256
+ comparison.entity_2 = entity_feat.strip()
257
+ elif code == "3":
258
+ comparison.feature = entity_feat.strip()
259
+ keyword = KEYWORD.findall(line)
260
+ if keyword:
261
+ comparison.keyword = keyword[0]
262
+ comparison_bundle.append(comparison)
263
+ # If non-gradable comparisons are found, create a simple Comparison
264
+ # instance for each one
265
+ if non_grad_comparisons:
266
+ for comp in non_grad_comparisons:
267
+ # comp_type in this case should always be 4.
268
+ comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
269
+ comparison = Comparison(
270
+ text=comparison_text, comp_type=comp_type
271
+ )
272
+ comparison_bundle.append(comparison)
273
+ # Flatten the list of comparisons before returning them
274
+ # return concat([comparison_bundle])
275
+ return comparison_bundle
276
+
277
+ def _read_keyword_block(self, stream):
278
+ keywords = []
279
+ for comparison in self._read_comparison_block(stream):
280
+ keywords.append(comparison.keyword)
281
+ return keywords
282
+
283
+ def _read_sent_block(self, stream):
284
+ while True:
285
+ line = stream.readline()
286
+ if re.match(STARS, line):
287
+ while True:
288
+ line = stream.readline()
289
+ if re.match(STARS, line):
290
+ break
291
+ continue
292
+ if (
293
+ not re.findall(COMPARISON, line)
294
+ and not ENTITIES_FEATS.findall(line)
295
+ and not re.findall(CLOSE_COMPARISON, line)
296
+ ):
297
+ if self._sent_tokenizer:
298
+ return [
299
+ self._word_tokenizer.tokenize(sent)
300
+ for sent in self._sent_tokenizer.tokenize(line)
301
+ ]
302
+ else:
303
+ return [self._word_tokenizer.tokenize(line)]
304
+
305
+ def _read_word_block(self, stream):
306
+ words = []
307
+ for sent in self._read_sent_block(stream):
308
+ words.extend(sent)
309
+ return words
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/conll.py ADDED
@@ -0,0 +1,579 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: CONLL Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # Edward Loper <edloper@gmail.com>
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ Read CoNLL-style chunk fileids.
11
+ """
12
+
13
+ import textwrap
14
+
15
+ from nltk.corpus.reader.api import *
16
+ from nltk.corpus.reader.util import *
17
+ from nltk.tag import map_tag
18
+ from nltk.tree import Tree
19
+ from nltk.util import LazyConcatenation, LazyMap
20
+
21
+
22
+ class ConllCorpusReader(CorpusReader):
23
+ """
24
+ A corpus reader for CoNLL-style files. These files consist of a
25
+ series of sentences, separated by blank lines. Each sentence is
26
+ encoded using a table (or "grid") of values, where each line
27
+ corresponds to a single word, and each column corresponds to an
28
+ annotation type. The set of columns used by CoNLL-style files can
29
+ vary from corpus to corpus; the ``ConllCorpusReader`` constructor
30
+ therefore takes an argument, ``columntypes``, which is used to
31
+ specify the columns that are used by a given corpus. By default
32
+ columns are split by consecutive whitespaces, with the
33
+ ``separator`` argument you can set a string to split by (e.g.
34
+ ``\'\t\'``).
35
+
36
+
37
+ @todo: Add support for reading from corpora where different
38
+ parallel files contain different columns.
39
+ @todo: Possibly add caching of the grid corpus view? This would
40
+ allow the same grid view to be used by different data access
41
+ methods (eg words() and parsed_sents() could both share the
42
+ same grid corpus view object).
43
+ @todo: Better support for -DOCSTART-. Currently, we just ignore
44
+ it, but it could be used to define methods that retrieve a
45
+ document at a time (eg parsed_documents()).
46
+ """
47
+
48
+ # /////////////////////////////////////////////////////////////////
49
+ # Column Types
50
+ # /////////////////////////////////////////////////////////////////
51
+
52
+ WORDS = "words" #: column type for words
53
+ POS = "pos" #: column type for part-of-speech tags
54
+ TREE = "tree" #: column type for parse trees
55
+ CHUNK = "chunk" #: column type for chunk structures
56
+ NE = "ne" #: column type for named entities
57
+ SRL = "srl" #: column type for semantic role labels
58
+ IGNORE = "ignore" #: column type for column that should be ignored
59
+
60
+ #: A list of all column types supported by the conll corpus reader.
61
+ COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
62
+
63
+ # /////////////////////////////////////////////////////////////////
64
+ # Constructor
65
+ # /////////////////////////////////////////////////////////////////
66
+
67
+ def __init__(
68
+ self,
69
+ root,
70
+ fileids,
71
+ columntypes,
72
+ chunk_types=None,
73
+ root_label="S",
74
+ pos_in_tree=False,
75
+ srl_includes_roleset=True,
76
+ encoding="utf8",
77
+ tree_class=Tree,
78
+ tagset=None,
79
+ separator=None,
80
+ ):
81
+ for columntype in columntypes:
82
+ if columntype not in self.COLUMN_TYPES:
83
+ raise ValueError("Bad column type %r" % columntype)
84
+ if isinstance(chunk_types, str):
85
+ chunk_types = [chunk_types]
86
+ self._chunk_types = chunk_types
87
+ self._colmap = {c: i for (i, c) in enumerate(columntypes)}
88
+ self._pos_in_tree = pos_in_tree
89
+ self._root_label = root_label # for chunks
90
+ self._srl_includes_roleset = srl_includes_roleset
91
+ self._tree_class = tree_class
92
+ CorpusReader.__init__(self, root, fileids, encoding)
93
+ self._tagset = tagset
94
+ self.sep = separator
95
+
96
+ # /////////////////////////////////////////////////////////////////
97
+ # Data Access Methods
98
+ # /////////////////////////////////////////////////////////////////
99
+
100
+ def words(self, fileids=None):
101
+ self._require(self.WORDS)
102
+ return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
103
+
104
+ def sents(self, fileids=None):
105
+ self._require(self.WORDS)
106
+ return LazyMap(self._get_words, self._grids(fileids))
107
+
108
+ def tagged_words(self, fileids=None, tagset=None):
109
+ self._require(self.WORDS, self.POS)
110
+
111
+ def get_tagged_words(grid):
112
+ return self._get_tagged_words(grid, tagset)
113
+
114
+ return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
115
+
116
+ def tagged_sents(self, fileids=None, tagset=None):
117
+ self._require(self.WORDS, self.POS)
118
+
119
+ def get_tagged_words(grid):
120
+ return self._get_tagged_words(grid, tagset)
121
+
122
+ return LazyMap(get_tagged_words, self._grids(fileids))
123
+
124
+ def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
125
+ self._require(self.WORDS, self.POS, self.CHUNK)
126
+ if chunk_types is None:
127
+ chunk_types = self._chunk_types
128
+
129
+ def get_chunked_words(grid): # capture chunk_types as local var
130
+ return self._get_chunked_words(grid, chunk_types, tagset)
131
+
132
+ return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
133
+
134
+ def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
135
+ self._require(self.WORDS, self.POS, self.CHUNK)
136
+ if chunk_types is None:
137
+ chunk_types = self._chunk_types
138
+
139
+ def get_chunked_words(grid): # capture chunk_types as local var
140
+ return self._get_chunked_words(grid, chunk_types, tagset)
141
+
142
+ return LazyMap(get_chunked_words, self._grids(fileids))
143
+
144
+ def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
145
+ self._require(self.WORDS, self.POS, self.TREE)
146
+ if pos_in_tree is None:
147
+ pos_in_tree = self._pos_in_tree
148
+
149
+ def get_parsed_sent(grid): # capture pos_in_tree as local var
150
+ return self._get_parsed_sent(grid, pos_in_tree, tagset)
151
+
152
+ return LazyMap(get_parsed_sent, self._grids(fileids))
153
+
154
+ def srl_spans(self, fileids=None):
155
+ self._require(self.SRL)
156
+ return LazyMap(self._get_srl_spans, self._grids(fileids))
157
+
158
+ def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
159
+ self._require(self.WORDS, self.POS, self.TREE, self.SRL)
160
+ if pos_in_tree is None:
161
+ pos_in_tree = self._pos_in_tree
162
+
163
+ def get_srl_instances(grid): # capture pos_in_tree as local var
164
+ return self._get_srl_instances(grid, pos_in_tree)
165
+
166
+ result = LazyMap(get_srl_instances, self._grids(fileids))
167
+ if flatten:
168
+ result = LazyConcatenation(result)
169
+ return result
170
+
171
+ def iob_words(self, fileids=None, tagset=None):
172
+ """
173
+ :return: a list of word/tag/IOB tuples
174
+ :rtype: list(tuple)
175
+ :param fileids: the list of fileids that make up this corpus
176
+ :type fileids: None or str or list
177
+ """
178
+ self._require(self.WORDS, self.POS, self.CHUNK)
179
+
180
+ def get_iob_words(grid):
181
+ return self._get_iob_words(grid, tagset)
182
+
183
+ return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
184
+
185
+ def iob_sents(self, fileids=None, tagset=None):
186
+ """
187
+ :return: a list of lists of word/tag/IOB tuples
188
+ :rtype: list(list)
189
+ :param fileids: the list of fileids that make up this corpus
190
+ :type fileids: None or str or list
191
+ """
192
+ self._require(self.WORDS, self.POS, self.CHUNK)
193
+
194
+ def get_iob_words(grid):
195
+ return self._get_iob_words(grid, tagset)
196
+
197
+ return LazyMap(get_iob_words, self._grids(fileids))
198
+
199
+ # /////////////////////////////////////////////////////////////////
200
+ # Grid Reading
201
+ # /////////////////////////////////////////////////////////////////
202
+
203
+ def _grids(self, fileids=None):
204
+ # n.b.: we could cache the object returned here (keyed on
205
+ # fileids), which would let us reuse the same corpus view for
206
+ # different things (eg srl and parse trees).
207
+ return concat(
208
+ [
209
+ StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
210
+ for (fileid, enc) in self.abspaths(fileids, True)
211
+ ]
212
+ )
213
+
214
+ def _read_grid_block(self, stream):
215
+ grids = []
216
+ for block in read_blankline_block(stream):
217
+ block = block.strip()
218
+ if not block:
219
+ continue
220
+
221
+ grid = [line.split(self.sep) for line in block.split("\n")]
222
+
223
+ # If there's a docstart row, then discard. ([xx] eventually it
224
+ # would be good to actually use it)
225
+ if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
226
+ del grid[0]
227
+
228
+ # Check that the grid is consistent.
229
+ for row in grid:
230
+ if len(row) != len(grid[0]):
231
+ raise ValueError("Inconsistent number of columns:\n%s" % block)
232
+ grids.append(grid)
233
+ return grids
234
+
235
+ # /////////////////////////////////////////////////////////////////
236
+ # Transforms
237
+ # /////////////////////////////////////////////////////////////////
238
+ # given a grid, transform it into some representation (e.g.,
239
+ # a list of words or a parse tree).
240
+
241
+ def _get_words(self, grid):
242
+ return self._get_column(grid, self._colmap["words"])
243
+
244
+ def _get_tagged_words(self, grid, tagset=None):
245
+ pos_tags = self._get_column(grid, self._colmap["pos"])
246
+ if tagset and tagset != self._tagset:
247
+ pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
248
+ return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
249
+
250
+ def _get_iob_words(self, grid, tagset=None):
251
+ pos_tags = self._get_column(grid, self._colmap["pos"])
252
+ if tagset and tagset != self._tagset:
253
+ pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
254
+ return list(
255
+ zip(
256
+ self._get_column(grid, self._colmap["words"]),
257
+ pos_tags,
258
+ self._get_column(grid, self._colmap["chunk"]),
259
+ )
260
+ )
261
+
262
+ def _get_chunked_words(self, grid, chunk_types, tagset=None):
263
+ # n.b.: this method is very similar to conllstr2tree.
264
+ words = self._get_column(grid, self._colmap["words"])
265
+ pos_tags = self._get_column(grid, self._colmap["pos"])
266
+ if tagset and tagset != self._tagset:
267
+ pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
268
+ chunk_tags = self._get_column(grid, self._colmap["chunk"])
269
+
270
+ stack = [Tree(self._root_label, [])]
271
+
272
+ for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
273
+ if chunk_tag == "O":
274
+ state, chunk_type = "O", ""
275
+ else:
276
+ (state, chunk_type) = chunk_tag.split("-")
277
+ # If it's a chunk we don't care about, treat it as O.
278
+ if chunk_types is not None and chunk_type not in chunk_types:
279
+ state = "O"
280
+ # Treat a mismatching I like a B.
281
+ if state == "I" and chunk_type != stack[-1].label():
282
+ state = "B"
283
+ # For B or I: close any open chunks
284
+ if state in "BO" and len(stack) == 2:
285
+ stack.pop()
286
+ # For B: start a new chunk.
287
+ if state == "B":
288
+ new_chunk = Tree(chunk_type, [])
289
+ stack[-1].append(new_chunk)
290
+ stack.append(new_chunk)
291
+ # Add the word token.
292
+ stack[-1].append((word, pos_tag))
293
+
294
+ return stack[0]
295
+
296
+ def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
297
+ words = self._get_column(grid, self._colmap["words"])
298
+ pos_tags = self._get_column(grid, self._colmap["pos"])
299
+ if tagset and tagset != self._tagset:
300
+ pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
301
+ parse_tags = self._get_column(grid, self._colmap["tree"])
302
+
303
+ treestr = ""
304
+ for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
305
+ if word == "(":
306
+ word = "-LRB-"
307
+ if word == ")":
308
+ word = "-RRB-"
309
+ if pos_tag == "(":
310
+ pos_tag = "-LRB-"
311
+ if pos_tag == ")":
312
+ pos_tag = "-RRB-"
313
+ (left, right) = parse_tag.split("*")
314
+ right = right.count(")") * ")" # only keep ')'.
315
+ treestr += f"{left} ({pos_tag} {word}) {right}"
316
+ try:
317
+ tree = self._tree_class.fromstring(treestr)
318
+ except (ValueError, IndexError):
319
+ tree = self._tree_class.fromstring(f"({self._root_label} {treestr})")
320
+
321
+ if not pos_in_tree:
322
+ for subtree in tree.subtrees():
323
+ for i, child in enumerate(subtree):
324
+ if (
325
+ isinstance(child, Tree)
326
+ and len(child) == 1
327
+ and isinstance(child[0], str)
328
+ ):
329
+ subtree[i] = (child[0], child.label())
330
+
331
+ return tree
332
+
333
+ def _get_srl_spans(self, grid):
334
+ """
335
+ list of list of (start, end), tag) tuples
336
+ """
337
+ if self._srl_includes_roleset:
338
+ predicates = self._get_column(grid, self._colmap["srl"] + 1)
339
+ start_col = self._colmap["srl"] + 2
340
+ else:
341
+ predicates = self._get_column(grid, self._colmap["srl"])
342
+ start_col = self._colmap["srl"] + 1
343
+
344
+ # Count how many predicates there are. This tells us how many
345
+ # columns to expect for SRL data.
346
+ num_preds = len([p for p in predicates if p != "-"])
347
+
348
+ spanlists = []
349
+ for i in range(num_preds):
350
+ col = self._get_column(grid, start_col + i)
351
+ spanlist = []
352
+ stack = []
353
+ for wordnum, srl_tag in enumerate(col):
354
+ (left, right) = srl_tag.split("*")
355
+ for tag in left.split("("):
356
+ if tag:
357
+ stack.append((tag, wordnum))
358
+ for i in range(right.count(")")):
359
+ (tag, start) = stack.pop()
360
+ spanlist.append(((start, wordnum + 1), tag))
361
+ spanlists.append(spanlist)
362
+
363
+ return spanlists
364
+
365
+ def _get_srl_instances(self, grid, pos_in_tree):
366
+ tree = self._get_parsed_sent(grid, pos_in_tree)
367
+ spanlists = self._get_srl_spans(grid)
368
+ if self._srl_includes_roleset:
369
+ predicates = self._get_column(grid, self._colmap["srl"] + 1)
370
+ rolesets = self._get_column(grid, self._colmap["srl"])
371
+ else:
372
+ predicates = self._get_column(grid, self._colmap["srl"])
373
+ rolesets = [None] * len(predicates)
374
+
375
+ instances = ConllSRLInstanceList(tree)
376
+ for wordnum, predicate in enumerate(predicates):
377
+ if predicate == "-":
378
+ continue
379
+ # Decide which spanlist to use. Don't assume that they're
380
+ # sorted in the same order as the predicates (even though
381
+ # they usually are).
382
+ for spanlist in spanlists:
383
+ for (start, end), tag in spanlist:
384
+ if wordnum in range(start, end) and tag in ("V", "C-V"):
385
+ break
386
+ else:
387
+ continue
388
+ break
389
+ else:
390
+ raise ValueError("No srl column found for %r" % predicate)
391
+ instances.append(
392
+ ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
393
+ )
394
+
395
+ return instances
396
+
397
+ # /////////////////////////////////////////////////////////////////
398
+ # Helper Methods
399
+ # /////////////////////////////////////////////////////////////////
400
+
401
+ def _require(self, *columntypes):
402
+ for columntype in columntypes:
403
+ if columntype not in self._colmap:
404
+ raise ValueError(
405
+ "This corpus does not contain a %s " "column." % columntype
406
+ )
407
+
408
+ @staticmethod
409
+ def _get_column(grid, column_index):
410
+ return [grid[i][column_index] for i in range(len(grid))]
411
+
412
+
413
+ class ConllSRLInstance:
414
+ """
415
+ An SRL instance from a CoNLL corpus, which identifies and
416
+ providing labels for the arguments of a single verb.
417
+ """
418
+
419
+ # [xx] add inst.core_arguments, inst.argm_arguments?
420
+
421
+ def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
422
+ self.verb = []
423
+ """A list of the word indices of the words that compose the
424
+ verb whose arguments are identified by this instance.
425
+ This will contain multiple word indices when multi-word
426
+ verbs are used (e.g. 'turn on')."""
427
+
428
+ self.verb_head = verb_head
429
+ """The word index of the head word of the verb whose arguments
430
+ are identified by this instance. E.g., for a sentence that
431
+ uses the verb 'turn on,' ``verb_head`` will be the word index
432
+ of the word 'turn'."""
433
+
434
+ self.verb_stem = verb_stem
435
+
436
+ self.roleset = roleset
437
+
438
+ self.arguments = []
439
+ """A list of ``(argspan, argid)`` tuples, specifying the location
440
+ and type for each of the arguments identified by this
441
+ instance. ``argspan`` is a tuple ``start, end``, indicating
442
+ that the argument consists of the ``words[start:end]``."""
443
+
444
+ self.tagged_spans = tagged_spans
445
+ """A list of ``(span, id)`` tuples, specifying the location and
446
+ type for each of the arguments, as well as the verb pieces,
447
+ that make up this instance."""
448
+
449
+ self.tree = tree
450
+ """The parse tree for the sentence containing this instance."""
451
+
452
+ self.words = tree.leaves()
453
+ """A list of the words in the sentence containing this
454
+ instance."""
455
+
456
+ # Fill in the self.verb and self.arguments values.
457
+ for (start, end), tag in tagged_spans:
458
+ if tag in ("V", "C-V"):
459
+ self.verb += list(range(start, end))
460
+ else:
461
+ self.arguments.append(((start, end), tag))
462
+
463
+ def __repr__(self):
464
+ # Originally, its:
465
+ ##plural = 's' if len(self.arguments) != 1 else ''
466
+ plural = "s" if len(self.arguments) != 1 else ""
467
+ return "<ConllSRLInstance for %r with %d argument%s>" % (
468
+ (self.verb_stem, len(self.arguments), plural)
469
+ )
470
+
471
+ def pprint(self):
472
+ verbstr = " ".join(self.words[i][0] for i in self.verb)
473
+ hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n"
474
+ s = ""
475
+ for i, word in enumerate(self.words):
476
+ if isinstance(word, tuple):
477
+ word = word[0]
478
+ for (start, end), argid in self.arguments:
479
+ if i == start:
480
+ s += "[%s " % argid
481
+ if i == end:
482
+ s += "] "
483
+ if i in self.verb:
484
+ word = "<<%s>>" % word
485
+ s += word + " "
486
+ return hdr + textwrap.fill(
487
+ s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
488
+ )
489
+
490
+
491
+ class ConllSRLInstanceList(list):
492
+ """
493
+ Set of instances for a single sentence
494
+ """
495
+
496
+ def __init__(self, tree, instances=()):
497
+ self.tree = tree
498
+ list.__init__(self, instances)
499
+
500
+ def __str__(self):
501
+ return self.pprint()
502
+
503
+ def pprint(self, include_tree=False):
504
+ # Sanity check: trees should be the same
505
+ for inst in self:
506
+ if inst.tree != self.tree:
507
+ raise ValueError("Tree mismatch!")
508
+
509
+ # If desired, add trees:
510
+ if include_tree:
511
+ words = self.tree.leaves()
512
+ pos = [None] * len(words)
513
+ synt = ["*"] * len(words)
514
+ self._tree2conll(self.tree, 0, words, pos, synt)
515
+
516
+ s = ""
517
+ for i in range(len(words)):
518
+ # optional tree columns
519
+ if include_tree:
520
+ s += "%-20s " % words[i]
521
+ s += "%-8s " % pos[i]
522
+ s += "%15s*%-8s " % tuple(synt[i].split("*"))
523
+
524
+ # verb head column
525
+ for inst in self:
526
+ if i == inst.verb_head:
527
+ s += "%-20s " % inst.verb_stem
528
+ break
529
+ else:
530
+ s += "%-20s " % "-"
531
+ # Remaining columns: self
532
+ for inst in self:
533
+ argstr = "*"
534
+ for (start, end), argid in inst.tagged_spans:
535
+ if i == start:
536
+ argstr = f"({argid}{argstr}"
537
+ if i == (end - 1):
538
+ argstr += ")"
539
+ s += "%-12s " % argstr
540
+ s += "\n"
541
+ return s
542
+
543
+ def _tree2conll(self, tree, wordnum, words, pos, synt):
544
+ assert isinstance(tree, Tree)
545
+ if len(tree) == 1 and isinstance(tree[0], str):
546
+ pos[wordnum] = tree.label()
547
+ assert words[wordnum] == tree[0]
548
+ return wordnum + 1
549
+ elif len(tree) == 1 and isinstance(tree[0], tuple):
550
+ assert len(tree[0]) == 2
551
+ pos[wordnum], pos[wordnum] = tree[0]
552
+ return wordnum + 1
553
+ else:
554
+ synt[wordnum] = f"({tree.label()}{synt[wordnum]}"
555
+ for child in tree:
556
+ wordnum = self._tree2conll(child, wordnum, words, pos, synt)
557
+ synt[wordnum - 1] += ")"
558
+ return wordnum
559
+
560
+
561
+ class ConllChunkCorpusReader(ConllCorpusReader):
562
+ """
563
+ A ConllCorpusReader whose data file contains three columns: words,
564
+ pos, and chunk.
565
+ """
566
+
567
+ def __init__(
568
+ self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
569
+ ):
570
+ ConllCorpusReader.__init__(
571
+ self,
572
+ root,
573
+ fileids,
574
+ ("words", "pos", "chunk"),
575
+ chunk_types=chunk_types,
576
+ encoding=encoding,
577
+ tagset=tagset,
578
+ separator=separator,
579
+ )
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/crubadan.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: An Crubadan N-grams Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Avital Pekker <avital.pekker@utoronto.ca>
5
+ #
6
+ # URL: <https://www.nltk.org/>
7
+ # For license information, see LICENSE.TXT
8
+
9
+ """
10
+ An NLTK interface for the n-gram statistics gathered from
11
+ the corpora for each language using An Crubadan.
12
+
13
+ There are multiple potential applications for the data but
14
+ this reader was created with the goal of using it in the
15
+ context of language identification.
16
+
17
+ For details about An Crubadan, this data, and its potential uses, see:
18
+ http://borel.slu.edu/crubadan/index.html
19
+ """
20
+
21
+ import re
22
+ from os import path
23
+
24
+ from nltk.corpus.reader import CorpusReader
25
+ from nltk.data import ZipFilePathPointer
26
+ from nltk.probability import FreqDist
27
+
28
+
29
+ class CrubadanCorpusReader(CorpusReader):
30
+ """
31
+ A corpus reader used to access language An Crubadan n-gram files.
32
+ """
33
+
34
+ _LANG_MAPPER_FILE = "table.txt"
35
+ _all_lang_freq = {}
36
+
37
+ def __init__(self, root, fileids, encoding="utf8", tagset=None):
38
+ super().__init__(root, fileids, encoding="utf8")
39
+ self._lang_mapping_data = []
40
+ self._load_lang_mapping_data()
41
+
42
+ def lang_freq(self, lang):
43
+ """Return n-gram FreqDist for a specific language
44
+ given ISO 639-3 language code"""
45
+
46
+ if lang not in self._all_lang_freq:
47
+ self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
48
+
49
+ return self._all_lang_freq[lang]
50
+
51
+ def langs(self):
52
+ """Return a list of supported languages as ISO 639-3 codes"""
53
+ return [row[1] for row in self._lang_mapping_data]
54
+
55
+ def iso_to_crubadan(self, lang):
56
+ """Return internal Crubadan code based on ISO 639-3 code"""
57
+ for i in self._lang_mapping_data:
58
+ if i[1].lower() == lang.lower():
59
+ return i[0]
60
+
61
+ def crubadan_to_iso(self, lang):
62
+ """Return ISO 639-3 code given internal Crubadan code"""
63
+ for i in self._lang_mapping_data:
64
+ if i[0].lower() == lang.lower():
65
+ return i[1]
66
+
67
+ def _load_lang_mapping_data(self):
68
+ """Load language mappings between codes and description from table.txt"""
69
+ if isinstance(self.root, ZipFilePathPointer):
70
+ raise RuntimeError(
71
+ "Please install the 'crubadan' corpus first, use nltk.download()"
72
+ )
73
+
74
+ mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
75
+ if self._LANG_MAPPER_FILE not in self.fileids():
76
+ raise RuntimeError("Could not find language mapper file: " + mapper_file)
77
+
78
+ with open(mapper_file, encoding="utf-8") as raw:
79
+ strip_raw = raw.read().strip()
80
+
81
+ self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")]
82
+
83
+ def _load_lang_ngrams(self, lang):
84
+ """Load single n-gram language file given the ISO 639-3 language code
85
+ and return its FreqDist"""
86
+
87
+ if lang not in self.langs():
88
+ raise RuntimeError("Unsupported language.")
89
+
90
+ crubadan_code = self.iso_to_crubadan(lang)
91
+ ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
92
+
93
+ if not path.isfile(ngram_file):
94
+ raise RuntimeError("No N-gram file found for requested language.")
95
+
96
+ counts = FreqDist()
97
+ with open(ngram_file, encoding="utf-8") as f:
98
+ for line in f:
99
+ data = line.split(" ")
100
+
101
+ ngram = data[1].strip("\n")
102
+ freq = int(data[0])
103
+
104
+ counts[ngram] = freq
105
+
106
+ return counts
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/dependency.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Dependency Corpus Reader
2
+ #
3
+ # Copyright (C) 2001-2022 NLTK Project
4
+ # Author: Kepa Sarasola <kepa.sarasola@ehu.es>
5
+ # Iker Manterola <returntothehangar@hotmail.com>
6
+ #
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ from nltk.corpus.reader.api import *
11
+ from nltk.corpus.reader.util import *
12
+ from nltk.parse import DependencyGraph
13
+ from nltk.tokenize import *
14
+
15
+
16
+ class DependencyCorpusReader(SyntaxCorpusReader):
17
+ def __init__(
18
+ self,
19
+ root,
20
+ fileids,
21
+ encoding="utf8",
22
+ word_tokenizer=TabTokenizer(),
23
+ sent_tokenizer=RegexpTokenizer("\n", gaps=True),
24
+ para_block_reader=read_blankline_block,
25
+ ):
26
+ SyntaxCorpusReader.__init__(self, root, fileids, encoding)
27
+
28
+ #########################################################
29
+
30
+ def words(self, fileids=None):
31
+ return concat(
32
+ [
33
+ DependencyCorpusView(fileid, False, False, False, encoding=enc)
34
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)
35
+ ]
36
+ )
37
+
38
+ def tagged_words(self, fileids=None):
39
+ return concat(
40
+ [
41
+ DependencyCorpusView(fileid, True, False, False, encoding=enc)
42
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)
43
+ ]
44
+ )
45
+
46
+ def sents(self, fileids=None):
47
+ return concat(
48
+ [
49
+ DependencyCorpusView(fileid, False, True, False, encoding=enc)
50
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)
51
+ ]
52
+ )
53
+
54
+ def tagged_sents(self, fileids=None):
55
+ return concat(
56
+ [
57
+ DependencyCorpusView(fileid, True, True, False, encoding=enc)
58
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)
59
+ ]
60
+ )
61
+
62
+ def parsed_sents(self, fileids=None):
63
+ sents = concat(
64
+ [
65
+ DependencyCorpusView(fileid, False, True, True, encoding=enc)
66
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)
67
+ ]
68
+ )
69
+ return [DependencyGraph(sent) for sent in sents]
70
+
71
+
72
+ class DependencyCorpusView(StreamBackedCorpusView):
73
+ _DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
74
+
75
+ def __init__(
76
+ self,
77
+ corpus_file,
78
+ tagged,
79
+ group_by_sent,
80
+ dependencies,
81
+ chunk_types=None,
82
+ encoding="utf8",
83
+ ):
84
+ self._tagged = tagged
85
+ self._dependencies = dependencies
86
+ self._group_by_sent = group_by_sent
87
+ self._chunk_types = chunk_types
88
+ StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
89
+
90
+ def read_block(self, stream):
91
+ # Read the next sentence.
92
+ sent = read_blankline_block(stream)[0].strip()
93
+ # Strip off the docstart marker, if present.
94
+ if sent.startswith(self._DOCSTART):
95
+ sent = sent[len(self._DOCSTART) :].lstrip()
96
+
97
+ # extract word and tag from any of the formats
98
+ if not self._dependencies:
99
+ lines = [line.split("\t") for line in sent.split("\n")]
100
+ if len(lines[0]) == 3 or len(lines[0]) == 4:
101
+ sent = [(line[0], line[1]) for line in lines]
102
+ elif len(lines[0]) == 10:
103
+ sent = [(line[1], line[4]) for line in lines]
104
+ else:
105
+ raise ValueError("Unexpected number of fields in dependency tree file")
106
+
107
+ # discard tags if they weren't requested
108
+ if not self._tagged:
109
+ sent = [word for (word, tag) in sent]
110
+
111
+ # Return the result.
112
+ if self._group_by_sent:
113
+ return [sent]
114
+ else:
115
+ return list(sent)
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/framenet.py ADDED
The diff for this file is too large to render. See raw diff